问题描述:Registry 中存储的镜像数量过多,占用了大量磁盘空间,最终导致磁盘使用率达到 100%,造成服务异常(如无法推送新镜像、拉取镜像超时等)。

张开发
2026/4/9 1:02:21 15 分钟阅读

分享文章

问题描述:Registry 中存储的镜像数量过多,占用了大量磁盘空间,最终导致磁盘使用率达到 100%,造成服务异常(如无法推送新镜像、拉取镜像超时等)。
解决方案代码逻辑查询待清理镜像从数据库获取所有已标记为软删除is_deleted 1且创建时间超过指定天数的镜像记录生成待清理清单。安全检查对于每个待清理镜像通过 Registry API 获取其 manifest digest并检查该 digest 是否被多个 tag 引用。只有当引用数为 1即该 manifest 仅被当前 tag 使用时才执行删除操作避免误删仍被其他 tag 依赖的镜像。删除 manifest调用 Registry API 的DELETE /v2/name/manifests/digest接口删除镜像的 manifest 文件。释放存储空间删除 manifest 后镜像的底层层blob并不会立即删除。需要手动运行 Registry 自带的垃圾回收GC命令根据引用计数清理不再被任何 manifest 引用的 blob从而真正释放磁盘空间。共享层保护如果多个镜像共享相同的基础层删除其中一个镜像的 manifest 不会影响其他镜像对该基础层的引用。GC 执行时会保留引用计数大于 0 的 blob确保共享层不被误删。总结删除操作删除的是manifest 文件相当于镜像的目录清单不是直接删层blob。手动 GC才会真正删除不再被任何 manifest 引用的 blob。Registry 维护引用计数每个 blob 被哪些 manifest 引用。共享层如基础层 L只要还有至少一个 manifest 引用它GC 就不会删除它。每层有唯一的内容摘要digest。整个镜像也有一个唯一的 digest即 manifest digest。完整代码import pymysqlfrom datetime import datetime, timedeltaimport requestsimport loggingimport argparseimport os# Registry 配置请根据实际环境修改registry_url http://your-registry-host:portregistry_host your-registry-host:portdef setup_logger():logger logging.getLogger(image_cleanup)logger.setLevel(logging.DEBUG)# 清除旧的 handler 防止重复if logger.hasHandlers():logger.handlers.clear()file_handler logging.FileHandler(image_cleanup.log, encodingutf-8)file_handler.setLevel(logging.DEBUG)console_handler logging.StreamHandler()console_handler.setLevel(logging.INFO)formatter logging.Formatter(%(asctime)s - %(levelname)s - %(message)s)file_handler.setFormatter(formatter)console_handler.setFormatter(formatter)logger.addHandler(file_handler)logger.addHandler(console_handler)return loggerlogger setup_logger()def connect_to_db():try:conn pymysql.connect(hostyour-db-host,useryour-db-user,passwordyour-db-password,databaseyour-db-name,charsetutf8mb4)return connexcept Exception as e:logger.error(fFailed to connect to the database: {e})exit(1)def query_images(days):conn connect_to_db()cursor conn.cursor()query SELECT name, project_name,DATE_ADD(create_time, INTERVAL 8 HOUR) as adjusted_create_time,user_name, real_tagFROM imageWHERE is_deleted 1params []if days 0:date_threshold datetime.now() - timedelta(daysdays)date_threshold_str date_threshold.strftime(%Y-%m-%d %H:%M:%S)query AND create_time %sparams.append(date_threshold_str)try:cursor.execute(query, params)results cursor.fetchall()except Exception as e:logger.error(fFailed to execute query: {e})results []finally:cursor.close()conn.close()return resultsdef get_tag_digest(repo, tag):url f{registry_url}/v2/{repo}/manifests/{tag}headers {Accept: application/vnd.docker.distribution.manifest.v2json, application/vnd.oci.image.manifest.v1json}try:resp requests.get(url, headersheaders, timeout5)if resp.status_code 200:return resp.headers.get(Docker-Content-Digest)else:logger.debug(fGet digest failed for {repo}:{tag}, status: {resp.status_code})except Exception as e:logger.error(fRequest error for {repo}:{tag}: {e})return Nonedef get_all_tags(repo):url f{registry_url}/v2/{repo}/tags/listtry:resp requests.get(url, timeout5)if resp.status_code 200:return resp.json().get(tags, [])except Exception as e:logger.error(fFailed to get tags for {repo}: {e})return []def get_digest_reference_count(repo, digest):if not digest: return 0tags get_all_tags(repo)count 0for tag in tags:if get_tag_digest(repo, tag) digest:count 1return countdef safe_delete_image(repo, tag):logger.info(fAttempting to delete: {repo}:{tag})digest get_tag_digest(repo, tag)if not digest:logger.warning(fCannot get digest for {repo}:{tag}, skipping.)return False# 检查是否有其他标签引用同一个镜像层ref_count get_digest_reference_count(repo, digest)if ref_count 1:logger.warning(fDigest {digest} is referenced by {ref_count} tags, skipping deletion of {repo}:{tag})return False# 执行删除delete_url f{registry_url}/v2/{repo}/manifests/{digest}logger.debug(fDelete URL: {delete_url})try:resp requests.delete(delete_url)if resp.status_code in (200, 202):return Trueelse:logger.error(fDelete API returned status {resp.status_code}: {resp.text})return Falseexcept Exception as e:logger.error(fDelete request failed: {e})return Falsedef parse_image_string(full_image_string):解析类似 your-registry-host:port/my-nginx:v1.0 的字符串返回 (repo, tag)try:# 1. 去掉可能存在的协议头 (http://)if full_image_string.startswith(http):full_image_string full_image_string.split(//, 1)[1]# 2. 分割 域名/仓库路径if / not in full_image_string:return None, None_, path_part full_image_string.split(/, 1)# 3. 分割 仓库名标签if : not in path_part:return path_part, latestrepo, tag path_part.rsplit(:, 1)return repo, tagexcept Exception as e:logger.error(fFailed to parse image string {full_image_string}: {e})return None, Nonedef main():parser argparse.ArgumentParser(descriptionImage Cleanup Script)parser.add_argument(action, choices[list, rm], helpAction to perform: list or rm)parser.add_argument(param, helpDays for list or file path for rm)args parser.parse_args()if args.action list:days int(args.param)results query_images(days)with open(output.txt, w, encodingutf-8) as file:for row in results:name, project_name, adjusted_create_time, user_name, real_tag rowrepository nametag real_tagname_with_prefix f{registry_host}/{repository}:{tag}line \t.join([name_with_prefix, str(project_name), str(adjusted_create_time), str(user_name)])file.write(line \n)logger.info(fListed image: {name_with_prefix})elif args.action rm:file_path args.paramif not os.path.isfile(file_path):logger.error(fFile not found: {file_path})returnwith open(file_path, r, encodingutf-8) as file:for line in file:line line.strip()if not line:continueparts line.split(\t)full_image parts[0]logger.debug(fProcessing line: {line})logger.debug(fExtracted image: {full_image})repository, tag parse_image_string(full_image)if not repository or not tag:logger.warning(fInvalid image format in line: {line})continueif safe_delete_image(repository, tag):logger.info(fSuccessfully deleted: {full_image})else:logger.warning(fFailed to delete: {full_image})if __name__ __main__:main()

更多文章