commit 22941965a71a75c83d27356f833ea780bb226975 from: Stefan Sperling date: Sun May 25 16:29:27 2025 UTC improve redundant pack detection during cleanup Instead of storing all referenced object IDs in a set, use the pack index of the large pack file we have created to check whether an object exists. Apply the modification time check only to packs which contain unreferenced objects. Otherwise we can leave many duplicate pack files behind. I observed this when all refs were packed and the packed-refs file was days old. ok op@ commit - 1b11aa0ac1ed83c510d6c78fc5279380dbcebe73 commit + 22941965a71a75c83d27356f833ea780bb226975 blob - 5c9aba1c8075530d47066c6456ea7b74a36a449f blob + 683c93b529c8fd992f426ac0b30fbb719910c71c --- lib/repository_admin.c +++ lib/repository_admin.c @@ -1282,8 +1282,8 @@ done: static const struct got_error * purge_redundant_pack(struct got_repository *repo, const char *packidx_path, - int dry_run, int ignore_mtime, time_t max_mtime, - int *remove, off_t *size_before, off_t *size_after) + int dry_run, int ignore_mtime, time_t max_mtime, int *remove, + int has_unref_objects, off_t *size_before, off_t *size_after) { static const char *ext[] = {".idx", ".pack", ".rev", ".bitmap", ".promisor", ".mtimes"}; @@ -1294,19 +1294,21 @@ purge_redundant_pack(struct got_repository *repo, cons if (strlcpy(path, packidx_path, sizeof(path)) >= sizeof(path)) return got_error(GOT_ERR_NO_SPACE); - /* - * Do not delete pack files which are younger than our maximum - * modification time threshold. This prevents a race where a - * new pack file which is being added to the repository - * concurrently would be deleted. - */ - if (fstatat(got_repo_get_fd(repo), path, &sb, 0) == -1) { - if (errno == ENOENT) - return NULL; - return got_error_from_errno2("fstatat", path); + if (*remove && !ignore_mtime && has_unref_objects) { + /* + * Do not delete pack files which contain unreferenced objects + * and are younger than our modification time threshold. + * This prevents a race where a new pack file which is being + * added to the repository concurrently would be deleted. + */ + if (fstatat(got_repo_get_fd(repo), path, &sb, 0) == -1) { + if (errno == ENOENT) + return NULL; + return got_error_from_errno2("fstatat", path); + } + if (sb.st_mtime > max_mtime) + *remove = 0; } - if (!ignore_mtime && sb.st_mtime > max_mtime) - *remove = 0; /* * For compatibility with Git, if a matching .keep file exist @@ -1354,9 +1356,9 @@ purge_redundant_pack(struct got_repository *repo, cons } static const struct got_error * -pack_is_redundant(int *redundant, struct got_repository *repo, - struct got_object_idset *traversed_ids, - const char *packidx_path, struct got_object_idset *idset) +pack_is_redundant(int *redundant, int *unref_objects, + struct got_repository *repo, struct got_object_idset *traversed_ids, + struct got_packidx *large_packidx, const char *packidx_path) { const struct got_error *err; struct got_packidx *packidx; @@ -1364,8 +1366,10 @@ pack_is_redundant(int *redundant, struct got_repositor struct got_object_id id; size_t i, nobjects; size_t digest_len = got_hash_digest_length(repo->algo); + int idx; - *redundant = 1; + *redundant = 0; + *unref_objects = 0; err = got_repo_get_packidx(&packidx, packidx_path, repo); if (err) @@ -1379,18 +1383,24 @@ pack_is_redundant(int *redundant, struct got_repositor memcpy(&id.hash, pid, digest_len); id.algo = repo->algo; - if (got_object_idset_contains(idset, &id)) - continue; + /* + * Pack files which contain unknown objects must be kept + * if they meet our modification time threshold. + */ + if (*unref_objects == 0 && + !got_object_idset_contains(traversed_ids, &id)) + *unref_objects = 1; - if (!got_object_idset_contains(traversed_ids, &id)) - continue; - - *redundant = 0; - err = got_object_idset_add(idset, &id, NULL); - if (err) - return err; + /* + * A pack file is not redundant if any referenced object + * it contains has no valid index in our large pack file, + */ + idx = got_packidx_get_object_idx(large_packidx, &id); + if (idx == -1 && got_object_idset_contains(traversed_ids, &id)) + return NULL; } + *redundant = 1; return NULL; } @@ -1419,18 +1429,19 @@ static const struct got_error * repo_purge_redundant_packfiles(struct got_repository *repo, struct got_object_idset *traversed_ids, off_t *size_before, off_t *size_after, int dry_run, int ignore_mtime, - time_t max_mtime, int nloose, int ncommits, int npurged, + time_t max_mtime, struct got_object_id *large_pack_hash, + int nloose, int ncommits, int npurged, struct got_ratelimit *rl, got_cleanup_progress_cb progress_cb, void *progress_arg, got_cancel_cb cancel_cb, void *cancel_arg) { const struct got_error *err; struct pack_info *pinfo, *sorted = NULL; - struct got_packidx *packidx; - struct got_object_idset *idset = NULL; + struct got_packidx *packidx, *large_packidx = NULL; struct got_pathlist_entry *pe; size_t i, npacks; - int remove, redundant_packs = 0; + int redundant_packs = 0, large_idxpath_len; + char *id_str = NULL, *large_idxpath = NULL; npacks = 0; RB_FOREACH(pe, got_pathlist_head, &repo->packidx_paths) @@ -1442,6 +1453,16 @@ repo_purge_redundant_packfiles(struct got_repository * sorted = calloc(npacks, sizeof(*sorted)); if (sorted == NULL) return got_error_from_errno("calloc"); + + err = got_object_id_str(&id_str, large_pack_hash); + if (err) + goto done; + large_idxpath_len = asprintf(&large_idxpath, "%s/pack-%s.idx", + GOT_OBJECTS_PACK_DIR, id_str); + if (large_idxpath_len == -1) { + err = got_error_from_errno("asprintf"); + goto done; + } i = 0; RB_FOREACH(pe, got_pathlist_head, &repo->packidx_paths) { @@ -1456,28 +1477,40 @@ repo_purge_redundant_packfiles(struct got_repository * } qsort(sorted, npacks, sizeof(*sorted), pack_info_cmp); - idset = got_object_idset_alloc(); - if (idset == NULL) { - err = got_error_from_errno("got_object_idset_alloc"); + /* + * Open the index of the large pack file we just created. + * Any other pack file composed only of objects which are also + * contained in our large pack is now redundant and can be purged. + */ + err = got_packidx_open(&large_packidx, got_repo_get_fd(repo), + large_idxpath, 1, repo->algo); + if (err) goto done; - } for (i = 0; i < npacks; ++i) { + int is_redundant = 0, has_unref_objects = 0; + if (cancel_cb) { err = (*cancel_cb)(cancel_arg); if (err) break; } - err = pack_is_redundant(&remove, repo, traversed_ids, - sorted[i].path, idset); - if (err) - goto done; + /* Do not delete the large pack file we just created. */ + if (got_path_cmp(sorted[i].path, large_idxpath, + sorted[i].path_len, large_idxpath_len) != 0) { + err = pack_is_redundant(&is_redundant, + &has_unref_objects, repo, traversed_ids, + large_packidx, sorted[i].path); + if (err) + goto done; + } err = purge_redundant_pack(repo, sorted[i].path, dry_run, - ignore_mtime, max_mtime, &remove, size_before, size_after); + ignore_mtime, max_mtime, &is_redundant, has_unref_objects, + size_before, size_after); if (err) goto done; - if (!remove) + if (!is_redundant) continue; err = report_cleanup_progress(progress_cb, progress_arg, rl, ncommits, nloose, npurged, ++redundant_packs); @@ -1494,8 +1527,10 @@ repo_purge_redundant_packfiles(struct got_repository * } done: free(sorted); - if (idset) - got_object_idset_free(idset); + if (large_packidx) + got_packidx_close(large_packidx); + free(large_idxpath); + free(id_str); return err; } @@ -1617,8 +1652,8 @@ got_repo_cleanup(struct got_repository *repo, err = repo_purge_redundant_packfiles(repo, traversed_ids, pack_before, pack_after, dry_run, ignore_mtime, max_mtime, - *nloose, *ncommits, npurged, &rl, progress_cb, progress_arg, - cancel_cb, cancel_arg); + &pack_hash, *nloose, *ncommits, npurged, + &rl, progress_cb, progress_arg, cancel_cb, cancel_arg); if (err) goto done;