commit 8c35ff140a39c592c937c6ddbf5e723e95a6fbfa from: Stefan Sperling date: Thu Nov 19 15:37:46 2020 UTC implement custom atomizer for blame to reuse data and mappings across commits commit - 89dc8b78ad19e890f6dd1fe4259652a32ff89b16 commit + 8c35ff140a39c592c937c6ddbf5e723e95a6fbfa blob - 18552faa524fd738b0f8f26b302df1e5785530ef blob + af7e1af4697f37ad206124a5aef6deec87e7134f --- lib/blame.c +++ lib/blame.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -50,24 +51,41 @@ struct got_blame_line { }; struct got_blame { - FILE *f; - off_t size; struct diff_config *cfg; - off_t filesize; - int nlines; - int nannotated; + int nlines; /* number of lines in file being blamed */ + int nannotated; /* number of lines already annotated */ struct got_blame_line *lines; /* one per line */ - off_t *line_offsets; /* one per line */ int ncommits; /* + * These change with every traversed commit. After diffing + * commits N:N-1, in preparation for diffing commits N-1:N-2, + * data for commit N is retained and flipped into data for N-1. + * + */ + FILE *f1; /* older version from commit N-1. */ + FILE *f2; /* newer version from commit N. */ + unsigned char *map1; + unsigned char *map2; + off_t size1; + off_t size2; + int nlines1; + int nlines2; + off_t *line_offsets1; + off_t *line_offsets2; + + /* * Map line numbers of an older version of the file to valid line - * numbers in blame->f. This map is updated with each commit we - * traverse throughout the file's history. - * Lines mapped to -1 do not correspond to any line in blame->f. + * numbers in the version of the file being blamed. This map is + * updated with each commit we traverse throughout the file's history. + * Lines mapped to -1 do not correspond to any line in the version + * being blamed. */ + int *linemap1; int *linemap2; - int nlines2; + + struct diff_data *data1; + struct diff_data *data2; }; static const struct got_error * @@ -94,8 +112,8 @@ annotate_line(struct got_blame *blame, int lineno, str } static const struct got_error * -blame_changes(struct got_blame *blame, int *linemap1, - struct diff_result *diff_result, struct got_object_id *commit_id, +blame_changes(struct got_blame *blame, struct diff_result *diff_result, + struct got_object_id *commit_id, const struct got_error *(*cb)(void *, int, int, struct got_object_id *), void *arg) { @@ -123,14 +141,15 @@ blame_changes(struct got_blame *blame, int *linemap1, if (left_count == right_count) { for (j = 0; j < left_count; j++) { - linemap1[idx1++] = blame->linemap2[idx2++]; + blame->linemap1[idx1++] = + blame->linemap2[idx2++]; } continue; } if (right_count == 0) { for (j = 0; j < left_count; j++) { - linemap1[idx1++] = -1; + blame->linemap1[idx1++] = -1; } continue; } @@ -149,6 +168,32 @@ blame_changes(struct got_blame *blame, int *linemap1, } static const struct got_error * +blame_prepare_file(FILE *f, unsigned char **p, off_t *size, + int *nlines, off_t **line_offsets, struct diff_data *diff_data, + const struct diff_config *cfg, struct got_blob_object *blob) +{ + const struct got_error *err = NULL; + int rc; + + err = got_object_blob_dump_to_file(size, nlines, line_offsets, + f, blob); + if (err) + return err; + +#ifndef GOT_DIFF_NO_MMAP + *p = mmap(NULL, *size, PROT_READ, MAP_PRIVATE, fileno(f), 0); + if (*p == MAP_FAILED) +#endif + *p = NULL; /* fall back on file I/O */ + + rc = diff_atomize_file(diff_data, cfg, f, *p, *size, 0); + if (rc) + return got_error_set_errno(rc, "diff_atomize_file"); + + return NULL; +} + +static const struct got_error * blame_commit(struct got_blame *blame, struct got_object_id *id, const char *path, struct got_repository *repo, const struct got_error *(*cb)(void *, int, int, struct got_object_id *), @@ -157,13 +202,9 @@ blame_commit(struct got_blame *blame, struct got_objec const struct got_error *err = NULL; struct got_commit_object *commit = NULL; struct got_object_qid *pid = NULL; - struct got_object_id *blob_id = NULL, *pblob_id = NULL; - struct got_blob_object *blob = NULL, *pblob = NULL; - struct got_diffreg_result *diffreg_result = NULL; - FILE *f1 = NULL, *f2 = NULL; - off_t size1, size2; - int nlines1, nlines2; - int *linemap1 = NULL; + struct got_object_id *pblob_id = NULL; + struct got_blob_object *pblob = NULL; + struct diff_result *diff_result = NULL; err = got_object_open_as_commit(&commit, repo, id); if (err) @@ -175,91 +216,60 @@ blame_commit(struct got_blame *blame, struct got_objec return NULL; } - err = got_object_id_by_path(&blob_id, repo, id, path); + err = got_object_id_by_path(&pblob_id, repo, pid->id, path); if (err) { if (err->code == GOT_ERR_NO_TREE_ENTRY) err = NULL; goto done; } - err = got_object_open_as_blob(&blob, repo, blob_id, 8192); + err = got_object_open_as_blob(&pblob, repo, pblob_id, 8192); if (err) goto done; - f2 = got_opentemp(); - if (f2 == NULL) { + blame->f1 = got_opentemp(); + if (blame->f1 == NULL) { err = got_error_from_errno("got_opentemp"); goto done; } - err = got_object_blob_dump_to_file(&size2, &nlines2, NULL, - f2, blob); + + err = blame_prepare_file(blame->f1, &blame->map1, &blame->size1, + &blame->nlines1, &blame->line_offsets1, blame->data1, + blame->cfg, pblob); if (err) goto done; - err = got_object_id_by_path(&pblob_id, repo, pid->id, path); - if (err) { - if (err->code == GOT_ERR_NO_TREE_ENTRY) - err = NULL; + diff_result = diff_main(blame->cfg, blame->data1, blame->data2); + if (diff_result == NULL) { + err = got_error_set_errno(ENOMEM, "malloc"); goto done; } - - err = got_object_open_as_blob(&pblob, repo, pblob_id, 8192); - if (err) + if (diff_result->rc != DIFF_RC_OK) { + err = got_error_set_errno(diff_result->rc, "diff"); goto done; - - f1 = got_opentemp(); - if (f1 == NULL) { - err = got_error_from_errno("got_opentemp"); - goto done; } - err = got_object_blob_dump_to_file(&size1, &nlines1, NULL, f1, pblob); - if (err) - goto done; - - err = got_diff_files(&diffreg_result, f1, "", f2, "", - 0, 0, NULL); - if (err) - goto done; - if (diffreg_result->result->chunks.len > 0) { - if (nlines1 > 0) { - linemap1 = calloc(nlines1, sizeof(*linemap1)); - if (linemap1 == NULL) { + if (diff_result->chunks.len > 0) { + if (blame->nlines1 > 0) { + blame->linemap1 = calloc(blame->nlines1, + sizeof(*blame->linemap1)); + if (blame->linemap1 == NULL) { err = got_error_from_errno("malloc"); goto done; } } - err = blame_changes(blame, linemap1, - diffreg_result->result, id, cb, arg); - if (err) { - free(linemap1); + err = blame_changes(blame, diff_result, id, cb, arg); + if (err) goto done; - } - if (linemap1) { - free(blame->linemap2); - blame->linemap2 = linemap1; - blame->nlines2 = nlines1; - } } else if (cb) err = cb(arg, blame->nlines, -1, id); done: - if (diffreg_result) { - const struct got_error *free_err; - free_err = got_diffreg_result_free(diffreg_result); - if (free_err && err == NULL) - err = free_err; - } + if (diff_result) + diff_result_free(diff_result); if (commit) got_object_commit_close(commit); - free(blob_id); free(pblob_id); - if (blob) - got_object_blob_close(blob); if (pblob) got_object_blob_close(pblob); - if (f1 && fclose(f1) != 0 && err == NULL) - err = got_error_from_errno("fclose"); - if (f2 && fclose(f2) != 0 && err == NULL) - err = got_error_from_errno("fclose"); return err; } @@ -268,23 +278,213 @@ blame_close(struct got_blame *blame) { const struct got_error *err = NULL; - if (blame->f && fclose(blame->f) != 0 && err == NULL) + diff_data_free(blame->data1); + free(blame->data1); + diff_data_free(blame->data2); + free(blame->data2); + if (blame->map1) { + if (munmap(blame->map1, blame->size1) == -1 && err == NULL) + err = got_error_from_errno("munmap"); + } + if (blame->map2) { + if (munmap(blame->map2, blame->size2) == -1 && err == NULL) + err = got_error_from_errno("munmap"); + } + if (blame->f1 && fclose(blame->f1) != 0 && err == NULL) err = got_error_from_errno("fclose"); + if (blame->f2 && fclose(blame->f2) != 0 && err == NULL) + err = got_error_from_errno("fclose"); free(blame->lines); + free(blame->line_offsets1); + free(blame->line_offsets2); + free(blame->linemap1); free(blame->linemap2); free(blame->cfg); free(blame); return err; +} + +static int +atomize_file(struct diff_data *d, FILE *f, off_t filesize, int nlines, + off_t *line_offsets) +{ + int i, rc = DIFF_RC_OK; + + ARRAYLIST_INIT(d->atoms, nlines); + + for (i = 0; i < nlines; i++) { + struct diff_atom *atom; + off_t len, pos = line_offsets[i]; + unsigned int hash = 0; + int j; + + ARRAYLIST_ADD(atom, d->atoms); + if (atom == NULL) { + rc = errno; + break; + } + + if (i < nlines - 1) + len = line_offsets[i + 1] - pos; + else + len = filesize - pos; + + if (fseeko(f, pos, SEEK_SET) == -1) { + rc = errno; + break; + } + for (j = 0; j < len; j++) { + int c = fgetc(f); + if (c == EOF) { + if (feof(f)) + rc = EIO; /* unexpected EOF */ + else + rc = errno; + goto done; + } + + hash = diff_atom_hash_update(hash, (unsigned char)c); + } + *atom = (struct diff_atom){ + .root = d, + .pos = pos, + .at = NULL, /* atom data is not memory-mapped */ + .len = len, + .hash = hash, + }; + } +done: + if (rc) + ARRAYLIST_FREE(d->atoms); + + return rc; } +static int +atomize_file_mmap(struct diff_data *d, unsigned char *p, + off_t filesize, int nlines, off_t *line_offsets) +{ + int i, rc = DIFF_RC_OK; + + ARRAYLIST_INIT(d->atoms, nlines); + + for (i = 0; i < nlines; i++) { + struct diff_atom *atom; + off_t len, pos = line_offsets[i]; + unsigned int hash = 0; + int j; + + ARRAYLIST_ADD(atom, d->atoms); + if (atom == NULL) { + rc = errno; + break; + } + + if (i < nlines - 1) + len = line_offsets[i + 1] - pos; + else + len = filesize - pos; + + for (j = 0; j < len; j++) + hash = diff_atom_hash_update(hash, p[pos + j]); + + *atom = (struct diff_atom){ + .root = d, + .pos = pos, + .at = &p[pos], + .len = len, + .hash = hash, + }; + } + + if (rc) + ARRAYLIST_FREE(d->atoms); + + return rc; +} + +/* Implements diff_atomize_func_t */ +static int +blame_atomize_file(void *arg, struct diff_data *d) +{ + struct got_blame *blame = arg; + + if (d->f == blame->f1) { + if (blame->map1) + return atomize_file_mmap(d, blame->map1, + blame->size1, blame->nlines1, + blame->line_offsets1); + else + return atomize_file(d, blame->f1, blame->size1, + blame->nlines1, blame->line_offsets1); + } else if (d->f == blame->f2) { + if (d->atoms.len > 0) { + /* Re-use data from previous commit. */ + return DIFF_RC_OK; + } + if (blame->map2) + return atomize_file_mmap(d, blame->map2, + blame->size2, blame->nlines2, + blame->line_offsets2); + else + return atomize_file(d, blame->f2, blame->size2, + blame->nlines2, blame->line_offsets2); + } + + return DIFF_RC_OK; +} + static const struct got_error * +close_file2_and_reuse_file1(struct got_blame *blame) +{ + struct diff_data *d; + + free(blame->line_offsets2); + blame->line_offsets2 = blame->line_offsets1; + blame->line_offsets1 = NULL; + + free(blame->linemap2); + blame->linemap2 = blame->linemap1; + blame->linemap1 = NULL; + + if (blame->map2) { + if (munmap(blame->map2, blame->size2) == -1) + return got_error_from_errno("munmap"); + blame->map2 = blame->map1; + blame->map2 = NULL; + + } + blame->size2 = blame->size1; + blame->size1 = 0; + + if (fclose(blame->f2) == EOF) + return got_error_from_errno("fclose"); + blame->f2 = blame->f1; + blame->f1 = NULL; + + blame->nlines2 = blame->nlines1; + blame->nlines1 = 0; + + free(blame->line_offsets2); + blame->line_offsets2 = blame->line_offsets1; + blame->line_offsets2 = NULL; + + diff_data_free(blame->data2); /* does not free pointer itself */ + memset(blame->data2, 0, sizeof(*blame->data2)); + d = blame->data2; + blame->data2 = blame->data1; + blame->data1 = d; + + return NULL; +} + +static const struct got_error * blame_open(struct got_blame **blamep, const char *path, struct got_object_id *start_commit_id, struct got_repository *repo, const struct got_error *(*cb)(void *, int, int, struct got_object_id *), void *arg, got_cancel_cb cancel_cb, void *cancel_arg) { const struct got_error *err = NULL; - struct got_object *obj = NULL; struct got_object_id *obj_id = NULL; struct got_blob_object *blob = NULL; struct got_blame *blame = NULL; @@ -298,42 +498,49 @@ blame_open(struct got_blame **blamep, const char *path if (err) goto done; - err = got_object_open(&obj, repo, obj_id); + err = got_object_open_as_blob(&blob, repo, obj_id, 8192); if (err) goto done; - if (obj->type != GOT_OBJ_TYPE_BLOB) { - err = got_error_path(path, GOT_ERR_OBJ_TYPE); + blame = calloc(1, sizeof(*blame)); + if (blame == NULL) { + err = got_error_from_errno("calloc"); goto done; } - err = got_object_blob_open(&blob, repo, obj, 8192); - if (err) + blame->data1 = calloc(1, sizeof(*blame->data1)); + if (blame->data1 == NULL) { + err = got_error_from_errno("calloc"); goto done; - - blame = calloc(1, sizeof(*blame)); - if (blame == NULL) { + } + blame->data2 = calloc(1, sizeof(*blame->data2)); + if (blame->data2 == NULL) { err = got_error_from_errno("calloc"); goto done; } - blame->f = got_opentemp(); - if (blame->f == NULL) { + blame->f2 = got_opentemp(); + if (blame->f2 == NULL) { err = got_error_from_errno("got_opentemp"); goto done; } - err = got_object_blob_dump_to_file(&blame->filesize, &blame->nlines, - &blame->line_offsets, blame->f, blob); - if (err || blame->nlines == 0) - goto done; - err = got_diff_get_config(&blame->cfg, GOT_DIFF_ALGORITHM_PATIENCE, - NULL, NULL); + blame_atomize_file, blame); if (err) goto done; + err = blame_prepare_file(blame->f2, &blame->map2, &blame->size2, + &blame->nlines2, &blame->line_offsets2, blame->data2, + blame->cfg, blob); + blame->nlines = blame->nlines2; + if (err || blame->nlines == 0) + goto done; + + got_object_blob_close(blob); + blob = NULL; + /* Don't include \n at EOF in the blame line count. */ - if (blame->line_offsets[blame->nlines - 1] == blame->filesize) + if (blame->line_offsets2[blame->nlines - 1] == blame->size2) blame->nlines--; blame->lines = calloc(blame->nlines, sizeof(*blame->lines)); @@ -342,7 +549,6 @@ blame_open(struct got_blame **blamep, const char *path goto done; } - blame->nlines2 = blame->nlines; blame->linemap2 = calloc(blame->nlines2, sizeof(*blame->linemap2)); if (blame->linemap2 == NULL) { err = got_error_from_errno("calloc"); @@ -380,6 +586,10 @@ blame_open(struct got_blame **blamep, const char *path } if (blame->nannotated == blame->nlines) break; + + err = close_file2_and_reuse_file1(blame); + if (err) + goto done; } } @@ -396,8 +606,6 @@ done: if (graph) got_commit_graph_close(graph); free(obj_id); - if (obj) - got_object_close(obj); if (blob) got_object_blob_close(blob); if (err) {