commit ca6fcbdce0110845f1195d2d1f5a14ed52a45baf from: Neels Hofmeyr date: Tue Oct 20 01:05:46 2020 UTC patience: make it easy to switch impls for mark_unique commit - 926387ed74169349bdfa40445192f2b23d815244 commit + ca6fcbdce0110845f1195d2d1f5a14ed52a45baf blob - ab51ce7a36d7cfb1b9e50d047239e9b760dba83b blob + 7dcfd74d9e824db631a9d3bc775f8e4ff365720c --- lib/diff_patience.c +++ lib/diff_patience.c @@ -29,8 +29,18 @@ #include "diff_internal.h" #include "diff_debug.h" +/* Algorithm to find unique lines: + * 0: stupidly iterate atoms + * 1: qsort + * 2: mergesort + */ +#define UNIQUE_STRATEGY 2 + /* Per-atom state for the Patience Diff algorithm */ struct atom_patience { +#if UNIQUE_STRATEGY == 0 + bool unique_here; +#endif bool unique_in_both; struct diff_atom *pos_in_other; struct diff_atom *prev_stack; @@ -46,7 +56,136 @@ struct atom_patience { (((struct atom_patience*)((ATOM)->root->current->algo_data))\ [diff_atom_idx((ATOM)->root->current, ATOM)]) -int diff_atoms_mergesort_compar(const void *_a, const void *_b) +#if UNIQUE_STRATEGY == 0 + +/* Stupid iteration and comparison of all atoms */ +static int +diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count) +{ + struct diff_atom *i; + unsigned int count = 0; + diff_data_foreach_atom(i, d) { + PATIENCE(i).unique_here = true; + PATIENCE(i).unique_in_both = true; + count++; + } + diff_data_foreach_atom(i, d) { + struct diff_atom *j; + + if (!PATIENCE(i).unique_here) + continue; + + diff_data_foreach_atom_from(i + 1, j, d) { + bool same; + int r = diff_atom_same(&same, i, j); + if (r) + return r; + if (!same) + continue; + if (PATIENCE(i).unique_here) { + PATIENCE(i).unique_here = false; + PATIENCE(i).unique_in_both = false; + count--; + } + PATIENCE(j).unique_here = false; + PATIENCE(j).unique_in_both = false; + count--; + } + } + if (unique_count) + *unique_count = count; + return 0; +} + +/* Mark those lines as PATIENCE(atom).unique_in_both = true that appear exactly + * once in each side. */ +static int +diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right, + unsigned int *unique_in_both_count) +{ + /* Derive the final unique_in_both count without needing an explicit + * iteration. So this is just some optimiziation to save one iteration + * in the end. */ + unsigned int unique_in_both; + int r; + + r = diff_atoms_mark_unique(left, &unique_in_both); + if (r) + return r; + r = diff_atoms_mark_unique(right, NULL); + if (r) + return r; + + debug("unique_in_both %u\n", unique_in_both); + + struct diff_atom *i; + diff_data_foreach_atom(i, left) { + if (!PATIENCE(i).unique_here) + continue; + struct diff_atom *j; + int found_in_b = 0; + diff_data_foreach_atom(j, right) { + bool same; + int r = diff_atom_same(&same, i, j); + if (r) + return r; + if (!same) + continue; + if (!PATIENCE(j).unique_here) { + found_in_b = 2; /* or more */ + break; + } else { + found_in_b = 1; + PATIENCE(j).pos_in_other = i; + PATIENCE(i).pos_in_other = j; + } + } + + if (found_in_b == 0 || found_in_b > 1) { + PATIENCE(i).unique_in_both = false; + unique_in_both--; + debug("unique_in_both %u (%d) ", unique_in_both, + found_in_b); + debug_dump_atom(left, NULL, i); + } + } + + /* Still need to unmark right[*]->patience.unique_in_both for atoms that + * don't exist in left */ + diff_data_foreach_atom(i, right) { + if (!PATIENCE(i).unique_here + || !PATIENCE(i).unique_in_both) + continue; + struct diff_atom *j; + bool found_in_a = false; + diff_data_foreach_atom(j, left) { + bool same; + int r; + if (!PATIENCE(j).unique_in_both) + continue; + r = diff_atom_same(&same, i, j); + if (r) + return r; + if (!same) + continue; + found_in_a = true; + break; + } + + if (!found_in_a) + PATIENCE(i).unique_in_both = false; + } + + if (unique_in_both_count) + *unique_in_both_count = unique_in_both; + return 0; +} + +#else /* UNIQUE_STRATEGY != 0 */ + +/* Use an optimized sorting algorithm (qsort, mergesort) to find unique lines */ + +int diff_atoms_compar(const void *_a, const void *_b) { const struct diff_atom *a = *(struct diff_atom**)_a; const struct diff_atom *b = *(struct diff_atom**)_b; @@ -54,7 +193,7 @@ int diff_atoms_mergesort_compar(const void *_a, const int rc = 0; /* If there's been an error (e.g. I/O error) in a previous compar, we - * have no way to abort the mergesort but just report the rc and stop + * have no way to abort the sort but just report the rc and stop * comparing. Make sure to catch errors on either side. If atoms are * from more than one diff_data, make sure the error, if any, spreads * to all of them, so we can cut short all future comparisons. */ @@ -93,14 +232,14 @@ int diff_atoms_mergesort_compar(const void *_a, const /* Sort an array of struct diff_atom* in-place. */ static int diff_atoms_sort(struct diff_atom *atoms[], - size_t atoms_count) + size_t atoms_count) { - /* - * Use mergesort(3) bcause it provides stable ordering - * even if two or more elements are equal. - */ +#if UNIQUE_STRATEGY == 1 + qsort(atoms, atoms_count, sizeof(struct diff_atom*), diff_atoms_compar); +#else mergesort(atoms, atoms_count, sizeof(struct diff_atom*), - diff_atoms_mergesort_compar); + diff_atoms_compar); +#endif return atoms[0]->root->err; } @@ -179,6 +318,7 @@ free_and_exit: free(all_atoms); return rc; } +#endif /* UNIQUE_STRATEGY != 0 */ static int diff_atoms_swallow_identical_neighbors(struct diff_data *left,