commit cd25827e24b8fad0a3f4a89b72fbdba86bc2c5d1 from: Neels Hofmeyr date: Sun Sep 20 14:15:51 2020 UTC myers_divide: stop traversing snakes after reasonable max effort commit - 8546b0450f31c11e9433f5d5c6dc1d79c86107ed commit + cd25827e24b8fad0a3f4a89b72fbdba86bc2c5d1 blob - f56135e983c75dc1d4d176da40de71623e9d95b9 blob + 0e5a1532ffd05811db5e9dc363a73df4406cf063 --- lib/diff_myers.c +++ lib/diff_myers.c @@ -51,9 +51,6 @@ struct diff_box { unsigned int right_end; }; -#define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0) - - /* If the two contents of a file are A B C D E and X B C Y, * the Myers diff graph looks like: * @@ -750,6 +747,16 @@ diff_divide_myers_backward(bool *found_midpoint, debug_dump_myers_graph(left, right, NULL, kd_forward, d, kd_backward, d); return 0; +} + +/* Integer square root approximation */ +static int +shift_sqrt(int val) +{ + int i; + for (i = 1; val > 0; val >>= 2) + i <<= 1; + return i; } /* Myers "Divide et Impera": tracing forwards from the start and backwards from @@ -783,6 +790,7 @@ diff_algo_myers_divide(const struct diff_algo_config * kd_buf[i] = -1; int *kd_forward = kd_buf; int *kd_backward = kd_buf + kd_len; + int max_effort = shift_sqrt(max/2); /* The 'k' axis in Myers spans positive and negative indexes, so point * the kd to the middle. @@ -809,7 +817,107 @@ diff_algo_myers_divide(const struct diff_algo_config * if (r) return r; if (found_midpoint) + break; + + /* Limit the effort spent looking for a mid snake. If files have + * very few lines in common, the effort spent to find nice mid + * snakes is just not worth it, the diff result will still be + * essentially minus everything on the left, plus everything on + * the right, with a few useless matches here and there. */ + if (d > max_effort) { + /* pick the furthest reaching point from + * kd_forward and kd_backward, and use that as a + * midpoint, to not step into another diff algo + * recursion with unchanged box. */ + int delta = (int)right->atoms.len - (int)left->atoms.len; + int x; + int y; + int i; + int best_forward_i = 0; + int best_forward_distance = 0; + int best_backward_i = 0; + int best_backward_distance = 0; + int distance; + int best_forward_x; + int best_forward_y; + int best_backward_x; + int best_backward_y; + + debug("~~~ d = %d > max_effort = %d\n", d, max_effort); + + for (i = d; i >= -d; i -= 2) { + if (i >= -(int)right->atoms.len && i <= (int)left->atoms.len) { + x = kd_forward[i]; + y = xk_to_y(x, i); + distance = x + y; + if (distance > best_forward_distance) { + best_forward_distance = distance; + best_forward_i = i; + } + } + + if (i >= -(int)left->atoms.len && i <= (int)right->atoms.len) { + x = kd_backward[i]; + y = xc_to_y(x, i, delta); + distance = (right->atoms.len - x) + + (left->atoms.len - y); + if (distance >= best_backward_distance) { + best_backward_distance = distance; + best_backward_i = i; + } + } + } + + /* The myers-divide didn't meet in the middle. We just + * figured out the places where the forward path + * advanced the most, and the backward path advanced the + * most. Just divide at whichever one of those two is better. + * + * o-o + * | + * o + * \ + * o + * \ + * F <-- cut here + * + * + * + * or here --> B + * \ + * o + * \ + * o + * | + * o-o + */ + best_forward_x = kd_forward[best_forward_i]; + best_forward_y = xk_to_y(best_forward_x, best_forward_i); + best_backward_x = kd_backward[best_backward_i]; + best_backward_y = xc_to_y(x, best_backward_i, delta); + + if (best_forward_distance >= best_backward_distance) { + x = best_forward_x; + y = best_forward_y; + } else { + x = best_backward_x; + y = best_backward_y; + } + + debug("max_effort cut at x=%d y=%d\n", x, y); + if (x < 0 || y < 0 + || x > left->atoms.len || y > right->atoms.len) + break; + + found_midpoint = true; + mid_snake = (struct diff_box){ + .left_start = x, + .left_end = x, + .right_start = y, + .right_end = y, + }; break; + } } if (!found_midpoint) { @@ -862,7 +970,7 @@ diff_algo_myers_divide(const struct diff_algo_config * /* else: left_section_len == 0 and right_section_len == 0, i.e. * nothing before the mid-snake. */ - if (!diff_box_empty(&mid_snake)) { + if (mid_snake.left_end > mid_snake.left_start) { /* The midpoint is a "snake", i.e. on a section of * identical data on both sides: that section * immediately becomes a solved diff chunk. */