1 /* Myers diff algorithm implementation, invented by Eugene W. Myers [1].
2 * Implementations of both the Myers Divide Et Impera (using linear space)
3 * and the canonical Myers algorithm (using quadratic space). */
5 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
26 #include <diff/arraylist.h>
27 #include <diff/diff_main.h>
31 /* Myers' diff algorithm [1] is nicely explained in [2].
32 * [1] http://www.xmailserver.org/diff2.pdf
33 * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.
35 * Myers approaches finding the smallest diff as a graph problem.
36 * The crux is that the original algorithm requires quadratic amount of memory:
37 * both sides' lengths added, and that squared. So if we're diffing lines of
38 * text, two files with 1000 lines each would blow up to a matrix of about
39 * 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text.
40 * The solution is using Myers' "divide and conquer" extension algorithm, which
41 * does the original traversal from both ends of the files to reach a middle
42 * where these "snakes" touch, hence does not need to backtrace the traversal,
43 * and so gets away with only keeping a single column of that huge state matrix
48 unsigned int left_start;
49 unsigned int left_end;
50 unsigned int right_start;
51 unsigned int right_end;
54 #define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0)
57 /* If the two contents of a file are A B C D E and X B C Y,
58 * the Myers diff graph looks like:
76 * Moving right means delete an atom from the left-hand-side,
77 * Moving down means add an atom from the right-hand-side.
78 * Diagonals indicate identical atoms on both sides, the challenge is to use as
79 * many diagonals as possible.
81 * The original Myers algorithm walks all the way from the top left to the
82 * bottom right, remembers all steps, and then backtraces to find the shortest
83 * path. However, that requires keeping the entire graph in memory, which needs
86 * Myers adds a variant that uses linear space -- note, not linear time, only
87 * linear space: walk forward and backward, find a meeting point in the middle,
88 * and recurse on the two separate sections. This is called "divide and
91 * d: the step number, starting with 0, a.k.a. the distance from the starting
93 * k: relative index in the state array for the forward scan, indicating on
94 * which diagonal through the diff graph we currently are.
95 * c: relative index in the state array for the backward scan, indicating the
96 * diagonal number from the bottom up.
98 * The "divide and conquer" traversal through the Myers graph looks like this:
101 * ----+--------------------------------------------
109 * 1 | 1,0 4,3 >= 4,3 5,4<-- 0
111 * 0 | -->0,0 3,3 4,4 -1
113 * -1 | 0,1 1,2 3,4 -2
118 * | forward-> <-backward
120 * x,y pairs here are the coordinates in the Myers graph:
121 * x = atom index in left-side source, y = atom index in the right-side source.
123 * Only one forward column and one backward column are kept in mem, each need at
124 * most left.len + 1 + right.len items. Note that each d step occupies either
125 * the even or the odd items of a column: if e.g. the previous column is in the
126 * odd items, the next column is formed in the even items, without overwriting
127 * the previous column's results.
129 * Also note that from the diagonal index k and the x coordinate, the y
130 * coordinate can be derived:
132 * Hence the state array only needs to keep the x coordinate, i.e. the position
133 * in the left-hand file, and the y coordinate, i.e. position in the right-hand
134 * file, is derived from the index in the state array.
136 * The two traces meet at 4,3, the first step (here found in the forward
137 * traversal) where a forward position is on or past a backward traced position
138 * on the same diagonal.
140 * This divides the problem space into:
150 * 3 o-o-o-o-*-o *: forward and backward meet here
154 * Doing the same on each section lead to:
160 * 1 o-b b: backward d=1 first reaches here (sliding up the snake)
161 * B \ f: then forward d=2 reaches here (sliding down the snake)
162 * 2 o As result, the box from b to f is found to be identical;
163 * C \ leaving a top box from 0,0 to 1,1 and a bottom trivial
164 * 3 f-o tail 3,3 to 4,3.
168 * 4 o *: forward and backward meet here
170 * and solving the last top left box gives:
186 #define xk_to_y(X, K) ((X) - (K))
187 #define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))
188 #define k_to_c(K, DELTA) ((K) + (DELTA))
189 #define c_to_k(C, DELTA) ((C) - (DELTA))
191 /* Do one forwards step in the "divide and conquer" graph traversal.
192 * left: the left side to diff.
193 * right: the right side to diff against.
194 * kd_forward: the traversal state for forwards traversal, modified by this
196 * This is carried over between invocations with increasing d.
197 * kd_forward points at the center of the state array, allowing
199 * kd_backward: the traversal state for backwards traversal, to find a meeting
201 * Since forwards is done first, kd_backward will be valid for d -
203 * kd_backward points at the center of the state array, allowing
205 * d: Step or distance counter, indicating for what value of d the kd_forward
206 * should be populated.
207 * For d == 0, kd_forward[0] is initialized, i.e. the first invocation should
209 * meeting_snake: resulting meeting point, if any.
210 * Return true when a meeting point has been identified.
213 diff_divide_myers_forward(bool *found_midpoint,
214 struct diff_data *left, struct diff_data *right,
215 int *kd_forward, int *kd_backward, int d,
216 struct diff_box *meeting_snake)
218 int delta = (int)right->atoms.len - (int)left->atoms.len;
221 *found_midpoint = false;
223 debug("-- %s d=%d\n", __func__, d);
225 for (k = d; k >= -d; k -= 2) {
226 if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
227 /* This diagonal is completely outside of the Myers
228 * graph, don't calculate it. */
229 if (k < -(int)right->atoms.len)
230 debug(" %d k < -(int)right->atoms.len %d\n", k,
231 -(int)right->atoms.len);
233 debug(" %d k > left->atoms.len %d\n", k,
236 /* We are traversing negatively, and already
237 * below the entire graph, nothing will come of
245 debug("- k = %d\n", k);
247 /* This is the initializing step. There is no prev_k
248 * yet, get the initial x from the top left of the Myers
252 /* Favoring "-" lines first means favoring moving rightwards in
254 * For this, all k should derive from k - 1, only the bottom
255 * most k derive from k + 1:
258 * ----+----------------
260 * 2 | 2,0 <-- from prev_k = 2 - 1 = 1
266 * -1 | 0,1 <-- bottom most for d=1 from
267 * | \\ prev_k = -1 + 1 = 0
268 * -2 | 0,2 <-- bottom most for d=2 from
269 * prev_k = -2 + 1 = -1
271 * Except when a k + 1 from a previous run already means a
272 * further advancement in the graph.
273 * If k == d, there is no k + 1 and k - 1 is the only option.
274 * If k < d, use k + 1 in case that yields a larger x. Also use
275 * k + 1 if k - 1 is outside the graph.
279 || (k - 1 >= -(int)right->atoms.len
280 && kd_forward[k - 1] >= kd_forward[k + 1]))) {
281 /* Advance from k - 1.
282 * From position prev_k, step to the right in the Myers
286 int prev_x = kd_forward[prev_k];
289 /* The bottom most one.
290 * From position prev_k, step to the bottom in the Myers
292 * Incrementing y is achieved by decrementing k while
293 * keeping the same x.
294 * (since we're deriving y from y = x - k).
297 int prev_x = kd_forward[prev_k];
301 int x_before_slide = x;
302 /* Slide down any snake that we might find here. */
303 while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len) {
305 int r = diff_atom_same(&same,
306 &left->atoms.head[x],
316 if (x_before_slide != x) {
317 debug(" down %d similar lines\n", x - x_before_slide);
322 for (fi = d; fi >= k; fi--) {
323 debug("kd_forward[%d] = (%d, %d)\n", fi,
324 kd_forward[fi], kd_forward[fi] - fi);
328 if (x < 0 || x > left->atoms.len
329 || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len)
332 /* Figured out a new forwards traversal, see if this has gone
333 * onto or even past a preceding backwards traversal.
335 * If the delta in length is odd, then d and backwards_d hit the
336 * same state indexes:
338 * ----+---------------- ----------------
348 * 0 | -->0,0 3,3====4,4 -1
355 * If the delta is even, they end up off-by-one, i.e. on
356 * different diagonals:
359 * ----+---------------- ----------------
367 * 0 | -->0,0 3,3 4,4<-- 0
374 * So in the forward path, we can only match up diagonals when
377 if ((delta & 1) == 0)
379 /* Forwards is done first, so the backwards one was still at
380 * d - 1. Can't do this for d == 0. */
381 int backwards_d = d - 1;
385 debug("backwards_d = %d\n", backwards_d);
387 /* If both sides have the same length, forward and backward
388 * start on the same diagonal, meaning the backwards state index
390 * As soon as the lengths are not the same, the backwards
391 * traversal starts on a different diagonal, and c = k shifted
392 * by the difference in length.
394 int c = k_to_c(k, delta);
396 /* When the file sizes are very different, the traversal trees
397 * start on far distant diagonals.
398 * They don't necessarily meet straight on. See whether this
399 * forward value is on a diagonal that is also valid in
400 * kd_backward[], and match them if so. */
401 if (c >= -backwards_d && c <= backwards_d) {
402 /* Current k is on a diagonal that exists in
403 * kd_backward[]. If the two x positions have met or
404 * passed (forward walked onto or past backward), then
405 * we've found a midpoint / a mid-box.
407 * When forwards and backwards traversals meet, the
408 * endpoints of the mid-snake are not the two points in
409 * kd_forward and kd_backward, but rather the section
410 * that was slid (if any) of the current
411 * forward/backward traversal only.
435 * The forward traversal reached M from the top and slid
436 * downwards to A. The backward traversal already
437 * reached X, which is not a straight line from M
438 * anymore, so picking a mid-snake from M to X would
441 * The correct mid-snake is between M and A. M is where
442 * the forward traversal hit the diagonal that the
443 * backward traversal has already passed, and A is what
444 * it reaches when sliding down identical lines.
446 int backward_x = kd_backward[c];
447 debug("Compare: k=%d c=%d is (%d,%d) >= (%d,%d)?\n",
448 k, c, x, xk_to_y(x, k), backward_x,
449 xc_to_y(backward_x, c, delta));
450 if (x >= backward_x) {
451 *meeting_snake = (struct diff_box){
452 .left_start = x_before_slide,
454 .right_start = xc_to_y(x_before_slide,
456 .right_end = xk_to_y(x, k),
458 debug("HIT x=(%u,%u) - y=(%u,%u)\n",
459 meeting_snake->left_start,
460 meeting_snake->right_start,
461 meeting_snake->left_end,
462 meeting_snake->right_end);
463 debug_dump_myers_graph(left, right, NULL,
466 *found_midpoint = true;
472 debug_dump_myers_graph(left, right, NULL, kd_forward, d,
477 /* Do one backwards step in the "divide and conquer" graph traversal.
478 * left: the left side to diff.
479 * right: the right side to diff against.
480 * kd_forward: the traversal state for forwards traversal, to find a meeting
482 * Since forwards is done first, after this, both kd_forward and
483 * kd_backward will be valid for d.
484 * kd_forward points at the center of the state array, allowing
486 * kd_backward: the traversal state for backwards traversal, to find a meeting
488 * This is carried over between invocations with increasing d.
489 * kd_backward points at the center of the state array, allowing
491 * d: Step or distance counter, indicating for what value of d the kd_backward
492 * should be populated.
493 * Before the first invocation, kd_backward[0] shall point at the bottom
494 * right of the Myers graph (left.len, right.len).
495 * The first invocation will be for d == 1.
496 * meeting_snake: resulting meeting point, if any.
497 * Return true when a meeting point has been identified.
500 diff_divide_myers_backward(bool *found_midpoint,
501 struct diff_data *left, struct diff_data *right,
502 int *kd_forward, int *kd_backward, int d,
503 struct diff_box *meeting_snake)
505 int delta = (int)right->atoms.len - (int)left->atoms.len;
509 *found_midpoint = false;
511 debug("-- %s d=%d\n", __func__, d);
513 for (c = d; c >= -d; c -= 2) {
514 if (c < -(int)left->atoms.len || c > (int)right->atoms.len) {
515 /* This diagonal is completely outside of the Myers
516 * graph, don't calculate it. */
517 if (c < -(int)left->atoms.len)
518 debug(" %d c < -(int)left->atoms.len %d\n", c,
519 -(int)left->atoms.len);
521 debug(" %d c > right->atoms.len %d\n", c,
524 /* We are traversing negatively, and already
525 * below the entire graph, nothing will come of
533 debug("- c = %d\n", c);
535 /* This is the initializing step. There is no prev_c
536 * yet, get the initial x from the bottom right of the
540 /* Favoring "-" lines first means favoring moving rightwards in
542 * For this, all c should derive from c - 1, only the bottom
543 * most c derive from c + 1:
546 * ---------------------------------------------------
550 * from prev_c = c - 1 --> 5,2 2
556 * bottom most for d=1 from c + 1 --> 4,4 -1
558 * bottom most for d=2 --> 3,4 -2
560 * Except when a c + 1 from a previous run already means a
561 * further advancement in the graph.
562 * If c == d, there is no c + 1 and c - 1 is the only option.
563 * If c < d, use c + 1 in case that yields a larger x.
564 * Also use c + 1 if c - 1 is outside the graph.
566 else if (c > -d && (c == d
567 || (c - 1 >= -(int)right->atoms.len
568 && kd_backward[c - 1] <= kd_backward[c + 1]))) {
570 * From position prev_c, step upwards in the Myers
572 * Decrementing y is achieved by incrementing c while
573 * keeping the same x. (since we're deriving y from
574 * y = x - c + delta).
577 int prev_x = kd_backward[prev_c];
580 /* The bottom most one.
581 * From position prev_c, step to the left in the Myers
585 int prev_x = kd_backward[prev_c];
589 /* Slide up any snake that we might find here (sections of
590 * identical lines on both sides). */
591 debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c,
593 xc_to_y(x, c, delta)-1);
596 debug_dump_atom(left, right, &left->atoms.head[x-1]);
598 if (xc_to_y(x, c, delta) > 0) {
600 debug_dump_atom(right, left,
601 &right->atoms.head[xc_to_y(x, c, delta)-1]);
603 int x_before_slide = x;
604 while (x > 0 && xc_to_y(x, c, delta) > 0) {
606 int r = diff_atom_same(&same,
607 &left->atoms.head[x-1],
609 xc_to_y(x, c, delta)-1]);
617 if (x_before_slide != x) {
618 debug(" up %d similar lines\n", x_before_slide - x);
623 for (fi = d; fi >= c; fi--) {
624 debug("kd_backward[%d] = (%d, %d)\n",
627 kd_backward[fi] - fi + delta);
631 if (x < 0 || x > left->atoms.len
632 || xc_to_y(x, c, delta) < 0
633 || xc_to_y(x, c, delta) > right->atoms.len)
636 /* Figured out a new backwards traversal, see if this has gone
637 * onto or even past a preceding forwards traversal.
639 * If the delta in length is even, then d and backwards_d hit
640 * the same state indexes -- note how this is different from in
641 * the forwards traversal, because now both d are the same:
644 * ----+---------------- --------------------
654 * 0 | -->0,0 3,3====4,3 5,4<-- 0
661 * If the delta is odd, they end up off-by-one, i.e. on
662 * different diagonals.
663 * So in the backward path, we can only match up diagonals when
666 if ((delta & 1) != 0)
668 /* Forwards was done first, now both d are the same. */
671 /* As soon as the lengths are not the same, the
672 * backwards traversal starts on a different diagonal,
673 * and c = k shifted by the difference in length.
675 int k = c_to_k(c, delta);
677 /* When the file sizes are very different, the traversal trees
678 * start on far distant diagonals.
679 * They don't necessarily meet straight on. See whether this
680 * backward value is also on a valid diagonal in kd_forward[],
681 * and match them if so. */
682 if (k >= -forwards_d && k <= forwards_d) {
683 /* Current c is on a diagonal that exists in
684 * kd_forward[]. If the two x positions have met or
685 * passed (backward walked onto or past forward), then
686 * we've found a midpoint / a mid-box.
688 * When forwards and backwards traversals meet, the
689 * endpoints of the mid-snake are not the two points in
690 * kd_forward and kd_backward, but rather the section
691 * that was slid (if any) of the current
692 * forward/backward traversal only.
714 * The backward traversal reached M from the bottom and
715 * slid upwards. The forward traversal already reached
716 * X, which is not a straight line from M anymore, so
717 * picking a mid-snake from M to X would yield a
720 * The correct mid-snake is between M and A. M is where
721 * the backward traversal hit the diagonal that the
722 * forwards traversal has already passed, and A is what
723 * it reaches when sliding up identical lines.
726 int forward_x = kd_forward[k];
727 debug("Compare: k=%d c=%d is (%d,%d) >= (%d,%d)?\n",
728 k, c, forward_x, xk_to_y(forward_x, k),
729 x, xc_to_y(x, c, delta));
730 if (forward_x >= x) {
731 *meeting_snake = (struct diff_box){
733 .left_end = x_before_slide,
734 .right_start = xc_to_y(x, c, delta),
735 .right_end = xk_to_y(x_before_slide, k),
737 debug("HIT x=%u,%u - y=%u,%u\n",
738 meeting_snake->left_start,
739 meeting_snake->right_start,
740 meeting_snake->left_end,
741 meeting_snake->right_end);
742 debug_dump_myers_graph(left, right, NULL,
745 *found_midpoint = true;
750 debug_dump_myers_graph(left, right, NULL, kd_forward, d, kd_backward,
755 /* Myers "Divide et Impera": tracing forwards from the start and backwards from
756 * the end to find a midpoint that divides the problem into smaller chunks.
757 * Requires only linear amounts of memory. */
759 diff_algo_myers_divide(const struct diff_algo_config *algo_config,
760 struct diff_state *state)
763 struct diff_data *left = &state->left;
764 struct diff_data *right = &state->right;
766 debug("\n** %s\n", __func__);
771 debug_dump_myers_graph(left, right, NULL, NULL, 0, NULL, 0);
773 /* Allocate two columns of a Myers graph, one for the forward and one
774 * for the backward traversal. */
775 unsigned int max = left->atoms.len + right->atoms.len;
776 size_t kd_len = max + 1;
777 size_t kd_buf_size = kd_len << 1;
778 int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
782 for (i = 0; i < kd_buf_size; i++)
784 int *kd_forward = kd_buf;
785 int *kd_backward = kd_buf + kd_len;
787 /* The 'k' axis in Myers spans positive and negative indexes, so point
788 * the kd to the middle.
789 * It is then possible to index from -max/2 .. max/2. */
791 kd_backward += max/2;
794 struct diff_box mid_snake = {};
795 bool found_midpoint = false;
796 for (d = 0; d <= (max/2); d++) {
798 debug("-- d=%d\n", d);
799 r = diff_divide_myers_forward(&found_midpoint, left, right,
800 kd_forward, kd_backward, d,
806 r = diff_divide_myers_backward(&found_midpoint, left, right,
807 kd_forward, kd_backward, d,
815 if (!found_midpoint) {
816 /* Divide and conquer failed to find a meeting point. Use the
817 * fallback_algo defined in the algo_config (leave this to the
818 * caller). This is just paranoia/sanity, we normally should
819 * always find a midpoint.
821 debug(" no midpoint \n");
822 rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
825 debug(" mid snake L: %u to %u of %u R: %u to %u of %u\n",
826 mid_snake.left_start, mid_snake.left_end, left->atoms.len,
827 mid_snake.right_start, mid_snake.right_end,
830 /* Section before the mid-snake. */
831 debug("Section before the mid-snake\n");
833 struct diff_atom *left_atom = &left->atoms.head[0];
834 unsigned int left_section_len = mid_snake.left_start;
835 struct diff_atom *right_atom = &right->atoms.head[0];
836 unsigned int right_section_len = mid_snake.right_start;
838 if (left_section_len && right_section_len) {
839 /* Record an unsolved chunk, the caller will apply
840 * inner_algo() on this chunk. */
841 if (!diff_state_add_chunk(state, false,
842 left_atom, left_section_len,
846 } else if (left_section_len && !right_section_len) {
847 /* Only left atoms and none on the right, they form a
848 * "minus" chunk, then. */
849 if (!diff_state_add_chunk(state, true,
850 left_atom, left_section_len,
853 } else if (!left_section_len && right_section_len) {
854 /* No left atoms, only atoms on the right, they form a
855 * "plus" chunk, then. */
856 if (!diff_state_add_chunk(state, true,
862 /* else: left_section_len == 0 and right_section_len == 0, i.e.
863 * nothing before the mid-snake. */
865 if (!diff_box_empty(&mid_snake)) {
866 /* The midpoint is a "snake", i.e. on a section of
867 * identical data on both sides: that section
868 * immediately becomes a solved diff chunk. */
869 debug("the mid-snake\n");
870 if (!diff_state_add_chunk(state, true,
871 &left->atoms.head[mid_snake.left_start],
872 mid_snake.left_end - mid_snake.left_start,
873 &right->atoms.head[mid_snake.right_start],
874 mid_snake.right_end - mid_snake.right_start))
878 /* Section after the mid-snake. */
879 debug("Section after the mid-snake\n");
880 debug(" left_end %u right_end %u\n",
881 mid_snake.left_end, mid_snake.right_end);
882 debug(" left_count %u right_count %u\n",
883 left->atoms.len, right->atoms.len);
884 left_atom = &left->atoms.head[mid_snake.left_end];
885 left_section_len = left->atoms.len - mid_snake.left_end;
886 right_atom = &right->atoms.head[mid_snake.right_end];
887 right_section_len = right->atoms.len - mid_snake.right_end;
889 if (left_section_len && right_section_len) {
890 /* Record an unsolved chunk, the caller will apply
891 * inner_algo() on this chunk. */
892 if (!diff_state_add_chunk(state, false,
893 left_atom, left_section_len,
897 } else if (left_section_len && !right_section_len) {
898 /* Only left atoms and none on the right, they form a
899 * "minus" chunk, then. */
900 if (!diff_state_add_chunk(state, true,
901 left_atom, left_section_len,
904 } else if (!left_section_len && right_section_len) {
905 /* No left atoms, only atoms on the right, they form a
906 * "plus" chunk, then. */
907 if (!diff_state_add_chunk(state, true,
913 /* else: left_section_len == 0 and right_section_len == 0, i.e.
914 * nothing after the mid-snake. */
921 debug("** END %s\n", __func__);
925 /* Myers Diff tracing from the start all the way through to the end, requiring
926 * quadratic amounts of memory. This can fail if the required space surpasses
927 * algo_config->permitted_state_size. */
929 diff_algo_myers(const struct diff_algo_config *algo_config,
930 struct diff_state *state)
932 /* do a diff_divide_myers_forward() without a _backward(), so that it
933 * walks forward across the entire files to reach the end. Keep each
934 * run's state, and do a final backtrace. */
936 struct diff_data *left = &state->left;
937 struct diff_data *right = &state->right;
939 debug("\n** %s\n", __func__);
944 debug_dump_myers_graph(left, right, NULL, NULL, 0, NULL, 0);
946 /* Allocate two columns of a Myers graph, one for the forward and one
947 * for the backward traversal. */
948 unsigned int max = left->atoms.len + right->atoms.len;
949 size_t kd_len = max + 1 + max;
950 size_t kd_buf_size = kd_len * kd_len;
951 size_t kd_state_size = kd_buf_size * sizeof(int);
952 debug("state size: %zu\n", kd_buf_size);
953 if (kd_buf_size < kd_len /* overflow? */
954 || kd_state_size > algo_config->permitted_state_size) {
955 debug("state size %zu > permitted_state_size %zu, use fallback_algo\n",
956 kd_state_size, algo_config->permitted_state_size);
957 return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
960 int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
964 for (i = 0; i < kd_buf_size; i++)
967 /* The 'k' axis in Myers spans positive and negative indexes, so point
968 * the kd to the middle.
969 * It is then possible to index from -max .. max. */
970 int *kd_origin = kd_buf + max;
971 int *kd_column = kd_origin;
974 int backtrack_d = -1;
978 for (d = 0; d <= max; d++, kd_column += kd_len) {
979 debug("-- d=%d\n", d);
981 debug("-- %s d=%d\n", __func__, d);
983 for (k = d; k >= -d; k -= 2) {
984 if (k < -(int)right->atoms.len
985 || k > (int)left->atoms.len) {
986 /* This diagonal is completely outside of the
987 * Myers graph, don't calculate it. */
988 if (k < -(int)right->atoms.len)
990 " -(int)right->atoms.len %d\n",
991 k, -(int)right->atoms.len);
993 debug(" %d k > left->atoms.len %d\n", k,
996 /* We are traversing negatively, and
997 * already below the entire graph,
998 * nothing will come of this. */
1006 debug("- k = %d\n", k);
1008 /* This is the initializing step. There is no
1009 * prev_k yet, get the initial x from the top
1010 * left of the Myers graph. */
1013 int *kd_prev_column = kd_column - kd_len;
1015 /* Favoring "-" lines first means favoring
1016 * moving rightwards in the Myers graph.
1017 * For this, all k should derive from k - 1,
1018 * only the bottom most k derive from k + 1:
1021 * ----+----------------
1024 * | / prev_k = 2 - 1 = 1
1029 * -1 | 0,1 <-- bottom most for d=1
1030 * | \\ from prev_k = -1+1 = 0
1031 * -2 | 0,2 <-- bottom most for
1033 * prev_k = -2+1 = -1
1035 * Except when a k + 1 from a previous run
1036 * already means a further advancement in the
1038 * If k == d, there is no k + 1 and k - 1 is the
1040 * If k < d, use k + 1 in case that yields a
1041 * larger x. Also use k + 1 if k - 1 is outside
1046 || (k - 1 >= -(int)right->atoms.len
1047 && kd_prev_column[k - 1]
1048 >= kd_prev_column[k + 1]))) {
1049 /* Advance from k - 1.
1050 * From position prev_k, step to the
1051 * right in the Myers graph: x += 1.
1054 int prev_x = kd_prev_column[prev_k];
1057 /* The bottom most one.
1058 * From position prev_k, step to the
1059 * bottom in the Myers graph: y += 1.
1060 * Incrementing y is achieved by
1061 * decrementing k while keeping the same
1062 * x. (since we're deriving y from y =
1066 int prev_x = kd_prev_column[prev_k];
1071 /* Slide down any snake that we might find here. */
1072 while (x < left->atoms.len
1073 && xk_to_y(x, k) < right->atoms.len) {
1075 int r = diff_atom_same(&same,
1076 &left->atoms.head[x],
1089 for (fi = d; fi >= k; fi-=2) {
1090 debug("kd_column[%d] = (%d, %d)\n", fi,
1092 kd_column[fi] - fi);
1096 if (x == left->atoms.len
1097 && xk_to_y(x, k) == right->atoms.len) {
1101 debug("Reached the end at d = %d, k = %d\n",
1102 backtrack_d, backtrack_k);
1107 if (backtrack_d >= 0)
1111 debug_dump_myers_graph(left, right, kd_origin, NULL, 0, NULL, 0);
1113 /* backtrack. A matrix spanning from start to end of the file is ready:
1116 * ----+---------------------------------
1124 * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0
1131 * From (4,4) backwards, find the previous position that is the largest, and remember it.
1134 for (d = backtrack_d, k = backtrack_k; d >= 0; d--) {
1138 /* When the best position is identified, remember it for that
1140 * That kd_column is no longer needed otherwise, so just
1141 * re-purpose kd_column[0] = x and kd_column[1] = y,
1142 * so that there is no need to allocate more memory.
1146 debug("Backtrack d=%d: xy=(%d, %d)\n",
1147 d, kd_column[0], kd_column[1]);
1149 /* Don't access memory before kd_buf */
1152 int *kd_prev_column = kd_column - kd_len;
1154 /* When y == 0, backtracking downwards (k-1) is the only way.
1155 * When x == 0, backtracking upwards (k+1) is the only way.
1158 * ----+---------------------------------
1166 * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4,
1167 * | \ / \ backtrack_k = 0
1173 debug("prev[k-1] = %d,%d prev[k+1] = %d,%d\n",
1174 kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1),
1175 kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1));
1178 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) {
1180 debug("prev k=k-1=%d x=%d y=%d\n",
1181 k, kd_prev_column[k],
1182 xk_to_y(kd_prev_column[k], k));
1185 debug("prev k=k+1=%d x=%d y=%d\n",
1186 k, kd_prev_column[k],
1187 xk_to_y(kd_prev_column[k], k));
1189 kd_column = kd_prev_column;
1192 /* Forwards again, this time recording the diff chunks.
1193 * Definitely start from 0,0. kd_column[0] may actually point to the
1194 * bottom of a snake starting at 0,0 */
1198 kd_column = kd_origin;
1199 for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) {
1200 int next_x = kd_column[0];
1201 int next_y = kd_column[1];
1202 debug("Forward track from xy(%d,%d) to xy(%d,%d)\n",
1203 x, y, next_x, next_y);
1205 struct diff_atom *left_atom = &left->atoms.head[x];
1206 int left_section_len = next_x - x;
1207 struct diff_atom *right_atom = &right->atoms.head[y];
1208 int right_section_len = next_y - y;
1211 if (left_section_len && right_section_len) {
1212 /* This must be a snake slide.
1213 * Snake slides have a straight line leading into them
1214 * (except when starting at (0,0)). Find out whether the
1215 * lead-in is horizontal or vertical:
1227 * If left_section_len > right_section_len, the lead-in
1228 * is horizontal, meaning first remove one atom from the
1229 * left before sliding down the snake.
1230 * If right_section_len > left_section_len, the lead-in
1231 * is vetical, so add one atom from the right before
1232 * sliding down the snake. */
1233 if (left_section_len == right_section_len + 1) {
1234 if (!diff_state_add_chunk(state, true,
1240 } else if (right_section_len == left_section_len + 1) {
1241 if (!diff_state_add_chunk(state, true,
1246 right_section_len--;
1247 } else if (left_section_len != right_section_len) {
1248 /* The numbers are making no sense. Should never
1250 rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
1254 if (!diff_state_add_chunk(state, true,
1255 left_atom, left_section_len,
1259 } else if (left_section_len && !right_section_len) {
1260 /* Only left atoms and none on the right, they form a
1261 * "minus" chunk, then. */
1262 if (!diff_state_add_chunk(state, true,
1263 left_atom, left_section_len,
1266 } else if (!left_section_len && right_section_len) {
1267 /* No left atoms, only atoms on the right, they form a
1268 * "plus" chunk, then. */
1269 if (!diff_state_add_chunk(state, true,
1284 debug("** END %s rc=%d\n", __func__, rc);