Blob


1 /* Myers diff algorithm implementation, invented by Eugene W. Myers [1].
2 * Implementations of both the Myers Divide Et Impera (using linear space)
3 * and the canonical Myers algorithm (using quadratic space). */
4 /*
5 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
20 #include <inttypes.h>
21 #include <stdbool.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <errno.h>
26 #include <diff/arraylist.h>
27 #include <diff/diff_main.h>
29 #include "debug.h"
31 /* Myers' diff algorithm [1] is nicely explained in [2].
32 * [1] http://www.xmailserver.org/diff2.pdf
33 * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.
34 *
35 * Myers approaches finding the smallest diff as a graph problem.
36 * The crux is that the original algorithm requires quadratic amount of memory:
37 * both sides' lengths added, and that squared. So if we're diffing lines of
38 * text, two files with 1000 lines each would blow up to a matrix of about
39 * 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text.
40 * The solution is using Myers' "divide and conquer" extension algorithm, which
41 * does the original traversal from both ends of the files to reach a middle
42 * where these "snakes" touch, hence does not need to backtrace the traversal,
43 * and so gets away with only keeping a single column of that huge state matrix
44 * in memory.
45 */
47 struct diff_box {
48 unsigned int left_start;
49 unsigned int left_end;
50 unsigned int right_start;
51 unsigned int right_end;
52 };
54 #define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0)
57 /* If the two contents of a file are A B C D E and X B C Y,
58 * the Myers diff graph looks like:
59 *
60 * k0 k1
61 * \ \
62 * k-1 0 1 2 3 4 5
63 * \ A B C D E
64 * 0 o-o-o-o-o-o
65 * X | | | | | |
66 * 1 o-o-o-o-o-o
67 * B | |\| | | |
68 * 2 o-o-o-o-o-o
69 * C | | |\| | |
70 * 3 o-o-o-o-o-o
71 * Y | | | | | |\
72 * 4 o-o-o-o-o-o c1
73 * \ \
74 * c-1 c0
75 *
76 * Moving right means delete an atom from the left-hand-side,
77 * Moving down means add an atom from the right-hand-side.
78 * Diagonals indicate identical atoms on both sides, the challenge is to use as
79 * many diagonals as possible.
80 *
81 * The original Myers algorithm walks all the way from the top left to the
82 * bottom right, remembers all steps, and then backtraces to find the shortest
83 * path. However, that requires keeping the entire graph in memory, which needs
84 * quadratic space.
85 *
86 * Myers adds a variant that uses linear space -- note, not linear time, only
87 * linear space: walk forward and backward, find a meeting point in the middle,
88 * and recurse on the two separate sections. This is called "divide and
89 * conquer".
90 *
91 * d: the step number, starting with 0, a.k.a. the distance from the starting
92 * point.
93 * k: relative index in the state array for the forward scan, indicating on
94 * which diagonal through the diff graph we currently are.
95 * c: relative index in the state array for the backward scan, indicating the
96 * diagonal number from the bottom up.
97 *
98 * The "divide and conquer" traversal through the Myers graph looks like this:
99 *
100 * | d= 0 1 2 3 2 1 0
101 * ----+--------------------------------------------
102 * k= | c=
103 * 4 | 3
104 * |
105 * 3 | 3,0 5,2 2
106 * | / \
107 * 2 | 2,0 5,3 1
108 * | / \
109 * 1 | 1,0 4,3 >= 4,3 5,4<-- 0
110 * | / / \ /
111 * 0 | -->0,0 3,3 4,4 -1
112 * | \ / /
113 * -1 | 0,1 1,2 3,4 -2
114 * | \ /
115 * -2 | 0,2 -3
116 * | \
117 * | 0,3
118 * | forward-> <-backward
120 * x,y pairs here are the coordinates in the Myers graph:
121 * x = atom index in left-side source, y = atom index in the right-side source.
123 * Only one forward column and one backward column are kept in mem, each need at
124 * most left.len + 1 + right.len items. Note that each d step occupies either
125 * the even or the odd items of a column: if e.g. the previous column is in the
126 * odd items, the next column is formed in the even items, without overwriting
127 * the previous column's results.
129 * Also note that from the diagonal index k and the x coordinate, the y
130 * coordinate can be derived:
131 * y = x - k
132 * Hence the state array only needs to keep the x coordinate, i.e. the position
133 * in the left-hand file, and the y coordinate, i.e. position in the right-hand
134 * file, is derived from the index in the state array.
136 * The two traces meet at 4,3, the first step (here found in the forward
137 * traversal) where a forward position is on or past a backward traced position
138 * on the same diagonal.
140 * This divides the problem space into:
142 * 0 1 2 3 4 5
143 * A B C D E
144 * 0 o-o-o-o-o
145 * X | | | | |
146 * 1 o-o-o-o-o
147 * B | |\| | |
148 * 2 o-o-o-o-o
149 * C | | |\| |
150 * 3 o-o-o-o-*-o *: forward and backward meet here
151 * Y | |
152 * 4 o-o
154 * Doing the same on each section lead to:
156 * 0 1 2 3 4 5
157 * A B C D E
158 * 0 o-o
159 * X | |
160 * 1 o-b b: backward d=1 first reaches here (sliding up the snake)
161 * B \ f: then forward d=2 reaches here (sliding down the snake)
162 * 2 o As result, the box from b to f is found to be identical;
163 * C \ leaving a top box from 0,0 to 1,1 and a bottom trivial
164 * 3 f-o tail 3,3 to 4,3.
166 * 3 o-*
167 * Y |
168 * 4 o *: forward and backward meet here
170 * and solving the last top left box gives:
172 * 0 1 2 3 4 5
173 * A B C D E -A
174 * 0 o-o +X
175 * X | B
176 * 1 o C
177 * B \ -D
178 * 2 o -E
179 * C \ +Y
180 * 3 o-o-o
181 * Y |
182 * 4 o
184 */
186 #define xk_to_y(X, K) ((X) - (K))
187 #define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))
188 #define k_to_c(K, DELTA) ((K) + (DELTA))
189 #define c_to_k(C, DELTA) ((C) - (DELTA))
191 /* Do one forwards step in the "divide and conquer" graph traversal.
192 * left: the left side to diff.
193 * right: the right side to diff against.
194 * kd_forward: the traversal state for forwards traversal, modified by this
195 * function.
196 * This is carried over between invocations with increasing d.
197 * kd_forward points at the center of the state array, allowing
198 * negative indexes.
199 * kd_backward: the traversal state for backwards traversal, to find a meeting
200 * point.
201 * Since forwards is done first, kd_backward will be valid for d -
202 * 1, not d.
203 * kd_backward points at the center of the state array, allowing
204 * negative indexes.
205 * d: Step or distance counter, indicating for what value of d the kd_forward
206 * should be populated.
207 * For d == 0, kd_forward[0] is initialized, i.e. the first invocation should
208 * be for d == 0.
209 * meeting_snake: resulting meeting point, if any.
210 * Return true when a meeting point has been identified.
211 */
212 static int
213 diff_divide_myers_forward(bool *found_midpoint,
214 struct diff_data *left, struct diff_data *right,
215 int *kd_forward, int *kd_backward, int d,
216 struct diff_box *meeting_snake)
218 int delta = (int)right->atoms.len - (int)left->atoms.len;
219 int k;
220 int x;
221 *found_midpoint = false;
223 debug("-- %s d=%d\n", __func__, d);
225 for (k = d; k >= -d; k -= 2) {
226 if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
227 /* This diagonal is completely outside of the Myers
228 * graph, don't calculate it. */
229 if (k < -(int)right->atoms.len)
230 debug(" %d k < -(int)right->atoms.len %d\n", k,
231 -(int)right->atoms.len);
232 else
233 debug(" %d k > left->atoms.len %d\n", k,
234 left->atoms.len);
235 if (k < 0) {
236 /* We are traversing negatively, and already
237 * below the entire graph, nothing will come of
238 * this. */
239 debug(" break");
240 break;
242 debug(" continue");
243 continue;
245 debug("- k = %d\n", k);
246 if (d == 0) {
247 /* This is the initializing step. There is no prev_k
248 * yet, get the initial x from the top left of the Myers
249 * graph. */
250 x = 0;
252 /* Favoring "-" lines first means favoring moving rightwards in
253 * the Myers graph.
254 * For this, all k should derive from k - 1, only the bottom
255 * most k derive from k + 1:
257 * | d= 0 1 2
258 * ----+----------------
259 * k= |
260 * 2 | 2,0 <-- from prev_k = 2 - 1 = 1
261 * | /
262 * 1 | 1,0
263 * | /
264 * 0 | -->0,0 3,3
265 * | \\ /
266 * -1 | 0,1 <-- bottom most for d=1 from
267 * | \\ prev_k = -1 + 1 = 0
268 * -2 | 0,2 <-- bottom most for d=2 from
269 * prev_k = -2 + 1 = -1
271 * Except when a k + 1 from a previous run already means a
272 * further advancement in the graph.
273 * If k == d, there is no k + 1 and k - 1 is the only option.
274 * If k < d, use k + 1 in case that yields a larger x. Also use
275 * k + 1 if k - 1 is outside the graph.
276 */
277 else if (k > -d
278 && (k == d
279 || (k - 1 >= -(int)right->atoms.len
280 && kd_forward[k - 1] >= kd_forward[k + 1]))) {
281 /* Advance from k - 1.
282 * From position prev_k, step to the right in the Myers
283 * graph: x += 1.
284 */
285 int prev_k = k - 1;
286 int prev_x = kd_forward[prev_k];
287 x = prev_x + 1;
288 } else {
289 /* The bottom most one.
290 * From position prev_k, step to the bottom in the Myers
291 * graph: y += 1.
292 * Incrementing y is achieved by decrementing k while
293 * keeping the same x.
294 * (since we're deriving y from y = x - k).
295 */
296 int prev_k = k + 1;
297 int prev_x = kd_forward[prev_k];
298 x = prev_x;
301 int x_before_slide = x;
302 /* Slide down any snake that we might find here. */
303 while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len) {
304 bool same;
305 int r = diff_atom_same(&same,
306 &left->atoms.head[x],
307 &right->atoms.head[
308 xk_to_y(x, k)]);
309 if (r)
310 return r;
311 if (!same)
312 break;
313 x++;
315 kd_forward[k] = x;
316 if (x_before_slide != x) {
317 debug(" down %d similar lines\n", x - x_before_slide);
320 if (DEBUG) {
321 int fi;
322 for (fi = d; fi >= k; fi--) {
323 debug("kd_forward[%d] = (%d, %d)\n", fi,
324 kd_forward[fi], kd_forward[fi] - fi);
328 if (x < 0 || x > left->atoms.len
329 || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len)
330 continue;
332 /* Figured out a new forwards traversal, see if this has gone
333 * onto or even past a preceding backwards traversal.
335 * If the delta in length is odd, then d and backwards_d hit the
336 * same state indexes:
337 * | d= 0 1 2 1 0
338 * ----+---------------- ----------------
339 * k= | c=
340 * 4 | 3
341 * |
342 * 3 | 2
343 * | same
344 * 2 | 2,0====5,3 1
345 * | / \
346 * 1 | 1,0 5,4<-- 0
347 * | / /
348 * 0 | -->0,0 3,3====4,4 -1
349 * | \ /
350 * -1 | 0,1 -2
351 * | \
352 * -2 | 0,2 -3
353 * |
355 * If the delta is even, they end up off-by-one, i.e. on
356 * different diagonals:
358 * | d= 0 1 2 1 0
359 * ----+---------------- ----------------
360 * | c=
361 * 3 | 3
362 * |
363 * 2 | 2,0 off 2
364 * | / \\
365 * 1 | 1,0 4,3 1
366 * | / // \
367 * 0 | -->0,0 3,3 4,4<-- 0
368 * | \ / /
369 * -1 | 0,1 3,4 -1
370 * | \ //
371 * -2 | 0,2 -2
372 * |
374 * So in the forward path, we can only match up diagonals when
375 * the delta is odd.
376 */
377 if ((delta & 1) == 0)
378 continue;
379 /* Forwards is done first, so the backwards one was still at
380 * d - 1. Can't do this for d == 0. */
381 int backwards_d = d - 1;
382 if (backwards_d < 0)
383 continue;
385 debug("backwards_d = %d\n", backwards_d);
387 /* If both sides have the same length, forward and backward
388 * start on the same diagonal, meaning the backwards state index
389 * c == k.
390 * As soon as the lengths are not the same, the backwards
391 * traversal starts on a different diagonal, and c = k shifted
392 * by the difference in length.
393 */
394 int c = k_to_c(k, delta);
396 /* When the file sizes are very different, the traversal trees
397 * start on far distant diagonals.
398 * They don't necessarily meet straight on. See whether this
399 * forward value is on a diagonal that is also valid in
400 * kd_backward[], and match them if so. */
401 if (c >= -backwards_d && c <= backwards_d) {
402 /* Current k is on a diagonal that exists in
403 * kd_backward[]. If the two x positions have met or
404 * passed (forward walked onto or past backward), then
405 * we've found a midpoint / a mid-box.
407 * When forwards and backwards traversals meet, the
408 * endpoints of the mid-snake are not the two points in
409 * kd_forward and kd_backward, but rather the section
410 * that was slid (if any) of the current
411 * forward/backward traversal only.
413 * For example:
415 * o
416 * \
417 * o
418 * \
419 * o
420 * \
421 * o
422 * \
423 * X o o
424 * | | |
425 * o-o-o o
426 * \|
427 * M
428 * \
429 * o
430 * \
431 * A o
432 * | |
433 * o-o-o
435 * The forward traversal reached M from the top and slid
436 * downwards to A. The backward traversal already
437 * reached X, which is not a straight line from M
438 * anymore, so picking a mid-snake from M to X would
439 * yield a mistake.
441 * The correct mid-snake is between M and A. M is where
442 * the forward traversal hit the diagonal that the
443 * backward traversal has already passed, and A is what
444 * it reaches when sliding down identical lines.
445 */
446 int backward_x = kd_backward[c];
447 debug("Compare: k=%d c=%d is (%d,%d) >= (%d,%d)?\n",
448 k, c, x, xk_to_y(x, k), backward_x,
449 xc_to_y(backward_x, c, delta));
450 if (x >= backward_x) {
451 *meeting_snake = (struct diff_box){
452 .left_start = x_before_slide,
453 .left_end = x,
454 .right_start = xc_to_y(x_before_slide,
455 c, delta),
456 .right_end = xk_to_y(x, k),
457 };
458 debug("HIT x=(%u,%u) - y=(%u,%u)\n",
459 meeting_snake->left_start,
460 meeting_snake->right_start,
461 meeting_snake->left_end,
462 meeting_snake->right_end);
463 debug_dump_myers_graph(left, right, NULL,
464 kd_forward, d,
465 kd_backward, d-1);
466 *found_midpoint = true;
467 return 0;
472 debug_dump_myers_graph(left, right, NULL, kd_forward, d,
473 kd_backward, d-1);
474 return 0;
477 /* Do one backwards step in the "divide and conquer" graph traversal.
478 * left: the left side to diff.
479 * right: the right side to diff against.
480 * kd_forward: the traversal state for forwards traversal, to find a meeting
481 * point.
482 * Since forwards is done first, after this, both kd_forward and
483 * kd_backward will be valid for d.
484 * kd_forward points at the center of the state array, allowing
485 * negative indexes.
486 * kd_backward: the traversal state for backwards traversal, to find a meeting
487 * point.
488 * This is carried over between invocations with increasing d.
489 * kd_backward points at the center of the state array, allowing
490 * negative indexes.
491 * d: Step or distance counter, indicating for what value of d the kd_backward
492 * should be populated.
493 * Before the first invocation, kd_backward[0] shall point at the bottom
494 * right of the Myers graph (left.len, right.len).
495 * The first invocation will be for d == 1.
496 * meeting_snake: resulting meeting point, if any.
497 * Return true when a meeting point has been identified.
498 */
499 static int
500 diff_divide_myers_backward(bool *found_midpoint,
501 struct diff_data *left, struct diff_data *right,
502 int *kd_forward, int *kd_backward, int d,
503 struct diff_box *meeting_snake)
505 int delta = (int)right->atoms.len - (int)left->atoms.len;
506 int c;
507 int x;
509 *found_midpoint = false;
511 debug("-- %s d=%d\n", __func__, d);
513 for (c = d; c >= -d; c -= 2) {
514 if (c < -(int)left->atoms.len || c > (int)right->atoms.len) {
515 /* This diagonal is completely outside of the Myers
516 * graph, don't calculate it. */
517 if (c < -(int)left->atoms.len)
518 debug(" %d c < -(int)left->atoms.len %d\n", c,
519 -(int)left->atoms.len);
520 else
521 debug(" %d c > right->atoms.len %d\n", c,
522 right->atoms.len);
523 if (c < 0) {
524 /* We are traversing negatively, and already
525 * below the entire graph, nothing will come of
526 * this. */
527 debug(" break");
528 break;
530 debug(" continue");
531 continue;
533 debug("- c = %d\n", c);
534 if (d == 0) {
535 /* This is the initializing step. There is no prev_c
536 * yet, get the initial x from the bottom right of the
537 * Myers graph. */
538 x = left->atoms.len;
540 /* Favoring "-" lines first means favoring moving rightwards in
541 * the Myers graph.
542 * For this, all c should derive from c - 1, only the bottom
543 * most c derive from c + 1:
545 * 2 1 0
546 * ---------------------------------------------------
547 * c=
548 * 3
550 * from prev_c = c - 1 --> 5,2 2
551 * \
552 * 5,3 1
553 * \
554 * 4,3 5,4<-- 0
555 * \ /
556 * bottom most for d=1 from c + 1 --> 4,4 -1
557 * /
558 * bottom most for d=2 --> 3,4 -2
560 * Except when a c + 1 from a previous run already means a
561 * further advancement in the graph.
562 * If c == d, there is no c + 1 and c - 1 is the only option.
563 * If c < d, use c + 1 in case that yields a larger x.
564 * Also use c + 1 if c - 1 is outside the graph.
565 */
566 else if (c > -d && (c == d
567 || (c - 1 >= -(int)right->atoms.len
568 && kd_backward[c - 1] <= kd_backward[c + 1]))) {
569 /* A top one.
570 * From position prev_c, step upwards in the Myers
571 * graph: y -= 1.
572 * Decrementing y is achieved by incrementing c while
573 * keeping the same x. (since we're deriving y from
574 * y = x - c + delta).
575 */
576 int prev_c = c - 1;
577 int prev_x = kd_backward[prev_c];
578 x = prev_x;
579 } else {
580 /* The bottom most one.
581 * From position prev_c, step to the left in the Myers
582 * graph: x -= 1.
583 */
584 int prev_c = c + 1;
585 int prev_x = kd_backward[prev_c];
586 x = prev_x - 1;
589 /* Slide up any snake that we might find here (sections of
590 * identical lines on both sides). */
591 debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c,
592 delta),
593 xc_to_y(x, c, delta)-1);
594 if (x > 0) {
595 debug(" l=");
596 debug_dump_atom(left, right, &left->atoms.head[x-1]);
598 if (xc_to_y(x, c, delta) > 0) {
599 debug(" r=");
600 debug_dump_atom(right, left,
601 &right->atoms.head[xc_to_y(x, c, delta)-1]);
603 int x_before_slide = x;
604 while (x > 0 && xc_to_y(x, c, delta) > 0) {
605 bool same;
606 int r = diff_atom_same(&same,
607 &left->atoms.head[x-1],
608 &right->atoms.head[
609 xc_to_y(x, c, delta)-1]);
610 if (r)
611 return r;
612 if (!same)
613 break;
614 x--;
616 kd_backward[c] = x;
617 if (x_before_slide != x) {
618 debug(" up %d similar lines\n", x_before_slide - x);
621 if (DEBUG) {
622 int fi;
623 for (fi = d; fi >= c; fi--) {
624 debug("kd_backward[%d] = (%d, %d)\n",
625 fi,
626 kd_backward[fi],
627 kd_backward[fi] - fi + delta);
631 if (x < 0 || x > left->atoms.len
632 || xc_to_y(x, c, delta) < 0
633 || xc_to_y(x, c, delta) > right->atoms.len)
634 continue;
636 /* Figured out a new backwards traversal, see if this has gone
637 * onto or even past a preceding forwards traversal.
639 * If the delta in length is even, then d and backwards_d hit
640 * the same state indexes -- note how this is different from in
641 * the forwards traversal, because now both d are the same:
643 * | d= 0 1 2 2 1 0
644 * ----+---------------- --------------------
645 * k= | c=
646 * 4 |
647 * |
648 * 3 | 3
649 * | same
650 * 2 | 2,0====5,2 2
651 * | / \
652 * 1 | 1,0 5,3 1
653 * | / / \
654 * 0 | -->0,0 3,3====4,3 5,4<-- 0
655 * | \ / /
656 * -1 | 0,1 4,4 -1
657 * | \
658 * -2 | 0,2 -2
659 * |
660 * -3
661 * If the delta is odd, they end up off-by-one, i.e. on
662 * different diagonals.
663 * So in the backward path, we can only match up diagonals when
664 * the delta is even.
665 */
666 if ((delta & 1) != 0)
667 continue;
668 /* Forwards was done first, now both d are the same. */
669 int forwards_d = d;
671 /* As soon as the lengths are not the same, the
672 * backwards traversal starts on a different diagonal,
673 * and c = k shifted by the difference in length.
674 */
675 int k = c_to_k(c, delta);
677 /* When the file sizes are very different, the traversal trees
678 * start on far distant diagonals.
679 * They don't necessarily meet straight on. See whether this
680 * backward value is also on a valid diagonal in kd_forward[],
681 * and match them if so. */
682 if (k >= -forwards_d && k <= forwards_d) {
683 /* Current c is on a diagonal that exists in
684 * kd_forward[]. If the two x positions have met or
685 * passed (backward walked onto or past forward), then
686 * we've found a midpoint / a mid-box.
688 * When forwards and backwards traversals meet, the
689 * endpoints of the mid-snake are not the two points in
690 * kd_forward and kd_backward, but rather the section
691 * that was slid (if any) of the current
692 * forward/backward traversal only.
694 * For example:
696 * o-o-o
697 * | |
698 * o A
699 * | \
700 * o o
701 * \
702 * M
703 * |\
704 * o o-o-o
705 * | | |
706 * o o X
707 * \
708 * o
709 * \
710 * o
711 * \
712 * o
714 * The backward traversal reached M from the bottom and
715 * slid upwards. The forward traversal already reached
716 * X, which is not a straight line from M anymore, so
717 * picking a mid-snake from M to X would yield a
718 * mistake.
720 * The correct mid-snake is between M and A. M is where
721 * the backward traversal hit the diagonal that the
722 * forwards traversal has already passed, and A is what
723 * it reaches when sliding up identical lines.
724 */
726 int forward_x = kd_forward[k];
727 debug("Compare: k=%d c=%d is (%d,%d) >= (%d,%d)?\n",
728 k, c, forward_x, xk_to_y(forward_x, k),
729 x, xc_to_y(x, c, delta));
730 if (forward_x >= x) {
731 *meeting_snake = (struct diff_box){
732 .left_start = x,
733 .left_end = x_before_slide,
734 .right_start = xc_to_y(x, c, delta),
735 .right_end = xk_to_y(x_before_slide, k),
736 };
737 debug("HIT x=%u,%u - y=%u,%u\n",
738 meeting_snake->left_start,
739 meeting_snake->right_start,
740 meeting_snake->left_end,
741 meeting_snake->right_end);
742 debug_dump_myers_graph(left, right, NULL,
743 kd_forward, d,
744 kd_backward, d);
745 *found_midpoint = true;
746 return 0;
750 debug_dump_myers_graph(left, right, NULL, kd_forward, d, kd_backward,
751 d);
752 return 0;
755 /* Myers "Divide et Impera": tracing forwards from the start and backwards from
756 * the end to find a midpoint that divides the problem into smaller chunks.
757 * Requires only linear amounts of memory. */
758 int
759 diff_algo_myers_divide(const struct diff_algo_config *algo_config,
760 struct diff_state *state)
762 int rc = ENOMEM;
763 struct diff_data *left = &state->left;
764 struct diff_data *right = &state->right;
766 debug("\n** %s\n", __func__);
767 debug("left:\n");
768 debug_dump(left);
769 debug("right:\n");
770 debug_dump(right);
771 debug_dump_myers_graph(left, right, NULL, NULL, 0, NULL, 0);
773 /* Allocate two columns of a Myers graph, one for the forward and one
774 * for the backward traversal. */
775 unsigned int max = left->atoms.len + right->atoms.len;
776 size_t kd_len = max + 1;
777 size_t kd_buf_size = kd_len << 1;
778 int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
779 if (!kd_buf)
780 return ENOMEM;
781 int i;
782 for (i = 0; i < kd_buf_size; i++)
783 kd_buf[i] = -1;
784 int *kd_forward = kd_buf;
785 int *kd_backward = kd_buf + kd_len;
787 /* The 'k' axis in Myers spans positive and negative indexes, so point
788 * the kd to the middle.
789 * It is then possible to index from -max/2 .. max/2. */
790 kd_forward += max/2;
791 kd_backward += max/2;
793 int d;
794 struct diff_box mid_snake = {};
795 bool found_midpoint = false;
796 for (d = 0; d <= (max/2); d++) {
797 int r;
798 debug("-- d=%d\n", d);
799 r = diff_divide_myers_forward(&found_midpoint, left, right,
800 kd_forward, kd_backward, d,
801 &mid_snake);
802 if (r)
803 return r;
804 if (found_midpoint)
805 break;
806 r = diff_divide_myers_backward(&found_midpoint, left, right,
807 kd_forward, kd_backward, d,
808 &mid_snake);
809 if (r)
810 return r;
811 if (found_midpoint)
812 break;
815 if (!found_midpoint) {
816 /* Divide and conquer failed to find a meeting point. Use the
817 * fallback_algo defined in the algo_config (leave this to the
818 * caller). This is just paranoia/sanity, we normally should
819 * always find a midpoint.
820 */
821 debug(" no midpoint \n");
822 rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
823 goto return_rc;
824 } else {
825 debug(" mid snake L: %u to %u of %u R: %u to %u of %u\n",
826 mid_snake.left_start, mid_snake.left_end, left->atoms.len,
827 mid_snake.right_start, mid_snake.right_end,
828 right->atoms.len);
830 /* Section before the mid-snake. */
831 debug("Section before the mid-snake\n");
833 struct diff_atom *left_atom = &left->atoms.head[0];
834 unsigned int left_section_len = mid_snake.left_start;
835 struct diff_atom *right_atom = &right->atoms.head[0];
836 unsigned int right_section_len = mid_snake.right_start;
838 if (left_section_len && right_section_len) {
839 /* Record an unsolved chunk, the caller will apply
840 * inner_algo() on this chunk. */
841 if (!diff_state_add_chunk(state, false,
842 left_atom, left_section_len,
843 right_atom,
844 right_section_len))
845 goto return_rc;
846 } else if (left_section_len && !right_section_len) {
847 /* Only left atoms and none on the right, they form a
848 * "minus" chunk, then. */
849 if (!diff_state_add_chunk(state, true,
850 left_atom, left_section_len,
851 right_atom, 0))
852 goto return_rc;
853 } else if (!left_section_len && right_section_len) {
854 /* No left atoms, only atoms on the right, they form a
855 * "plus" chunk, then. */
856 if (!diff_state_add_chunk(state, true,
857 left_atom, 0,
858 right_atom,
859 right_section_len))
860 goto return_rc;
862 /* else: left_section_len == 0 and right_section_len == 0, i.e.
863 * nothing before the mid-snake. */
865 if (!diff_box_empty(&mid_snake)) {
866 /* The midpoint is a "snake", i.e. on a section of
867 * identical data on both sides: that section
868 * immediately becomes a solved diff chunk. */
869 debug("the mid-snake\n");
870 if (!diff_state_add_chunk(state, true,
871 &left->atoms.head[mid_snake.left_start],
872 mid_snake.left_end - mid_snake.left_start,
873 &right->atoms.head[mid_snake.right_start],
874 mid_snake.right_end - mid_snake.right_start))
875 goto return_rc;
878 /* Section after the mid-snake. */
879 debug("Section after the mid-snake\n");
880 debug(" left_end %u right_end %u\n",
881 mid_snake.left_end, mid_snake.right_end);
882 debug(" left_count %u right_count %u\n",
883 left->atoms.len, right->atoms.len);
884 left_atom = &left->atoms.head[mid_snake.left_end];
885 left_section_len = left->atoms.len - mid_snake.left_end;
886 right_atom = &right->atoms.head[mid_snake.right_end];
887 right_section_len = right->atoms.len - mid_snake.right_end;
889 if (left_section_len && right_section_len) {
890 /* Record an unsolved chunk, the caller will apply
891 * inner_algo() on this chunk. */
892 if (!diff_state_add_chunk(state, false,
893 left_atom, left_section_len,
894 right_atom,
895 right_section_len))
896 goto return_rc;
897 } else if (left_section_len && !right_section_len) {
898 /* Only left atoms and none on the right, they form a
899 * "minus" chunk, then. */
900 if (!diff_state_add_chunk(state, true,
901 left_atom, left_section_len,
902 right_atom, 0))
903 goto return_rc;
904 } else if (!left_section_len && right_section_len) {
905 /* No left atoms, only atoms on the right, they form a
906 * "plus" chunk, then. */
907 if (!diff_state_add_chunk(state, true,
908 left_atom, 0,
909 right_atom,
910 right_section_len))
911 goto return_rc;
913 /* else: left_section_len == 0 and right_section_len == 0, i.e.
914 * nothing after the mid-snake. */
917 rc = DIFF_RC_OK;
919 return_rc:
920 free(kd_buf);
921 debug("** END %s\n", __func__);
922 return rc;
925 /* Myers Diff tracing from the start all the way through to the end, requiring
926 * quadratic amounts of memory. This can fail if the required space surpasses
927 * algo_config->permitted_state_size. */
928 int
929 diff_algo_myers(const struct diff_algo_config *algo_config,
930 struct diff_state *state)
932 /* do a diff_divide_myers_forward() without a _backward(), so that it
933 * walks forward across the entire files to reach the end. Keep each
934 * run's state, and do a final backtrace. */
935 int rc = ENOMEM;
936 struct diff_data *left = &state->left;
937 struct diff_data *right = &state->right;
939 debug("\n** %s\n", __func__);
940 debug("left:\n");
941 debug_dump(left);
942 debug("right:\n");
943 debug_dump(right);
944 debug_dump_myers_graph(left, right, NULL, NULL, 0, NULL, 0);
946 /* Allocate two columns of a Myers graph, one for the forward and one
947 * for the backward traversal. */
948 unsigned int max = left->atoms.len + right->atoms.len;
949 size_t kd_len = max + 1 + max;
950 size_t kd_buf_size = kd_len * kd_len;
951 size_t kd_state_size = kd_buf_size * sizeof(int);
952 debug("state size: %zu\n", kd_buf_size);
953 if (kd_buf_size < kd_len /* overflow? */
954 || kd_state_size > algo_config->permitted_state_size) {
955 debug("state size %zu > permitted_state_size %zu, use fallback_algo\n",
956 kd_state_size, algo_config->permitted_state_size);
957 return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
960 int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
961 if (!kd_buf)
962 return ENOMEM;
963 int i;
964 for (i = 0; i < kd_buf_size; i++)
965 kd_buf[i] = -1;
967 /* The 'k' axis in Myers spans positive and negative indexes, so point
968 * the kd to the middle.
969 * It is then possible to index from -max .. max. */
970 int *kd_origin = kd_buf + max;
971 int *kd_column = kd_origin;
973 int d;
974 int backtrack_d = -1;
975 int backtrack_k = 0;
976 int k;
977 int x, y;
978 for (d = 0; d <= max; d++, kd_column += kd_len) {
979 debug("-- d=%d\n", d);
981 debug("-- %s d=%d\n", __func__, d);
983 for (k = d; k >= -d; k -= 2) {
984 if (k < -(int)right->atoms.len
985 || k > (int)left->atoms.len) {
986 /* This diagonal is completely outside of the
987 * Myers graph, don't calculate it. */
988 if (k < -(int)right->atoms.len)
989 debug(" %d k <"
990 " -(int)right->atoms.len %d\n",
991 k, -(int)right->atoms.len);
992 else
993 debug(" %d k > left->atoms.len %d\n", k,
994 left->atoms.len);
995 if (k < 0) {
996 /* We are traversing negatively, and
997 * already below the entire graph,
998 * nothing will come of this. */
999 debug(" break");
1000 break;
1002 debug(" continue");
1003 continue;
1006 debug("- k = %d\n", k);
1007 if (d == 0) {
1008 /* This is the initializing step. There is no
1009 * prev_k yet, get the initial x from the top
1010 * left of the Myers graph. */
1011 x = 0;
1012 } else {
1013 int *kd_prev_column = kd_column - kd_len;
1015 /* Favoring "-" lines first means favoring
1016 * moving rightwards in the Myers graph.
1017 * For this, all k should derive from k - 1,
1018 * only the bottom most k derive from k + 1:
1020 * | d= 0 1 2
1021 * ----+----------------
1022 * k= |
1023 * 2 | 2,0 <-- from
1024 * | / prev_k = 2 - 1 = 1
1025 * 1 | 1,0
1026 * | /
1027 * 0 | -->0,0 3,3
1028 * | \\ /
1029 * -1 | 0,1 <-- bottom most for d=1
1030 * | \\ from prev_k = -1+1 = 0
1031 * -2 | 0,2 <-- bottom most for
1032 * d=2 from
1033 * prev_k = -2+1 = -1
1035 * Except when a k + 1 from a previous run
1036 * already means a further advancement in the
1037 * graph.
1038 * If k == d, there is no k + 1 and k - 1 is the
1039 * only option.
1040 * If k < d, use k + 1 in case that yields a
1041 * larger x. Also use k + 1 if k - 1 is outside
1042 * the graph.
1044 if (k > -d
1045 && (k == d
1046 || (k - 1 >= -(int)right->atoms.len
1047 && kd_prev_column[k - 1]
1048 >= kd_prev_column[k + 1]))) {
1049 /* Advance from k - 1.
1050 * From position prev_k, step to the
1051 * right in the Myers graph: x += 1.
1053 int prev_k = k - 1;
1054 int prev_x = kd_prev_column[prev_k];
1055 x = prev_x + 1;
1056 } else {
1057 /* The bottom most one.
1058 * From position prev_k, step to the
1059 * bottom in the Myers graph: y += 1.
1060 * Incrementing y is achieved by
1061 * decrementing k while keeping the same
1062 * x. (since we're deriving y from y =
1063 * x - k).
1065 int prev_k = k + 1;
1066 int prev_x = kd_prev_column[prev_k];
1067 x = prev_x;
1071 /* Slide down any snake that we might find here. */
1072 while (x < left->atoms.len
1073 && xk_to_y(x, k) < right->atoms.len) {
1074 bool same;
1075 int r = diff_atom_same(&same,
1076 &left->atoms.head[x],
1077 &right->atoms.head[
1078 xk_to_y(x, k)]);
1079 if (r)
1080 return r;
1081 if (!same)
1082 break;
1083 x++;
1085 kd_column[k] = x;
1087 if (DEBUG) {
1088 int fi;
1089 for (fi = d; fi >= k; fi-=2) {
1090 debug("kd_column[%d] = (%d, %d)\n", fi,
1091 kd_column[fi],
1092 kd_column[fi] - fi);
1096 if (x == left->atoms.len
1097 && xk_to_y(x, k) == right->atoms.len) {
1098 /* Found a path */
1099 backtrack_d = d;
1100 backtrack_k = k;
1101 debug("Reached the end at d = %d, k = %d\n",
1102 backtrack_d, backtrack_k);
1103 break;
1107 if (backtrack_d >= 0)
1108 break;
1111 debug_dump_myers_graph(left, right, kd_origin, NULL, 0, NULL, 0);
1113 /* backtrack. A matrix spanning from start to end of the file is ready:
1115 * | d= 0 1 2 3 4
1116 * ----+---------------------------------
1117 * k= |
1118 * 3 |
1119 * |
1120 * 2 | 2,0
1121 * | /
1122 * 1 | 1,0 4,3
1123 * | / / \
1124 * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0
1125 * | \ / \
1126 * -1 | 0,1 3,4
1127 * | \
1128 * -2 | 0,2
1129 * |
1131 * From (4,4) backwards, find the previous position that is the largest, and remember it.
1134 for (d = backtrack_d, k = backtrack_k; d >= 0; d--) {
1135 x = kd_column[k];
1136 y = xk_to_y(x, k);
1138 /* When the best position is identified, remember it for that
1139 * kd_column.
1140 * That kd_column is no longer needed otherwise, so just
1141 * re-purpose kd_column[0] = x and kd_column[1] = y,
1142 * so that there is no need to allocate more memory.
1144 kd_column[0] = x;
1145 kd_column[1] = y;
1146 debug("Backtrack d=%d: xy=(%d, %d)\n",
1147 d, kd_column[0], kd_column[1]);
1149 /* Don't access memory before kd_buf */
1150 if (d == 0)
1151 break;
1152 int *kd_prev_column = kd_column - kd_len;
1154 /* When y == 0, backtracking downwards (k-1) is the only way.
1155 * When x == 0, backtracking upwards (k+1) is the only way.
1157 * | d= 0 1 2 3 4
1158 * ----+---------------------------------
1159 * k= |
1160 * 3 |
1161 * | ..y == 0
1162 * 2 | 2,0
1163 * | /
1164 * 1 | 1,0 4,3
1165 * | / / \
1166 * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4,
1167 * | \ / \ backtrack_k = 0
1168 * -1 | 0,1 3,4
1169 * | \
1170 * -2 | 0,2__
1171 * | x == 0
1173 debug("prev[k-1] = %d,%d prev[k+1] = %d,%d\n",
1174 kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1),
1175 kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1));
1176 if (y == 0
1177 || (x > 0
1178 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) {
1179 k = k - 1;
1180 debug("prev k=k-1=%d x=%d y=%d\n",
1181 k, kd_prev_column[k],
1182 xk_to_y(kd_prev_column[k], k));
1183 } else {
1184 k = k + 1;
1185 debug("prev k=k+1=%d x=%d y=%d\n",
1186 k, kd_prev_column[k],
1187 xk_to_y(kd_prev_column[k], k));
1189 kd_column = kd_prev_column;
1192 /* Forwards again, this time recording the diff chunks.
1193 * Definitely start from 0,0. kd_column[0] may actually point to the
1194 * bottom of a snake starting at 0,0 */
1195 x = 0;
1196 y = 0;
1198 kd_column = kd_origin;
1199 for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) {
1200 int next_x = kd_column[0];
1201 int next_y = kd_column[1];
1202 debug("Forward track from xy(%d,%d) to xy(%d,%d)\n",
1203 x, y, next_x, next_y);
1205 struct diff_atom *left_atom = &left->atoms.head[x];
1206 int left_section_len = next_x - x;
1207 struct diff_atom *right_atom = &right->atoms.head[y];
1208 int right_section_len = next_y - y;
1210 rc = ENOMEM;
1211 if (left_section_len && right_section_len) {
1212 /* This must be a snake slide.
1213 * Snake slides have a straight line leading into them
1214 * (except when starting at (0,0)). Find out whether the
1215 * lead-in is horizontal or vertical:
1217 * left
1218 * ---------->
1219 * |
1220 * r| o-o o
1221 * i| \ |
1222 * g| o o
1223 * h| \ \
1224 * t| o o
1225 * v
1227 * If left_section_len > right_section_len, the lead-in
1228 * is horizontal, meaning first remove one atom from the
1229 * left before sliding down the snake.
1230 * If right_section_len > left_section_len, the lead-in
1231 * is vetical, so add one atom from the right before
1232 * sliding down the snake. */
1233 if (left_section_len == right_section_len + 1) {
1234 if (!diff_state_add_chunk(state, true,
1235 left_atom, 1,
1236 right_atom, 0))
1237 goto return_rc;
1238 left_atom++;
1239 left_section_len--;
1240 } else if (right_section_len == left_section_len + 1) {
1241 if (!diff_state_add_chunk(state, true,
1242 left_atom, 0,
1243 right_atom, 1))
1244 goto return_rc;
1245 right_atom++;
1246 right_section_len--;
1247 } else if (left_section_len != right_section_len) {
1248 /* The numbers are making no sense. Should never
1249 * happen. */
1250 rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
1251 goto return_rc;
1254 if (!diff_state_add_chunk(state, true,
1255 left_atom, left_section_len,
1256 right_atom,
1257 right_section_len))
1258 goto return_rc;
1259 } else if (left_section_len && !right_section_len) {
1260 /* Only left atoms and none on the right, they form a
1261 * "minus" chunk, then. */
1262 if (!diff_state_add_chunk(state, true,
1263 left_atom, left_section_len,
1264 right_atom, 0))
1265 goto return_rc;
1266 } else if (!left_section_len && right_section_len) {
1267 /* No left atoms, only atoms on the right, they form a
1268 * "plus" chunk, then. */
1269 if (!diff_state_add_chunk(state, true,
1270 left_atom, 0,
1271 right_atom,
1272 right_section_len))
1273 goto return_rc;
1276 x = next_x;
1277 y = next_y;
1280 rc = DIFF_RC_OK;
1282 return_rc:
1283 free(kd_buf);
1284 debug("** END %s rc=%d\n", __func__, rc);
1285 return rc;