Blob


1 /* Implementation of the Patience Diff algorithm invented by Bram Cohen:
2 * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */
3 /*
4 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
19 #include <assert.h>
20 #include <inttypes.h>
21 #include <errno.h>
22 #include <stdbool.h>
23 #include <stdio.h>
24 #include <stdlib.h>
26 #include <diff/arraylist.h>
27 #include <diff/diff_main.h>
29 #include "diff_debug.h"
31 /* Set unique_here = true for all atoms that exist exactly once in this list. */
32 static int
33 diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count)
34 {
35 struct diff_atom *i;
36 unsigned int count = 0;
37 diff_data_foreach_atom(i, d) {
38 i->patience.unique_here = true;
39 i->patience.unique_in_both = true;
40 count++;
41 }
42 diff_data_foreach_atom(i, d) {
43 struct diff_atom *j;
45 if (!i->patience.unique_here)
46 continue;
48 diff_data_foreach_atom_from(i + 1, j, d) {
49 bool same;
50 int r = diff_atom_same(&same, i, j);
51 if (r)
52 return r;
53 if (!same)
54 continue;
55 if (i->patience.unique_here) {
56 i->patience.unique_here = false;
57 i->patience.unique_in_both = false;
58 count--;
59 }
60 j->patience.unique_here = false;
61 j->patience.unique_in_both = false;
62 count--;
63 }
64 }
65 if (unique_count)
66 *unique_count = count;
67 return 0;
68 }
70 /* Mark those lines as atom->patience.unique_in_both = true that appear exactly
71 * once in each side. */
72 static int
73 diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
74 unsigned int *unique_in_both_count)
75 {
76 /* Derive the final unique_in_both count without needing an explicit
77 * iteration. So this is just some optimiziation to save one iteration
78 * in the end. */
79 unsigned int unique_in_both;
80 int r;
82 r = diff_atoms_mark_unique(left, &unique_in_both);
83 if (r)
84 return r;
85 r = diff_atoms_mark_unique(right, NULL);
86 if (r)
87 return r;
89 debug("unique_in_both %u\n", unique_in_both);
91 struct diff_atom *i;
92 diff_data_foreach_atom(i, left) {
93 if (!i->patience.unique_here)
94 continue;
95 struct diff_atom *j;
96 int found_in_b = 0;
97 diff_data_foreach_atom(j, right) {
98 bool same;
99 int r = diff_atom_same(&same, i, j);
100 if (r)
101 return r;
102 if (!same)
103 continue;
104 if (!j->patience.unique_here) {
105 found_in_b = 2; /* or more */
106 break;
107 } else {
108 found_in_b = 1;
109 j->patience.pos_in_other = i;
110 i->patience.pos_in_other = j;
114 if (found_in_b == 0 || found_in_b > 1) {
115 i->patience.unique_in_both = false;
116 unique_in_both--;
117 debug("unique_in_both %u (%d) ", unique_in_both,
118 found_in_b);
119 debug_dump_atom(left, NULL, i);
123 /* Still need to unmark right[*]->patience.unique_in_both for atoms that
124 * don't exist in left */
125 diff_data_foreach_atom(i, right) {
126 if (!i->patience.unique_here
127 || !i->patience.unique_in_both)
128 continue;
129 struct diff_atom *j;
130 bool found_in_a = false;
131 diff_data_foreach_atom(j, left) {
132 bool same;
133 int r;
134 if (!j->patience.unique_in_both)
135 continue;
136 r = diff_atom_same(&same, i, j);
137 if (r)
138 return r;
139 if (!same)
140 continue;
141 found_in_a = true;
142 break;
145 if (!found_in_a)
146 i->patience.unique_in_both = false;
149 if (unique_in_both_count)
150 *unique_in_both_count = unique_in_both;
151 return 0;
154 static int
155 diff_atoms_swallow_identical_neighbors(struct diff_data *left,
156 struct diff_data *right,
157 unsigned int *unique_in_both_count)
159 debug("trivially combine identical lines"
160 " around unique_in_both lines\n");
162 unsigned int l_idx;
163 unsigned int next_l_idx;
164 unsigned int l_min = 0;
165 unsigned int r_min = 0;
166 for (l_idx = 0; l_idx < left->atoms.len; l_idx = next_l_idx) {
167 next_l_idx = l_idx + 1;
168 struct diff_atom *l = &left->atoms.head[l_idx];
170 if (!l->patience.unique_in_both)
171 continue;
173 debug("check identical lines around ");
174 debug_dump_atom(left, right, l);
176 unsigned int r_idx = diff_atom_idx(right,
177 l->patience.pos_in_other);
179 struct diff_range identical_l;
180 struct diff_range identical_r;
182 /* Swallow upwards.
183 * Each common-unique line swallows identical lines upwards and
184 * downwards.
185 * All common-unique lines that were part of the identical lines
186 * following below were already swallowed in the previous
187 * iteration, so we will never hit another common-unique line
188 * above. */
189 for (identical_l.start = l_idx, identical_r.start = r_idx;
190 identical_l.start > l_min && identical_r.start > r_min;
191 identical_l.start--, identical_r.start--) {
192 bool same;
193 int r = diff_atom_same(&same,
194 &left->atoms.head[identical_l.start - 1],
195 &right->atoms.head[identical_r.start - 1]);
196 if (r)
197 return r;
198 if (!same)
199 break;
202 /* Swallow downwards */
203 for (identical_l.end = l_idx + 1, identical_r.end = r_idx + 1;
204 identical_l.end < left->atoms.len
205 && identical_r.end < right->atoms.len;
206 identical_l.end++, identical_r.end++,
207 next_l_idx++) {
208 struct diff_atom *l_end;
209 struct diff_atom *r_end;
210 bool same;
211 int r = diff_atom_same(&same,
212 &left->atoms.head[identical_l.end],
213 &right->atoms.head[identical_r.end]);
214 if (r)
215 return r;
216 if (!same)
217 break;
218 l_end = &left->atoms.head[identical_l.end];
219 r_end = &right->atoms.head[identical_r.end];
220 if (!l_end->patience.unique_in_both)
221 continue;
222 /* Part of a chunk of identical lines, remove from
223 * listing of unique_in_both lines */
224 l_end->patience.unique_in_both = false;
225 r_end->patience.unique_in_both = false;
226 (*unique_in_both_count)--;
229 l->patience.identical_lines = identical_l;
230 l->patience.pos_in_other->patience.identical_lines =
231 identical_r;
233 l_min = identical_l.end;
234 r_min = identical_r.end;
236 if (!diff_range_empty(&l->patience.identical_lines)) {
237 debug("common-unique line at l=%u r=%u swallowed"
238 " identical lines l=%u-%u r=%u-%u\n",
239 l_idx, r_idx,
240 identical_l.start, identical_l.end,
241 identical_r.start, identical_r.end);
243 debug("next_l_idx = %u\n", next_l_idx);
245 return 0;
248 /* binary search to find the stack to put this atom "card" on. */
249 static int
250 find_target_stack(struct diff_atom *atom,
251 struct diff_atom **patience_stacks,
252 unsigned int patience_stacks_count)
254 unsigned int lo = 0;
255 unsigned int hi = patience_stacks_count;
256 while (lo < hi) {
257 unsigned int mid = (lo + hi) >> 1;
259 if (patience_stacks[mid]->patience.pos_in_other
260 < atom->patience.pos_in_other)
261 lo = mid + 1;
262 else
263 hi = mid;
265 return lo;
268 /* Among the lines that appear exactly once in each side, find the longest
269 * streak that appear in both files in the same order (with other stuff allowed
270 * to interleave). Use patience sort for that, as in the Patience Diff
271 * algorithm.
272 * See https://bramcohen.livejournal.com/73318.html and, for a much more
273 * detailed explanation,
274 * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */
275 int
276 diff_algo_patience(const struct diff_algo_config *algo_config,
277 struct diff_state *state)
279 int rc;
281 struct diff_data *left = &state->left;
282 struct diff_data *right = &state->right;
284 unsigned int unique_in_both_count;
286 debug("\n** %s\n", __func__);
288 /* Find those lines that appear exactly once in 'left' and exactly once
289 * in 'right'. */
290 rc = diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count);
291 if (rc)
292 return rc;
294 debug("unique_in_both_count %u\n", unique_in_both_count);
295 debug("left:\n");
296 debug_dump(left);
297 debug("right:\n");
298 debug_dump(right);
300 if (!unique_in_both_count) {
301 /* Cannot apply Patience, tell the caller to use fallback_algo
302 * instead. */
303 return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
306 rc = diff_atoms_swallow_identical_neighbors(left, right,
307 &unique_in_both_count);
308 if (rc)
309 return rc;
310 debug("After swallowing identical neighbors: unique_in_both = %u\n",
311 unique_in_both_count);
313 rc = ENOMEM;
315 /* An array of Longest Common Sequence is the result of the below
316 * subscope: */
317 unsigned int lcs_count = 0;
318 struct diff_atom **lcs = NULL;
319 struct diff_atom *lcs_tail = NULL;
322 /* This subscope marks the lifetime of the atom_pointers
323 * allocation */
325 /* One chunk of storage for atom pointers */
326 struct diff_atom **atom_pointers;
327 atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2,
328 sizeof(struct diff_atom*));
330 /* Half for the list of atoms that still need to be put on
331 * stacks */
332 struct diff_atom **uniques = atom_pointers;
334 /* Half for the patience sort state's "card stacks" -- we
335 * remember only each stack's topmost "card" */
336 struct diff_atom **patience_stacks;
337 patience_stacks = atom_pointers + unique_in_both_count;
338 unsigned int patience_stacks_count = 0;
340 /* Take all common, unique items from 'left' ... */
342 struct diff_atom *atom;
343 struct diff_atom **uniques_end = uniques;
344 diff_data_foreach_atom(atom, left) {
345 if (!atom->patience.unique_in_both)
346 continue;
347 *uniques_end = atom;
348 uniques_end++;
351 /* ...and sort them to the order found in 'right'.
352 * The idea is to find the leftmost stack that has a higher line
353 * number and add it to the stack's top.
354 * If there is no such stack, open a new one on the right. The
355 * line number is derived from the atom*, which are array items
356 * and hence reflect the relative position in the source file.
357 * So we got the common-uniques from 'left' and sort them
358 * according to atom->patience.pos_in_other. */
359 unsigned int i;
360 for (i = 0; i < unique_in_both_count; i++) {
361 atom = uniques[i];
362 unsigned int target_stack;
363 target_stack = find_target_stack(atom, patience_stacks,
364 patience_stacks_count);
365 assert(target_stack <= patience_stacks_count);
366 patience_stacks[target_stack] = atom;
367 if (target_stack == patience_stacks_count)
368 patience_stacks_count++;
370 /* Record a back reference to the next stack on the
371 * left, which will form the final longest sequence
372 * later. */
373 atom->patience.prev_stack = target_stack ?
374 patience_stacks[target_stack - 1] : NULL;
378 /* backtrace through prev_stack references to form the final
379 * longest common sequence */
380 lcs_tail = patience_stacks[patience_stacks_count - 1];
381 lcs_count = patience_stacks_count;
383 /* uniques and patience_stacks are no longer needed.
384 * Backpointers are in atom->patience.prev_stack */
385 free(atom_pointers);
388 lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*));
389 struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1];
390 struct diff_atom *atom;
391 for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) {
392 assert(lcs_backtrace_pos >= lcs);
393 *lcs_backtrace_pos = atom;
396 unsigned int i;
397 if (DEBUG) {
398 debug("\npatience LCS:\n");
399 for (i = 0; i < lcs_count; i++) {
400 debug_dump_atom(left, right, lcs[i]);
405 /* TODO: For each common-unique line found (now listed in lcs), swallow
406 * lines upwards and downwards that are identical on each side. Requires
407 * a way to represent atoms being glued to adjacent atoms. */
409 debug("\ntraverse LCS, possibly recursing:\n");
411 /* Now we have pinned positions in both files at which it makes sense to
412 * divide the diff problem into smaller chunks. Go into the next round:
413 * look at each section in turn, trying to again find common-unique
414 * lines in those smaller sections. As soon as no more are found, the
415 * remaining smaller sections are solved by Myers. */
416 unsigned int left_pos = 0;
417 unsigned int right_pos = 0;
418 for (i = 0; i <= lcs_count; i++) {
419 struct diff_atom *atom;
420 struct diff_atom *atom_r;
421 unsigned int left_idx;
422 unsigned int right_idx;
424 if (i < lcs_count) {
425 atom = lcs[i];
426 atom_r = atom->patience.pos_in_other;
427 debug("lcs[%u] = left[%ld] = right[%ld]\n", i,
428 diff_atom_idx(left, atom), diff_atom_idx(right, atom_r));
429 left_idx = atom->patience.identical_lines.start;
430 right_idx = atom_r->patience.identical_lines.start;
431 debug(" identical lines l %u-%u r %u-%u\n",
432 atom->patience.identical_lines.start, atom->patience.identical_lines.end,
433 atom_r->patience.identical_lines.start, atom_r->patience.identical_lines.end);
434 } else {
435 atom = NULL;
436 atom_r = NULL;
437 left_idx = left->atoms.len;
438 right_idx = right->atoms.len;
441 /* 'atom' now marks an atom that matches on both sides according
442 * to patience-diff (a common-unique identical atom in both
443 * files).
444 * Handle the section before and the atom itself; the section
445 * after will be handled by the next loop iteration -- note that
446 * i loops to last element + 1 ("i <= lcs_count"), so that there
447 * will be another final iteration to pick up the last remaining
448 * items after the last LCS atom.
449 * The sections before might also be empty on left and/or right.
450 * left_pos and right_pos mark the indexes of the first atoms
451 * that have not yet been handled in the previous loop
452 * iteration. left_idx and right_idx mark the indexes of the
453 * matching atom on left and right, respectively. */
455 debug("iteration %u left_pos %u left_idx %u"
456 " right_pos %u right_idx %u\n",
457 i, left_pos, left_idx, right_pos, right_idx);
459 /* Section before the matching atom */
460 struct diff_atom *left_atom = &left->atoms.head[left_pos];
461 unsigned int left_section_len = left_idx - left_pos;
463 struct diff_atom *right_atom = &(right->atoms.head[right_pos]);
464 unsigned int right_section_len = right_idx - right_pos;
466 if (left_section_len && right_section_len) {
467 /* Record an unsolved chunk, the caller will apply
468 * inner_algo() on this chunk. */
469 if (!diff_state_add_chunk(state, false,
470 left_atom, left_section_len,
471 right_atom,
472 right_section_len))
473 goto return_rc;
474 } else if (left_section_len && !right_section_len) {
475 /* Only left atoms and none on the right, they form a
476 * "minus" chunk, then. */
477 if (!diff_state_add_chunk(state, true,
478 left_atom, left_section_len,
479 right_atom, 0))
480 goto return_rc;
481 } else if (!left_section_len && right_section_len) {
482 /* No left atoms, only atoms on the right, they form a
483 * "plus" chunk, then. */
484 if (!diff_state_add_chunk(state, true,
485 left_atom, 0,
486 right_atom, right_section_len))
487 goto return_rc;
489 /* else: left_section_len == 0 and right_section_len == 0, i.e.
490 * nothing here. */
492 /* The atom found to match on both sides forms a chunk of equals
493 * on each side. In the very last iteration of this loop, there
494 * is no matching atom, we were just cleaning out the remaining
495 * lines. */
496 if (atom) {
497 void *ok;
498 ok = diff_state_add_chunk(state, true,
499 left->atoms.head
500 + atom->patience.identical_lines.start,
501 diff_range_len(&atom->patience.identical_lines),
502 right->atoms.head
503 + atom_r->patience.identical_lines.start,
504 diff_range_len(&atom_r->patience.identical_lines));
505 if (!ok)
506 goto return_rc;
507 left_pos = atom->patience.identical_lines.end;
508 right_pos = atom_r->patience.identical_lines.end;
509 } else {
510 left_pos = left_idx + 1;
511 right_pos = right_idx + 1;
513 debug("end of iteration %u left_pos %u left_idx %u"
514 " right_pos %u right_idx %u\n",
515 i, left_pos, left_idx, right_pos, right_idx);
517 debug("** END %s\n", __func__);
519 rc = DIFF_RC_OK;
521 return_rc:
522 free(lcs);
523 return rc;