1 /* Implementation of the Patience Diff algorithm invented by Bram Cohen:
2 * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */
4 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
26 #include <diff/arraylist.h>
27 #include <diff/diff_main.h>
29 #include "diff_debug.h"
31 /* Set unique_here = true for all atoms that exist exactly once in this list. */
33 diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count)
36 unsigned int count = 0;
37 diff_data_foreach_atom(i, d) {
38 i->patience.unique_here = true;
39 i->patience.unique_in_both = true;
42 diff_data_foreach_atom(i, d) {
45 if (!i->patience.unique_here)
48 diff_data_foreach_atom_from(i + 1, j, d) {
50 int r = diff_atom_same(&same, i, j);
55 if (i->patience.unique_here) {
56 i->patience.unique_here = false;
57 i->patience.unique_in_both = false;
60 j->patience.unique_here = false;
61 j->patience.unique_in_both = false;
66 *unique_count = count;
70 /* Mark those lines as atom->patience.unique_in_both = true that appear exactly
71 * once in each side. */
73 diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
74 unsigned int *unique_in_both_count)
76 /* Derive the final unique_in_both count without needing an explicit
77 * iteration. So this is just some optimiziation to save one iteration
79 unsigned int unique_in_both;
82 r = diff_atoms_mark_unique(left, &unique_in_both);
85 r = diff_atoms_mark_unique(right, NULL);
89 debug("unique_in_both %u\n", unique_in_both);
92 diff_data_foreach_atom(i, left) {
93 if (!i->patience.unique_here)
97 diff_data_foreach_atom(j, right) {
99 int r = diff_atom_same(&same, i, j);
104 if (!j->patience.unique_here) {
105 found_in_b = 2; /* or more */
109 j->patience.pos_in_other = i;
110 i->patience.pos_in_other = j;
114 if (found_in_b == 0 || found_in_b > 1) {
115 i->patience.unique_in_both = false;
117 debug("unique_in_both %u (%d) ", unique_in_both,
119 debug_dump_atom(left, NULL, i);
123 /* Still need to unmark right[*]->patience.unique_in_both for atoms that
124 * don't exist in left */
125 diff_data_foreach_atom(i, right) {
126 if (!i->patience.unique_here
127 || !i->patience.unique_in_both)
130 bool found_in_a = false;
131 diff_data_foreach_atom(j, left) {
134 if (!j->patience.unique_in_both)
136 r = diff_atom_same(&same, i, j);
146 i->patience.unique_in_both = false;
149 if (unique_in_both_count)
150 *unique_in_both_count = unique_in_both;
155 diff_atoms_swallow_identical_neighbors(struct diff_data *left,
156 struct diff_data *right,
157 unsigned int *unique_in_both_count)
159 debug("trivially combine identical lines"
160 " around unique_in_both lines\n");
163 unsigned int next_l_idx;
164 unsigned int l_min = 0;
165 unsigned int r_min = 0;
166 for (l_idx = 0; l_idx < left->atoms.len; l_idx = next_l_idx) {
167 next_l_idx = l_idx + 1;
168 struct diff_atom *l = &left->atoms.head[l_idx];
170 if (!l->patience.unique_in_both)
173 debug("check identical lines around ");
174 debug_dump_atom(left, right, l);
176 unsigned int r_idx = diff_atom_idx(right,
177 l->patience.pos_in_other);
179 struct diff_range identical_l;
180 struct diff_range identical_r;
183 * Each common-unique line swallows identical lines upwards and
185 * All common-unique lines that were part of the identical lines
186 * following below were already swallowed in the previous
187 * iteration, so we will never hit another common-unique line
189 for (identical_l.start = l_idx, identical_r.start = r_idx;
190 identical_l.start > l_min && identical_r.start > r_min;
191 identical_l.start--, identical_r.start--) {
193 int r = diff_atom_same(&same,
194 &left->atoms.head[identical_l.start - 1],
195 &right->atoms.head[identical_r.start - 1]);
202 /* Swallow downwards */
203 for (identical_l.end = l_idx + 1, identical_r.end = r_idx + 1;
204 identical_l.end < left->atoms.len
205 && identical_r.end < right->atoms.len;
206 identical_l.end++, identical_r.end++,
208 struct diff_atom *l_end;
209 struct diff_atom *r_end;
211 int r = diff_atom_same(&same,
212 &left->atoms.head[identical_l.end],
213 &right->atoms.head[identical_r.end]);
218 l_end = &left->atoms.head[identical_l.end];
219 r_end = &right->atoms.head[identical_r.end];
220 if (!l_end->patience.unique_in_both)
222 /* Part of a chunk of identical lines, remove from
223 * listing of unique_in_both lines */
224 l_end->patience.unique_in_both = false;
225 r_end->patience.unique_in_both = false;
226 (*unique_in_both_count)--;
229 l->patience.identical_lines = identical_l;
230 l->patience.pos_in_other->patience.identical_lines =
233 l_min = identical_l.end;
234 r_min = identical_r.end;
236 if (!diff_range_empty(&l->patience.identical_lines)) {
237 debug("common-unique line at l=%u r=%u swallowed"
238 " identical lines l=%u-%u r=%u-%u\n",
240 identical_l.start, identical_l.end,
241 identical_r.start, identical_r.end);
243 debug("next_l_idx = %u\n", next_l_idx);
248 /* binary search to find the stack to put this atom "card" on. */
250 find_target_stack(struct diff_atom *atom,
251 struct diff_atom **patience_stacks,
252 unsigned int patience_stacks_count)
255 unsigned int hi = patience_stacks_count;
257 unsigned int mid = (lo + hi) >> 1;
259 if (patience_stacks[mid]->patience.pos_in_other
260 < atom->patience.pos_in_other)
268 /* Among the lines that appear exactly once in each side, find the longest
269 * streak that appear in both files in the same order (with other stuff allowed
270 * to interleave). Use patience sort for that, as in the Patience Diff
272 * See https://bramcohen.livejournal.com/73318.html and, for a much more
273 * detailed explanation,
274 * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */
276 diff_algo_patience(const struct diff_algo_config *algo_config,
277 struct diff_state *state)
281 struct diff_data *left = &state->left;
282 struct diff_data *right = &state->right;
284 unsigned int unique_in_both_count;
286 debug("\n** %s\n", __func__);
288 /* Find those lines that appear exactly once in 'left' and exactly once
290 rc = diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count);
294 debug("unique_in_both_count %u\n", unique_in_both_count);
300 if (!unique_in_both_count) {
301 /* Cannot apply Patience, tell the caller to use fallback_algo
303 return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
306 rc = diff_atoms_swallow_identical_neighbors(left, right,
307 &unique_in_both_count);
310 debug("After swallowing identical neighbors: unique_in_both = %u\n",
311 unique_in_both_count);
315 /* An array of Longest Common Sequence is the result of the below
317 unsigned int lcs_count = 0;
318 struct diff_atom **lcs = NULL;
319 struct diff_atom *lcs_tail = NULL;
322 /* This subscope marks the lifetime of the atom_pointers
325 /* One chunk of storage for atom pointers */
326 struct diff_atom **atom_pointers;
327 atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2,
328 sizeof(struct diff_atom*));
330 /* Half for the list of atoms that still need to be put on
332 struct diff_atom **uniques = atom_pointers;
334 /* Half for the patience sort state's "card stacks" -- we
335 * remember only each stack's topmost "card" */
336 struct diff_atom **patience_stacks;
337 patience_stacks = atom_pointers + unique_in_both_count;
338 unsigned int patience_stacks_count = 0;
340 /* Take all common, unique items from 'left' ... */
342 struct diff_atom *atom;
343 struct diff_atom **uniques_end = uniques;
344 diff_data_foreach_atom(atom, left) {
345 if (!atom->patience.unique_in_both)
351 /* ...and sort them to the order found in 'right'.
352 * The idea is to find the leftmost stack that has a higher line
353 * number and add it to the stack's top.
354 * If there is no such stack, open a new one on the right. The
355 * line number is derived from the atom*, which are array items
356 * and hence reflect the relative position in the source file.
357 * So we got the common-uniques from 'left' and sort them
358 * according to atom->patience.pos_in_other. */
360 for (i = 0; i < unique_in_both_count; i++) {
362 unsigned int target_stack;
363 target_stack = find_target_stack(atom, patience_stacks,
364 patience_stacks_count);
365 assert(target_stack <= patience_stacks_count);
366 patience_stacks[target_stack] = atom;
367 if (target_stack == patience_stacks_count)
368 patience_stacks_count++;
370 /* Record a back reference to the next stack on the
371 * left, which will form the final longest sequence
373 atom->patience.prev_stack = target_stack ?
374 patience_stacks[target_stack - 1] : NULL;
378 /* backtrace through prev_stack references to form the final
379 * longest common sequence */
380 lcs_tail = patience_stacks[patience_stacks_count - 1];
381 lcs_count = patience_stacks_count;
383 /* uniques and patience_stacks are no longer needed.
384 * Backpointers are in atom->patience.prev_stack */
388 lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*));
389 struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1];
390 struct diff_atom *atom;
391 for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) {
392 assert(lcs_backtrace_pos >= lcs);
393 *lcs_backtrace_pos = atom;
398 debug("\npatience LCS:\n");
399 for (i = 0; i < lcs_count; i++) {
400 debug_dump_atom(left, right, lcs[i]);
405 /* TODO: For each common-unique line found (now listed in lcs), swallow
406 * lines upwards and downwards that are identical on each side. Requires
407 * a way to represent atoms being glued to adjacent atoms. */
409 debug("\ntraverse LCS, possibly recursing:\n");
411 /* Now we have pinned positions in both files at which it makes sense to
412 * divide the diff problem into smaller chunks. Go into the next round:
413 * look at each section in turn, trying to again find common-unique
414 * lines in those smaller sections. As soon as no more are found, the
415 * remaining smaller sections are solved by Myers. */
416 unsigned int left_pos = 0;
417 unsigned int right_pos = 0;
418 for (i = 0; i <= lcs_count; i++) {
419 struct diff_atom *atom;
420 struct diff_atom *atom_r;
421 unsigned int left_idx;
422 unsigned int right_idx;
426 atom_r = atom->patience.pos_in_other;
427 debug("lcs[%u] = left[%ld] = right[%ld]\n", i,
428 diff_atom_idx(left, atom), diff_atom_idx(right, atom_r));
429 left_idx = atom->patience.identical_lines.start;
430 right_idx = atom_r->patience.identical_lines.start;
431 debug(" identical lines l %u-%u r %u-%u\n",
432 atom->patience.identical_lines.start, atom->patience.identical_lines.end,
433 atom_r->patience.identical_lines.start, atom_r->patience.identical_lines.end);
437 left_idx = left->atoms.len;
438 right_idx = right->atoms.len;
441 /* 'atom' now marks an atom that matches on both sides according
442 * to patience-diff (a common-unique identical atom in both
444 * Handle the section before and the atom itself; the section
445 * after will be handled by the next loop iteration -- note that
446 * i loops to last element + 1 ("i <= lcs_count"), so that there
447 * will be another final iteration to pick up the last remaining
448 * items after the last LCS atom.
449 * The sections before might also be empty on left and/or right.
450 * left_pos and right_pos mark the indexes of the first atoms
451 * that have not yet been handled in the previous loop
452 * iteration. left_idx and right_idx mark the indexes of the
453 * matching atom on left and right, respectively. */
455 debug("iteration %u left_pos %u left_idx %u"
456 " right_pos %u right_idx %u\n",
457 i, left_pos, left_idx, right_pos, right_idx);
459 /* Section before the matching atom */
460 struct diff_atom *left_atom = &left->atoms.head[left_pos];
461 unsigned int left_section_len = left_idx - left_pos;
463 struct diff_atom *right_atom = &(right->atoms.head[right_pos]);
464 unsigned int right_section_len = right_idx - right_pos;
466 if (left_section_len && right_section_len) {
467 /* Record an unsolved chunk, the caller will apply
468 * inner_algo() on this chunk. */
469 if (!diff_state_add_chunk(state, false,
470 left_atom, left_section_len,
474 } else if (left_section_len && !right_section_len) {
475 /* Only left atoms and none on the right, they form a
476 * "minus" chunk, then. */
477 if (!diff_state_add_chunk(state, true,
478 left_atom, left_section_len,
481 } else if (!left_section_len && right_section_len) {
482 /* No left atoms, only atoms on the right, they form a
483 * "plus" chunk, then. */
484 if (!diff_state_add_chunk(state, true,
486 right_atom, right_section_len))
489 /* else: left_section_len == 0 and right_section_len == 0, i.e.
492 /* The atom found to match on both sides forms a chunk of equals
493 * on each side. In the very last iteration of this loop, there
494 * is no matching atom, we were just cleaning out the remaining
498 ok = diff_state_add_chunk(state, true,
500 + atom->patience.identical_lines.start,
501 diff_range_len(&atom->patience.identical_lines),
503 + atom_r->patience.identical_lines.start,
504 diff_range_len(&atom_r->patience.identical_lines));
507 left_pos = atom->patience.identical_lines.end;
508 right_pos = atom_r->patience.identical_lines.end;
510 left_pos = left_idx + 1;
511 right_pos = right_idx + 1;
513 debug("end of iteration %u left_pos %u left_idx %u"
514 " right_pos %u right_idx %u\n",
515 i, left_pos, left_idx, right_pos, right_idx);
517 debug("** END %s\n", __func__);