Blob


1 /* Generic infrastructure to implement various diff algorithms (implementation). */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
19 #include <sys/queue.h>
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <limits.h>
28 #include <unistd.h>
30 #include <assert.h>
32 #include <arraylist.h>
33 #include <diff_main.h>
35 #include "got_compat.h"
37 #include "diff_internal.h"
38 #include "diff_debug.h"
40 inline enum diff_chunk_type
41 diff_chunk_type(const struct diff_chunk *chunk)
42 {
43 if (!chunk->left_count && !chunk->right_count)
44 return CHUNK_EMPTY;
45 if (!chunk->solved)
46 return CHUNK_ERROR;
47 if (!chunk->right_count)
48 return CHUNK_MINUS;
49 if (!chunk->left_count)
50 return CHUNK_PLUS;
51 if (chunk->left_count != chunk->right_count)
52 return CHUNK_ERROR;
53 return CHUNK_SAME;
54 }
56 static int
57 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
58 {
59 int r;
60 if (fseeko(f, at_pos, SEEK_SET) == -1)
61 return errno;
62 r = fread(buf, sizeof(char), len, f);
63 if ((r == 0 || r < len) && ferror(f))
64 return EIO;
65 if (r != len)
66 return EIO;
67 return 0;
68 }
70 static int
71 buf_cmp(const unsigned char *left, size_t left_len,
72 const unsigned char *right, size_t right_len,
73 bool ignore_whitespace)
74 {
75 int cmp;
77 if (ignore_whitespace) {
78 int il = 0, ir = 0;
79 while (il < left_len && ir < right_len) {
80 unsigned char cl = left[il];
81 unsigned char cr = right[ir];
83 if (isspace((unsigned char)cl) && il < left_len) {
84 il++;
85 continue;
86 }
87 if (isspace((unsigned char)cr) && ir < right_len) {
88 ir++;
89 continue;
90 }
92 if (cl > cr)
93 return 1;
94 if (cr > cl)
95 return -1;
96 il++;
97 ir++;
98 }
99 while (il < left_len) {
100 unsigned char cl = left[il++];
101 if (!isspace((unsigned char)cl))
102 return 1;
104 while (ir < right_len) {
105 unsigned char cr = right[ir++];
106 if (!isspace((unsigned char)cr))
107 return -1;
110 return 0;
113 cmp = memcmp(left, right, MIN(left_len, right_len));
114 if (cmp)
115 return cmp;
116 if (left_len == right_len)
117 return 0;
118 return (left_len > right_len) ? 1 : -1;
121 int
122 diff_atom_cmp(int *cmp,
123 const struct diff_atom *left,
124 const struct diff_atom *right)
126 off_t remain_left, remain_right;
127 int flags = (left->root->diff_flags | right->root->diff_flags);
128 bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
130 if (!left->len && !right->len) {
131 *cmp = 0;
132 return 0;
134 if (!ignore_whitespace) {
135 if (!right->len) {
136 *cmp = 1;
137 return 0;
139 if (!left->len) {
140 *cmp = -1;
141 return 0;
145 if (left->at != NULL && right->at != NULL) {
146 *cmp = buf_cmp(left->at, left->len, right->at, right->len,
147 ignore_whitespace);
148 return 0;
151 remain_left = left->len;
152 remain_right = right->len;
153 while (remain_left > 0 || remain_right > 0) {
154 const size_t chunksz = 8192;
155 unsigned char buf_left[chunksz], buf_right[chunksz];
156 const uint8_t *p_left, *p_right;
157 off_t n_left, n_right;
158 ssize_t r;
160 if (!remain_right) {
161 *cmp = 1;
162 return 0;
164 if (!remain_left) {
165 *cmp = -1;
166 return 0;
169 n_left = MIN(chunksz, remain_left);
170 n_right = MIN(chunksz, remain_right);
172 if (left->at == NULL) {
173 r = read_at(left->root->f,
174 left->pos + (left->len - remain_left),
175 buf_left, n_left);
176 if (r) {
177 *cmp = 0;
178 return r;
180 p_left = buf_left;
181 } else {
182 p_left = left->at + (left->len - remain_left);
185 if (right->at == NULL) {
186 r = read_at(right->root->f,
187 right->pos + (right->len - remain_right),
188 buf_right, n_right);
189 if (r) {
190 *cmp = 0;
191 return r;
193 p_right = buf_right;
194 } else {
195 p_right = right->at + (right->len - remain_right);
198 r = buf_cmp(p_left, n_left, p_right, n_right,
199 ignore_whitespace);
200 if (r) {
201 *cmp = r;
202 return 0;
205 remain_left -= n_left;
206 remain_right -= n_right;
209 *cmp = 0;
210 return 0;
213 int
214 diff_atom_same(bool *same,
215 const struct diff_atom *left,
216 const struct diff_atom *right)
218 int cmp;
219 int r;
220 if (left->hash != right->hash) {
221 *same = false;
222 return 0;
224 r = diff_atom_cmp(&cmp, left, right);
225 if (r) {
226 *same = true;
227 return r;
229 *same = (cmp == 0);
230 return 0;
233 static struct diff_chunk *
234 diff_state_add_solved_chunk(struct diff_state *state,
235 const struct diff_chunk *chunk)
237 diff_chunk_arraylist_t *result;
238 struct diff_chunk *new_chunk;
239 enum diff_chunk_type last_t;
240 enum diff_chunk_type new_t;
241 struct diff_chunk *last;
243 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
244 * never directly follows a plus chunk. */
245 result = &state->result->chunks;
247 last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
248 : CHUNK_EMPTY;
249 new_t = diff_chunk_type(chunk);
251 debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
252 result->len);
253 debug("L\n");
254 debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
255 debug("R\n");
256 debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
258 if (result->len) {
259 last = &result->head[result->len - 1];
260 assert(chunk->left_start
261 == last->left_start + last->left_count);
262 assert(chunk->right_start
263 == last->right_start + last->right_count);
266 if (new_t == last_t) {
267 new_chunk = &result->head[result->len - 1];
268 new_chunk->left_count += chunk->left_count;
269 new_chunk->right_count += chunk->right_count;
270 debug(" - added chunk touches previous one of same type, joined:\n");
271 debug("L\n");
272 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
273 debug("R\n");
274 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
275 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
276 enum diff_chunk_type prev_last_t =
277 result->len > 1 ?
278 diff_chunk_type(&result->head[result->len - 2])
279 : CHUNK_EMPTY;
280 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
281 * Is the one before that also a minus? combine. */
282 if (prev_last_t == CHUNK_MINUS) {
283 new_chunk = &result->head[result->len - 2];
284 new_chunk->left_count += chunk->left_count;
285 new_chunk->right_count += chunk->right_count;
287 debug(" - added minus-chunk follows plus-chunk,"
288 " put before that plus-chunk and joined"
289 " with preceding minus-chunk:\n");
290 debug("L\n");
291 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
292 debug("R\n");
293 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
294 } else {
295 ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
296 if (!new_chunk)
297 return NULL;
298 *new_chunk = *chunk;
300 /* The new minus chunk indicates to which position on
301 * the right it corresponds, even though it doesn't add
302 * any lines on the right. By moving above a plus chunk,
303 * that position on the right has shifted. */
304 last = &result->head[result->len - 1];
305 new_chunk->right_start = last->right_start;
307 debug(" - added minus-chunk follows plus-chunk,"
308 " put before that plus-chunk\n");
311 /* That last_t == CHUNK_PLUS indicates to which position on the
312 * left it corresponds, even though it doesn't add any lines on
313 * the left. By inserting/extending the prev_last_t ==
314 * CHUNK_MINUS, that position on the left has shifted. */
315 last = &result->head[result->len - 1];
316 last->left_start = new_chunk->left_start
317 + new_chunk->left_count;
319 } else {
320 ARRAYLIST_ADD(new_chunk, *result);
321 if (!new_chunk)
322 return NULL;
323 *new_chunk = *chunk;
325 return new_chunk;
328 /* Even if a left or right side is empty, diff output may need to know the
329 * position in that file.
330 * So left_start or right_start must never be NULL -- pass left_count or
331 * right_count as zero to indicate staying at that position without consuming
332 * any lines. */
333 struct diff_chunk *
334 diff_state_add_chunk(struct diff_state *state, bool solved,
335 struct diff_atom *left_start, unsigned int left_count,
336 struct diff_atom *right_start, unsigned int right_count)
338 struct diff_chunk *new_chunk;
339 struct diff_chunk chunk = {
340 .solved = solved,
341 .left_start = left_start,
342 .left_count = left_count,
343 .right_start = right_start,
344 .right_count = right_count,
345 };
347 /* An unsolved chunk means store as intermediate result for later
348 * re-iteration.
349 * If there already are intermediate results, that means even a
350 * following solved chunk needs to go to intermediate results, so that
351 * it is later put in the final correct position in solved chunks.
352 */
353 if (!solved || state->temp_result.len) {
354 /* Append to temp_result */
355 debug("ADD %s chunk to temp result:\n",
356 chunk.solved ? "solved" : "UNSOLVED");
357 debug("L\n");
358 debug_dump_atoms(&state->left, left_start, left_count);
359 debug("R\n");
360 debug_dump_atoms(&state->right, right_start, right_count);
361 ARRAYLIST_ADD(new_chunk, state->temp_result);
362 if (!new_chunk)
363 return NULL;
364 *new_chunk = chunk;
365 return new_chunk;
368 return diff_state_add_solved_chunk(state, &chunk);
371 static void
372 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
373 unsigned long long len, int diff_flags)
375 *d = (struct diff_data){
376 .f = f,
377 .pos = 0,
378 .data = data,
379 .len = len,
380 .root = d,
381 .diff_flags = diff_flags,
382 };
385 void
386 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
387 struct diff_atom *from_atom, unsigned int atoms_count)
389 struct diff_atom *last_atom;
391 debug("diff_data %p parent %p from_atom %p atoms_count %u\n",
392 d, parent, from_atom, atoms_count);
393 debug(" from_atom ");
394 debug_dump_atom(parent, NULL, from_atom);
396 if (atoms_count == 0) {
397 *d = (struct diff_data){
398 .f = NULL,
399 .pos = 0,
400 .data = NULL,
401 .len = 0,
402 .root = parent->root,
403 .atoms.head = NULL,
404 .atoms.len = atoms_count,
405 };
407 return;
410 last_atom = from_atom + atoms_count - 1;
411 *d = (struct diff_data){
412 .f = NULL,
413 .pos = from_atom->pos,
414 .data = from_atom->at,
415 .len = (last_atom->pos + last_atom->len) - from_atom->pos,
416 .root = parent->root,
417 .atoms.head = from_atom,
418 .atoms.len = atoms_count,
419 };
421 debug("subsection:\n");
422 debug_dump(d);
425 void
426 diff_data_free(struct diff_data *diff_data)
428 if (!diff_data)
429 return;
430 if (diff_data->atoms.allocated)
431 ARRAYLIST_FREE(diff_data->atoms);
434 int
435 diff_algo_none(const struct diff_algo_config *algo_config,
436 struct diff_state *state)
438 debug("\n** %s\n", __func__);
439 debug("left:\n");
440 debug_dump(&state->left);
441 debug("right:\n");
442 debug_dump(&state->right);
443 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
444 0);
446 /* Add a chunk of equal lines, if any */
447 struct diff_atom *l = state->left.atoms.head;
448 unsigned int l_len = state->left.atoms.len;
449 struct diff_atom *r = state->right.atoms.head;
450 unsigned int r_len = state->right.atoms.len;
451 unsigned int equal_atoms_start = 0;
452 unsigned int equal_atoms_end = 0;
453 unsigned int l_idx = 0;
454 unsigned int r_idx = 0;
456 while (equal_atoms_start < l_len
457 && equal_atoms_start < r_len) {
458 int err;
459 bool same;
460 err = diff_atom_same(&same, &l[equal_atoms_start],
461 &r[equal_atoms_start]);
462 if (err)
463 return err;
464 if (!same)
465 break;
466 equal_atoms_start++;
468 while (equal_atoms_end < (l_len - equal_atoms_start)
469 && equal_atoms_end < (r_len - equal_atoms_start)) {
470 int err;
471 bool same;
472 err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
473 &r[r_len - 1 - equal_atoms_end]);
474 if (err)
475 return err;
476 if (!same)
477 break;
478 equal_atoms_end++;
481 /* Add a chunk of equal lines at the start */
482 if (equal_atoms_start) {
483 if (!diff_state_add_chunk(state, true,
484 l, equal_atoms_start,
485 r, equal_atoms_start))
486 return ENOMEM;
487 l_idx += equal_atoms_start;
488 r_idx += equal_atoms_start;
491 /* Add a "minus" chunk with all lines from the left. */
492 if (equal_atoms_start + equal_atoms_end < l_len) {
493 unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
494 if (!diff_state_add_chunk(state, true,
495 &l[l_idx], add_len,
496 &r[r_idx], 0))
497 return ENOMEM;
498 l_idx += add_len;
501 /* Add a "plus" chunk with all lines from the right. */
502 if (equal_atoms_start + equal_atoms_end < r_len) {
503 unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
504 if (!diff_state_add_chunk(state, true,
505 &l[l_idx], 0,
506 &r[r_idx], add_len))
507 return ENOMEM;
508 r_idx += add_len;
511 /* Add a chunk of equal lines at the end */
512 if (equal_atoms_end) {
513 if (!diff_state_add_chunk(state, true,
514 &l[l_idx], equal_atoms_end,
515 &r[r_idx], equal_atoms_end))
516 return ENOMEM;
519 return DIFF_RC_OK;
522 static int
523 diff_run_algo(const struct diff_algo_config *algo_config,
524 struct diff_state *state)
526 int rc;
528 if (!algo_config || !algo_config->impl
529 || !state->recursion_depth_left
530 || !state->left.atoms.len || !state->right.atoms.len) {
531 debug("Fall back to diff_algo_none():%s%s%s\n",
532 (!algo_config || !algo_config->impl) ? " no-cfg" : "",
533 (!state->recursion_depth_left) ? " max-depth" : "",
534 (!state->left.atoms.len || !state->right.atoms.len)?
535 " trivial" : "");
536 return diff_algo_none(algo_config, state);
539 ARRAYLIST_FREE(state->temp_result);
540 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
541 rc = algo_config->impl(algo_config, state);
542 switch (rc) {
543 case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
544 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
545 algo_config->fallback_algo);
546 rc = diff_run_algo(algo_config->fallback_algo, state);
547 goto return_rc;
549 case DIFF_RC_OK:
550 /* continue below */
551 break;
553 default:
554 /* some error happened */
555 goto return_rc;
558 /* Pick up any diff chunks that are still unsolved and feed to
559 * inner_algo. inner_algo will solve unsolved chunks and append to
560 * result, and subsequent solved chunks on this level are then appended
561 * to result afterwards. */
562 int i;
563 for (i = 0; i < state->temp_result.len; i++) {
564 struct diff_chunk *c = &state->temp_result.head[i];
565 if (c->solved) {
566 diff_state_add_solved_chunk(state, c);
567 continue;
570 /* c is an unsolved chunk, feed to inner_algo */
571 struct diff_state inner_state = {
572 .result = state->result,
573 .recursion_depth_left = state->recursion_depth_left - 1,
574 .kd_buf = state->kd_buf,
575 .kd_buf_size = state->kd_buf_size,
576 };
577 diff_data_init_subsection(&inner_state.left, &state->left,
578 c->left_start, c->left_count);
579 diff_data_init_subsection(&inner_state.right, &state->right,
580 c->right_start, c->right_count);
582 rc = diff_run_algo(algo_config->inner_algo, &inner_state);
583 state->kd_buf = inner_state.kd_buf;
584 state->kd_buf_size = inner_state.kd_buf_size;
585 if (rc != DIFF_RC_OK)
586 goto return_rc;
589 rc = DIFF_RC_OK;
590 return_rc:
591 ARRAYLIST_FREE(state->temp_result);
592 return rc;
595 int
596 diff_atomize_file(struct diff_data *d,
597 const struct diff_config *config,
598 FILE *f, const uint8_t *data, off_t len, int diff_flags)
600 if (!config->atomize_func)
601 return EINVAL;
603 diff_data_init_root(d, f, data, len, diff_flags);
605 return config->atomize_func(config->atomize_func_data, d);
609 struct diff_result *
610 diff_main(const struct diff_config *config, struct diff_data *left,
611 struct diff_data *right)
613 struct diff_result *result = malloc(sizeof(struct diff_result));
614 if (!result)
615 return NULL;
617 *result = (struct diff_result){};
618 result->left = left;
619 result->right = right;
621 struct diff_state state = {
622 .result = result,
623 .recursion_depth_left = config->max_recursion_depth ?
624 config->max_recursion_depth : UINT_MAX,
625 .kd_buf = NULL,
626 .kd_buf_size = 0,
627 };
628 diff_data_init_subsection(&state.left, left,
629 left->atoms.head,
630 left->atoms.len);
631 diff_data_init_subsection(&state.right, right,
632 right->atoms.head,
633 right->atoms.len);
635 result->rc = diff_run_algo(config->algo, &state);
636 free(state.kd_buf);
638 return result;
641 void
642 diff_result_free(struct diff_result *result)
644 if (!result)
645 return;
646 ARRAYLIST_FREE(result->chunks);
647 free(result);
650 int
651 diff_result_contains_printable_chunks(struct diff_result *result)
653 struct diff_chunk *c;
654 enum diff_chunk_type t;
655 int i;
657 for (i = 0; i < result->chunks.len; i++) {
658 c = &result->chunks.head[i];
659 t = diff_chunk_type(c);
660 if (t == CHUNK_MINUS || t == CHUNK_PLUS)
661 return 1;
664 return 0;