Blob


1 /* Generic infrastructure to implement various diff algorithms (implementation). */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
19 #include <sys/queue.h>
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <limits.h>
28 #include <unistd.h>
30 #include <assert.h>
32 #include <arraylist.h>
33 #include <diff_main.h>
35 #include "got_compat.h"
37 #include "diff_internal.h"
38 #include "diff_debug.h"
40 static int
41 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
42 {
43 int r;
44 if (fseeko(f, at_pos, SEEK_SET) == -1)
45 return errno;
46 r = fread(buf, sizeof(char), len, f);
47 if ((r == 0 || r < len) && ferror(f))
48 return EIO;
49 if (r != len)
50 return EIO;
51 return 0;
52 }
54 static int
55 buf_cmp(const unsigned char *left, size_t left_len,
56 const unsigned char *right, size_t right_len,
57 bool ignore_whitespace)
58 {
59 int cmp;
61 if (ignore_whitespace) {
62 int il = 0, ir = 0;
63 while (il < left_len && ir < right_len) {
64 unsigned char cl = left[il];
65 unsigned char cr = right[ir];
67 if (isspace((unsigned char)cl) && il < left_len) {
68 il++;
69 continue;
70 }
71 if (isspace((unsigned char)cr) && ir < right_len) {
72 ir++;
73 continue;
74 }
76 if (cl > cr)
77 return 1;
78 if (cr > cl)
79 return -1;
80 il++;
81 ir++;
82 }
83 while (il < left_len) {
84 unsigned char cl = left[il++];
85 if (!isspace((unsigned char)cl))
86 return 1;
87 }
88 while (ir < right_len) {
89 unsigned char cr = right[ir++];
90 if (!isspace((unsigned char)cr))
91 return -1;
92 }
94 return 0;
95 }
97 cmp = memcmp(left, right, MIN(left_len, right_len));
98 if (cmp)
99 return cmp;
100 if (left_len == right_len)
101 return 0;
102 return (left_len > right_len) ? 1 : -1;
105 int
106 diff_atom_cmp(int *cmp,
107 const struct diff_atom *left,
108 const struct diff_atom *right)
110 off_t remain_left, remain_right;
111 int flags = (left->root->diff_flags | right->root->diff_flags);
112 bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
114 if (!left->len && !right->len) {
115 *cmp = 0;
116 return 0;
118 if (!ignore_whitespace) {
119 if (!right->len) {
120 *cmp = 1;
121 return 0;
123 if (!left->len) {
124 *cmp = -1;
125 return 0;
129 if (left->at != NULL && right->at != NULL) {
130 *cmp = buf_cmp(left->at, left->len, right->at, right->len,
131 ignore_whitespace);
132 return 0;
135 remain_left = left->len;
136 remain_right = right->len;
137 while (remain_left > 0 || remain_right > 0) {
138 const size_t chunksz = 8192;
139 unsigned char buf_left[chunksz], buf_right[chunksz];
140 const uint8_t *p_left, *p_right;
141 off_t n_left, n_right;
142 ssize_t r;
144 if (!remain_right) {
145 *cmp = 1;
146 return 0;
148 if (!remain_left) {
149 *cmp = -1;
150 return 0;
153 n_left = MIN(chunksz, remain_left);
154 n_right = MIN(chunksz, remain_right);
156 if (left->at == NULL) {
157 r = read_at(left->root->f,
158 left->pos + (left->len - remain_left),
159 buf_left, n_left);
160 if (r) {
161 *cmp = 0;
162 return r;
164 p_left = buf_left;
165 } else {
166 p_left = left->at + (left->len - remain_left);
169 if (right->at == NULL) {
170 r = read_at(right->root->f,
171 right->pos + (right->len - remain_right),
172 buf_right, n_right);
173 if (r) {
174 *cmp = 0;
175 return r;
177 p_right = buf_right;
178 } else {
179 p_right = right->at + (right->len - remain_right);
182 r = buf_cmp(p_left, n_left, p_right, n_right,
183 ignore_whitespace);
184 if (r) {
185 *cmp = r;
186 return 0;
189 remain_left -= n_left;
190 remain_right -= n_right;
193 *cmp = 0;
194 return 0;
197 int
198 diff_atom_same(bool *same,
199 const struct diff_atom *left,
200 const struct diff_atom *right)
202 int cmp;
203 int r;
204 if (left->hash != right->hash) {
205 *same = false;
206 return 0;
208 r = diff_atom_cmp(&cmp, left, right);
209 if (r) {
210 *same = true;
211 return r;
213 *same = (cmp == 0);
214 return 0;
217 static struct diff_chunk *
218 diff_state_add_solved_chunk(struct diff_state *state,
219 const struct diff_chunk *chunk)
221 diff_chunk_arraylist_t *result;
222 struct diff_chunk *new_chunk;
223 enum diff_chunk_type last_t;
224 enum diff_chunk_type new_t;
225 struct diff_chunk *last;
227 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
228 * never directly follows a plus chunk. */
229 result = &state->result->chunks;
231 last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
232 : CHUNK_EMPTY;
233 new_t = diff_chunk_type(chunk);
235 debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
236 result->len);
237 debug("L\n");
238 debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
239 debug("R\n");
240 debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
242 if (result->len) {
243 last = &result->head[result->len - 1];
244 assert(chunk->left_start
245 == last->left_start + last->left_count);
246 assert(chunk->right_start
247 == last->right_start + last->right_count);
250 if (new_t == last_t) {
251 new_chunk = &result->head[result->len - 1];
252 new_chunk->left_count += chunk->left_count;
253 new_chunk->right_count += chunk->right_count;
254 debug(" - added chunk touches previous one of same type, joined:\n");
255 debug("L\n");
256 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
257 debug("R\n");
258 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
259 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
260 enum diff_chunk_type prev_last_t =
261 result->len > 1 ?
262 diff_chunk_type(&result->head[result->len - 2])
263 : CHUNK_EMPTY;
264 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
265 * Is the one before that also a minus? combine. */
266 if (prev_last_t == CHUNK_MINUS) {
267 new_chunk = &result->head[result->len - 2];
268 new_chunk->left_count += chunk->left_count;
269 new_chunk->right_count += chunk->right_count;
271 debug(" - added minus-chunk follows plus-chunk,"
272 " put before that plus-chunk and joined"
273 " with preceding minus-chunk:\n");
274 debug("L\n");
275 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
276 debug("R\n");
277 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
278 } else {
279 ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
280 if (!new_chunk)
281 return NULL;
282 *new_chunk = *chunk;
284 /* The new minus chunk indicates to which position on
285 * the right it corresponds, even though it doesn't add
286 * any lines on the right. By moving above a plus chunk,
287 * that position on the right has shifted. */
288 last = &result->head[result->len - 1];
289 new_chunk->right_start = last->right_start;
291 debug(" - added minus-chunk follows plus-chunk,"
292 " put before that plus-chunk\n");
295 /* That last_t == CHUNK_PLUS indicates to which position on the
296 * left it corresponds, even though it doesn't add any lines on
297 * the left. By inserting/extending the prev_last_t ==
298 * CHUNK_MINUS, that position on the left has shifted. */
299 last = &result->head[result->len - 1];
300 last->left_start = new_chunk->left_start
301 + new_chunk->left_count;
303 } else {
304 ARRAYLIST_ADD(new_chunk, *result);
305 if (!new_chunk)
306 return NULL;
307 *new_chunk = *chunk;
309 return new_chunk;
312 /* Even if a left or right side is empty, diff output may need to know the
313 * position in that file.
314 * So left_start or right_start must never be NULL -- pass left_count or
315 * right_count as zero to indicate staying at that position without consuming
316 * any lines. */
317 struct diff_chunk *
318 diff_state_add_chunk(struct diff_state *state, bool solved,
319 struct diff_atom *left_start, unsigned int left_count,
320 struct diff_atom *right_start, unsigned int right_count)
322 struct diff_chunk *new_chunk;
323 struct diff_chunk chunk = {
324 .solved = solved,
325 .left_start = left_start,
326 .left_count = left_count,
327 .right_start = right_start,
328 .right_count = right_count,
329 };
331 /* An unsolved chunk means store as intermediate result for later
332 * re-iteration.
333 * If there already are intermediate results, that means even a
334 * following solved chunk needs to go to intermediate results, so that
335 * it is later put in the final correct position in solved chunks.
336 */
337 if (!solved || state->temp_result.len) {
338 /* Append to temp_result */
339 debug("ADD %s chunk to temp result:\n",
340 chunk.solved ? "solved" : "UNSOLVED");
341 debug("L\n");
342 debug_dump_atoms(&state->left, left_start, left_count);
343 debug("R\n");
344 debug_dump_atoms(&state->right, right_start, right_count);
345 ARRAYLIST_ADD(new_chunk, state->temp_result);
346 if (!new_chunk)
347 return NULL;
348 *new_chunk = chunk;
349 return new_chunk;
352 return diff_state_add_solved_chunk(state, &chunk);
355 static void
356 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
357 unsigned long long len, int diff_flags)
359 *d = (struct diff_data){
360 .f = f,
361 .pos = 0,
362 .data = data,
363 .len = len,
364 .root = d,
365 .diff_flags = diff_flags,
366 };
369 void
370 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
371 struct diff_atom *from_atom, unsigned int atoms_count)
373 struct diff_atom *last_atom;
375 debug("diff_data %p parent %p from_atom %p atoms_count %u\n",
376 d, parent, from_atom, atoms_count);
377 debug(" from_atom ");
378 debug_dump_atom(parent, NULL, from_atom);
380 if (atoms_count == 0) {
381 *d = (struct diff_data){
382 .f = NULL,
383 .pos = 0,
384 .data = NULL,
385 .len = 0,
386 .root = parent->root,
387 .atoms.head = NULL,
388 .atoms.len = atoms_count,
389 };
391 return;
394 last_atom = from_atom + atoms_count - 1;
395 *d = (struct diff_data){
396 .f = NULL,
397 .pos = from_atom->pos,
398 .data = from_atom->at,
399 .len = (last_atom->pos + last_atom->len) - from_atom->pos,
400 .root = parent->root,
401 .atoms.head = from_atom,
402 .atoms.len = atoms_count,
403 };
405 debug("subsection:\n");
406 debug_dump(d);
409 void
410 diff_data_free(struct diff_data *diff_data)
412 if (!diff_data)
413 return;
414 if (diff_data->atoms.allocated)
415 ARRAYLIST_FREE(diff_data->atoms);
418 int
419 diff_algo_none(const struct diff_algo_config *algo_config,
420 struct diff_state *state)
422 debug("\n** %s\n", __func__);
423 debug("left:\n");
424 debug_dump(&state->left);
425 debug("right:\n");
426 debug_dump(&state->right);
427 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
428 0);
430 /* Add a chunk of equal lines, if any */
431 struct diff_atom *l = state->left.atoms.head;
432 unsigned int l_len = state->left.atoms.len;
433 struct diff_atom *r = state->right.atoms.head;
434 unsigned int r_len = state->right.atoms.len;
435 unsigned int equal_atoms_start = 0;
436 unsigned int equal_atoms_end = 0;
437 unsigned int l_idx = 0;
438 unsigned int r_idx = 0;
440 while (equal_atoms_start < l_len
441 && equal_atoms_start < r_len) {
442 int err;
443 bool same;
444 err = diff_atom_same(&same, &l[equal_atoms_start],
445 &r[equal_atoms_start]);
446 if (err)
447 return err;
448 if (!same)
449 break;
450 equal_atoms_start++;
452 while (equal_atoms_end < (l_len - equal_atoms_start)
453 && equal_atoms_end < (r_len - equal_atoms_start)) {
454 int err;
455 bool same;
456 err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
457 &r[r_len - 1 - equal_atoms_end]);
458 if (err)
459 return err;
460 if (!same)
461 break;
462 equal_atoms_end++;
465 /* Add a chunk of equal lines at the start */
466 if (equal_atoms_start) {
467 if (!diff_state_add_chunk(state, true,
468 l, equal_atoms_start,
469 r, equal_atoms_start))
470 return ENOMEM;
471 l_idx += equal_atoms_start;
472 r_idx += equal_atoms_start;
475 /* Add a "minus" chunk with all lines from the left. */
476 if (equal_atoms_start + equal_atoms_end < l_len) {
477 unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
478 if (!diff_state_add_chunk(state, true,
479 &l[l_idx], add_len,
480 &r[r_idx], 0))
481 return ENOMEM;
482 l_idx += add_len;
485 /* Add a "plus" chunk with all lines from the right. */
486 if (equal_atoms_start + equal_atoms_end < r_len) {
487 unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
488 if (!diff_state_add_chunk(state, true,
489 &l[l_idx], 0,
490 &r[r_idx], add_len))
491 return ENOMEM;
492 r_idx += add_len;
495 /* Add a chunk of equal lines at the end */
496 if (equal_atoms_end) {
497 if (!diff_state_add_chunk(state, true,
498 &l[l_idx], equal_atoms_end,
499 &r[r_idx], equal_atoms_end))
500 return ENOMEM;
503 return DIFF_RC_OK;
506 static int
507 diff_run_algo(const struct diff_algo_config *algo_config,
508 struct diff_state *state)
510 int rc;
512 if (!algo_config || !algo_config->impl
513 || !state->recursion_depth_left
514 || !state->left.atoms.len || !state->right.atoms.len) {
515 debug("Fall back to diff_algo_none():%s%s%s\n",
516 (!algo_config || !algo_config->impl) ? " no-cfg" : "",
517 (!state->recursion_depth_left) ? " max-depth" : "",
518 (!state->left.atoms.len || !state->right.atoms.len)?
519 " trivial" : "");
520 return diff_algo_none(algo_config, state);
523 ARRAYLIST_FREE(state->temp_result);
524 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
525 rc = algo_config->impl(algo_config, state);
526 switch (rc) {
527 case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
528 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
529 algo_config->fallback_algo);
530 rc = diff_run_algo(algo_config->fallback_algo, state);
531 goto return_rc;
533 case DIFF_RC_OK:
534 /* continue below */
535 break;
537 default:
538 /* some error happened */
539 goto return_rc;
542 /* Pick up any diff chunks that are still unsolved and feed to
543 * inner_algo. inner_algo will solve unsolved chunks and append to
544 * result, and subsequent solved chunks on this level are then appended
545 * to result afterwards. */
546 int i;
547 for (i = 0; i < state->temp_result.len; i++) {
548 struct diff_chunk *c = &state->temp_result.head[i];
549 if (c->solved) {
550 diff_state_add_solved_chunk(state, c);
551 continue;
554 /* c is an unsolved chunk, feed to inner_algo */
555 struct diff_state inner_state = {
556 .result = state->result,
557 .recursion_depth_left = state->recursion_depth_left - 1,
558 .kd_buf = state->kd_buf,
559 .kd_buf_size = state->kd_buf_size,
560 };
561 diff_data_init_subsection(&inner_state.left, &state->left,
562 c->left_start, c->left_count);
563 diff_data_init_subsection(&inner_state.right, &state->right,
564 c->right_start, c->right_count);
566 rc = diff_run_algo(algo_config->inner_algo, &inner_state);
567 state->kd_buf = inner_state.kd_buf;
568 state->kd_buf_size = inner_state.kd_buf_size;
569 if (rc != DIFF_RC_OK)
570 goto return_rc;
573 rc = DIFF_RC_OK;
574 return_rc:
575 ARRAYLIST_FREE(state->temp_result);
576 return rc;
579 int
580 diff_atomize_file(struct diff_data *d,
581 const struct diff_config *config,
582 FILE *f, const uint8_t *data, off_t len, int diff_flags)
584 if (!config->atomize_func)
585 return EINVAL;
587 diff_data_init_root(d, f, data, len, diff_flags);
589 return config->atomize_func(config->atomize_func_data, d);
593 struct diff_result *
594 diff_main(const struct diff_config *config, struct diff_data *left,
595 struct diff_data *right)
597 struct diff_result *result = malloc(sizeof(struct diff_result));
598 if (!result)
599 return NULL;
601 *result = (struct diff_result){};
602 result->left = left;
603 result->right = right;
605 struct diff_state state = {
606 .result = result,
607 .recursion_depth_left = config->max_recursion_depth ?
608 config->max_recursion_depth : UINT_MAX,
609 .kd_buf = NULL,
610 .kd_buf_size = 0,
611 };
612 diff_data_init_subsection(&state.left, left,
613 left->atoms.head,
614 left->atoms.len);
615 diff_data_init_subsection(&state.right, right,
616 right->atoms.head,
617 right->atoms.len);
619 result->rc = diff_run_algo(config->algo, &state);
620 free(state.kd_buf);
622 return result;
625 void
626 diff_result_free(struct diff_result *result)
628 if (!result)
629 return;
630 ARRAYLIST_FREE(result->chunks);
631 free(result);
634 int
635 diff_result_contains_printable_chunks(struct diff_result *result)
637 struct diff_chunk *c;
638 enum diff_chunk_type t;
639 int i;
641 for (i = 0; i < result->chunks.len; i++) {
642 c = &result->chunks.head[i];
643 t = diff_chunk_type(c);
644 if (t == CHUNK_MINUS || t == CHUNK_PLUS)
645 return 1;
648 return 0;