Blob


1 /* Generic infrastructure to implement various diff algorithms (implementation). */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
19 #include <ctype.h>
20 #include <errno.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <stdbool.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include <limits.h>
27 #include <unistd.h>
29 #include <assert.h>
31 #include <arraylist.h>
32 #include <diff_main.h>
34 #include "got_compat.h"
36 #include "diff_internal.h"
37 #include "diff_debug.h"
39 static int
40 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
41 {
42 int r;
43 if (fseeko(f, at_pos, SEEK_SET) == -1)
44 return errno;
45 r = fread(buf, sizeof(char), len, f);
46 if ((r == 0 || r < len) && ferror(f))
47 return errno;
48 if (r != len)
49 return EIO;
50 return 0;
51 }
53 static int
54 buf_cmp(const unsigned char *left, size_t left_len,
55 const unsigned char *right, size_t right_len,
56 bool ignore_whitespace)
57 {
58 int cmp;
60 if (ignore_whitespace) {
61 int il = 0, ir = 0;
62 while (il < left_len && ir < right_len) {
63 unsigned char cl = left[il];
64 unsigned char cr = right[ir];
66 if (isspace(cl) && il < left_len) {
67 il++;
68 continue;
69 }
70 if (isspace(cr) && ir < right_len) {
71 ir++;
72 continue;
73 }
75 if (cl > cr)
76 return 1;
77 if (cr > cl)
78 return -1;
79 il++;
80 ir++;
81 }
82 while (il < left_len) {
83 unsigned char cl = left[il++];
84 if (!isspace(cl))
85 return 1;
86 }
87 while (ir < right_len) {
88 unsigned char cr = right[ir++];
89 if (!isspace(cr))
90 return -1;
91 }
93 return 0;
94 }
96 cmp = memcmp(left, right, MIN(left_len, right_len));
97 if (cmp)
98 return cmp;
99 if (left_len == right_len)
100 return 0;
101 return (left_len > right_len) ? 1 : -1;
104 int
105 diff_atom_cmp(int *cmp,
106 const struct diff_atom *left,
107 const struct diff_atom *right)
109 off_t remain_left, remain_right;
110 int flags = (left->root->diff_flags | right->root->diff_flags);
111 bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
113 if (!left->len && !right->len) {
114 *cmp = 0;
115 return 0;
117 if (!ignore_whitespace) {
118 if (!right->len) {
119 *cmp = 1;
120 return 0;
122 if (!left->len) {
123 *cmp = -1;
124 return 0;
128 if (left->at != NULL && right->at != NULL) {
129 *cmp = buf_cmp(left->at, left->len, right->at, right->len,
130 ignore_whitespace);
131 return 0;
134 remain_left = left->len;
135 remain_right = right->len;
136 while (remain_left > 0 || remain_right > 0) {
137 const size_t chunksz = 8192;
138 unsigned char buf_left[chunksz], buf_right[chunksz];
139 const uint8_t *p_left, *p_right;
140 off_t n_left, n_right;
141 ssize_t r;
143 if (!remain_right) {
144 *cmp = 1;
145 return 0;
147 if (!remain_left) {
148 *cmp = -1;
149 return 0;
152 n_left = MIN(chunksz, remain_left);
153 n_right = MIN(chunksz, remain_right);
155 if (left->at == NULL) {
156 r = read_at(left->root->f,
157 left->pos + (left->len - remain_left),
158 buf_left, n_left);
159 if (r) {
160 *cmp = 0;
161 return r;
163 p_left = buf_left;
164 } else {
165 p_left = left->at + (left->len - remain_left);
168 if (right->at == NULL) {
169 r = read_at(right->root->f,
170 right->pos + (right->len - remain_right),
171 buf_right, n_right);
172 if (r) {
173 *cmp = 0;
174 return r;
176 p_right = buf_right;
177 } else {
178 p_right = right->at + (right->len - remain_right);
181 r = buf_cmp(p_left, n_left, p_right, n_right,
182 ignore_whitespace);
183 if (r) {
184 *cmp = r;
185 return 0;
188 remain_left -= n_left;
189 remain_right -= n_right;
192 *cmp = 0;
193 return 0;
196 int
197 diff_atom_same(bool *same,
198 const struct diff_atom *left,
199 const struct diff_atom *right)
201 int cmp;
202 int r;
203 if (left->hash != right->hash) {
204 *same = false;
205 return 0;
207 r = diff_atom_cmp(&cmp, left, right);
208 if (r) {
209 *same = true;
210 return r;
212 *same = (cmp == 0);
213 return 0;
216 static struct diff_chunk *
217 diff_state_add_solved_chunk(struct diff_state *state,
218 const struct diff_chunk *chunk)
220 diff_chunk_arraylist_t *result;
221 struct diff_chunk *new_chunk;
222 enum diff_chunk_type last_t;
223 enum diff_chunk_type new_t;
224 struct diff_chunk *last;
226 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
227 * never directly follows a plus chunk. */
228 result = &state->result->chunks;
230 last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
231 : CHUNK_EMPTY;
232 new_t = diff_chunk_type(chunk);
234 debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
235 result->len);
236 debug("L\n");
237 debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
238 debug("R\n");
239 debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
241 if (result->len) {
242 last = &result->head[result->len - 1];
243 assert(chunk->left_start
244 == last->left_start + last->left_count);
245 assert(chunk->right_start
246 == last->right_start + last->right_count);
249 if (new_t == last_t) {
250 new_chunk = &result->head[result->len - 1];
251 new_chunk->left_count += chunk->left_count;
252 new_chunk->right_count += chunk->right_count;
253 debug(" - added chunk touches previous one of same type, joined:\n");
254 debug("L\n");
255 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
256 debug("R\n");
257 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
258 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
259 enum diff_chunk_type prev_last_t =
260 result->len > 1 ?
261 diff_chunk_type(&result->head[result->len - 2])
262 : CHUNK_EMPTY;
263 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
264 * Is the one before that also a minus? combine. */
265 if (prev_last_t == CHUNK_MINUS) {
266 new_chunk = &result->head[result->len - 2];
267 new_chunk->left_count += chunk->left_count;
268 new_chunk->right_count += chunk->right_count;
270 debug(" - added minus-chunk follows plus-chunk,"
271 " put before that plus-chunk and joined"
272 " with preceding minus-chunk:\n");
273 debug("L\n");
274 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
275 debug("R\n");
276 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
277 } else {
278 ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
279 if (!new_chunk)
280 return NULL;
281 *new_chunk = *chunk;
283 /* The new minus chunk indicates to which position on
284 * the right it corresponds, even though it doesn't add
285 * any lines on the right. By moving above a plus chunk,
286 * that position on the right has shifted. */
287 last = &result->head[result->len - 1];
288 new_chunk->right_start = last->right_start;
290 debug(" - added minus-chunk follows plus-chunk,"
291 " put before that plus-chunk\n");
294 /* That last_t == CHUNK_PLUS indicates to which position on the
295 * left it corresponds, even though it doesn't add any lines on
296 * the left. By inserting/extending the prev_last_t ==
297 * CHUNK_MINUS, that position on the left has shifted. */
298 last = &result->head[result->len - 1];
299 last->left_start = new_chunk->left_start
300 + new_chunk->left_count;
302 } else {
303 ARRAYLIST_ADD(new_chunk, *result);
304 if (!new_chunk)
305 return NULL;
306 *new_chunk = *chunk;
308 return new_chunk;
311 /* Even if a left or right side is empty, diff output may need to know the
312 * position in that file.
313 * So left_start or right_start must never be NULL -- pass left_count or
314 * right_count as zero to indicate staying at that position without consuming
315 * any lines. */
316 struct diff_chunk *
317 diff_state_add_chunk(struct diff_state *state, bool solved,
318 struct diff_atom *left_start, unsigned int left_count,
319 struct diff_atom *right_start, unsigned int right_count)
321 struct diff_chunk *new_chunk;
322 struct diff_chunk chunk = {
323 .solved = solved,
324 .left_start = left_start,
325 .left_count = left_count,
326 .right_start = right_start,
327 .right_count = right_count,
328 };
330 /* An unsolved chunk means store as intermediate result for later
331 * re-iteration.
332 * If there already are intermediate results, that means even a
333 * following solved chunk needs to go to intermediate results, so that
334 * it is later put in the final correct position in solved chunks.
335 */
336 if (!solved || state->temp_result.len) {
337 /* Append to temp_result */
338 debug("ADD %s chunk to temp result:\n",
339 chunk.solved ? "solved" : "UNSOLVED");
340 debug("L\n");
341 debug_dump_atoms(&state->left, left_start, left_count);
342 debug("R\n");
343 debug_dump_atoms(&state->right, right_start, right_count);
344 ARRAYLIST_ADD(new_chunk, state->temp_result);
345 if (!new_chunk)
346 return NULL;
347 *new_chunk = chunk;
348 return new_chunk;
351 return diff_state_add_solved_chunk(state, &chunk);
354 void
355 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
356 unsigned long long len, int diff_flags)
358 *d = (struct diff_data){
359 .f = f,
360 .pos = 0,
361 .data = data,
362 .len = len,
363 .root = d,
364 .diff_flags = diff_flags,
365 };
368 void
369 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
370 struct diff_atom *from_atom, unsigned int atoms_count)
372 struct diff_atom *last_atom;
374 debug("diff_data %p parent %p from_atom %p atoms_count %u\n",
375 d, parent, from_atom, atoms_count);
376 debug(" from_atom ");
377 debug_dump_atom(parent, NULL, from_atom);
379 if (atoms_count == 0) {
380 *d = (struct diff_data){
381 .f = NULL,
382 .pos = 0,
383 .data = NULL,
384 .len = 0,
385 .root = parent->root,
386 .atoms.head = NULL,
387 .atoms.len = atoms_count,
388 };
390 return;
393 last_atom = from_atom + atoms_count - 1;
394 *d = (struct diff_data){
395 .f = NULL,
396 .pos = from_atom->pos,
397 .data = from_atom->at,
398 .len = (last_atom->pos + last_atom->len) - from_atom->pos,
399 .root = parent->root,
400 .atoms.head = from_atom,
401 .atoms.len = atoms_count,
402 };
404 debug("subsection:\n");
405 debug_dump(d);
408 void
409 diff_data_free(struct diff_data *diff_data)
411 if (!diff_data)
412 return;
413 if (diff_data->atoms.allocated)
414 ARRAYLIST_FREE(diff_data->atoms);
417 int
418 diff_algo_none(const struct diff_algo_config *algo_config,
419 struct diff_state *state)
421 debug("\n** %s\n", __func__);
422 debug("left:\n");
423 debug_dump(&state->left);
424 debug("right:\n");
425 debug_dump(&state->right);
426 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
427 0);
429 /* Add a chunk of equal lines, if any */
430 struct diff_atom *l = state->left.atoms.head;
431 unsigned int l_len = state->left.atoms.len;
432 struct diff_atom *r = state->right.atoms.head;
433 unsigned int r_len = state->right.atoms.len;
434 unsigned int equal_atoms_start = 0;
435 unsigned int equal_atoms_end = 0;
436 unsigned int l_idx = 0;
437 unsigned int r_idx = 0;
439 while (equal_atoms_start < l_len
440 && equal_atoms_start < r_len) {
441 int err;
442 bool same;
443 err = diff_atom_same(&same, &l[equal_atoms_start],
444 &r[equal_atoms_start]);
445 if (err)
446 return err;
447 if (!same)
448 break;
449 equal_atoms_start++;
451 while (equal_atoms_end < (l_len - equal_atoms_start)
452 && equal_atoms_end < (r_len - equal_atoms_start)) {
453 int err;
454 bool same;
455 err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
456 &r[r_len - 1 - equal_atoms_end]);
457 if (err)
458 return err;
459 if (!same)
460 break;
461 equal_atoms_end++;
464 /* Add a chunk of equal lines at the start */
465 if (equal_atoms_start) {
466 if (!diff_state_add_chunk(state, true,
467 l, equal_atoms_start,
468 r, equal_atoms_start))
469 return ENOMEM;
470 l_idx += equal_atoms_start;
471 r_idx += equal_atoms_start;
474 /* Add a "minus" chunk with all lines from the left. */
475 if (equal_atoms_start + equal_atoms_end < l_len) {
476 unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
477 if (!diff_state_add_chunk(state, true,
478 &l[l_idx], add_len,
479 &r[r_idx], 0))
480 return ENOMEM;
481 l_idx += add_len;
484 /* Add a "plus" chunk with all lines from the right. */
485 if (equal_atoms_start + equal_atoms_end < r_len) {
486 unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
487 if (!diff_state_add_chunk(state, true,
488 &l[l_idx], 0,
489 &r[r_idx], add_len))
490 return ENOMEM;
491 r_idx += add_len;
494 /* Add a chunk of equal lines at the end */
495 if (equal_atoms_end) {
496 if (!diff_state_add_chunk(state, true,
497 &l[l_idx], equal_atoms_end,
498 &r[r_idx], equal_atoms_end))
499 return ENOMEM;
502 return DIFF_RC_OK;
505 int
506 diff_run_algo(const struct diff_algo_config *algo_config,
507 struct diff_state *state)
509 int rc;
511 if (!algo_config || !algo_config->impl
512 || !state->recursion_depth_left
513 || !state->left.atoms.len || !state->right.atoms.len) {
514 debug("Fall back to diff_algo_none():%s%s%s\n",
515 (!algo_config || !algo_config->impl) ? " no-cfg" : "",
516 (!state->recursion_depth_left) ? " max-depth" : "",
517 (!state->left.atoms.len || !state->right.atoms.len)?
518 " trivial" : "");
519 return diff_algo_none(algo_config, state);
522 ARRAYLIST_FREE(state->temp_result);
523 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
524 rc = algo_config->impl(algo_config, state);
525 switch (rc) {
526 case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
527 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
528 algo_config->fallback_algo);
529 rc = diff_run_algo(algo_config->fallback_algo, state);
530 goto return_rc;
532 case DIFF_RC_OK:
533 /* continue below */
534 break;
536 default:
537 /* some error happened */
538 goto return_rc;
541 /* Pick up any diff chunks that are still unsolved and feed to
542 * inner_algo. inner_algo will solve unsolved chunks and append to
543 * result, and subsequent solved chunks on this level are then appended
544 * to result afterwards. */
545 int i;
546 for (i = 0; i < state->temp_result.len; i++) {
547 struct diff_chunk *c = &state->temp_result.head[i];
548 if (c->solved) {
549 diff_state_add_solved_chunk(state, c);
550 continue;
553 /* c is an unsolved chunk, feed to inner_algo */
554 struct diff_state inner_state = {
555 .result = state->result,
556 .recursion_depth_left = state->recursion_depth_left - 1,
557 .kd_buf = state->kd_buf,
558 .kd_buf_size = state->kd_buf_size,
559 };
560 diff_data_init_subsection(&inner_state.left, &state->left,
561 c->left_start, c->left_count);
562 diff_data_init_subsection(&inner_state.right, &state->right,
563 c->right_start, c->right_count);
565 rc = diff_run_algo(algo_config->inner_algo, &inner_state);
566 state->kd_buf = inner_state.kd_buf;
567 state->kd_buf_size = inner_state.kd_buf_size;
568 if (rc != DIFF_RC_OK)
569 goto return_rc;
572 rc = DIFF_RC_OK;
573 return_rc:
574 ARRAYLIST_FREE(state->temp_result);
575 return rc;
578 int
579 diff_atomize_file(struct diff_data *d,
580 const struct diff_config *config,
581 FILE *f, const uint8_t *data, off_t len, int diff_flags)
583 if (!config->atomize_func)
584 return EINVAL;
586 diff_data_init_root(d, f, data, len, diff_flags);
588 return config->atomize_func(config->atomize_func_data, d);
592 struct diff_result *
593 diff_main(const struct diff_config *config, struct diff_data *left,
594 struct diff_data *right)
596 struct diff_result *result = malloc(sizeof(struct diff_result));
597 if (!result)
598 return NULL;
600 *result = (struct diff_result){};
601 result->left = left;
602 result->right = right;
604 struct diff_state state = {
605 .result = result,
606 .recursion_depth_left = config->max_recursion_depth ? : UINT_MAX,
607 .kd_buf = NULL,
608 .kd_buf_size = 0,
609 };
610 diff_data_init_subsection(&state.left, left,
611 left->atoms.head,
612 left->atoms.len);
613 diff_data_init_subsection(&state.right, right,
614 right->atoms.head,
615 right->atoms.len);
617 result->rc = diff_run_algo(config->algo, &state);
618 free(state.kd_buf);
620 return result;
623 void
624 diff_result_free(struct diff_result *result)
626 if (!result)
627 return;
628 ARRAYLIST_FREE(result->chunks);
629 free(result);