Blob


1 /* Generic infrastructure to implement various diff algorithms (implementation). */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
19 #include <sys/queue.h>
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <limits.h>
28 #include <unistd.h>
30 #include <assert.h>
32 #include <arraylist.h>
33 #include <diff_main.h>
35 #include "diff_internal.h"
36 #include "diff_debug.h"
38 static int
39 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
40 {
41 int r;
42 if (fseeko(f, at_pos, SEEK_SET) == -1)
43 return errno;
44 r = fread(buf, sizeof(char), len, f);
45 if ((r == 0 || r < len) && ferror(f))
46 return errno;
47 if (r != len)
48 return EIO;
49 return 0;
50 }
52 static int
53 buf_cmp(const unsigned char *left, size_t left_len,
54 const unsigned char *right, size_t right_len,
55 bool ignore_whitespace)
56 {
57 int cmp;
59 if (ignore_whitespace) {
60 int il = 0, ir = 0;
61 while (il < left_len && ir < right_len) {
62 unsigned char cl = left[il];
63 unsigned char cr = right[ir];
65 if (isspace(cl) && il < left_len) {
66 il++;
67 continue;
68 }
69 if (isspace(cr) && ir < right_len) {
70 ir++;
71 continue;
72 }
74 if (cl > cr)
75 return 1;
76 if (cr > cl)
77 return -1;
78 il++;
79 ir++;
80 }
81 while (il < left_len) {
82 unsigned char cl = left[il++];
83 if (!isspace(cl))
84 return 1;
85 }
86 while (ir < right_len) {
87 unsigned char cr = right[ir++];
88 if (!isspace(cr))
89 return -1;
90 }
92 return 0;
93 }
95 cmp = memcmp(left, right, MIN(left_len, right_len));
96 if (cmp)
97 return cmp;
98 if (left_len == right_len)
99 return 0;
100 return (left_len > right_len) ? 1 : -1;
103 int
104 diff_atom_cmp(int *cmp,
105 const struct diff_atom *left,
106 const struct diff_atom *right)
108 off_t remain_left, remain_right;
109 int flags = (left->root->diff_flags | right->root->diff_flags);
110 bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
112 if (!left->len && !right->len) {
113 *cmp = 0;
114 return 0;
116 if (!ignore_whitespace) {
117 if (!right->len) {
118 *cmp = 1;
119 return 0;
121 if (!left->len) {
122 *cmp = -1;
123 return 0;
127 if (left->at != NULL && right->at != NULL) {
128 *cmp = buf_cmp(left->at, left->len, right->at, right->len,
129 ignore_whitespace);
130 return 0;
133 remain_left = left->len;
134 remain_right = right->len;
135 while (remain_left > 0 || remain_right > 0) {
136 const size_t chunksz = 8192;
137 unsigned char buf_left[chunksz], buf_right[chunksz];
138 const uint8_t *p_left, *p_right;
139 off_t n_left, n_right;
140 ssize_t r;
142 if (!remain_right) {
143 *cmp = 1;
144 return 0;
146 if (!remain_left) {
147 *cmp = -1;
148 return 0;
151 n_left = MIN(chunksz, remain_left);
152 n_right = MIN(chunksz, remain_right);
154 if (left->at == NULL) {
155 r = read_at(left->root->f,
156 left->pos + (left->len - remain_left),
157 buf_left, n_left);
158 if (r) {
159 *cmp = 0;
160 return r;
162 p_left = buf_left;
163 } else {
164 p_left = left->at + (left->len - remain_left);
167 if (right->at == NULL) {
168 r = read_at(right->root->f,
169 right->pos + (right->len - remain_right),
170 buf_right, n_right);
171 if (r) {
172 *cmp = 0;
173 return r;
175 p_right = buf_right;
176 } else {
177 p_right = right->at + (right->len - remain_right);
180 r = buf_cmp(p_left, n_left, p_right, n_right,
181 ignore_whitespace);
182 if (r) {
183 *cmp = r;
184 return 0;
187 remain_left -= n_left;
188 remain_right -= n_right;
191 *cmp = 0;
192 return 0;
195 int
196 diff_atom_same(bool *same,
197 const struct diff_atom *left,
198 const struct diff_atom *right)
200 int cmp;
201 int r;
202 if (left->hash != right->hash) {
203 *same = false;
204 return 0;
206 r = diff_atom_cmp(&cmp, left, right);
207 if (r) {
208 *same = true;
209 return r;
211 *same = (cmp == 0);
212 return 0;
215 static struct diff_chunk *
216 diff_state_add_solved_chunk(struct diff_state *state,
217 const struct diff_chunk *chunk)
219 diff_chunk_arraylist_t *result;
220 struct diff_chunk *new_chunk;
221 enum diff_chunk_type last_t;
222 enum diff_chunk_type new_t;
224 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
225 * never directly follows a plus chunk. */
226 result = &state->result->chunks;
228 last_t = diff_chunk_type(&result->head[result->len - 1]);
229 new_t = diff_chunk_type(chunk);
231 debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED");
232 debug("L\n");
233 debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
234 debug("R\n");
235 debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
237 if (new_t == last_t) {
238 new_chunk = &result->head[result->len - 1];
239 new_chunk->left_count += chunk->left_count;
240 new_chunk->right_count += chunk->right_count;
241 debug(" - added chunk touches previous one of same type, joined:\n");
242 debug("L\n");
243 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
244 debug("R\n");
245 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
246 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
247 enum diff_chunk_type prev_last_t =
248 result->len > 1 ?
249 diff_chunk_type(&result->head[result->len - 2])
250 : CHUNK_EMPTY;
251 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
252 * Is the one before that also a minus? combine. */
253 if (prev_last_t == CHUNK_MINUS) {
254 new_chunk = &result->head[result->len - 2];
255 new_chunk->left_count += chunk->left_count;
256 new_chunk->right_count += chunk->right_count;
258 debug(" - added minus-chunk follows plus-chunk,"
259 " put before that plus-chunk and joined"
260 " with preceding minus-chunk:\n");
261 debug("L\n");
262 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
263 debug("R\n");
264 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
265 } else {
266 ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
267 if (!new_chunk)
268 return NULL;
269 *new_chunk = *chunk;
271 debug(" - added minus-chunk follows plus-chunk,"
272 " put before that plus-chunk\n");
274 } else {
275 ARRAYLIST_ADD(new_chunk, *result);
276 if (!new_chunk)
277 return NULL;
278 *new_chunk = *chunk;
280 return new_chunk;
283 /* Even if a left or right side is empty, diff output may need to know the
284 * position in that file.
285 * So left_start or right_start must never be NULL -- pass left_count or
286 * right_count as zero to indicate staying at that position without consuming
287 * any lines. */
288 struct diff_chunk *
289 diff_state_add_chunk(struct diff_state *state, bool solved,
290 struct diff_atom *left_start, unsigned int left_count,
291 struct diff_atom *right_start, unsigned int right_count)
293 diff_chunk_arraylist_t *result = NULL;
294 struct diff_chunk *new_chunk;
295 struct diff_chunk chunk = {
296 .solved = solved,
297 .left_start = left_start,
298 .left_count = left_count,
299 .right_start = right_start,
300 .right_count = right_count,
301 };
303 if (!solved || state->temp_result.len) {
304 /* Append to temp_result */
305 debug("append to temp_result\n");
306 result = &state->temp_result;
307 } else if (!state->result->chunks.len) {
308 /* Append to final result */
309 result = &state->result->chunks;
310 debug("Add first chunk:\n");
311 debug("L\n");
312 debug_dump_atoms(&state->left, left_start, left_count);
313 debug("R\n");
314 debug_dump_atoms(&state->right, right_start, right_count);
316 if (result) {
317 ARRAYLIST_ADD(new_chunk, *result);
318 if (!new_chunk)
319 return NULL;
320 *new_chunk = chunk;
321 return new_chunk;
324 return diff_state_add_solved_chunk(state, &chunk);
327 void
328 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
329 unsigned long long len, int diff_flags)
331 *d = (struct diff_data){
332 .f = f,
333 .pos = 0,
334 .data = data,
335 .len = len,
336 .root = d,
337 .diff_flags = diff_flags,
338 };
341 void
342 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
343 struct diff_atom *from_atom, unsigned int atoms_count)
345 struct diff_atom *last_atom;
347 if (atoms_count == 0) {
348 *d = (struct diff_data){
349 .f = NULL,
350 .pos = 0,
351 .data = NULL,
352 .len = 0,
353 .root = parent->root,
354 .atoms.head = NULL,
355 .atoms.len = atoms_count,
356 };
358 return;
361 last_atom = from_atom + atoms_count - 1;
362 *d = (struct diff_data){
363 .f = NULL,
364 .pos = from_atom->pos,
365 .data = from_atom->at,
366 .len = (last_atom->pos + last_atom->len) - from_atom->pos,
367 .root = parent->root,
368 .atoms.head = from_atom,
369 .atoms.len = atoms_count,
370 };
372 debug("subsection:\n");
373 debug_dump(d);
376 void
377 diff_data_free(struct diff_data *diff_data)
379 if (!diff_data)
380 return;
381 if (diff_data->atoms.allocated)
382 ARRAYLIST_FREE(diff_data->atoms);
385 int
386 diff_algo_none(const struct diff_algo_config *algo_config,
387 struct diff_state *state)
389 debug("\n** %s\n", __func__);
390 debug("left:\n");
391 debug_dump(&state->left);
392 debug("right:\n");
393 debug_dump(&state->right);
394 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
395 0);
397 /* Add a chunk of equal lines, if any */
398 struct diff_atom *l = state->left.atoms.head;
399 unsigned int l_len = state->left.atoms.len;
400 struct diff_atom *r = state->right.atoms.head;
401 unsigned int r_len = state->right.atoms.len;
402 unsigned int equal_atoms_start = 0;
403 unsigned int equal_atoms_end = 0;
404 unsigned int l_idx = 0;
405 unsigned int r_idx = 0;
407 while (equal_atoms_start < l_len
408 && equal_atoms_start < r_len) {
409 int err;
410 bool same;
411 err = diff_atom_same(&same, &l[equal_atoms_start],
412 &r[equal_atoms_start]);
413 if (err)
414 return err;
415 if (!same)
416 break;
417 equal_atoms_start++;
419 while (equal_atoms_end < (l_len - equal_atoms_start)
420 && equal_atoms_end < (r_len - equal_atoms_start)) {
421 int err;
422 bool same;
423 err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
424 &r[r_len - 1 - equal_atoms_end]);
425 if (err)
426 return err;
427 if (!same)
428 break;
429 equal_atoms_end++;
432 /* Add a chunk of equal lines at the start */
433 if (equal_atoms_start) {
434 if (!diff_state_add_chunk(state, true,
435 l, equal_atoms_start,
436 r, equal_atoms_start))
437 return ENOMEM;
438 l_idx += equal_atoms_start;
439 r_idx += equal_atoms_start;
442 /* Add a "minus" chunk with all lines from the left. */
443 if (equal_atoms_start + equal_atoms_end < l_len) {
444 unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
445 if (!diff_state_add_chunk(state, true,
446 &l[l_idx], add_len,
447 &r[r_idx], 0))
448 return ENOMEM;
449 l_idx += add_len;
452 /* Add a "plus" chunk with all lines from the right. */
453 if (equal_atoms_start + equal_atoms_end < r_len) {
454 unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
455 if (!diff_state_add_chunk(state, true,
456 &l[l_idx], 0,
457 &r[r_idx], add_len))
458 return ENOMEM;
459 r_idx += add_len;
462 /* Add a chunk of equal lines at the end */
463 if (equal_atoms_end) {
464 if (!diff_state_add_chunk(state, true,
465 &l[l_idx], equal_atoms_end,
466 &r[r_idx], equal_atoms_end))
467 return ENOMEM;
470 return DIFF_RC_OK;
473 int
474 diff_run_algo(const struct diff_algo_config *algo_config,
475 struct diff_state *state)
477 int rc;
479 if (!algo_config || !algo_config->impl
480 || !state->recursion_depth_left
481 || !state->left.atoms.len || !state->right.atoms.len) {
482 debug("Fall back to diff_algo_none():%s%s%s\n",
483 (!algo_config || !algo_config->impl) ? " no-cfg" : "",
484 (!state->recursion_depth_left) ? " max-depth" : "",
485 (!state->left.atoms.len || !state->right.atoms.len)?
486 " trivial" : "");
487 return diff_algo_none(algo_config, state);
490 ARRAYLIST_FREE(state->temp_result);
491 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
492 rc = algo_config->impl(algo_config, state);
493 switch (rc) {
494 case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
495 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
496 algo_config->fallback_algo);
497 rc = diff_run_algo(algo_config->fallback_algo, state);
498 goto return_rc;
500 case DIFF_RC_OK:
501 /* continue below */
502 break;
504 default:
505 /* some error happened */
506 goto return_rc;
509 /* Pick up any diff chunks that are still unsolved and feed to
510 * inner_algo. inner_algo will solve unsolved chunks and append to
511 * result, and subsequent solved chunks on this level are then appended
512 * to result afterwards. */
513 int i;
514 for (i = 0; i < state->temp_result.len; i++) {
515 struct diff_chunk *c = &state->temp_result.head[i];
516 if (c->solved) {
517 diff_state_add_solved_chunk(state, c);
518 continue;
521 /* c is an unsolved chunk, feed to inner_algo */
522 struct diff_state inner_state = {
523 .result = state->result,
524 .recursion_depth_left = state->recursion_depth_left - 1,
525 .kd_buf = state->kd_buf,
526 .kd_buf_size = state->kd_buf_size,
527 };
528 diff_data_init_subsection(&inner_state.left, &state->left,
529 c->left_start, c->left_count);
530 diff_data_init_subsection(&inner_state.right, &state->right,
531 c->right_start, c->right_count);
533 rc = diff_run_algo(algo_config->inner_algo, &inner_state);
534 state->kd_buf = inner_state.kd_buf;
535 state->kd_buf_size = inner_state.kd_buf_size;
536 if (rc != DIFF_RC_OK)
537 goto return_rc;
540 rc = DIFF_RC_OK;
541 return_rc:
542 ARRAYLIST_FREE(state->temp_result);
543 return rc;
546 int
547 diff_atomize_file(struct diff_data *d,
548 const struct diff_config *config,
549 FILE *f, const uint8_t *data, off_t len, int diff_flags)
551 if (!config->atomize_func)
552 return EINVAL;
554 diff_data_init_root(d, f, data, len, diff_flags);
556 return config->atomize_func(config->atomize_func_data, d);
560 struct diff_result *
561 diff_main(const struct diff_config *config, struct diff_data *left,
562 struct diff_data *right)
564 struct diff_result *result = malloc(sizeof(struct diff_result));
565 if (!result)
566 return NULL;
568 *result = (struct diff_result){};
569 result->left = left;
570 result->right = right;
572 struct diff_state state = {
573 .result = result,
574 .recursion_depth_left = config->max_recursion_depth ? : 32,
575 .kd_buf = NULL,
576 .kd_buf_size = 0,
577 };
578 diff_data_init_subsection(&state.left, left,
579 left->atoms.head,
580 left->atoms.len);
581 diff_data_init_subsection(&state.right, right,
582 right->atoms.head,
583 right->atoms.len);
585 result->rc = diff_run_algo(config->algo, &state);
586 free(state.kd_buf);
588 return result;
591 void
592 diff_result_free(struct diff_result *result)
594 if (!result)
595 return;
596 ARRAYLIST_FREE(result->chunks);
597 free(result);