1 /* Generic infrastructure to implement various diff algorithms (implementation). */
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 #include <sys/queue.h>
32 #include <arraylist.h>
33 #include <diff_main.h>
35 #include "diff_internal.h"
36 #include "diff_debug.h"
39 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
42 if (fseeko(f, at_pos, SEEK_SET) == -1)
44 r = fread(buf, sizeof(char), len, f);
45 if ((r == 0 || r < len) && ferror(f))
53 buf_cmp(const unsigned char *left, size_t left_len,
54 const unsigned char *right, size_t right_len,
55 bool ignore_whitespace)
59 if (ignore_whitespace) {
61 while (il < left_len && ir < right_len) {
62 unsigned char cl = left[il];
63 unsigned char cr = right[ir];
65 if (isspace(cl) && il < left_len) {
69 if (isspace(cr) && ir < right_len) {
81 while (il < left_len) {
82 unsigned char cl = left[il++];
86 while (ir < right_len) {
87 unsigned char cr = right[ir++];
95 cmp = memcmp(left, right, MIN(left_len, right_len));
98 if (left_len == right_len)
100 return (left_len > right_len) ? 1 : -1;
104 diff_atom_cmp(int *cmp,
105 const struct diff_atom *left,
106 const struct diff_atom *right)
108 off_t remain_left, remain_right;
109 int flags = (left->root->diff_flags | right->root->diff_flags);
110 bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
112 if (!ignore_whitespace) {
113 if (!left->len && !right->len) {
127 if (left->at != NULL && right->at != NULL) {
128 *cmp = buf_cmp(left->at, left->len, right->at, right->len,
133 remain_left = left->len;
134 remain_right = right->len;
135 while (remain_left > 0 || remain_right > 0) {
136 const size_t chunksz = 8192;
137 unsigned char buf_left[chunksz], buf_right[chunksz];
138 const uint8_t *p_left, *p_right;
139 off_t n_left, n_right;
151 n_left = MIN(chunksz, remain_left);
152 n_right = MIN(chunksz, remain_right);
154 if (left->at == NULL) {
155 r = read_at(left->root->f,
156 left->pos + (left->len - remain_left),
164 p_left = left->at + (left->len - remain_left);
167 if (right->at == NULL) {
168 r = read_at(right->root->f,
169 right->pos + (right->len - remain_right),
177 p_right = right->at + (right->len - remain_right);
180 r = buf_cmp(p_left, n_left, p_right, n_right,
187 remain_left -= n_left;
188 remain_right -= n_right;
196 diff_atom_same(bool *same,
197 const struct diff_atom *left,
198 const struct diff_atom *right)
201 int r = diff_atom_cmp(&cmp, left, right);
210 static struct diff_chunk *
211 diff_state_add_solved_chunk(struct diff_state *state,
212 const struct diff_chunk *chunk)
214 diff_chunk_arraylist_t *result;
215 struct diff_chunk *new_chunk;
216 enum diff_chunk_type last_t;
217 enum diff_chunk_type new_t;
219 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
220 * never directly follows a plus chunk. */
221 result = &state->result->chunks;
223 last_t = diff_chunk_type(&result->head[result->len - 1]);
224 new_t = diff_chunk_type(chunk);
226 debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED");
228 debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
230 debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
232 if (new_t == last_t) {
233 new_chunk = &result->head[result->len - 1];
234 new_chunk->left_count += chunk->left_count;
235 new_chunk->right_count += chunk->right_count;
236 debug(" - added chunk touches previous one of same type, joined:\n");
238 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
240 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
241 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
242 enum diff_chunk_type prev_last_t =
244 diff_chunk_type(&result->head[result->len - 2])
246 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
247 * Is the one before that also a minus? combine. */
248 if (prev_last_t == CHUNK_MINUS) {
249 new_chunk = &result->head[result->len - 2];
250 new_chunk->left_count += chunk->left_count;
251 new_chunk->right_count += chunk->right_count;
253 debug(" - added minus-chunk follows plus-chunk,"
254 " put before that plus-chunk and joined"
255 " with preceding minus-chunk:\n");
257 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
259 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
261 ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
266 debug(" - added minus-chunk follows plus-chunk,"
267 " put before that plus-chunk\n");
270 ARRAYLIST_ADD(new_chunk, *result);
278 /* Even if a left or right side is empty, diff output may need to know the
279 * position in that file.
280 * So left_start or right_start must never be NULL -- pass left_count or
281 * right_count as zero to indicate staying at that position without consuming
284 diff_state_add_chunk(struct diff_state *state, bool solved,
285 struct diff_atom *left_start, unsigned int left_count,
286 struct diff_atom *right_start, unsigned int right_count)
288 diff_chunk_arraylist_t *result = NULL;
289 struct diff_chunk *new_chunk;
290 struct diff_chunk chunk = {
292 .left_start = left_start,
293 .left_count = left_count,
294 .right_start = right_start,
295 .right_count = right_count,
298 if (!solved || state->temp_result.len) {
299 /* Append to temp_result */
300 debug("append to temp_result\n");
301 result = &state->temp_result;
302 } else if (!state->result->chunks.len) {
303 /* Append to final result */
304 result = &state->result->chunks;
305 debug("Add first chunk:\n");
307 debug_dump_atoms(&state->left, left_start, left_count);
309 debug_dump_atoms(&state->right, right_start, right_count);
312 ARRAYLIST_ADD(new_chunk, *result);
319 return diff_state_add_solved_chunk(state, &chunk);
323 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
324 unsigned long long len, int diff_flags)
326 *d = (struct diff_data){
332 .diff_flags = diff_flags,
337 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
338 struct diff_atom *from_atom, unsigned int atoms_count)
340 struct diff_atom *last_atom;
342 if (atoms_count == 0) {
343 *d = (struct diff_data){
348 .root = parent->root,
350 .atoms.len = atoms_count,
356 last_atom = from_atom + atoms_count - 1;
357 *d = (struct diff_data){
359 .pos = from_atom->pos,
360 .data = from_atom->at,
361 .len = (last_atom->pos + last_atom->len) - from_atom->pos,
362 .root = parent->root,
363 .atoms.head = from_atom,
364 .atoms.len = atoms_count,
367 debug("subsection:\n");
372 diff_data_free(struct diff_data *diff_data)
376 if (diff_data->atoms.allocated)
377 ARRAYLIST_FREE(diff_data->atoms);
381 diff_algo_none(const struct diff_algo_config *algo_config,
382 struct diff_state *state)
384 debug("\n** %s\n", __func__);
386 debug_dump(&state->left);
388 debug_dump(&state->right);
389 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
392 /* Add a chunk of equal lines, if any */
393 struct diff_atom *l = state->left.atoms.head;
394 unsigned int l_len = state->left.atoms.len;
395 struct diff_atom *r = state->right.atoms.head;
396 unsigned int r_len = state->right.atoms.len;
397 unsigned int equal_atoms_start = 0;
398 unsigned int equal_atoms_end = 0;
399 unsigned int l_idx = 0;
400 unsigned int r_idx = 0;
402 while (equal_atoms_start < l_len
403 && equal_atoms_start < r_len) {
406 err = diff_atom_same(&same, &l[equal_atoms_start],
407 &r[equal_atoms_start]);
414 while (equal_atoms_end < (l_len - equal_atoms_start)
415 && equal_atoms_end < (r_len - equal_atoms_start)) {
418 err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
419 &r[r_len - 1 - equal_atoms_end]);
427 /* Add a chunk of equal lines at the start */
428 if (equal_atoms_start) {
429 if (!diff_state_add_chunk(state, true,
430 l, equal_atoms_start,
431 r, equal_atoms_start))
433 l_idx += equal_atoms_start;
434 r_idx += equal_atoms_start;
437 /* Add a "minus" chunk with all lines from the left. */
438 if (equal_atoms_start + equal_atoms_end < l_len) {
439 unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
440 if (!diff_state_add_chunk(state, true,
447 /* Add a "plus" chunk with all lines from the right. */
448 if (equal_atoms_start + equal_atoms_end < r_len) {
449 unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
450 if (!diff_state_add_chunk(state, true,
457 /* Add a chunk of equal lines at the end */
458 if (equal_atoms_end) {
459 if (!diff_state_add_chunk(state, true,
460 &l[l_idx], equal_atoms_end,
461 &r[r_idx], equal_atoms_end))
469 diff_run_algo(const struct diff_algo_config *algo_config,
470 struct diff_state *state)
474 if (!algo_config || !algo_config->impl
475 || !state->recursion_depth_left
476 || !state->left.atoms.len || !state->right.atoms.len) {
477 debug("Fall back to diff_algo_none():%s%s%s\n",
478 (!algo_config || !algo_config->impl) ? " no-cfg" : "",
479 (!state->recursion_depth_left) ? " max-depth" : "",
480 (!state->left.atoms.len || !state->right.atoms.len)?
482 return diff_algo_none(algo_config, state);
485 ARRAYLIST_FREE(state->temp_result);
486 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
487 rc = algo_config->impl(algo_config, state);
489 case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
490 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
491 algo_config->fallback_algo);
492 rc = diff_run_algo(algo_config->fallback_algo, state);
500 /* some error happened */
504 /* Pick up any diff chunks that are still unsolved and feed to
505 * inner_algo. inner_algo will solve unsolved chunks and append to
506 * result, and subsequent solved chunks on this level are then appended
507 * to result afterwards. */
509 for (i = 0; i < state->temp_result.len; i++) {
510 struct diff_chunk *c = &state->temp_result.head[i];
512 diff_state_add_solved_chunk(state, c);
516 /* c is an unsolved chunk, feed to inner_algo */
517 struct diff_state inner_state = {
518 .result = state->result,
519 .recursion_depth_left = state->recursion_depth_left - 1,
521 diff_data_init_subsection(&inner_state.left, &state->left,
522 c->left_start, c->left_count);
523 diff_data_init_subsection(&inner_state.right, &state->right,
524 c->right_start, c->right_count);
526 rc = diff_run_algo(algo_config->inner_algo, &inner_state);
528 if (rc != DIFF_RC_OK)
534 ARRAYLIST_FREE(state->temp_result);
539 diff_main(const struct diff_config *config,
540 FILE *left_f, const uint8_t *left_data, off_t left_len,
541 FILE *right_f, const uint8_t *right_data, off_t right_len,
544 struct diff_result *result = malloc(sizeof(struct diff_result));
548 *result = (struct diff_result){};
549 diff_data_init_root(&result->left, left_f, left_data, left_len,
551 diff_data_init_root(&result->right, right_f, right_data, right_len,
554 if (!config->atomize_func) {
559 result->rc = config->atomize_func(config->atomize_func_data,
560 &result->left, &result->right);
561 if (result->rc != DIFF_RC_OK)
564 struct diff_state state = {
566 .recursion_depth_left = config->max_recursion_depth ? : 32,
568 diff_data_init_subsection(&state.left, &result->left,
569 result->left.atoms.head,
570 result->left.atoms.len);
571 diff_data_init_subsection(&state.right, &result->right,
572 result->right.atoms.head,
573 result->right.atoms.len);
575 result->rc = diff_run_algo(config->algo, &state);
581 diff_result_free(struct diff_result *result)
585 diff_data_free(&result->left);
586 diff_data_free(&result->right);
587 ARRAYLIST_FREE(result->chunks);