Blob


1 /* Generic infrastructure to implement various diff algorithms (implementation). */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
19 #include <sys/queue.h>
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <limits.h>
28 #include <unistd.h>
30 #include <assert.h>
32 #include <diff/arraylist.h>
33 #include <diff/diff_main.h>
35 #include "diff_debug.h"
37 static int
38 read_at(int fd, int at_pos, unsigned char *buf, size_t len)
39 {
40 int r;
41 if (lseek(fd, at_pos, SEEK_SET) == -1)
42 return errno;
43 r = read(fd, buf, len);
44 if (r == -1)
45 return errno;
46 if (r != len)
47 return EIO;
48 return 0;
49 }
51 static int
52 buf_cmp(const unsigned char *left, size_t left_len,
53 const unsigned char *right, size_t right_len,
54 bool ignore_whitespace)
55 {
56 int cmp;
58 if (ignore_whitespace) {
59 int il = 0, ir = 0;
60 while (il < left_len && ir < right_len) {
61 unsigned char cl = left[il];
62 unsigned char cr = right[ir];
64 if (isspace(cl) && il < left_len) {
65 il++;
66 continue;
67 }
68 if (isspace(cr) && ir < right_len) {
69 ir++;
70 continue;
71 }
73 if (cl > cr)
74 return 1;
75 if (cr > cl)
76 return -1;
77 il++;
78 ir++;
79 }
80 while (il < left_len) {
81 unsigned char cl = left[il++];
82 if (!isspace(cl))
83 return 1;
84 }
85 while (ir < right_len) {
86 unsigned char cr = right[ir++];
87 if (!isspace(cr))
88 return -1;
89 }
91 return 0;
92 }
94 cmp = memcmp(left, right, MIN(left_len, right_len));
95 if (cmp)
96 return cmp;
97 if (left_len == right_len)
98 return 0;
99 return (left_len > right_len) ? 1 : -1;
102 int
103 diff_atom_cmp(int *cmp,
104 const struct diff_atom *left,
105 const struct diff_atom *right)
107 off_t remain_left, remain_right;
108 bool ignore_whitespace;
110 ignore_whitespace = (left->d->root->ignore_whitespace ||
111 right->d->root->ignore_whitespace);
113 if (!ignore_whitespace) {
114 if (!left->len && !right->len) {
115 *cmp = 0;
116 return 0;
118 if (!right->len) {
119 *cmp = 1;
120 return 0;
122 if (!left->len) {
123 *cmp = -1;
124 return 0;
128 if (left->at != NULL && right->at != NULL) {
129 *cmp = buf_cmp(left->at, left->len, right->at, right->len,
130 ignore_whitespace);
131 return 0;
134 remain_left = left->len;
135 remain_right = right->len;
136 while (remain_left > 0 || remain_right > 0) {
137 const size_t chunksz = 8192;
138 unsigned char buf_left[chunksz], buf_right[chunksz];
139 const uint8_t *p_left, *p_right;
140 off_t n_left, n_right;
141 ssize_t r;
143 if (!remain_right) {
144 *cmp = 1;
145 return 0;
147 if (!remain_left) {
148 *cmp = -1;
149 return 0;
152 n_left = MIN(chunksz, remain_left);
153 n_right = MIN(chunksz, remain_right);
155 if (left->at == NULL) {
156 r = read_at(left->d->root->fd,
157 left->pos + (left->len - remain_left),
158 buf_left, n_left);
159 if (r) {
160 *cmp = 0;
161 return r;
163 p_left = buf_left;
164 } else {
165 p_left = left->at + (left->len - remain_left);
168 if (right->at == NULL) {
169 r = read_at(right->d->root->fd,
170 right->pos + (right->len - remain_right),
171 buf_right, n_right);
172 if (r) {
173 *cmp = 0;
174 return r;
176 p_right = buf_right;
177 } else {
178 p_right = right->at + (right->len - remain_right);
181 r = buf_cmp(p_left, n_left, p_right, n_right,
182 ignore_whitespace);
183 if (r) {
184 *cmp = r;
185 return 0;
188 remain_left -= n_left;
189 remain_right -= n_right;
192 *cmp = 0;
193 return 0;
196 int
197 diff_atom_same(bool *same,
198 const struct diff_atom *left,
199 const struct diff_atom *right)
201 int cmp;
202 int r = diff_atom_cmp(&cmp, left, right);
203 if (r) {
204 *same = true;
205 return r;
207 *same = (cmp == 0);
208 return 0;
211 /* Even if a left or right side is empty, diff output may need to know the
212 * position in that file.
213 * So left_start or right_start must never be NULL -- pass left_count or
214 * right_count as zero to indicate staying at that position without consuming
215 * any lines. */
216 struct diff_chunk *
217 diff_state_add_chunk(struct diff_state *state, bool solved,
218 struct diff_atom *left_start, unsigned int left_count,
219 struct diff_atom *right_start, unsigned int right_count)
221 diff_chunk_arraylist_t *result = NULL;
222 struct diff_chunk *new_chunk;
223 struct diff_chunk chunk = {
224 .solved = solved,
225 .left_start = left_start,
226 .left_count = left_count,
227 .right_start = right_start,
228 .right_count = right_count,
229 };
230 enum diff_chunk_type last_t;
231 enum diff_chunk_type prev_last_t;
232 enum diff_chunk_type new_t;
234 if (!solved || state->temp_result.len) {
235 /* Append to temp_result */
236 result = &state->temp_result;
237 } else if (!state->result->chunks.len) {
238 /* Append to final result */
239 result = &state->result->chunks;
241 if (result) {
242 ARRAYLIST_ADD(new_chunk, *result);
243 if (!new_chunk)
244 return NULL;
245 *new_chunk = chunk;
246 goto chunk_added;
249 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
250 * never directly follows a plus chunk. */
251 result = &state->result->chunks;
253 prev_last_t = result->len > 1 ?
254 diff_chunk_type(&result->head[result->len - 2]) : CHUNK_EMPTY;
255 last_t = diff_chunk_type(&result->head[result->len - 1]);
256 new_t = diff_chunk_type(&chunk);
258 if (new_t == last_t) {
259 new_chunk = &result->head[result->len - 1];
260 new_chunk->left_count += chunk.left_count;
261 new_chunk->right_count += chunk.right_count;
262 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
263 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk.
264 * Is the one before that also a minus? combine. */
265 if (prev_last_t == CHUNK_MINUS) {
266 new_chunk = &result->head[result->len - 2];
267 new_chunk->left_count += chunk.left_count;
268 new_chunk->right_count += chunk.right_count;
269 } else {
270 ARRAYLIST_INSERT(new_chunk, *result, result->len - 2);
271 if (!new_chunk)
272 return NULL;
273 *new_chunk = chunk;
275 } else {
276 ARRAYLIST_ADD(new_chunk, *result);
277 if (!new_chunk)
278 return NULL;
279 *new_chunk = chunk;
280 goto chunk_added;
283 chunk_added:
284 debug("Add %s chunk:\n", new_chunk->solved ? "solved" : "UNSOLVED");
285 debug("L\n");
286 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
287 debug("R\n");
288 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
289 return new_chunk;
292 void
293 diff_data_init_root(struct diff_data *d, int fd, const uint8_t *data,
294 unsigned long long len, bool ignore_whitespace)
296 *d = (struct diff_data){
297 .fd = fd,
298 .pos = 0,
299 .data = data,
300 .len = len,
301 .root = d,
302 .ignore_whitespace = ignore_whitespace,
303 };
306 void
307 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
308 struct diff_atom *from_atom, unsigned int atoms_count)
310 struct diff_atom *last_atom = from_atom + atoms_count - 1;
311 *d = (struct diff_data){
312 .fd = -1,
313 .pos = from_atom->pos,
314 .data = from_atom->at,
315 .len = (last_atom->pos + last_atom->len) - from_atom->pos,
316 .root = parent->root,
317 .atoms.head = from_atom,
318 .atoms.len = atoms_count,
319 };
321 debug("subsection:\n");
322 debug_dump(d);
325 void
326 diff_data_free(struct diff_data *diff_data)
328 if (!diff_data)
329 return;
330 if (diff_data->atoms.allocated)
331 ARRAYLIST_FREE(diff_data->atoms);
334 int
335 diff_algo_none(const struct diff_algo_config *algo_config,
336 struct diff_state *state)
338 debug("\n** %s\n", __func__);
339 debug("left:\n");
340 debug_dump(&state->left);
341 debug("right:\n");
342 debug_dump(&state->right);
343 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
344 0);
346 /* Add a chunk of equal lines, if any */
347 unsigned int equal_atoms = 0;
348 while (equal_atoms < state->left.atoms.len
349 && equal_atoms < state->right.atoms.len) {
350 int r;
351 bool same;
352 r = diff_atom_same(&same, &state->left.atoms.head[equal_atoms],
353 &state->right.atoms.head[equal_atoms]);
354 if (r)
355 return r;
356 if (!same)
357 break;
358 equal_atoms++;
360 if (equal_atoms) {
361 if (!diff_state_add_chunk(state, true,
362 &state->left.atoms.head[0],
363 equal_atoms,
364 &state->right.atoms.head[0],
365 equal_atoms))
366 return ENOMEM;
369 /* Add a "minus" chunk with all lines from the left. */
370 if (equal_atoms < state->left.atoms.len) {
371 if (!diff_state_add_chunk(state, true,
372 &state->left.atoms.head[equal_atoms],
373 state->left.atoms.len - equal_atoms,
374 NULL, 0))
375 return ENOMEM;
378 /* Add a "plus" chunk with all lines from the right. */
379 if (equal_atoms < state->right.atoms.len) {
380 if (!diff_state_add_chunk(state, true,
381 NULL, 0,
382 &state->right.atoms.head[equal_atoms],
383 state->right.atoms.len - equal_atoms))
384 return ENOMEM;
386 return DIFF_RC_OK;
389 int
390 diff_run_algo(const struct diff_algo_config *algo_config,
391 struct diff_state *state)
393 int rc;
394 ARRAYLIST_FREE(state->temp_result);
396 if (!algo_config || !algo_config->impl
397 || !state->recursion_depth_left) {
398 debug("MAX RECURSION REACHED, just dumping diff chunks\n");
399 return diff_algo_none(algo_config, state);
402 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
403 rc = algo_config->impl(algo_config, state);
404 switch (rc) {
405 case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
406 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
407 algo_config->fallback_algo);
408 rc = diff_run_algo(algo_config->fallback_algo, state);
409 goto return_rc;
411 case DIFF_RC_OK:
412 /* continue below */
413 break;
415 default:
416 /* some error happened */
417 goto return_rc;
420 /* Pick up any diff chunks that are still unsolved and feed to
421 * inner_algo. inner_algo will solve unsolved chunks and append to
422 * result, and subsequent solved chunks on this level are then appended
423 * to result afterwards. */
424 int i;
425 for (i = 0; i < state->temp_result.len; i++) {
426 struct diff_chunk *c = &state->temp_result.head[i];
427 if (c->solved) {
428 struct diff_chunk *final_c;
429 ARRAYLIST_ADD(final_c, state->result->chunks);
430 if (!final_c) {
431 rc = ENOMEM;
432 goto return_rc;
434 *final_c = *c;
435 continue;
438 /* c is an unsolved chunk, feed to inner_algo */
439 struct diff_state inner_state = {
440 .result = state->result,
441 .recursion_depth_left = state->recursion_depth_left - 1,
442 };
443 diff_data_init_subsection(&inner_state.left, &state->left,
444 c->left_start, c->left_count);
445 diff_data_init_subsection(&inner_state.right, &state->right,
446 c->right_start, c->right_count);
448 rc = diff_run_algo(algo_config->inner_algo, &inner_state);
450 if (rc != DIFF_RC_OK)
451 goto return_rc;
454 rc = DIFF_RC_OK;
455 return_rc:
456 ARRAYLIST_FREE(state->temp_result);
457 return rc;
460 struct diff_result *
461 diff_main(const struct diff_config *config,
462 int left_fd, const uint8_t *left_data, off_t left_len,
463 int right_fd, const uint8_t *right_data, off_t right_len,
464 bool ignore_whitespace)
466 struct diff_result *result = malloc(sizeof(struct diff_result));
467 if (!result)
468 return NULL;
470 *result = (struct diff_result){};
471 diff_data_init_root(&result->left, left_fd, left_data, left_len,
472 ignore_whitespace);
473 diff_data_init_root(&result->right, right_fd, right_data, right_len,
474 ignore_whitespace);
476 if (!config->atomize_func) {
477 result->rc = EINVAL;
478 return result;
481 result->rc = config->atomize_func(config->atomize_func_data,
482 &result->left, &result->right);
483 if (result->rc != DIFF_RC_OK)
484 return result;
486 struct diff_state state = {
487 .result = result,
488 .recursion_depth_left = config->max_recursion_depth ? : 32,
489 };
490 diff_data_init_subsection(&state.left, &result->left,
491 result->left.atoms.head,
492 result->left.atoms.len);
493 diff_data_init_subsection(&state.right, &result->right,
494 result->right.atoms.head,
495 result->right.atoms.len);
497 result->rc = diff_run_algo(config->algo, &state);
499 return result;
502 void
503 diff_result_free(struct diff_result *result)
505 if (!result)
506 return;
507 diff_data_free(&result->left);
508 diff_data_free(&result->right);
509 ARRAYLIST_FREE(result->chunks);
510 free(result);