commit 55974fa638c18026f21cd0739d3bc6df41a94a2c from: Martin Pieuchot date: Tue Feb 11 13:27:47 2020 UTC Move all sources to a single place commit - bb7fb738462f1e54a85659098b77605727b9dee0 commit + 55974fa638c18026f21cd0739d3bc6df41a94a2c blob - 2ff357ceee0eed252b191258de3fea36fa4ae92b blob + 3cdc42b334160159d59a29028e6fc531223ba48e --- README +++ README @@ -19,11 +19,5 @@ Many thanks for this valuable door opener! The source code itself is not based on the code found in those blogs, but written from scratch with the knowledge gained. -Compile: - OpenBSD: - make -C diff - Linux: - make -f linux_Makefile -C diff - Test: make -C test/ blob - /dev/null blob + 3a89e2f6754fad430d6111cd5acc08420f47f111 (mode 644) --- /dev/null +++ Makefile @@ -0,0 +1,38 @@ +.PATH:${.CURDIR}/../lib + +.include "diff-version.mk" + +PROG= diff +SRCS= \ + diff.c \ + diff_atomize_text.c \ + diff_main.c \ + diff_myers.c \ + diff_patience.c \ + diff_output.c \ + diff_output_plain.c \ + diff_output_unidiff.c \ + ${END} +MAN = ${PROG}.1 + +CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib + +.if defined(PROFILE) +LDADD = -lutil_p -lz_p -lc_p +.else +LDADD = -lutil -lz +.endif + +.if ${DIFF_RELEASE} != "Yes" +NOMAN = Yes +.endif + +realinstall: + ${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} \ + -m ${BINMODE} ${PROG} ${BINDIR}/${PROG} + +dist: + mkdir ../diff-${DIFF_VERSION}/diff + cp ${SRCS} ${MAN} ../diff-${DIFF_VERSION}/diff + +.include blob - b93701332fb1013c6f09d620e138075dc268925c (mode 644) blob + /dev/null --- diff/Makefile +++ /dev/null @@ -1,38 +0,0 @@ -.PATH:${.CURDIR}/../lib - -.include "../diff-version.mk" - -PROG= diff -SRCS= \ - diff.c \ - diff_atomize_text.c \ - diff_main.c \ - diff_myers.c \ - diff_patience.c \ - diff_output.c \ - diff_output_plain.c \ - diff_output_unidiff.c \ - ${END} -MAN = ${PROG}.1 - -CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib - -.if defined(PROFILE) -LDADD = -lutil_p -lz_p -lc_p -.else -LDADD = -lutil -lz -.endif - -.if ${DIFF_RELEASE} != "Yes" -NOMAN = Yes -.endif - -realinstall: - ${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} \ - -m ${BINMODE} ${PROG} ${BINDIR}/${PROG} - -dist: - mkdir ../diff-${DIFF_VERSION}/diff - cp ${SRCS} ${MAN} ../diff-${DIFF_VERSION}/diff - -.include blob - 86b3c54681283baa034bfbeeb19d4b2ee19ef50a (mode 644) blob + /dev/null --- diff/diff.c +++ /dev/null @@ -1,143 +0,0 @@ -/* Commandline diff utility to test diff implementations. */ -/* - * Copyright (c) 2018 Martin Pieuchot - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include - -#ifdef __linux__ -/* stupid shims to compile and test on linux */ -#define __dead - -static const char *getprogname() -{ - return "diff"; -} -#endif - -__dead void usage(void); -int diffreg(char *, char *, int); -char *mmapfile(const char *, struct stat *); - -__dead void -usage(void) -{ - fprintf(stderr, "usage: %s file1 file2\n", getprogname()); - exit(1); -} - -int -main(int argc, char *argv[]) -{ - int ch; - - while ((ch = getopt(argc, argv, "")) != -1) { - switch (ch) { - default: - usage(); - } - } - - argc -= optind; - argv += optind; - - if (argc != 2) - usage(); - - return diffreg(argv[0], argv[1], 0); -} - -#include -#include - -const struct diff_algo_config myers, patience, myers_divide; - -const struct diff_algo_config myers = (struct diff_algo_config){ - .impl = diff_algo_myers, - .permitted_state_size = 1024 * 1024 * sizeof(int), - .fallback_algo = &patience, -}; - -const struct diff_algo_config patience = (struct diff_algo_config){ - .impl = diff_algo_patience, - .inner_algo = &patience, // After subdivision, do Patience again. - .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera. -}; - -const struct diff_algo_config myers_divide = (struct diff_algo_config){ - .impl = diff_algo_myers_divide, - .inner_algo = &myers, // When division succeeded, start from the top. - // (fallback_algo = NULL implies diff_algo_none). -}; - -const struct diff_config diff_config = { - .atomize_func = diff_atomize_text_by_line, - .algo = &myers, -}; - -int -diffreg(char *file1, char *file2, int flags) -{ - char *str1, *str2; - struct stat st1, st2; - struct diff_input_info info = { - .left_path = file1, - .right_path = file2, - }; - - str1 = mmapfile(file1, &st1); - str2 = mmapfile(file2, &st2); - - diff_unidiff(stdout, &diff_config, &info, str1, st1.st_size, str2, st2.st_size, 3); - - munmap(str1, st1.st_size); - munmap(str2, st2.st_size); - - return 0; -} - -char * -mmapfile(const char *path, struct stat *st) -{ - int fd; - char *p; - - fd = open(path, O_RDONLY); - if (fd == -1) - err(2, "%s", path); - - if (fstat(fd, st) == -1) - err(2, "%s", path); - - if ((uintmax_t)st->st_size > SIZE_MAX) - errx(2, "%s: file too big to fit memory", path); - - p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0); - if (p == MAP_FAILED) - err(2, "mmap"); - - close(fd); - - return p; -} blob - 62c7103d70ac32f292cc05f62c782305dd8798f6 (mode 644) blob + /dev/null --- diff/linux_Makefile +++ /dev/null @@ -1,13 +0,0 @@ -CFLAGS = -fsanitize=address -fsanitize=undefined -g -O3 - -diff: diff.c ../lib/libdiff.a - gcc $(CFLAGS) -I../include -o $@ $^ - - -../lib/libdiff.a: ../lib/*.[hc] ../include/diff/*.h - $(MAKE) -f linux_Makefile -C ../lib - -.PHONY: clean -clean: - rm diff - $(MAKE) -f linux_Makefile -C ../lib clean blob - /dev/null blob + 395468a8c29ce7c6d5ca5283a1a1aa247acb3d87 (mode 644) --- /dev/null +++ arraylist.h @@ -0,0 +1,96 @@ +/* Auto-reallocating array for arbitrary member types. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#pragma once + +#ifdef __linux__ +/* stupid shims to compile and test on linux */ +#include +static inline void *reallocarray(void *buf, size_t n, size_t member_size) +{ + return realloc(buf, n * member_size); +} +static inline void *recallocarray(void *buf, size_t oldn, size_t n, size_t member_size) +{ + buf = reallocarray(buf, n, member_size); + bzero(((char*)buf) + (oldn * member_size), (n - oldn) * member_size); + return buf; +} +#endif + + +/* Usage: + * + * ARRAYLIST(any_type_t) list; + * // OR + * typedef ARRAYLIST(any_type_t) any_type_list_t; + * any_type_list_t list; + * + * ARRAYLIST_INIT(list, 128); // < number of (at first unused) members to add on each realloc + * any_type_t *x; + * while (bar) { + * ARRAYLIST_ADD(x, list); // < enlarges the allocated array as needed; list.head may change due to realloc + * if (!x) + * abort(); + * *x = random_foo_value; + * } + * for (i = 0; i < list.len; i++) + * printf("%s", foo_to_str(list.head[i])); + * ARRAYLIST_FREE(list); + */ +#define ARRAYLIST(MEMBER_TYPE) \ + struct { \ + MEMBER_TYPE *head; \ + unsigned int len; \ + unsigned int allocated; \ + unsigned int alloc_blocksize; \ + } + +#define ARRAYLIST_INIT(ARRAY_LIST, ALLOC_BLOCKSIZE) do { \ + (ARRAY_LIST).head = NULL; \ + (ARRAY_LIST).len = 0; \ + (ARRAY_LIST).allocated = 0; \ + (ARRAY_LIST).alloc_blocksize = ALLOC_BLOCKSIZE; \ + } while(0) + +#define ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST) do { \ + if ((ARRAY_LIST).len && !(ARRAY_LIST).allocated) { \ + NEW_ITEM_P = NULL; \ + break; \ + } \ + if ((ARRAY_LIST).head == NULL || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \ + (ARRAY_LIST).allocated += (ARRAY_LIST).alloc_blocksize ? : 8; \ + (ARRAY_LIST).head = recallocarray((ARRAY_LIST).head, (ARRAY_LIST).len, \ + (ARRAY_LIST).allocated, sizeof(*(ARRAY_LIST).head)); \ + }; \ + if (!(ARRAY_LIST).head || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \ + NEW_ITEM_P = NULL; \ + break; \ + } \ + (NEW_ITEM_P) = &(ARRAY_LIST).head[(ARRAY_LIST).len]; \ + (ARRAY_LIST).len++; \ + } while (0) + +#define ARRAYLIST_CLEAR(ARRAY_LIST) \ + (ARRAY_LIST).len = 0 + +#define ARRAYLIST_FREE(ARRAY_LIST) \ + do { \ + if ((ARRAY_LIST).head && (ARRAY_LIST).allocated) \ + free((ARRAY_LIST).head); \ + ARRAYLIST_INIT(ARRAY_LIST, (ARRAY_LIST).alloc_blocksize); \ + } while(0) blob - 395468a8c29ce7c6d5ca5283a1a1aa247acb3d87 (mode 644) blob + /dev/null --- include/diff/arraylist.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Auto-reallocating array for arbitrary member types. */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#pragma once - -#ifdef __linux__ -/* stupid shims to compile and test on linux */ -#include -static inline void *reallocarray(void *buf, size_t n, size_t member_size) -{ - return realloc(buf, n * member_size); -} -static inline void *recallocarray(void *buf, size_t oldn, size_t n, size_t member_size) -{ - buf = reallocarray(buf, n, member_size); - bzero(((char*)buf) + (oldn * member_size), (n - oldn) * member_size); - return buf; -} -#endif - - -/* Usage: - * - * ARRAYLIST(any_type_t) list; - * // OR - * typedef ARRAYLIST(any_type_t) any_type_list_t; - * any_type_list_t list; - * - * ARRAYLIST_INIT(list, 128); // < number of (at first unused) members to add on each realloc - * any_type_t *x; - * while (bar) { - * ARRAYLIST_ADD(x, list); // < enlarges the allocated array as needed; list.head may change due to realloc - * if (!x) - * abort(); - * *x = random_foo_value; - * } - * for (i = 0; i < list.len; i++) - * printf("%s", foo_to_str(list.head[i])); - * ARRAYLIST_FREE(list); - */ -#define ARRAYLIST(MEMBER_TYPE) \ - struct { \ - MEMBER_TYPE *head; \ - unsigned int len; \ - unsigned int allocated; \ - unsigned int alloc_blocksize; \ - } - -#define ARRAYLIST_INIT(ARRAY_LIST, ALLOC_BLOCKSIZE) do { \ - (ARRAY_LIST).head = NULL; \ - (ARRAY_LIST).len = 0; \ - (ARRAY_LIST).allocated = 0; \ - (ARRAY_LIST).alloc_blocksize = ALLOC_BLOCKSIZE; \ - } while(0) - -#define ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST) do { \ - if ((ARRAY_LIST).len && !(ARRAY_LIST).allocated) { \ - NEW_ITEM_P = NULL; \ - break; \ - } \ - if ((ARRAY_LIST).head == NULL || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \ - (ARRAY_LIST).allocated += (ARRAY_LIST).alloc_blocksize ? : 8; \ - (ARRAY_LIST).head = recallocarray((ARRAY_LIST).head, (ARRAY_LIST).len, \ - (ARRAY_LIST).allocated, sizeof(*(ARRAY_LIST).head)); \ - }; \ - if (!(ARRAY_LIST).head || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \ - NEW_ITEM_P = NULL; \ - break; \ - } \ - (NEW_ITEM_P) = &(ARRAY_LIST).head[(ARRAY_LIST).len]; \ - (ARRAY_LIST).len++; \ - } while (0) - -#define ARRAYLIST_CLEAR(ARRAY_LIST) \ - (ARRAY_LIST).len = 0 - -#define ARRAYLIST_FREE(ARRAY_LIST) \ - do { \ - if ((ARRAY_LIST).head && (ARRAY_LIST).allocated) \ - free((ARRAY_LIST).head); \ - ARRAYLIST_INIT(ARRAY_LIST, (ARRAY_LIST).alloc_blocksize); \ - } while(0) blob - f6dcef58bb67e1a01ce4a633f7928a184a56ebb1 (mode 644) blob + /dev/null --- include/diff/diff_main.h +++ /dev/null @@ -1,287 +0,0 @@ -/* Generic infrastructure to implement various diff algorithms. */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include - -#ifndef MAX -#define MAX(A,B) ((A)>(B)?(A):(B)) -#endif -#ifndef MIN -#define MIN(A,B) ((A)<(B)?(A):(B)) -#endif - -struct range { - int start; - int end; -}; - -static inline bool range_empty(const struct range *r) -{ - return r->start == r->end; -} - -static inline bool ranges_touch(const struct range *a, const struct range *b) -{ - return (a->end >= b->start) && (a->start <= b->end); -} - -static inline void ranges_merge(struct range *a, const struct range *b) -{ - *a = (struct range){ - .start = MIN(a->start, b->start), - .end = MAX(a->end, b->end), - }; -} - -static inline int range_len(const struct range *r) -{ - if (!r) - return 0; - return r->end - r->start; -} - -/* List of all possible return codes of a diff invocation. */ -enum diff_rc { - DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1, - DIFF_RC_OK = 0, - DIFF_RC_ENOTSUP = ENOTSUP, - DIFF_RC_ENOMEM = ENOMEM, - DIFF_RC_EINVAL = EINVAL, -}; - -struct diff_atom { - const uint8_t *at; - size_t len; - - /* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to - * compare entire atoms to find out whether they are indeed identical or not. */ - unsigned int hash; - - /* State for the Patience Diff algorithm */ - /* TODO: keep a separate array for the patience state */ - struct { - bool unique_here; - bool unique_in_both; - struct diff_atom *pos_in_other; - struct diff_atom *prev_stack; - struct range identical_lines; - } patience; -}; - -static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right) -{ - return left->hash == right->hash - && left->len == right->len - && memcmp(left->at, right->at, left->len) == 0; -} - -/* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In - * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file, - * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms - * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a - * subset of atoms. */ -struct diff_data { - const uint8_t *data; - size_t len; - ARRAYLIST(struct diff_atom) atoms; - struct diff_data *root; -}; - -void diff_data_free(struct diff_data *diff_data); - -/* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with - * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in - * the file (and not the relative position within the subsection). */ -#define diff_atom_root_idx(DIFF_DATA, ATOM) \ - ((ATOM) ? (ATOM) - ((DIFF_DATA)->root->atoms.head) : (DIFF_DATA)->root->atoms.len) - -/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with - * 0). */ -#define diff_atom_idx(DIFF_DATA, ATOM) \ - ((ATOM) ? (ATOM) - ((DIFF_DATA)->atoms.head) : (DIFF_DATA)->atoms.len) - -#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \ - for ((ATOM) = (FIRST_ATOM); \ - (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \ - (ATOM)++) - -#define diff_data_foreach_atom(ATOM, DIFF_DATA) \ - foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len) - -#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \ - for ((ATOM) = (FROM); \ - (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \ - (ATOM)++) - -/* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right. - * - * If solved == false: - * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next. - * The lines on the left should be diffed against the lines on the right. - * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.) - * - * If solved == true: - * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk"). - * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk"). - * If there are both left and right lines, it is a chunk of equal content on both sides, - * and left_count == right_count: - * - * - foo } - * - bar }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3, - * - baz } right_start = NULL, right_count = 0 } - * moo } - * goo }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3, - * zoo } right_start = &right.atoms.head[0], right_count = 3 } - * +loo } - * +roo }-- diff_chunk{ left_start = NULL, left_count = 0, - * +too } right_start = &right.atoms.head[3], right_count = 3 } - * - */ -struct diff_chunk { - bool solved; - struct diff_atom *left_start; - unsigned int left_count; - struct diff_atom *right_start; - unsigned int right_count; -}; - -typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t; -#define DIFF_RESULT_ALLOC_BLOCKSIZE 128 - -struct diff_result { - enum diff_rc rc; - struct diff_data left; - struct diff_data right; - diff_chunk_arraylist_t chunks; -}; - -struct diff_state { - /* The final result passed to the original diff caller. */ - struct diff_result *result; - - /* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */ - struct diff_data left; - struct diff_data right; - - unsigned int recursion_depth_left; - - /* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */ - diff_chunk_arraylist_t temp_result; -}; - -struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved, - struct diff_atom *left_start, unsigned int left_count, - struct diff_atom *right_start, unsigned int right_count); - -/* Signature of a utility function to divide both source files into diff atoms. - * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets - * both left and right to atomize at the same time. - * An example is diff_atomize_text_by_line() in diff_atomize_text.c. - * - * func_data: context pointer (free to be used by implementation). - * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created. - * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created. - */ -typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right); - -extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right); - -struct diff_algo_config; -typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state); - -/* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */ -enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state); - -/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can - * fail if the required space surpasses algo_config->permitted_state_size. */ -extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state); - -/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides - * the problem into smaller chunks. Requires only linear amounts of memory. */ -extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state); - -/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to - * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */ -extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state); - -/* Diff algorithms to use, possibly nested. For example: - * - * struct diff_algo_config myers, patience, myers_divide; - * - * myers = (struct diff_algo_config){ - * .impl = diff_algo_myers, - * .permitted_state_size = 32 * 1024 * 1024, - * .fallback_algo = &patience, // when too large, do diff_algo_patience - * }; - * - * patience = (struct diff_algo_config){ - * .impl = diff_algo_patience, - * .inner_algo = &patience, // After subdivision, do Patience again. - * .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera. - * }; - * - * myers_divide = (struct diff_algo_config){ - * .impl = diff_algo_myers_divide, - * .inner_algo = &myers, // When division succeeded, start from the top. - * // (fallback_algo = NULL implies diff_algo_none). - * }; - * - * struct diff_config config = { - * .algo = &myers, - * ... - * }; - * diff_main(&config, ...); - */ -struct diff_algo_config { - diff_algo_impl_t impl; - - /* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo - * (diff_algo_myers). permitted_state_size == 0 means no limitation. */ - size_t permitted_state_size; - - /* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */ - const struct diff_algo_config *inner_algo; - - /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find - * any common-unique atoms), then use this algorithm instead. */ - const struct diff_algo_config *fallback_algo; -}; - -struct diff_config { - diff_atomize_func_t atomize_func; - void *atomize_func_data; - - const struct diff_algo_config *algo; - - /* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite - * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all - * left atoms and add all right atoms). */ - unsigned int max_recursion_depth; -}; - -struct diff_result *diff_main(const struct diff_config *config, - const uint8_t *left_data, size_t left_len, - const uint8_t *right_data, size_t right_len); -void diff_result_free(struct diff_result *result); blob - 5d07e74d868098aab6819c41c9499f0f1b30040c (mode 644) blob + /dev/null --- include/diff/diff_output.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Diff output generators and invocation shims. */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#pragma once - -#include -#include - -struct diff_input_info { - const char *arbitrary_info; - const char *left_path; - const char *right_path; -}; - -enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info, - const struct diff_result *result); -enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config, - const struct diff_input_info *info, - const char *left, int left_len, const char *right, int right_len); - -enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info, - const struct diff_result *result, unsigned int context_lines); -enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config, - const struct diff_input_info *info, - const char *left, int left_len, const char *right, int right_len, - unsigned int context_lines); - -enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info); -void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count); blob - /dev/null blob + ec47f65416456d6fa54228e53e8ccea1fd888fba (mode 644) --- /dev/null +++ debug.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#pragma once + +#define DEBUG 0 + +#if DEBUG +#include +#define print(args...) fprintf(stderr, ##args) +#define debug print +#define debug_dump dump +#define debug_dump_atom dump_atom +#define debug_dump_atoms dump_atoms + +static inline void dump_atom(const struct diff_data *left, const struct diff_data *right, const struct diff_atom *atom) +{ + if (!atom) { + print("NULL atom\n"); + return; + } + if (left) + print(" %3ld", diff_atom_root_idx(left, atom)); + if (right && atom->patience.pos_in_other) + print(" %3ld", diff_atom_root_idx(right, atom->patience.pos_in_other)); + + print(" %s%s '", atom->patience.unique_here ? "u" : " ", atom->patience.unique_in_both ? "c" : " "); + const char *s; + for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) { + if (*s == '\r') + print("\\r"); + else if (*s == '\n') + print("\\n"); + else if ((*s < 32 || *s >= 127) && (*s != '\t')) + print("\\x%02x", *s); + else + print("%c", *s); + } + print("'\n"); +} + +static inline void dump_atoms(const struct diff_data *d, struct diff_atom *atom, unsigned int count) +{ + if (count > 42) { + dump_atoms(d, atom, 20); + print("[%u lines skipped]\n", count - 20 - 20); + dump_atoms(d, atom + count - 20, 20); + return; + } else { + struct diff_atom *i; + foreach_diff_atom(i, atom, count) { + dump_atom(d, NULL, i); + } + } +} + +static inline void dump(struct diff_data *d) +{ + dump_atoms(d, d->atoms.head, d->atoms.len); +} + +static inline void dump_myers_graph(const struct diff_data *l, const struct diff_data *r, + int *kd) +{ + int x; + int y; + print(" "); + for (x = 0; x <= l->atoms.len; x++) + print("%2d", x); + print("\n"); + + for (y = 0; y <= r->atoms.len; y++) { + print("%2d ", y); + for (x = 0; x <= l->atoms.len; x++) { + + /* print d advancements from kd, if any. */ + int d = -1; + if (kd) { + int max = l->atoms.len + r->atoms.len; + size_t kd_len = max + 1 + max; + int *kd_pos = kd; + int di; +#define xk_to_y(X, K) ((X) - (K)) + for (di = 0; di < max; di++) { + int ki; + for (ki = di; ki >= -di; ki -= 2) { + if (x == kd_pos[ki] + && y == xk_to_y(x, ki)) { + d = di; + break; + } + } + if (d >= 0) + break; + kd_pos += kd_len; + } + } + if (d >= 0) + print("%d", d); + else + print("o"); + if (x < l->atoms.len && d < 10) + print("-"); + } + print("\n"); + if (y == r->atoms.len) + break; + + print(" "); + for (x = 0; x < l->atoms.len; x++) { + if (diff_atom_same(&l->atoms.head[x], &r->atoms.head[y])) + print("|\\"); + else + print("| "); + } + print("|\n"); + } +} + +static inline void debug_dump_myers_graph(const struct diff_data *l, const struct diff_data *r, + int *kd) +{ + if (l->atoms.len > 99 || r->atoms.len > 99) + return; + dump_myers_graph(l, r, kd); +} + +#else +#define debug(args...) +#define debug_dump(args...) +#define debug_dump_atom(args...) +#define debug_dump_atoms(args...) +#define debug_dump_myers_graph(args...) +#endif blob - ec47f65416456d6fa54228e53e8ccea1fd888fba (mode 644) blob + /dev/null --- lib/debug.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#pragma once - -#define DEBUG 0 - -#if DEBUG -#include -#define print(args...) fprintf(stderr, ##args) -#define debug print -#define debug_dump dump -#define debug_dump_atom dump_atom -#define debug_dump_atoms dump_atoms - -static inline void dump_atom(const struct diff_data *left, const struct diff_data *right, const struct diff_atom *atom) -{ - if (!atom) { - print("NULL atom\n"); - return; - } - if (left) - print(" %3ld", diff_atom_root_idx(left, atom)); - if (right && atom->patience.pos_in_other) - print(" %3ld", diff_atom_root_idx(right, atom->patience.pos_in_other)); - - print(" %s%s '", atom->patience.unique_here ? "u" : " ", atom->patience.unique_in_both ? "c" : " "); - const char *s; - for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) { - if (*s == '\r') - print("\\r"); - else if (*s == '\n') - print("\\n"); - else if ((*s < 32 || *s >= 127) && (*s != '\t')) - print("\\x%02x", *s); - else - print("%c", *s); - } - print("'\n"); -} - -static inline void dump_atoms(const struct diff_data *d, struct diff_atom *atom, unsigned int count) -{ - if (count > 42) { - dump_atoms(d, atom, 20); - print("[%u lines skipped]\n", count - 20 - 20); - dump_atoms(d, atom + count - 20, 20); - return; - } else { - struct diff_atom *i; - foreach_diff_atom(i, atom, count) { - dump_atom(d, NULL, i); - } - } -} - -static inline void dump(struct diff_data *d) -{ - dump_atoms(d, d->atoms.head, d->atoms.len); -} - -static inline void dump_myers_graph(const struct diff_data *l, const struct diff_data *r, - int *kd) -{ - int x; - int y; - print(" "); - for (x = 0; x <= l->atoms.len; x++) - print("%2d", x); - print("\n"); - - for (y = 0; y <= r->atoms.len; y++) { - print("%2d ", y); - for (x = 0; x <= l->atoms.len; x++) { - - /* print d advancements from kd, if any. */ - int d = -1; - if (kd) { - int max = l->atoms.len + r->atoms.len; - size_t kd_len = max + 1 + max; - int *kd_pos = kd; - int di; -#define xk_to_y(X, K) ((X) - (K)) - for (di = 0; di < max; di++) { - int ki; - for (ki = di; ki >= -di; ki -= 2) { - if (x == kd_pos[ki] - && y == xk_to_y(x, ki)) { - d = di; - break; - } - } - if (d >= 0) - break; - kd_pos += kd_len; - } - } - if (d >= 0) - print("%d", d); - else - print("o"); - if (x < l->atoms.len && d < 10) - print("-"); - } - print("\n"); - if (y == r->atoms.len) - break; - - print(" "); - for (x = 0; x < l->atoms.len; x++) { - if (diff_atom_same(&l->atoms.head[x], &r->atoms.head[y])) - print("|\\"); - else - print("| "); - } - print("|\n"); - } -} - -static inline void debug_dump_myers_graph(const struct diff_data *l, const struct diff_data *r, - int *kd) -{ - if (l->atoms.len > 99 || r->atoms.len > 99) - return; - dump_myers_graph(l, r, kd); -} - -#else -#define debug(args...) -#define debug_dump(args...) -#define debug_dump_atom(args...) -#define debug_dump_atoms(args...) -#define debug_dump_myers_graph(args...) -#endif blob - 964c1353da1b055ae761ae6961c45ea19e16d032 (mode 644) blob + /dev/null --- lib/diff_atomize_text.c +++ /dev/null @@ -1,75 +0,0 @@ -/* Split source by line breaks, and calculate a simplistic checksum. */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include - -static int diff_data_atomize_text_lines(struct diff_data *d) -{ - const uint8_t *pos = d->data; - const uint8_t *end = pos + d->len; - enum diff_rc rc; - - unsigned int array_size_estimate = d->len / 50; - unsigned int pow2 = 1; - while (array_size_estimate >>= 1) - pow2++; - - ARRAYLIST_INIT(d->atoms, 1 << pow2); - - while (pos < end) { - const uint8_t *line_end = pos; - unsigned int hash = 0; - - while (line_end < end && *line_end != '\r' && *line_end != '\n') { - hash = hash * 23 + *line_end; - line_end++; - } - - /* When not at the end of data, the line ending char ('\r' or '\n') must follow */ - if (line_end < end) - line_end++; - /* If that was an '\r', also pull in any following '\n' */ - if (line_end[0] == '\r' && line_end < end && line_end[1] == '\n') - line_end++; - - /* Record the found line as diff atom */ - struct diff_atom *atom; - ARRAYLIST_ADD(atom, d->atoms); - if (!atom) - return DIFF_RC_ENOMEM; - - *atom = (struct diff_atom){ - .at = pos, - .len = line_end - pos, - .hash = hash, - }; - - /* Starting point for next line: */ - pos = line_end; - } - - return DIFF_RC_OK; -} - -enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right) -{ - enum diff_rc rc; - rc = diff_data_atomize_text_lines(left); - if (rc != DIFF_RC_OK) - return rc; - return diff_data_atomize_text_lines(right); -} blob - 285bd40d5941fe84c5d9a9bd437a4a9f981ead13 (mode 644) blob + /dev/null --- lib/diff_main.c +++ /dev/null @@ -1,246 +0,0 @@ -/* Generic infrastructure to implement various diff algorithms (implementation). */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include "debug.h" - -/* Even if a left or right side is empty, diff output may need to know the position in that file. - * So left_start or right_start must never be NULL -- pass left_count or right_count as zero to indicate staying at that - * position without consuming any lines. */ -struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved, - struct diff_atom *left_start, unsigned int left_count, - struct diff_atom *right_start, unsigned int right_count) -{ - struct diff_chunk *chunk; - diff_chunk_arraylist_t *result; - - if (solved && !state->temp_result.len) - result = &state->result->chunks; - else - result = &state->temp_result; - - ARRAYLIST_ADD(chunk, *result); - if (!chunk) - return NULL; - *chunk = (struct diff_chunk){ - .solved = solved, - .left_start = left_start, - .left_count = left_count, - .right_start = right_start, - .right_count = right_count, - }; - - debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED"); - debug("L\n"); - debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count); - debug("R\n"); - debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count); - return chunk; -} - -void diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len) -{ - *d = (struct diff_data){ - .data = data, - .len = len, - .root = d, - }; -} - -void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent, - struct diff_atom *from_atom, unsigned int atoms_count) -{ - struct diff_atom *last_atom = from_atom + atoms_count - 1; - *d = (struct diff_data){ - .data = from_atom->at, - .len = (last_atom->at + last_atom->len) - from_atom->at, - .root = parent->root, - .atoms.head = from_atom, - .atoms.len = atoms_count, - }; - - debug("subsection:\n"); - debug_dump(d); -} - -void diff_data_free(struct diff_data *diff_data) -{ - if (!diff_data) - return; - if (diff_data->atoms.allocated) - ARRAYLIST_FREE(diff_data->atoms); -} - -enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state) -{ - debug("\n** %s\n", __func__); - debug("left:\n"); - debug_dump(&state->left); - debug("right:\n"); - debug_dump(&state->right); - debug_dump_myers_graph(&state->left, &state->right, NULL); - - /* Add a chunk of equal lines, if any */ - unsigned int equal_atoms = 0; - while (equal_atoms < state->left.atoms.len && equal_atoms < state->right.atoms.len - && diff_atom_same(&state->left.atoms.head[equal_atoms], &state->right.atoms.head[equal_atoms])) - equal_atoms++; - if (equal_atoms) { - if (!diff_state_add_chunk(state, true, - &state->left.atoms.head[0], equal_atoms, - &state->right.atoms.head[0], equal_atoms)) - return DIFF_RC_ENOMEM; - } - - /* Add a "minus" chunk with all lines from the left. */ - if (equal_atoms < state->left.atoms.len) { - if (!diff_state_add_chunk(state, true, - &state->left.atoms.head[equal_atoms], - state->left.atoms.len - equal_atoms, - NULL, 0)) - return DIFF_RC_ENOMEM; - } - - /* Add a "plus" chunk with all lines from the right. */ - if (equal_atoms < state->right.atoms.len) { - if (!diff_state_add_chunk(state, true, - NULL, 0, - &state->right.atoms.head[equal_atoms], - state->right.atoms.len - equal_atoms)) - return DIFF_RC_ENOMEM; - } - return DIFF_RC_OK; -} - -enum diff_rc diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state) -{ - enum diff_rc rc; - ARRAYLIST_FREE(state->temp_result); - - if (!algo_config || !algo_config->impl || !state->recursion_depth_left) { - debug("MAX RECURSION REACHED, just dumping diff chunks\n"); - return diff_algo_none(algo_config, state); - } - - ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE); - rc = algo_config->impl(algo_config, state); - switch (rc) { - case DIFF_RC_USE_DIFF_ALGO_FALLBACK: - debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo); - rc = diff_run_algo(algo_config->fallback_algo, state); - goto return_rc; - - case DIFF_RC_OK: - /* continue below */ - break; - - default: - /* some error happened */ - goto return_rc; - } - - /* Pick up any diff chunks that are still unsolved and feed to inner_algo. - * inner_algo will solve unsolved chunks and append to result, - * and subsequent solved chunks on this level are then appended to result afterwards. */ - int i; - for (i = 0; i < state->temp_result.len; i++) { - struct diff_chunk *c = &state->temp_result.head[i]; - if (c->solved) { - struct diff_chunk *final_c; - ARRAYLIST_ADD(final_c, state->result->chunks); - if (!final_c) { - rc = DIFF_RC_ENOMEM; - goto return_rc; - } - *final_c = *c; - continue; - } - - /* c is an unsolved chunk, feed to inner_algo */ - struct diff_state inner_state = { - .result = state->result, - .recursion_depth_left = state->recursion_depth_left - 1, - }; - diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count); - diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count); - - rc = diff_run_algo(algo_config->inner_algo, &inner_state); - - if (rc != DIFF_RC_OK) - goto return_rc; - } - - rc = DIFF_RC_OK; -return_rc: - ARRAYLIST_FREE(state->temp_result); - return rc; -} - -struct diff_result *diff_main(const struct diff_config *config, - const uint8_t *left_data, size_t left_len, - const uint8_t *right_data, size_t right_len) -{ - struct diff_result *result = malloc(sizeof(struct diff_result)); - if (!result) - return NULL; - - *result = (struct diff_result){}; - diff_data_init_root(&result->left, left_data, left_len); - diff_data_init_root(&result->right, right_data, right_len); - - if (!config->atomize_func) { - result->rc = DIFF_RC_EINVAL; - return result; - } - - result->rc = config->atomize_func(config->atomize_func_data, &result->left, &result->right); - if (result->rc != DIFF_RC_OK) - return result; - - struct diff_state state = { - .result = result, - .recursion_depth_left = config->max_recursion_depth ? : 1024, - }; - diff_data_init_subsection(&state.left, &result->left, result->left.atoms.head, result->left.atoms.len); - diff_data_init_subsection(&state.right, &result->right, result->right.atoms.head, result->right.atoms.len); - - result->rc = diff_run_algo(config->algo, &state); - - return result; -} - -void diff_result_free(struct diff_result *result) -{ - if (!result) - return; - diff_data_free(&result->left); - diff_data_free(&result->right); - ARRAYLIST_FREE(result->chunks); - free(result); -} blob - cba927a4a5813ed3a1afa1247227a9637271e176 (mode 644) blob + /dev/null --- lib/diff_myers.c +++ /dev/null @@ -1,1043 +0,0 @@ -/* Myers diff algorithm implementation, invented by Eugene W. Myers [1]. - * Implementations of both the Myers Divide Et Impera (using linear space) - * and the canonical Myers algorithm (using quadratic space). */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include - -#include "debug.h" - -/* Myers' diff algorithm [1] is nicely explained in [2]. - * [1] http://www.xmailserver.org/diff2.pdf - * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff. - * - * Myers approaches finding the smallest diff as a graph problem. - * The crux is that the original algorithm requires quadratic amount of memory: - * both sides' lengths added, and that squared. So if we're diffing lines of text, two files with 1000 lines each would - * blow up to a matrix of about 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text. - * The solution is using Myers' "divide and conquer" extension algorithm, which does the original traversal from both - * ends of the files to reach a middle where these "snakes" touch, hence does not need to backtrace the traversal, and - * so gets away with only keeping a single column of that huge state matrix in memory. - * - * Todo: the divide and conquer requires linear *space*, not necessarily linear *time*. It recurses, apparently doing - * multiple Myers passes, and also it apparently favors fragmented diffs in cases where chunks of text were moved to a - * different place. Up to a given count of diff atoms (text lines), it might be desirable to accept the quadratic memory - * usage, get nicer diffs and less re-iteration of the same data? - */ - -struct diff_box { - unsigned int left_start; - unsigned int left_end; - unsigned int right_start; - unsigned int right_end; -}; - -#define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0) - - -/* If the two contents of a file are A B C D E and X B C Y, - * the Myers diff graph looks like: - * - * k0 k1 - * \ \ - * k-1 0 1 2 3 4 5 - * \ A B C D E - * 0 o-o-o-o-o-o - * X | | | | | | - * 1 o-o-o-o-o-o - * B | |\| | | | - * 2 o-o-o-o-o-o - * C | | |\| | | - * 3 o-o-o-o-o-o - * Y | | | | | |\ - * 4 o-o-o-o-o-o c1 - * \ \ - * c-1 c0 - * - * Moving right means delete an atom from the left-hand-side, - * Moving down means add an atom from the right-hand-side. - * Diagonals indicate identical atoms on both sides, the challenge is to use as many diagonals as possible. - * - * The original Myers algorithm walks all the way from the top left to the bottom right, remembers all steps, and then - * backtraces to find the shortest path. However, that requires keeping the entire graph in memory, which needs - * quadratic space. - * - * Myers adds a variant that uses linear space -- note, not linear time, only linear space: walk forward and backward, - * find a meeting point in the middle, and recurse on the two separate sections. This is called "divide and conquer". - * - * d: the step number, starting with 0, a.k.a. the distance from the starting point. - * k: relative index in the state array for the forward scan, indicating on which diagonal through the diff graph we - * currently are. - * c: relative index in the state array for the backward scan, indicating the diagonal number from the bottom up. - * - * The "divide and conquer" traversal through the Myers graph looks like this: - * - * | d= 0 1 2 3 2 1 0 - * ----+-------------------------------------------- - * k= | c= - * 4 | 3 - * | - * 3 | 3,0 5,2 2 - * | / \ - * 2 | 2,0 5,3 1 - * | / \ - * 1 | 1,0 4,3 >= 4,3 5,4<-- 0 - * | / / \ / - * 0 | -->0,0 3,3 4,4 -1 - * | \ / / - * -1 | 0,1 1,2 3,4 -2 - * | \ / - * -2 | 0,2 -3 - * | \ - * | 0,3 - * | forward-> <-backward - * - * x,y pairs here are the coordinates in the Myers graph: - * x = atom index in left-side source, y = atom index in the right-side source. - * - * Only one forward column and one backward column are kept in mem, each need at most left.len + 1 + right.len items. - * Note that each d step occupies either the even or the odd items of a column: if e.g. the previous column is in the - * odd items, the next column is formed in the even items, without overwriting the previous column's results. - * - * Also note that from the diagonal index k and the x coordinate, the y coordinate can be derived: - * y = x - k - * Hence the state array only needs to keep the x coordinate, i.e. the position in the left-hand file, and the y - * coordinate, i.e. position in the right-hand file, is derived from the index in the state array. - * - * The two traces meet at 4,3, the first step (here found in the forward traversal) where a forward position is on or - * past a backward traced position on the same diagonal. - * - * This divides the problem space into: - * - * 0 1 2 3 4 5 - * A B C D E - * 0 o-o-o-o-o - * X | | | | | - * 1 o-o-o-o-o - * B | |\| | | - * 2 o-o-o-o-o - * C | | |\| | - * 3 o-o-o-o-*-o *: forward and backward meet here - * Y | | - * 4 o-o - * - * Doing the same on each section lead to: - * - * 0 1 2 3 4 5 - * A B C D E - * 0 o-o - * X | | - * 1 o-b b: backward d=1 first reaches here (sliding up the snake) - * B \ f: then forward d=2 reaches here (sliding down the snake) - * 2 o As result, the box from b to f is found to be identical; - * C \ leaving a top box from 0,0 to 1,1 and a bottom trivial tail 3,3 to 4,3. - * 3 f-o - * - * 3 o-* - * Y | - * 4 o *: forward and backward meet here - * - * and solving the last top left box gives: - * - * 0 1 2 3 4 5 - * A B C D E -A - * 0 o-o +X - * X | B - * 1 o C - * B \ -D - * 2 o -E - * C \ +Y - * 3 o-o-o - * Y | - * 4 o - * - */ - -#define xk_to_y(X, K) ((X) - (K)) -#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA)) -#define k_to_c(K, DELTA) ((K) + (DELTA)) -#define c_to_k(C, DELTA) ((C) - (DELTA)) - -/* Do one forwards step in the "divide and conquer" graph traversal. - * left: the left side to diff. - * right: the right side to diff against. - * kd_forward: the traversal state for forwards traversal, modified by this function. - * This is carried over between invocations with increasing d. - * kd_forward points at the center of the state array, allowing negative indexes. - * kd_backward: the traversal state for backwards traversal, to find a meeting point. - * Since forwards is done first, kd_backward will be valid for d - 1, not d. - * kd_backward points at the center of the state array, allowing negative indexes. - * d: Step or distance counter, indicating for what value of d the kd_forward should be populated. - * For d == 0, kd_forward[0] is initialized, i.e. the first invocation should be for d == 0. - * meeting_snake: resulting meeting point, if any. - */ -static void diff_divide_myers_forward(struct diff_data *left, struct diff_data *right, - int *kd_forward, int *kd_backward, int d, - struct diff_box *meeting_snake) -{ - int delta = (int)right->atoms.len - (int)left->atoms.len; - int prev_x; - int prev_y; - int k; - int x; - - debug("-- %s d=%d\n", __func__, d); - debug_dump_myers_graph(left, right, NULL); - - for (k = d; k >= -d; k -= 2) { - if (k < -(int)right->atoms.len || k > (int)left->atoms.len) { - /* This diagonal is completely outside of the Myers graph, don't calculate it. */ - if (k < -(int)right->atoms.len) - debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len); - else - debug(" %d k > left->atoms.len %d\n", k, left->atoms.len); - if (k < 0) { - /* We are traversing negatively, and already below the entire graph, nothing will come - * of this. */ - debug(" break"); - break; - } - debug(" continue"); - continue; - } - debug("- k = %d\n", k); - if (d == 0) { - /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of - * the Myers graph. */ - x = 0; - } - /* Favoring "-" lines first means favoring moving rightwards in the Myers graph. - * For this, all k should derive from k - 1, only the bottom most k derive from k + 1: - * - * | d= 0 1 2 - * ----+---------------- - * k= | - * 2 | 2,0 <-- from prev_k = 2 - 1 = 1 - * | / - * 1 | 1,0 - * | / - * 0 | -->0,0 3,3 - * | \\ / - * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0 - * | \\ - * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1 - * - * Except when a k + 1 from a previous run already means a further advancement in the graph. - * If k == d, there is no k + 1 and k - 1 is the only option. - * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph. - */ - else if (k > -d && (k == d - || (k - 1 >= -(int)right->atoms.len - && kd_forward[k - 1] >= kd_forward[k + 1]))) { - /* Advance from k - 1. - * From position prev_k, step to the right in the Myers graph: x += 1. - */ - int prev_k = k - 1; - prev_x = kd_forward[prev_k]; - prev_y = xk_to_y(prev_x, prev_k); - x = prev_x + 1; - } else { - /* The bottom most one. - * From position prev_k, step to the bottom in the Myers graph: y += 1. - * Incrementing y is achieved by decrementing k while keeping the same x. - * (since we're deriving y from y = x - k). - */ - int prev_k = k + 1; - prev_x = kd_forward[prev_k]; - prev_y = xk_to_y(prev_x, prev_k); - x = prev_x; - } - - /* Slide down any snake that we might find here. */ - while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len - && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)])) - x++; - kd_forward[k] = x; - - if (DEBUG) { - int fi; - for (fi = d; fi >= k; fi--) { - debug("kd_forward[%d] = (%d, %d)\n", fi, kd_forward[fi], kd_forward[fi] - fi); - /* - if (kd_forward[fi] >= 0 && kd_forward[fi] < left->atoms.len) - debug_dump_atom(left, right, &left->atoms.head[kd_forward[fi]]); - else - debug("\n"); - if (kd_forward[fi]-fi >= 0 && kd_forward[fi]-fi < right->atoms.len) - debug_dump_atom(right, left, &right->atoms.head[kd_forward[fi]-fi]); - else - debug("\n"); - */ - } - } - - if (x < 0 || x > left->atoms.len - || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len) - continue; - - /* Figured out a new forwards traversal, see if this has gone onto or even past a preceding backwards - * traversal. - * - * If the delta in length is odd, then d and backwards_d hit the same state indexes: - * | d= 0 1 2 1 0 - * ----+---------------- ---------------- - * k= | c= - * 4 | 3 - * | - * 3 | 2 - * | same - * 2 | 2,0====5,3 1 - * | / \ - * 1 | 1,0 5,4<-- 0 - * | / / - * 0 | -->0,0 3,3====4,4 -1 - * | \ / - * -1 | 0,1 -2 - * | \ - * -2 | 0,2 -3 - * | - * - * If the delta is even, they end up off-by-one, i.e. on different diagonals: - * - * | d= 0 1 2 1 0 - * ----+---------------- ---------------- - * | c= - * 3 | 3 - * | - * 2 | 2,0 off 2 - * | / \\ - * 1 | 1,0 4,3 1 - * | / // \ - * 0 | -->0,0 3,3 4,4<-- 0 - * | \ / / - * -1 | 0,1 3,4 -1 - * | \ // - * -2 | 0,2 -2 - * | - * - * So in the forward path, we can only match up diagonals when the delta is odd. - */ - /* Forwards is done first, so the backwards one was still at d - 1. Can't do this for d == 0. */ - int backwards_d = d - 1; - if ((delta & 1) && (backwards_d >= 0)) { - debug("backwards_d = %d\n", backwards_d); - - /* If both sides have the same length, forward and backward start on the same diagonal, meaning the - * backwards state index c == k. - * As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and - * c = k shifted by the difference in length. - */ - int c = k_to_c(k, delta); - - /* When the file sizes are very different, the traversal trees start on far distant diagonals. - * They don't necessarily meet straight on. See whether this forward value is on a diagonal that - * is also valid in kd_backward[], and match them if so. */ - if (c >= -backwards_d && c <= backwards_d) { - /* Current k is on a diagonal that exists in kd_backward[]. If the two x positions have - * met or passed (forward walked onto or past backward), then we've found a midpoint / a - * mid-box. - * - * But we need to avoid matching a situation like this: - * 0 1 - * x y - * 0 o-o-o - * x |\| | - * 1 o-o-o - * y | |\| - * 2 (B)o-o <--(B) backwards traversal reached here - * a | | | - * 3 o-o-o<-- prev_x, prev_y - * b | | | - * 4 o-o(F) <--(F) forwards traversal reached here - * x |\| | Now both are on the same diagonal and look like they passed, - * 5 o-o-o but actually they have sneaked past each other and have not met. - * y | |\| - * 6 o-o-o - * - * The solution is to notice that prev_x,prev_y were also already past (B). - */ - int backward_x = kd_backward[c]; - int backward_y = xc_to_y(backward_x, c, delta); - debug(" prev_x,y = (%d,%d) c%d:backward_x,y = (%d,%d) k%d:x,y = (%d,%d)\n", - prev_x, prev_y, c, backward_x, backward_y, k, x, xk_to_y(x, k)); - if (prev_x <= backward_x && prev_y <= backward_y - && x >= backward_x) { - *meeting_snake = (struct diff_box){ - .left_start = backward_x, - .left_end = x, - .right_start = xc_to_y(backward_x, c, delta), - .right_end = xk_to_y(x, k), - }; - debug("HIT x=(%u,%u) - y=(%u,%u)\n", - meeting_snake->left_start, - meeting_snake->right_start, - meeting_snake->left_end, - meeting_snake->right_end); - return; - } - } - } - } -} - -/* Do one backwards step in the "divide and conquer" graph traversal. - * left: the left side to diff. - * right: the right side to diff against. - * kd_forward: the traversal state for forwards traversal, to find a meeting point. - * Since forwards is done first, after this, both kd_forward and kd_backward will be valid for d. - * kd_forward points at the center of the state array, allowing negative indexes. - * kd_backward: the traversal state for backwards traversal, to find a meeting point. - * This is carried over between invocations with increasing d. - * kd_backward points at the center of the state array, allowing negative indexes. - * d: Step or distance counter, indicating for what value of d the kd_backward should be populated. - * Before the first invocation, kd_backward[0] shall point at the bottom right of the Myers graph - * (left.len, right.len). - * The first invocation will be for d == 1. - * meeting_snake: resulting meeting point, if any. - */ -static void diff_divide_myers_backward(struct diff_data *left, struct diff_data *right, - int *kd_forward, int *kd_backward, int d, - struct diff_box *meeting_snake) -{ - int delta = (int)right->atoms.len - (int)left->atoms.len; - int prev_x; - int prev_y; - int c; - int x; - - debug("-- %s d=%d\n", __func__, d); - debug_dump_myers_graph(left, right, NULL); - - for (c = d; c >= -d; c -= 2) { - if (c < -(int)left->atoms.len || c > (int)right->atoms.len) { - /* This diagonal is completely outside of the Myers graph, don't calculate it. */ - if (c < -(int)left->atoms.len) - debug(" %d c < -(int)left->atoms.len %d\n", c, -(int)left->atoms.len); - else - debug(" %d c > right->atoms.len %d\n", c, right->atoms.len); - if (c < 0) { - /* We are traversing negatively, and already below the entire graph, nothing will come - * of this. */ - debug(" break"); - break; - } - debug(" continue"); - continue; - } - debug("- c = %d\n", c); - if (d == 0) { - /* This is the initializing step. There is no prev_c yet, get the initial x from the bottom - * right of the Myers graph. */ - x = left->atoms.len; - } - /* Favoring "-" lines first means favoring moving rightwards in the Myers graph. - * For this, all c should derive from c - 1, only the bottom most c derive from c + 1: - * - * 2 1 0 - * --------------------------------------------------- - * c= - * 3 - * - * from prev_c = c - 1 --> 5,2 2 - * \ - * 5,3 1 - * \ - * 4,3 5,4<-- 0 - * \ / - * bottom most for d=1 from c + 1 --> 4,4 -1 - * / - * bottom most for d=2 --> 3,4 -2 - * - * Except when a c + 1 from a previous run already means a further advancement in the graph. - * If c == d, there is no c + 1 and c - 1 is the only option. - * If c < d, use c + 1 in case that yields a larger x. Also use c + 1 if c - 1 is outside the graph. - */ - else if (c > -d && (c == d - || (c - 1 >= -(int)right->atoms.len - && kd_backward[c - 1] <= kd_backward[c + 1]))) { - /* A top one. - * From position prev_c, step upwards in the Myers graph: y -= 1. - * Decrementing y is achieved by incrementing c while keeping the same x. - * (since we're deriving y from y = x - c + delta). - */ - int prev_c = c - 1; - prev_x = kd_backward[prev_c]; - prev_y = xc_to_y(prev_x, prev_c, delta); - x = prev_x; - } else { - /* The bottom most one. - * From position prev_c, step to the left in the Myers graph: x -= 1. - */ - int prev_c = c + 1; - prev_x = kd_backward[prev_c]; - prev_y = xc_to_y(prev_x, prev_c, delta); - x = prev_x - 1; - } - - /* Slide up any snake that we might find here. */ - debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c, delta), xc_to_y(x, c, delta)-1); - if (x > 0) { - debug(" l="); debug_dump_atom(left, right, &left->atoms.head[x-1]); - } - if (xc_to_y(x, c, delta) > 0) { - debug(" r="); debug_dump_atom(right, left, &right->atoms.head[xc_to_y(x, c, delta)-1]); - } - while (x > 0 && xc_to_y(x, c, delta) > 0 - && diff_atom_same(&left->atoms.head[x-1], &right->atoms.head[xc_to_y(x, c, delta)-1])) - x--; - kd_backward[c] = x; - - if (DEBUG) { - int fi; - for (fi = d; fi >= c; fi--) { - debug("kd_backward[%d] = (%d, %d)\n", fi, kd_backward[fi], - kd_backward[fi] - fi + delta); - /* - if (kd_backward[fi] >= 0 && kd_backward[fi] < left->atoms.len) - debug_dump_atom(left, right, &left->atoms.head[kd_backward[fi]]); - else - debug("\n"); - if (kd_backward[fi]-fi+delta >= 0 && kd_backward[fi]-fi+delta < right->atoms.len) - debug_dump_atom(right, left, &right->atoms.head[kd_backward[fi]-fi+delta]); - else - debug("\n"); - */ - } - } - - if (x < 0 || x > left->atoms.len - || xc_to_y(x, c, delta) < 0 || xc_to_y(x, c, delta) > right->atoms.len) - continue; - - /* Figured out a new backwards traversal, see if this has gone onto or even past a preceding forwards - * traversal. - * - * If the delta in length is even, then d and backwards_d hit the same state indexes -- note how this is - * different from in the forwards traversal, because now both d are the same: - * - * | d= 0 1 2 2 1 0 - * ----+---------------- -------------------- - * k= | c= - * 4 | - * | - * 3 | 3 - * | same - * 2 | 2,0====5,2 2 - * | / \ - * 1 | 1,0 5,3 1 - * | / / \ - * 0 | -->0,0 3,3====4,3 5,4<-- 0 - * | \ / / - * -1 | 0,1 4,4 -1 - * | \ - * -2 | 0,2 -2 - * | - * -3 - * If the delta is odd, they end up off-by-one, i.e. on different diagonals. - * So in the backward path, we can only match up diagonals when the delta is even. - */ - if ((delta & 1) == 0) { - /* Forwards was done first, now both d are the same. */ - int forwards_d = d; - - /* As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and - * c = k shifted by the difference in length. - */ - int k = c_to_k(c, delta); - - /* When the file sizes are very different, the traversal trees start on far distant diagonals. - * They don't necessarily meet straight on. See whether this backward value is also on a valid - * diagonal in kd_forward[], and match them if so. */ - if (k >= -forwards_d && k <= forwards_d) { - /* Current c is on a diagonal that exists in kd_forward[]. If the two x positions have - * met or passed (backward walked onto or past forward), then we've found a midpoint / a - * mid-box. */ - int forward_x = kd_forward[k]; - int forward_y = xk_to_y(forward_x, k); - debug("Compare %d to %d k=%d (x=%d,y=%d) to (x=%d,y=%d)\n", - forward_x, x, k, - forward_x, xk_to_y(forward_x, k), x, xc_to_y(x, c, delta)); - if (forward_x <= prev_x && forward_y <= prev_y - && forward_x >= x) { - *meeting_snake = (struct diff_box){ - .left_start = x, - .left_end = forward_x, - .right_start = xc_to_y(x, c, delta), - .right_end = xk_to_y(forward_x, k), - }; - debug("HIT x=%u,%u - y=%u,%u\n", - meeting_snake->left_start, - meeting_snake->right_start, - meeting_snake->left_end, - meeting_snake->right_end); - return; - } - } - } - } -} - -/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides - * the problem into smaller chunks. Requires only linear amounts of memory. */ -enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state) -{ - enum diff_rc rc = DIFF_RC_ENOMEM; - struct diff_data *left = &state->left; - struct diff_data *right = &state->right; - - debug("\n** %s\n", __func__); - debug("left:\n"); - debug_dump(left); - debug("right:\n"); - debug_dump(right); - debug_dump_myers_graph(left, right, NULL); - - /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */ - unsigned int max = left->atoms.len + right->atoms.len; - size_t kd_len = max + 1; - size_t kd_buf_size = kd_len << 1; - int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int)); - if (!kd_buf) - return DIFF_RC_ENOMEM; - int i; - for (i = 0; i < kd_buf_size; i++) - kd_buf[i] = -1; - int *kd_forward = kd_buf; - int *kd_backward = kd_buf + kd_len; - - /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle. - * It is then possible to index from -max/2 .. max/2. */ - kd_forward += max/2; - kd_backward += max/2; - - int d; - struct diff_box mid_snake = {}; - for (d = 0; d <= (max/2); d++) { - debug("-- d=%d\n", d); - diff_divide_myers_forward(left, right, kd_forward, kd_backward, d, &mid_snake); - if (!diff_box_empty(&mid_snake)) - break; - diff_divide_myers_backward(left, right, kd_forward, kd_backward, d, &mid_snake); - if (!diff_box_empty(&mid_snake)) - break; - } - - if (diff_box_empty(&mid_snake)) { - /* Divide and conquer failed to find a meeting point. Use the fallback_algo defined in the algo_config - * (leave this to the caller). This is just paranoia/sanity, we normally should always find a midpoint. - */ - debug(" no midpoint \n"); - rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK; - goto return_rc; - } else { - debug(" mid snake L: %u to %u of %u R: %u to %u of %u\n", - mid_snake.left_start, mid_snake.left_end, left->atoms.len, - mid_snake.right_start, mid_snake.right_end, right->atoms.len); - - /* Section before the mid-snake. */ - debug("Section before the mid-snake\n"); - - struct diff_atom *left_atom = &left->atoms.head[0]; - unsigned int left_section_len = mid_snake.left_start; - struct diff_atom *right_atom = &right->atoms.head[0]; - unsigned int right_section_len = mid_snake.right_start; - - if (left_section_len && right_section_len) { - /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */ - if (!diff_state_add_chunk(state, false, - left_atom, left_section_len, - right_atom, right_section_len)) - goto return_rc; - } else if (left_section_len && !right_section_len) { - /* Only left atoms and none on the right, they form a "minus" chunk, then. */ - if (!diff_state_add_chunk(state, true, - left_atom, left_section_len, - right_atom, 0)) - goto return_rc; - } else if (!left_section_len && right_section_len) { - /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ - if (!diff_state_add_chunk(state, true, - left_atom, 0, - right_atom, right_section_len)) - goto return_rc; - } - /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing before the mid-snake. */ - - /* the mid-snake, identical data on both sides: */ - debug("the mid-snake\n"); - if (!diff_state_add_chunk(state, true, - &left->atoms.head[mid_snake.left_start], - mid_snake.left_end - mid_snake.left_start, - &right->atoms.head[mid_snake.right_start], - mid_snake.right_end - mid_snake.right_start)) - goto return_rc; - - /* Section after the mid-snake. */ - debug("Section after the mid-snake\n"); - debug(" left_end %u right_end %u\n", mid_snake.left_end, mid_snake.right_end); - debug(" left_count %u right_count %u\n", left->atoms.len, right->atoms.len); - left_atom = &left->atoms.head[mid_snake.left_end]; - left_section_len = left->atoms.len - mid_snake.left_end; - right_atom = &right->atoms.head[mid_snake.right_end]; - right_section_len = right->atoms.len - mid_snake.right_end; - - if (left_section_len && right_section_len) { - /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */ - if (!diff_state_add_chunk(state, false, - left_atom, left_section_len, - right_atom, right_section_len)) - goto return_rc; - } else if (left_section_len && !right_section_len) { - /* Only left atoms and none on the right, they form a "minus" chunk, then. */ - if (!diff_state_add_chunk(state, true, - left_atom, left_section_len, - right_atom, 0)) - goto return_rc; - } else if (!left_section_len && right_section_len) { - /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ - if (!diff_state_add_chunk(state, true, - left_atom, 0, - right_atom, right_section_len)) - goto return_rc; - } - /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing after the mid-snake. */ - } - - rc = DIFF_RC_OK; - -return_rc: - free(kd_buf); - debug("** END %s\n", __func__); - return rc; -} - -/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can - * fail if the required space surpasses algo_config->permitted_state_size. */ -enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state) -{ - /* do a diff_divide_myers_forward() without a _backward(), so that it walks forward across the entire - * files to reach the end. Keep each run's state, and do a final backtrace. */ - enum diff_rc rc = DIFF_RC_ENOMEM; - struct diff_data *left = &state->left; - struct diff_data *right = &state->right; - - debug("\n** %s\n", __func__); - debug("left:\n"); - debug_dump(left); - debug("right:\n"); - debug_dump(right); - debug_dump_myers_graph(left, right, NULL); - - /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */ - unsigned int max = left->atoms.len + right->atoms.len; - size_t kd_len = max + 1 + max; - size_t kd_buf_size = kd_len * kd_len; - debug("state size: %zu\n", kd_buf_size); - if (kd_buf_size < kd_len /* overflow? */ - || kd_buf_size * sizeof(int) > algo_config->permitted_state_size) { - debug("state size %zu > permitted_state_size %zu, use fallback_algo\n", - kd_buf_size, algo_config->permitted_state_size); - return DIFF_RC_USE_DIFF_ALGO_FALLBACK; - } - - int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int)); - if (!kd_buf) - return DIFF_RC_ENOMEM; - int i; - for (i = 0; i < kd_buf_size; i++) - kd_buf[i] = -1; - - /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle. - * It is then possible to index from -max .. max. */ - int *kd_origin = kd_buf + max; - int *kd_column = kd_origin; - - int d; - int backtrack_d = -1; - int backtrack_k = 0; - int k; - int x, y; - for (d = 0; d <= max; d++, kd_column += kd_len) { - debug("-- d=%d\n", d); - - debug("-- %s d=%d\n", __func__, d); - - for (k = d; k >= -d; k -= 2) { - if (k < -(int)right->atoms.len || k > (int)left->atoms.len) { - /* This diagonal is completely outside of the Myers graph, don't calculate it. */ - if (k < -(int)right->atoms.len) - debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len); - else - debug(" %d k > left->atoms.len %d\n", k, left->atoms.len); - if (k < 0) { - /* We are traversing negatively, and already below the entire graph, nothing will come - * of this. */ - debug(" break"); - break; - } - debug(" continue"); - continue; - } - - debug("- k = %d\n", k); - if (d == 0) { - /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of - * the Myers graph. */ - x = 0; - } else { - int *kd_prev_column = kd_column - kd_len; - - /* Favoring "-" lines first means favoring moving rightwards in the Myers graph. - * For this, all k should derive from k - 1, only the bottom most k derive from k + 1: - * - * | d= 0 1 2 - * ----+---------------- - * k= | - * 2 | 2,0 <-- from prev_k = 2 - 1 = 1 - * | / - * 1 | 1,0 - * | / - * 0 | -->0,0 3,3 - * | \\ / - * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0 - * | \\ - * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1 - * - * Except when a k + 1 from a previous run already means a further advancement in the graph. - * If k == d, there is no k + 1 and k - 1 is the only option. - * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph. - */ - if (k > -d && (k == d - || (k - 1 >= -(int)right->atoms.len - && kd_prev_column[k - 1] >= kd_prev_column[k + 1]))) { - /* Advance from k - 1. - * From position prev_k, step to the right in the Myers graph: x += 1. - */ - int prev_k = k - 1; - int prev_x = kd_prev_column[prev_k]; - x = prev_x + 1; - } else { - /* The bottom most one. - * From position prev_k, step to the bottom in the Myers graph: y += 1. - * Incrementing y is achieved by decrementing k while keeping the same x. - * (since we're deriving y from y = x - k). - */ - int prev_k = k + 1; - int prev_x = kd_prev_column[prev_k]; - x = prev_x; - } - } - - /* Slide down any snake that we might find here. */ - while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len - && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)])) - x++; - kd_column[k] = x; - - if (DEBUG) { - int fi; - for (fi = d; fi >= k; fi-=2) { - debug("kd_column[%d] = (%d, %d)\n", fi, kd_column[fi], kd_column[fi] - fi); -#if 0 - if (kd_column[fi] >= 0 && kd_column[fi] < left->atoms.len) - debug_dump_atom(left, right, &left->atoms.head[kd_column[fi]]); - else - debug("\n"); - if (kd_column[fi]-fi >= 0 && kd_column[fi]-fi < right->atoms.len) - debug_dump_atom(right, left, &right->atoms.head[kd_column[fi]-fi]); - else - debug("\n"); -#endif - } - } - - if (x == left->atoms.len && xk_to_y(x, k) == right->atoms.len) { - /* Found a path */ - backtrack_d = d; - backtrack_k = k; - debug("Reached the end at d = %d, k = %d\n", - backtrack_d, backtrack_k); - break; - } - } - - if (backtrack_d >= 0) - break; - } - - debug_dump_myers_graph(left, right, kd_origin); - - /* backtrack. A matrix spanning from start to end of the file is ready: - * - * | d= 0 1 2 3 4 - * ----+--------------------------------- - * k= | - * 3 | - * | - * 2 | 2,0 - * | / - * 1 | 1,0 4,3 - * | / / \ - * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0 - * | \ / \ - * -1 | 0,1 3,4 - * | \ - * -2 | 0,2 - * | - * - * From (4,4) backwards, find the previous position that is the largest, and remember it. - * - */ - for (d = backtrack_d, k = backtrack_k; d >= 0; d--) { - x = kd_column[k]; - y = xk_to_y(x, k); - - /* When the best position is identified, remember it for that kd_column. - * That kd_column is no longer needed otherwise, so just re-purpose kd_column[0] = x and kd_column[1] = y, - * so that there is no need to allocate more memory. - */ - kd_column[0] = x; - kd_column[1] = y; - debug("Backtrack d=%d: xy=(%d, %d)\n", - d, kd_column[0], kd_column[1]); - - /* Don't access memory before kd_buf */ - if (d == 0) - break; - int *kd_prev_column = kd_column - kd_len; - - /* When y == 0, backtracking downwards (k-1) is the only way. - * When x == 0, backtracking upwards (k+1) is the only way. - * - * | d= 0 1 2 3 4 - * ----+--------------------------------- - * k= | - * 3 | - * | ..y == 0 - * 2 | 2,0 - * | / - * 1 | 1,0 4,3 - * | / / \ - * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0 - * | \ / \ - * -1 | 0,1 3,4 - * | \ - * -2 | 0,2__ - * | x == 0 - */ - debug("prev[k-1] = %d,%d prev[k+1] = %d,%d\n", - kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1), - kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1)); - if (y == 0 - || (x > 0 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) { - k = k - 1; - debug("prev k=k-1=%d x=%d y=%d\n", - k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k)); - } else { - k = k + 1; - debug("prev k=k+1=%d x=%d y=%d\n", - k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k)); - } - kd_column = kd_prev_column; - } - - /* Forwards again, this time recording the diff chunks. - * Definitely start from 0,0. kd_column[0] may actually point to the bottom of a snake starting at 0,0 */ - x = 0; - y = 0; - - kd_column = kd_origin; - for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) { - int next_x = kd_column[0]; - int next_y = kd_column[1]; - debug("Forward track from xy(%d,%d) to xy(%d,%d)\n", - x, y, next_x, next_y); - - struct diff_atom *left_atom = &left->atoms.head[x]; - int left_section_len = next_x - x; - struct diff_atom *right_atom = &right->atoms.head[y]; - int right_section_len = next_y - y; - - rc = DIFF_RC_ENOMEM; - if (left_section_len && right_section_len) { - /* This must be a snake slide. - * Snake slides have a straight line leading into them (except when starting at (0,0)). Find - * out whether the lead-in is horizontal or vertical: - * - * left - * ----------> - * | - * r| o-o o - * i| \ | - * g| o o - * h| \ \ - * t| o o - * v - * - * If left_section_len > right_section_len, the lead-in is horizontal, meaning first - * remove one atom from the left before sliding down the snake. - * If right_section_len > left_section_len, the lead-in is vetical, so add one atom from - * the right before sliding down the snake. */ - if (left_section_len == right_section_len + 1) { - if (!diff_state_add_chunk(state, true, - left_atom, 1, - right_atom, 0)) - goto return_rc; - left_atom++; - left_section_len--; - } else if (right_section_len == left_section_len + 1) { - if (!diff_state_add_chunk(state, true, - left_atom, 0, - right_atom, 1)) - goto return_rc; - right_atom++; - right_section_len--; - } else if (left_section_len != right_section_len) { - /* The numbers are making no sense. Should never happen. */ - rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK; - goto return_rc; - } - - if (!diff_state_add_chunk(state, true, - left_atom, left_section_len, - right_atom, right_section_len)) - goto return_rc; - } else if (left_section_len && !right_section_len) { - /* Only left atoms and none on the right, they form a "minus" chunk, then. */ - if (!diff_state_add_chunk(state, true, - left_atom, left_section_len, - right_atom, 0)) - goto return_rc; - } else if (!left_section_len && right_section_len) { - /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ - if (!diff_state_add_chunk(state, true, - left_atom, 0, - right_atom, right_section_len)) - goto return_rc; - } - - x = next_x; - y = next_y; - } - - rc = DIFF_RC_OK; - -return_rc: - free(kd_buf); - debug("** END %s rc=%d\n", __func__, rc); - return rc; -} blob - 4c76a249cac3df710e57ed404bb9a918acf41870 (mode 644) blob + /dev/null --- lib/diff_output.c +++ /dev/null @@ -1,52 +0,0 @@ -/* Common parts for printing diff output */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include - -void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count) -{ - struct diff_atom *atom; - foreach_diff_atom(atom, start_atom, count) { - fprintf(dest, "%s", prefix); - int i; - unsigned int len = atom->len; - if (len && atom->at[len - 1] == '\n') { - len--; - if (len && atom->at[len - 1] == '\r') - len--; - } - - for (i = 0; i < len; i++) { - char c = atom->at[i]; - if ((c < 0x20 || c >= 0x7f) && c != '\t') - fprintf(dest, "\\x%02x", (unsigned char)c); - else - fprintf(dest, "%c", c); - } - fprintf(dest, "\n"); - } -} - -enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info) -{ - if (info->arbitrary_info && *info->arbitrary_info) - fprintf(dest, "%s", info->arbitrary_info); - fprintf(dest, "--- %s\n+++ %s\n", - info->left_path ? : "a", - info->right_path ? : "b"); - return DIFF_RC_OK; -} blob - 796c3ee69f83bf338c67c8b310b27c2dea7501f1 (mode 644) blob + /dev/null --- lib/diff_output_plain.c +++ /dev/null @@ -1,54 +0,0 @@ -/* Output all lines of a diff_result. */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include - -enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info, - const struct diff_result *result) -{ - if (!result) - return DIFF_RC_EINVAL; - if (result->rc != DIFF_RC_OK) - return result->rc; - - diff_output_info(dest, info); - - int i; - for (i = 0; i < result->chunks.len; i++) { - struct diff_chunk *c = &result->chunks.head[i]; - if (c->left_count && c->right_count) - diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count); - else if (c->left_count && !c->right_count) - diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count); - else if (c->right_count && !c->left_count) - diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count); - } - return DIFF_RC_OK; -} - -enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config, - const struct diff_input_info *info, - const char *left, int left_len, const char *right, int right_len) -{ - enum diff_rc rc; - left_len = left_len < 0 ? strlen(left) : left_len; - right_len = right_len < 0 ? strlen(right) : right_len; - struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len); - rc = diff_output_plain(dest, info, result); - diff_result_free(result); - return rc; -} blob - 986d06cbf5ec125bca1bfbce84d7cd3378501735 (mode 644) blob + /dev/null --- lib/diff_output_unidiff.c +++ /dev/null @@ -1,207 +0,0 @@ -/* Produce a unidiff output from a diff_result. */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include - -#include "debug.h" - -enum chunk_type { - CHUNK_EMPTY, - CHUNK_PLUS, - CHUNK_MINUS, - CHUNK_SAME, - CHUNK_WEIRD, -}; - -static inline enum chunk_type chunk_type(const struct diff_chunk *chunk) -{ - if (!chunk->left_count && !chunk->right_count) - return CHUNK_EMPTY; - if (!chunk->solved) - return CHUNK_WEIRD; - if (!chunk->right_count) - return CHUNK_MINUS; - if (!chunk->left_count) - return CHUNK_PLUS; - if (chunk->left_count != chunk->right_count) - return CHUNK_WEIRD; - return CHUNK_SAME; -} - -struct chunk_context { - struct range chunk; - struct range left, right; -}; - -static bool chunk_context_empty(const struct chunk_context *cc) -{ - return range_empty(&cc->chunk); -} - -static void chunk_context_get(struct chunk_context *cc, const struct diff_result *r, int chunk_idx, - int context_lines) -{ - const struct diff_chunk *c = &r->chunks.head[chunk_idx]; - int left_start = diff_atom_root_idx(&r->left, c->left_start); - int right_start = diff_atom_root_idx(&r->right, c->right_start); - - *cc = (struct chunk_context){ - .chunk = { - .start = chunk_idx, - .end = chunk_idx + 1, - }, - .left = { - .start = MAX(0, left_start - context_lines), - .end = MIN(r->left.atoms.len, left_start + c->left_count + context_lines), - }, - .right = { - .start = MAX(0, right_start - context_lines), - .end = MIN(r->right.atoms.len, right_start + c->right_count + context_lines), - }, - }; -} - -static bool chunk_contexts_touch(const struct chunk_context *cc, const struct chunk_context *other) -{ - return ranges_touch(&cc->chunk, &other->chunk) - || ranges_touch(&cc->left, &other->left) - || ranges_touch(&cc->right, &other->right); -} - -static void chunk_contexts_merge(struct chunk_context *cc, const struct chunk_context *other) -{ - ranges_merge(&cc->chunk, &other->chunk); - ranges_merge(&cc->left, &other->left); - ranges_merge(&cc->right, &other->right); -} - -static void diff_output_unidiff_chunk(FILE *dest, bool *info_printed, const struct diff_input_info *info, - const struct diff_result *result, const struct chunk_context *cc) -{ - if (range_empty(&cc->left) && range_empty(&cc->right)) - return; - - if (!(*info_printed)) { - diff_output_info(dest, info); - *info_printed = true; - } - - fprintf(dest, "@@ -%d,%d +%d,%d @@\n", - cc->left.start + 1, cc->left.end - cc->left.start, - cc->right.start + 1, cc->right.end - cc->right.start); - - /* Got the absolute line numbers where to start printing, and the index of the interesting (non-context) chunk. - * To print context lines above the interesting chunk, nipping on the previous chunk index may be necessary. - * It is guaranteed to be only context lines where left == right, so it suffices to look on the left. */ - const struct diff_chunk *first_chunk = &result->chunks.head[cc->chunk.start]; - int chunk_start_line = diff_atom_root_idx(&result->left, first_chunk->left_start); - if (cc->left.start < chunk_start_line) - diff_output_lines(dest, " ", &result->left.atoms.head[cc->left.start], - chunk_start_line - cc->left.start); - - /* Now write out all the joined chunks and contexts between them */ - int c_idx; - for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) { - const struct diff_chunk *c = &result->chunks.head[c_idx]; - - if (c->left_count && c->right_count) - diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count); - else if (c->left_count && !c->right_count) - diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count); - else if (c->right_count && !c->left_count) - diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count); - } - - /* Trailing context? */ - const struct diff_chunk *last_chunk = &result->chunks.head[cc->chunk.end - 1]; - int chunk_end_line = diff_atom_root_idx(&result->left, last_chunk->left_start + last_chunk->left_count); - if (cc->left.end > chunk_end_line) - diff_output_lines(dest, " ", &result->left.atoms.head[chunk_end_line], - cc->left.end - chunk_end_line); -} - -enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info, - const struct diff_result *result, unsigned int context_lines) -{ - if (!result) - return DIFF_RC_EINVAL; - if (result->rc != DIFF_RC_OK) - return result->rc; - - struct chunk_context cc = {}; - bool info_printed = false; - - int i; - for (i = 0; i < result->chunks.len; i++) { - struct diff_chunk *c = &result->chunks.head[i]; - enum chunk_type t = chunk_type(c); - - if (t == CHUNK_MINUS || t == CHUNK_PLUS) { - if (chunk_context_empty(&cc)) { - /* These are the first lines being printed. - * Note down the start point, any number of subsequent chunks may be joined up to this - * unidiff chunk by context lines or by being directly adjacent. */ - chunk_context_get(&cc, result, i, context_lines); - debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n", - cc.chunk.start, cc.chunk.end, - cc.left.start, cc.left.end, cc.right.start, cc.right.end); - } else { - /* There already is a previous chunk noted down for being printed. - * Does it join up with this one? */ - struct chunk_context next; - chunk_context_get(&next, result, i, context_lines); - debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n", - next.chunk.start, next.chunk.end, - next.left.start, next.left.end, next.right.start, next.right.end); - if (chunk_contexts_touch(&cc, &next)) { - /* This next context touches or overlaps the previous one, join. */ - chunk_contexts_merge(&cc, &next); - debug("new chunk to be printed touches previous chunk, now: left %d-%d right %d-%d\n", - cc.left.start, cc.left.end, cc.right.start, cc.right.end); - } else { - /* No touching, so the previous context is complete with a gap between it and - * this next one. Print the previous one and start fresh here. */ - debug("new chunk to be printed does not touch previous chunk; print left %d-%d right %d-%d\n", - cc.left.start, cc.left.end, cc.right.start, cc.right.end); - diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc); - cc = next; - debug("new unprinted chunk is left %d-%d right %d-%d\n", - cc.left.start, cc.left.end, cc.right.start, cc.right.end); - } - } - } - - } - - if (!chunk_context_empty(&cc)) - diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc); - return DIFF_RC_OK; -} - -enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config, - const struct diff_input_info *info, - const char *left, int left_len, const char *right, int right_len, - unsigned int context_lines) -{ - enum diff_rc rc; - left_len = left_len < 0 ? strlen(left) : left_len; - right_len = right_len < 0 ? strlen(right) : right_len; - struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len); - rc = diff_output_unidiff(dest, info, result, context_lines); - diff_result_free(result); - return rc; -} blob - 17a6bffe139d685b095825de47ef10ff51e508b5 (mode 644) blob + /dev/null --- lib/diff_patience.c +++ /dev/null @@ -1,414 +0,0 @@ -/* Implementation of the Patience Diff algorithm invented by Bram Cohen: - * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */ -/* - * Copyright (c) 2020 Neels Hofmeyr - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include - -#include "debug.h" - -/* Set unique_here = true for all atoms that exist exactly once in this list. */ -static void diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count) -{ - struct diff_atom *i; - unsigned int count = 0; - diff_data_foreach_atom(i, d) { - i->patience.unique_here = true; - i->patience.unique_in_both = true; - count++; - } - diff_data_foreach_atom(i, d) { - struct diff_atom *j; - - if (!i->patience.unique_here) - continue; - - diff_data_foreach_atom_from(i + 1, j, d) { - if (diff_atom_same(i, j)) { - if (i->patience.unique_here) { - i->patience.unique_here = false; - i->patience.unique_in_both = false; - count--; - } - j->patience.unique_here = false; - j->patience.unique_in_both = false; - count--; - } - } - } - if (unique_count) - *unique_count = count; -} - -/* Mark those lines as atom->patience.unique_in_both = true that appear exactly once in each side. */ -static void diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right, - unsigned int *unique_in_both_count) -{ - /* Derive the final unique_in_both count without needing an explicit iteration. So this is just some - * optimiziation to save one iteration in the end. */ - unsigned int unique_in_both; - - diff_atoms_mark_unique(left, &unique_in_both); - diff_atoms_mark_unique(right, NULL); - - debug("unique_in_both %u\n", unique_in_both); - - struct diff_atom *i; - diff_data_foreach_atom(i, left) { - if (!i->patience.unique_here) - continue; - struct diff_atom *j; - int found_in_b = 0; - diff_data_foreach_atom(j, right) { - if (!diff_atom_same(i, j)) - continue; - if (!j->patience.unique_here) { - found_in_b = 2; /* or more */ - break; - } else { - found_in_b = 1; - j->patience.pos_in_other = i; - i->patience.pos_in_other = j; - } - } - - if (found_in_b == 0 || found_in_b > 1) { - i->patience.unique_in_both = false; - unique_in_both--; - debug("unique_in_both %u (%d) ", unique_in_both, found_in_b); - debug_dump_atom(left, NULL, i); - } - } - - /* Still need to unmark right[*]->patience.unique_in_both for atoms that don't exist in left */ - diff_data_foreach_atom(i, right) { - if (!i->patience.unique_here - || !i->patience.unique_in_both) - continue; - struct diff_atom *j; - bool found_in_a = false; - diff_data_foreach_atom(j, left) { - if (!j->patience.unique_in_both) - continue; - if (!diff_atom_same(i, j)) - continue; - found_in_a = true; - break; - } - - if (!found_in_a) - i->patience.unique_in_both = false; - } - - if (unique_in_both_count) - *unique_in_both_count = unique_in_both; -} - -static void diff_atoms_swallow_identical_neighbors(struct diff_data *left, struct diff_data *right, - unsigned int *unique_in_both_count) -{ - debug("trivially combine identical lines arount unique_in_both lines\n"); - - unsigned int l_idx; - unsigned int next_l_idx; - unsigned int l_min = 0; - unsigned int r_min = 0; - for (l_idx = 0; l_idx < left->atoms.len; l_idx = next_l_idx) { - next_l_idx = l_idx + 1; - struct diff_atom *l = &left->atoms.head[l_idx]; - - if (!l->patience.unique_in_both) - continue; - - debug("check identical lines around "); - debug_dump_atom(left, right, l); - - unsigned int r_idx = diff_atom_idx(right, l->patience.pos_in_other); - - struct range identical_l; - struct range identical_r; - - /* Swallow upwards. - * Each common-unique line swallows identical lines upwards and downwards. - * All common-unique lines that were part of the identical lines following below were already swallowed - * in the previous iteration, so we will never hit another common-unique line above. */ - for (identical_l.start = l_idx, identical_r.start = r_idx; - identical_l.start > l_min - && identical_r.start > r_min - && diff_atom_same(&left->atoms.head[identical_l.start - 1], - &right->atoms.head[identical_r.start - 1]); - identical_l.start--, identical_r.start--); - - /* Swallow downwards */ - for (identical_l.end = l_idx + 1, identical_r.end = r_idx + 1; - identical_l.end < left->atoms.len - && identical_r.end < right->atoms.len - && diff_atom_same(&left->atoms.head[identical_l.end], - &right->atoms.head[identical_r.end]); - identical_l.end++, identical_r.end++, - next_l_idx++) { - if (left->atoms.head[identical_l.end].patience.unique_in_both) { - /* Part of a chunk of identical lines, remove from listing of unique_in_both lines */ - left->atoms.head[identical_l.end].patience.unique_in_both = false; - right->atoms.head[identical_r.end].patience.unique_in_both = false; - (*unique_in_both_count)--; - } - } - - l->patience.identical_lines = identical_l; - l->patience.pos_in_other->patience.identical_lines = identical_r; - - l_min = identical_l.end; - r_min = identical_r.end; - - if (!range_empty(&l->patience.identical_lines)) { - debug("common-unique line at l=%u r=%u swallowed identical lines l=%u-%u r=%u-%u\n", - l_idx, r_idx, - identical_l.start, identical_l.end, - identical_r.start, identical_r.end); - } - debug("next_l_idx = %u\n", next_l_idx); - } -} - -/* Among the lines that appear exactly once in each side, find the longest streak that appear in both files in the same - * order (with other stuff allowed to interleave). Use patience sort for that, as in the Patience Diff algorithm. - * See https://bramcohen.livejournal.com/73318.html and, for a much more detailed explanation, - * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */ -enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state) -{ - enum diff_rc rc = DIFF_RC_ENOMEM; - - struct diff_data *left = &state->left; - struct diff_data *right = &state->right; - - unsigned int unique_in_both_count; - - debug("\n** %s\n", __func__); - - /* Find those lines that appear exactly once in 'left' and exactly once in 'right'. */ - diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count); - - debug("unique_in_both_count %u\n", unique_in_both_count); - debug("left:\n"); - debug_dump(left); - debug("right:\n"); - debug_dump(right); - - if (!unique_in_both_count) { - /* Cannot apply Patience, tell the caller to use fallback_algo instead. */ - return DIFF_RC_USE_DIFF_ALGO_FALLBACK; - } - - diff_atoms_swallow_identical_neighbors(left, right, &unique_in_both_count); - debug("After swallowing identical neighbors: unique_in_both = %u\n", - unique_in_both_count); - - /* An array of Longest Common Sequence is the result of the below subscope: */ - unsigned int lcs_count = 0; - struct diff_atom **lcs = NULL; - struct diff_atom *lcs_tail = NULL; - - { - /* This subscope marks the lifetime of the atom_pointers allocation */ - - /* One chunk of storage for atom pointers */ - struct diff_atom **atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2, sizeof(struct diff_atom*)); - - /* Half for the list of atoms that still need to be put on stacks */ - struct diff_atom **uniques = atom_pointers; - - /* Half for the patience sort state's "card stacks" -- we remember only each stack's topmost "card" */ - struct diff_atom **patience_stacks = atom_pointers + unique_in_both_count; - unsigned int patience_stacks_count = 0; - - /* Take all common, unique items from 'left' ... */ - - struct diff_atom *atom; - struct diff_atom **uniques_end = uniques; - diff_data_foreach_atom(atom, left) { - if (!atom->patience.unique_in_both) - continue; - *uniques_end = atom; - uniques_end++; - } - - /* ...and sort them to the order found in 'right'. - * The idea is to find the leftmost stack that has a higher line number and add it to the stack's top. - * If there is no such stack, open a new one on the right. The line number is derived from the atom*, - * which are array items and hence reflect the relative position in the source file. So we got the - * common-uniques from 'left' and sort them according to atom->patience.pos_in_other. */ - unsigned int i; - for (i = 0; i < unique_in_both_count; i++) { - atom = uniques[i]; - unsigned int target_stack; - - if (!patience_stacks_count) - target_stack = 0; - else { - /* binary search to find the stack to put this atom "card" on. */ - unsigned int lo = 0; - unsigned int hi = patience_stacks_count; - while (lo < hi) { - unsigned int mid = (lo + hi) >> 1; - if (patience_stacks[mid]->patience.pos_in_other < atom->patience.pos_in_other) - lo = mid + 1; - else - hi = mid; - } - - target_stack = lo; - } - - assert(target_stack <= patience_stacks_count); - patience_stacks[target_stack] = atom; - if (target_stack == patience_stacks_count) - patience_stacks_count++; - - /* Record a back reference to the next stack on the left, which will form the final longest sequence - * later. */ - atom->patience.prev_stack = target_stack ? patience_stacks[target_stack - 1] : NULL; - - } - - /* backtrace through prev_stack references to form the final longest common sequence */ - lcs_tail = patience_stacks[patience_stacks_count - 1]; - lcs_count = patience_stacks_count; - - /* uniques and patience_stacks are no longer needed. Backpointers are in atom->patience.prev_stack */ - free(atom_pointers); - } - - lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*)); - struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1]; - struct diff_atom *atom; - for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) { - assert(lcs_backtrace_pos >= lcs); - *lcs_backtrace_pos = atom; - } - - unsigned int i; - if (DEBUG) { - debug("\npatience LCS:\n"); - for (i = 0; i < lcs_count; i++) { - debug_dump_atom(left, right, lcs[i]); - } - } - - - /* TODO: For each common-unique line found (now listed in lcs), swallow lines upwards and downwards that are - * identical on each side. Requires a way to represent atoms being glued to adjacent atoms. */ - - debug("\ntraverse LCS, possibly recursing:\n"); - - /* Now we have pinned positions in both files at which it makes sense to divide the diff problem into smaller - * chunks. Go into the next round: look at each section in turn, trying to again find common-unique lines in - * those smaller sections. As soon as no more are found, the remaining smaller sections are solved by Myers. */ - unsigned int left_pos = 0; - unsigned int right_pos = 0; - for (i = 0; i <= lcs_count; i++) { - struct diff_atom *atom; - struct diff_atom *atom_r; - unsigned int left_idx; - unsigned int right_idx; - - if (i < lcs_count) { - atom = lcs[i]; - atom_r = atom->patience.pos_in_other; - debug("lcs[%u] = left[%u] = right[%u]\n", i, - diff_atom_idx(left, atom), diff_atom_idx(right, atom_r)); - left_idx = atom->patience.identical_lines.start; - right_idx = atom_r->patience.identical_lines.start; - debug(" identical lines l %u-%u r %u-%u\n", - atom->patience.identical_lines.start, atom->patience.identical_lines.end, - atom_r->patience.identical_lines.start, atom_r->patience.identical_lines.end); - } else { - atom = NULL; - atom_r = NULL; - left_idx = left->atoms.len; - right_idx = right->atoms.len; - } - - /* 'atom' now marks an atom that matches on both sides according to patience-diff - * (a common-unique identical atom in both files). - * Handle the section before and the atom itself; the section after will be handled by the next loop - * iteration -- note that i loops to last element + 1 ("i <= lcs_count"), so that there will be another - * final iteration to pick up the last remaining items after the last LCS atom. - * The sections before might also be empty on left and/or right. - * left_pos and right_pos mark the indexes of the first atoms that have not yet been handled in the - * previous loop iteration. - * left_idx and right_idx mark the indexes of the matching atom on left and right, respectively. */ - - debug("iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n", - i, left_pos, left_idx, right_pos, right_idx); - - struct diff_chunk *chunk; - - /* Section before the matching atom */ - struct diff_atom *left_atom = &left->atoms.head[left_pos]; - unsigned int left_section_len = left_idx - left_pos; - - struct diff_atom *right_atom = &(right->atoms.head[right_pos]); - unsigned int right_section_len = right_idx - right_pos; - - if (left_section_len && right_section_len) { - /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */ - if (!diff_state_add_chunk(state, false, - left_atom, left_section_len, - right_atom, right_section_len)) - goto return_rc; - } else if (left_section_len && !right_section_len) { - /* Only left atoms and none on the right, they form a "minus" chunk, then. */ - if (!diff_state_add_chunk(state, true, - left_atom, left_section_len, - right_atom, 0)) - goto return_rc; - } else if (!left_section_len && right_section_len) { - /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ - if (!diff_state_add_chunk(state, true, - left_atom, 0, - right_atom, right_section_len)) - goto return_rc; - } - /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing here. */ - - /* The atom found to match on both sides forms a chunk of equals on each side. In the very last - * iteration of this loop, there is no matching atom, we were just cleaning out the remaining lines. */ - if (atom) { - if (!diff_state_add_chunk(state, true, - left->atoms.head + atom->patience.identical_lines.start, - range_len(&atom->patience.identical_lines), - right->atoms.head + atom_r->patience.identical_lines.start, - range_len(&atom_r->patience.identical_lines))) - goto return_rc; - left_pos = atom->patience.identical_lines.end; - right_pos = atom_r->patience.identical_lines.end; - } else { - left_pos = left_idx + 1; - right_pos = right_idx + 1; - } - debug("end of iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n", - i, left_pos, left_idx, right_pos, right_idx); - } - debug("** END %s\n", __func__); - - rc = DIFF_RC_OK; - -return_rc: - free(lcs); - return rc; -} blob - a6ff8bd1ed65c0db6c0feaf7866523a9d9f52948 (mode 644) blob + /dev/null --- lib/linux_Makefile +++ /dev/null @@ -1,24 +0,0 @@ -srcs = \ - diff_atomize_text.c \ - diff_main.c \ - diff_myers.c \ - diff_patience.c \ - diff_output.c \ - diff_output_plain.c \ - diff_output_unidiff.c \ - $(END) - -objs = $(srcs:.c=.o) - -libdiff.a: $(objs) - ar rcs $@ $^ - -CFLAGS = -fsanitize=address -fsanitize=undefined -g -O3 - -%.o: %.c ./*.h ../include/diff/*.h - gcc $(CFLAGS) -I../include -o $@ -c $< - -.PHONY: clean -clean: - -rm $(objs) - -rm libdiff.a blob - b4a9acb0cdc71e2d714c76d3f510873470cb8236 (mode 644) blob + /dev/null --- man/diff.1 +++ /dev/null @@ -1,47 +0,0 @@ -.\" $OpenBSD$ -.\" -.\" Copyright (c) 2018 Martin Pieuchot -.\" Copyright (c) 2020 Neels Hofmeyr -.\" -.\" Permission to use, copy, modify, and distribute this software for any -.\" purpose with or without fee is hereby granted, provided that the above -.\" copyright notice and this permission notice appear in all copies. -.\" -.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -.\" -.Dd $Mdocdate: August 28 2017 $ -.Dt DIFF 1 -.Os -.Sh NAME -.Nm diff -.Nd compare files -.Sh SYNOPSIS -.Nm diff -.Ar file1 file2 -.Sh DESCRIPTION -The -.Nm -utility compares the contents of -.Ar file1 -and -.Ar file2 -line by line. -.Sh EXIT STATUS -The -.Nm -utility exits with one of the following values: -.Pp -.Bl -tag -width Ds -offset indent -compact -.It 0 -No differences were found. -.It 1 -Differences were found. -.It >1 -An error occurred. -.El blob - /dev/null blob + b4a9acb0cdc71e2d714c76d3f510873470cb8236 (mode 644) --- /dev/null +++ diff.1 @@ -0,0 +1,47 @@ +.\" $OpenBSD$ +.\" +.\" Copyright (c) 2018 Martin Pieuchot +.\" Copyright (c) 2020 Neels Hofmeyr +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate: August 28 2017 $ +.Dt DIFF 1 +.Os +.Sh NAME +.Nm diff +.Nd compare files +.Sh SYNOPSIS +.Nm diff +.Ar file1 file2 +.Sh DESCRIPTION +The +.Nm +utility compares the contents of +.Ar file1 +and +.Ar file2 +line by line. +.Sh EXIT STATUS +The +.Nm +utility exits with one of the following values: +.Pp +.Bl -tag -width Ds -offset indent -compact +.It 0 +No differences were found. +.It 1 +Differences were found. +.It >1 +An error occurred. +.El blob - 17f8df091e1bb71385f342f2e783b09347df7d75 blob + 480a582f5d2848c9cb614cf98532f97a5bfb5ba2 --- test/verify_all.sh +++ test/verify_all.sh @@ -1,6 +1,6 @@ #!/bin/sh -diff_prog="../diff/diff" +diff_prog="../obj/diff" diff_type=unidiff blob - /dev/null blob + bbe26f81080d982f5d5a2678f29193e5c70df6fd (mode 644) --- /dev/null +++ diff.c @@ -0,0 +1,143 @@ +/* Commandline diff utility to test diff implementations. */ +/* + * Copyright (c) 2018 Martin Pieuchot + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "diff_main.h" +#include "diff_output.h" + +#ifdef __linux__ +/* stupid shims to compile and test on linux */ +#define __dead + +static const char *getprogname() +{ + return "diff"; +} +#endif + +__dead void usage(void); +int diffreg(char *, char *, int); +char *mmapfile(const char *, struct stat *); + +__dead void +usage(void) +{ + fprintf(stderr, "usage: %s file1 file2\n", getprogname()); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + int ch; + + while ((ch = getopt(argc, argv, "")) != -1) { + switch (ch) { + default: + usage(); + } + } + + argc -= optind; + argv += optind; + + if (argc != 2) + usage(); + + return diffreg(argv[0], argv[1], 0); +} + +const struct diff_algo_config myers, patience, myers_divide; + +const struct diff_algo_config myers = (struct diff_algo_config){ + .impl = diff_algo_myers, + .permitted_state_size = 1024 * 1024 * sizeof(int), + .fallback_algo = &patience, +}; + +const struct diff_algo_config patience = (struct diff_algo_config){ + .impl = diff_algo_patience, + .inner_algo = &patience, // After subdivision, do Patience again. + .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera. +}; + +const struct diff_algo_config myers_divide = (struct diff_algo_config){ + .impl = diff_algo_myers_divide, + .inner_algo = &myers, // When division succeeded, start from the top. + // (fallback_algo = NULL implies diff_algo_none). +}; + +const struct diff_config diff_config = { + .atomize_func = diff_atomize_text_by_line, + .algo = &myers, +}; + +int +diffreg(char *file1, char *file2, int flags) +{ + char *str1, *str2; + struct stat st1, st2; + struct diff_input_info info = { + .left_path = file1, + .right_path = file2, + }; + + str1 = mmapfile(file1, &st1); + str2 = mmapfile(file2, &st2); + + diff_unidiff(stdout, &diff_config, &info, str1, st1.st_size, str2, st2.st_size, 3); + + munmap(str1, st1.st_size); + munmap(str2, st2.st_size); + + return 0; +} + +char * +mmapfile(const char *path, struct stat *st) +{ + int fd; + char *p; + + fd = open(path, O_RDONLY); + if (fd == -1) + err(2, "%s", path); + + if (fstat(fd, st) == -1) + err(2, "%s", path); + + if ((uintmax_t)st->st_size > SIZE_MAX) + errx(2, "%s: file too big to fit memory", path); + + p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + err(2, "mmap"); + + close(fd); + + return p; +} blob - /dev/null blob + 9aa20ee96f002f8a10c60b035b8e62be1dff769b (mode 644) --- /dev/null +++ diff_atomize_text.c @@ -0,0 +1,75 @@ +/* Split source by line breaks, and calculate a simplistic checksum. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "diff_main.h" + +static int diff_data_atomize_text_lines(struct diff_data *d) +{ + const uint8_t *pos = d->data; + const uint8_t *end = pos + d->len; + enum diff_rc rc; + + unsigned int array_size_estimate = d->len / 50; + unsigned int pow2 = 1; + while (array_size_estimate >>= 1) + pow2++; + + ARRAYLIST_INIT(d->atoms, 1 << pow2); + + while (pos < end) { + const uint8_t *line_end = pos; + unsigned int hash = 0; + + while (line_end < end && *line_end != '\r' && *line_end != '\n') { + hash = hash * 23 + *line_end; + line_end++; + } + + /* When not at the end of data, the line ending char ('\r' or '\n') must follow */ + if (line_end < end) + line_end++; + /* If that was an '\r', also pull in any following '\n' */ + if (line_end[0] == '\r' && line_end < end && line_end[1] == '\n') + line_end++; + + /* Record the found line as diff atom */ + struct diff_atom *atom; + ARRAYLIST_ADD(atom, d->atoms); + if (!atom) + return DIFF_RC_ENOMEM; + + *atom = (struct diff_atom){ + .at = pos, + .len = line_end - pos, + .hash = hash, + }; + + /* Starting point for next line: */ + pos = line_end; + } + + return DIFF_RC_OK; +} + +enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right) +{ + enum diff_rc rc; + rc = diff_data_atomize_text_lines(left); + if (rc != DIFF_RC_OK) + return rc; + return diff_data_atomize_text_lines(right); +} blob - /dev/null blob + 29fdd78115f0ea2392af08b51e5d57d450d9dd1b (mode 644) --- /dev/null +++ diff_main.c @@ -0,0 +1,245 @@ +/* Generic infrastructure to implement various diff algorithms (implementation). */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "diff_main.h" +#include "debug.h" + +/* Even if a left or right side is empty, diff output may need to know the position in that file. + * So left_start or right_start must never be NULL -- pass left_count or right_count as zero to indicate staying at that + * position without consuming any lines. */ +struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved, + struct diff_atom *left_start, unsigned int left_count, + struct diff_atom *right_start, unsigned int right_count) +{ + struct diff_chunk *chunk; + diff_chunk_arraylist_t *result; + + if (solved && !state->temp_result.len) + result = &state->result->chunks; + else + result = &state->temp_result; + + ARRAYLIST_ADD(chunk, *result); + if (!chunk) + return NULL; + *chunk = (struct diff_chunk){ + .solved = solved, + .left_start = left_start, + .left_count = left_count, + .right_start = right_start, + .right_count = right_count, + }; + + debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED"); + debug("L\n"); + debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count); + debug("R\n"); + debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count); + return chunk; +} + +void diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len) +{ + *d = (struct diff_data){ + .data = data, + .len = len, + .root = d, + }; +} + +void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent, + struct diff_atom *from_atom, unsigned int atoms_count) +{ + struct diff_atom *last_atom = from_atom + atoms_count - 1; + *d = (struct diff_data){ + .data = from_atom->at, + .len = (last_atom->at + last_atom->len) - from_atom->at, + .root = parent->root, + .atoms.head = from_atom, + .atoms.len = atoms_count, + }; + + debug("subsection:\n"); + debug_dump(d); +} + +void diff_data_free(struct diff_data *diff_data) +{ + if (!diff_data) + return; + if (diff_data->atoms.allocated) + ARRAYLIST_FREE(diff_data->atoms); +} + +enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + debug("\n** %s\n", __func__); + debug("left:\n"); + debug_dump(&state->left); + debug("right:\n"); + debug_dump(&state->right); + debug_dump_myers_graph(&state->left, &state->right, NULL); + + /* Add a chunk of equal lines, if any */ + unsigned int equal_atoms = 0; + while (equal_atoms < state->left.atoms.len && equal_atoms < state->right.atoms.len + && diff_atom_same(&state->left.atoms.head[equal_atoms], &state->right.atoms.head[equal_atoms])) + equal_atoms++; + if (equal_atoms) { + if (!diff_state_add_chunk(state, true, + &state->left.atoms.head[0], equal_atoms, + &state->right.atoms.head[0], equal_atoms)) + return DIFF_RC_ENOMEM; + } + + /* Add a "minus" chunk with all lines from the left. */ + if (equal_atoms < state->left.atoms.len) { + if (!diff_state_add_chunk(state, true, + &state->left.atoms.head[equal_atoms], + state->left.atoms.len - equal_atoms, + NULL, 0)) + return DIFF_RC_ENOMEM; + } + + /* Add a "plus" chunk with all lines from the right. */ + if (equal_atoms < state->right.atoms.len) { + if (!diff_state_add_chunk(state, true, + NULL, 0, + &state->right.atoms.head[equal_atoms], + state->right.atoms.len - equal_atoms)) + return DIFF_RC_ENOMEM; + } + return DIFF_RC_OK; +} + +enum diff_rc diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + enum diff_rc rc; + ARRAYLIST_FREE(state->temp_result); + + if (!algo_config || !algo_config->impl || !state->recursion_depth_left) { + debug("MAX RECURSION REACHED, just dumping diff chunks\n"); + return diff_algo_none(algo_config, state); + } + + ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE); + rc = algo_config->impl(algo_config, state); + switch (rc) { + case DIFF_RC_USE_DIFF_ALGO_FALLBACK: + debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo); + rc = diff_run_algo(algo_config->fallback_algo, state); + goto return_rc; + + case DIFF_RC_OK: + /* continue below */ + break; + + default: + /* some error happened */ + goto return_rc; + } + + /* Pick up any diff chunks that are still unsolved and feed to inner_algo. + * inner_algo will solve unsolved chunks and append to result, + * and subsequent solved chunks on this level are then appended to result afterwards. */ + int i; + for (i = 0; i < state->temp_result.len; i++) { + struct diff_chunk *c = &state->temp_result.head[i]; + if (c->solved) { + struct diff_chunk *final_c; + ARRAYLIST_ADD(final_c, state->result->chunks); + if (!final_c) { + rc = DIFF_RC_ENOMEM; + goto return_rc; + } + *final_c = *c; + continue; + } + + /* c is an unsolved chunk, feed to inner_algo */ + struct diff_state inner_state = { + .result = state->result, + .recursion_depth_left = state->recursion_depth_left - 1, + }; + diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count); + diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count); + + rc = diff_run_algo(algo_config->inner_algo, &inner_state); + + if (rc != DIFF_RC_OK) + goto return_rc; + } + + rc = DIFF_RC_OK; +return_rc: + ARRAYLIST_FREE(state->temp_result); + return rc; +} + +struct diff_result *diff_main(const struct diff_config *config, + const uint8_t *left_data, size_t left_len, + const uint8_t *right_data, size_t right_len) +{ + struct diff_result *result = malloc(sizeof(struct diff_result)); + if (!result) + return NULL; + + *result = (struct diff_result){}; + diff_data_init_root(&result->left, left_data, left_len); + diff_data_init_root(&result->right, right_data, right_len); + + if (!config->atomize_func) { + result->rc = DIFF_RC_EINVAL; + return result; + } + + result->rc = config->atomize_func(config->atomize_func_data, &result->left, &result->right); + if (result->rc != DIFF_RC_OK) + return result; + + struct diff_state state = { + .result = result, + .recursion_depth_left = config->max_recursion_depth ? : 1024, + }; + diff_data_init_subsection(&state.left, &result->left, result->left.atoms.head, result->left.atoms.len); + diff_data_init_subsection(&state.right, &result->right, result->right.atoms.head, result->right.atoms.len); + + result->rc = diff_run_algo(config->algo, &state); + + return result; +} + +void diff_result_free(struct diff_result *result) +{ + if (!result) + return; + diff_data_free(&result->left); + diff_data_free(&result->right); + ARRAYLIST_FREE(result->chunks); + free(result); +} blob - /dev/null blob + 933592cf5e6a33c4c1ace4d204f3fb494ab62d34 (mode 644) --- /dev/null +++ diff_main.h @@ -0,0 +1,287 @@ +/* Generic infrastructure to implement various diff algorithms. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "arraylist.h" + +#ifndef MAX +#define MAX(A,B) ((A)>(B)?(A):(B)) +#endif +#ifndef MIN +#define MIN(A,B) ((A)<(B)?(A):(B)) +#endif + +struct range { + int start; + int end; +}; + +static inline bool range_empty(const struct range *r) +{ + return r->start == r->end; +} + +static inline bool ranges_touch(const struct range *a, const struct range *b) +{ + return (a->end >= b->start) && (a->start <= b->end); +} + +static inline void ranges_merge(struct range *a, const struct range *b) +{ + *a = (struct range){ + .start = MIN(a->start, b->start), + .end = MAX(a->end, b->end), + }; +} + +static inline int range_len(const struct range *r) +{ + if (!r) + return 0; + return r->end - r->start; +} + +/* List of all possible return codes of a diff invocation. */ +enum diff_rc { + DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1, + DIFF_RC_OK = 0, + DIFF_RC_ENOTSUP = ENOTSUP, + DIFF_RC_ENOMEM = ENOMEM, + DIFF_RC_EINVAL = EINVAL, +}; + +struct diff_atom { + const uint8_t *at; + size_t len; + + /* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to + * compare entire atoms to find out whether they are indeed identical or not. */ + unsigned int hash; + + /* State for the Patience Diff algorithm */ + /* TODO: keep a separate array for the patience state */ + struct { + bool unique_here; + bool unique_in_both; + struct diff_atom *pos_in_other; + struct diff_atom *prev_stack; + struct range identical_lines; + } patience; +}; + +static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right) +{ + return left->hash == right->hash + && left->len == right->len + && memcmp(left->at, right->at, left->len) == 0; +} + +/* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In + * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file, + * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms + * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a + * subset of atoms. */ +struct diff_data { + const uint8_t *data; + size_t len; + ARRAYLIST(struct diff_atom) atoms; + struct diff_data *root; +}; + +void diff_data_free(struct diff_data *diff_data); + +/* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with + * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in + * the file (and not the relative position within the subsection). */ +#define diff_atom_root_idx(DIFF_DATA, ATOM) \ + ((ATOM) ? (ATOM) - ((DIFF_DATA)->root->atoms.head) : (DIFF_DATA)->root->atoms.len) + +/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with + * 0). */ +#define diff_atom_idx(DIFF_DATA, ATOM) \ + ((ATOM) ? (ATOM) - ((DIFF_DATA)->atoms.head) : (DIFF_DATA)->atoms.len) + +#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \ + for ((ATOM) = (FIRST_ATOM); \ + (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \ + (ATOM)++) + +#define diff_data_foreach_atom(ATOM, DIFF_DATA) \ + foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len) + +#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \ + for ((ATOM) = (FROM); \ + (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \ + (ATOM)++) + +/* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right. + * + * If solved == false: + * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next. + * The lines on the left should be diffed against the lines on the right. + * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.) + * + * If solved == true: + * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk"). + * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk"). + * If there are both left and right lines, it is a chunk of equal content on both sides, + * and left_count == right_count: + * + * - foo } + * - bar }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3, + * - baz } right_start = NULL, right_count = 0 } + * moo } + * goo }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3, + * zoo } right_start = &right.atoms.head[0], right_count = 3 } + * +loo } + * +roo }-- diff_chunk{ left_start = NULL, left_count = 0, + * +too } right_start = &right.atoms.head[3], right_count = 3 } + * + */ +struct diff_chunk { + bool solved; + struct diff_atom *left_start; + unsigned int left_count; + struct diff_atom *right_start; + unsigned int right_count; +}; + +typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t; +#define DIFF_RESULT_ALLOC_BLOCKSIZE 128 + +struct diff_result { + enum diff_rc rc; + struct diff_data left; + struct diff_data right; + diff_chunk_arraylist_t chunks; +}; + +struct diff_state { + /* The final result passed to the original diff caller. */ + struct diff_result *result; + + /* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */ + struct diff_data left; + struct diff_data right; + + unsigned int recursion_depth_left; + + /* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */ + diff_chunk_arraylist_t temp_result; +}; + +struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved, + struct diff_atom *left_start, unsigned int left_count, + struct diff_atom *right_start, unsigned int right_count); + +/* Signature of a utility function to divide both source files into diff atoms. + * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets + * both left and right to atomize at the same time. + * An example is diff_atomize_text_by_line() in diff_atomize_text.c. + * + * func_data: context pointer (free to be used by implementation). + * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created. + * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created. + */ +typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right); + +extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right); + +struct diff_algo_config; +typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */ +enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can + * fail if the required space surpasses algo_config->permitted_state_size. */ +extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides + * the problem into smaller chunks. Requires only linear amounts of memory. */ +extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to + * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */ +extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Diff algorithms to use, possibly nested. For example: + * + * struct diff_algo_config myers, patience, myers_divide; + * + * myers = (struct diff_algo_config){ + * .impl = diff_algo_myers, + * .permitted_state_size = 32 * 1024 * 1024, + * .fallback_algo = &patience, // when too large, do diff_algo_patience + * }; + * + * patience = (struct diff_algo_config){ + * .impl = diff_algo_patience, + * .inner_algo = &patience, // After subdivision, do Patience again. + * .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera. + * }; + * + * myers_divide = (struct diff_algo_config){ + * .impl = diff_algo_myers_divide, + * .inner_algo = &myers, // When division succeeded, start from the top. + * // (fallback_algo = NULL implies diff_algo_none). + * }; + * + * struct diff_config config = { + * .algo = &myers, + * ... + * }; + * diff_main(&config, ...); + */ +struct diff_algo_config { + diff_algo_impl_t impl; + + /* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo + * (diff_algo_myers). permitted_state_size == 0 means no limitation. */ + size_t permitted_state_size; + + /* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */ + const struct diff_algo_config *inner_algo; + + /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find + * any common-unique atoms), then use this algorithm instead. */ + const struct diff_algo_config *fallback_algo; +}; + +struct diff_config { + diff_atomize_func_t atomize_func; + void *atomize_func_data; + + const struct diff_algo_config *algo; + + /* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite + * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all + * left atoms and add all right atoms). */ + unsigned int max_recursion_depth; +}; + +struct diff_result *diff_main(const struct diff_config *config, + const uint8_t *left_data, size_t left_len, + const uint8_t *right_data, size_t right_len); +void diff_result_free(struct diff_result *result); blob - /dev/null blob + 1021b3be22639ea353533646e72153bb03276f0e (mode 644) --- /dev/null +++ diff_myers.c @@ -0,0 +1,1042 @@ +/* Myers diff algorithm implementation, invented by Eugene W. Myers [1]. + * Implementations of both the Myers Divide Et Impera (using linear space) + * and the canonical Myers algorithm (using quadratic space). */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "diff_main.h" +#include "debug.h" + +/* Myers' diff algorithm [1] is nicely explained in [2]. + * [1] http://www.xmailserver.org/diff2.pdf + * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff. + * + * Myers approaches finding the smallest diff as a graph problem. + * The crux is that the original algorithm requires quadratic amount of memory: + * both sides' lengths added, and that squared. So if we're diffing lines of text, two files with 1000 lines each would + * blow up to a matrix of about 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text. + * The solution is using Myers' "divide and conquer" extension algorithm, which does the original traversal from both + * ends of the files to reach a middle where these "snakes" touch, hence does not need to backtrace the traversal, and + * so gets away with only keeping a single column of that huge state matrix in memory. + * + * Todo: the divide and conquer requires linear *space*, not necessarily linear *time*. It recurses, apparently doing + * multiple Myers passes, and also it apparently favors fragmented diffs in cases where chunks of text were moved to a + * different place. Up to a given count of diff atoms (text lines), it might be desirable to accept the quadratic memory + * usage, get nicer diffs and less re-iteration of the same data? + */ + +struct diff_box { + unsigned int left_start; + unsigned int left_end; + unsigned int right_start; + unsigned int right_end; +}; + +#define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0) + + +/* If the two contents of a file are A B C D E and X B C Y, + * the Myers diff graph looks like: + * + * k0 k1 + * \ \ + * k-1 0 1 2 3 4 5 + * \ A B C D E + * 0 o-o-o-o-o-o + * X | | | | | | + * 1 o-o-o-o-o-o + * B | |\| | | | + * 2 o-o-o-o-o-o + * C | | |\| | | + * 3 o-o-o-o-o-o + * Y | | | | | |\ + * 4 o-o-o-o-o-o c1 + * \ \ + * c-1 c0 + * + * Moving right means delete an atom from the left-hand-side, + * Moving down means add an atom from the right-hand-side. + * Diagonals indicate identical atoms on both sides, the challenge is to use as many diagonals as possible. + * + * The original Myers algorithm walks all the way from the top left to the bottom right, remembers all steps, and then + * backtraces to find the shortest path. However, that requires keeping the entire graph in memory, which needs + * quadratic space. + * + * Myers adds a variant that uses linear space -- note, not linear time, only linear space: walk forward and backward, + * find a meeting point in the middle, and recurse on the two separate sections. This is called "divide and conquer". + * + * d: the step number, starting with 0, a.k.a. the distance from the starting point. + * k: relative index in the state array for the forward scan, indicating on which diagonal through the diff graph we + * currently are. + * c: relative index in the state array for the backward scan, indicating the diagonal number from the bottom up. + * + * The "divide and conquer" traversal through the Myers graph looks like this: + * + * | d= 0 1 2 3 2 1 0 + * ----+-------------------------------------------- + * k= | c= + * 4 | 3 + * | + * 3 | 3,0 5,2 2 + * | / \ + * 2 | 2,0 5,3 1 + * | / \ + * 1 | 1,0 4,3 >= 4,3 5,4<-- 0 + * | / / \ / + * 0 | -->0,0 3,3 4,4 -1 + * | \ / / + * -1 | 0,1 1,2 3,4 -2 + * | \ / + * -2 | 0,2 -3 + * | \ + * | 0,3 + * | forward-> <-backward + * + * x,y pairs here are the coordinates in the Myers graph: + * x = atom index in left-side source, y = atom index in the right-side source. + * + * Only one forward column and one backward column are kept in mem, each need at most left.len + 1 + right.len items. + * Note that each d step occupies either the even or the odd items of a column: if e.g. the previous column is in the + * odd items, the next column is formed in the even items, without overwriting the previous column's results. + * + * Also note that from the diagonal index k and the x coordinate, the y coordinate can be derived: + * y = x - k + * Hence the state array only needs to keep the x coordinate, i.e. the position in the left-hand file, and the y + * coordinate, i.e. position in the right-hand file, is derived from the index in the state array. + * + * The two traces meet at 4,3, the first step (here found in the forward traversal) where a forward position is on or + * past a backward traced position on the same diagonal. + * + * This divides the problem space into: + * + * 0 1 2 3 4 5 + * A B C D E + * 0 o-o-o-o-o + * X | | | | | + * 1 o-o-o-o-o + * B | |\| | | + * 2 o-o-o-o-o + * C | | |\| | + * 3 o-o-o-o-*-o *: forward and backward meet here + * Y | | + * 4 o-o + * + * Doing the same on each section lead to: + * + * 0 1 2 3 4 5 + * A B C D E + * 0 o-o + * X | | + * 1 o-b b: backward d=1 first reaches here (sliding up the snake) + * B \ f: then forward d=2 reaches here (sliding down the snake) + * 2 o As result, the box from b to f is found to be identical; + * C \ leaving a top box from 0,0 to 1,1 and a bottom trivial tail 3,3 to 4,3. + * 3 f-o + * + * 3 o-* + * Y | + * 4 o *: forward and backward meet here + * + * and solving the last top left box gives: + * + * 0 1 2 3 4 5 + * A B C D E -A + * 0 o-o +X + * X | B + * 1 o C + * B \ -D + * 2 o -E + * C \ +Y + * 3 o-o-o + * Y | + * 4 o + * + */ + +#define xk_to_y(X, K) ((X) - (K)) +#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA)) +#define k_to_c(K, DELTA) ((K) + (DELTA)) +#define c_to_k(C, DELTA) ((C) - (DELTA)) + +/* Do one forwards step in the "divide and conquer" graph traversal. + * left: the left side to diff. + * right: the right side to diff against. + * kd_forward: the traversal state for forwards traversal, modified by this function. + * This is carried over between invocations with increasing d. + * kd_forward points at the center of the state array, allowing negative indexes. + * kd_backward: the traversal state for backwards traversal, to find a meeting point. + * Since forwards is done first, kd_backward will be valid for d - 1, not d. + * kd_backward points at the center of the state array, allowing negative indexes. + * d: Step or distance counter, indicating for what value of d the kd_forward should be populated. + * For d == 0, kd_forward[0] is initialized, i.e. the first invocation should be for d == 0. + * meeting_snake: resulting meeting point, if any. + */ +static void diff_divide_myers_forward(struct diff_data *left, struct diff_data *right, + int *kd_forward, int *kd_backward, int d, + struct diff_box *meeting_snake) +{ + int delta = (int)right->atoms.len - (int)left->atoms.len; + int prev_x; + int prev_y; + int k; + int x; + + debug("-- %s d=%d\n", __func__, d); + debug_dump_myers_graph(left, right, NULL); + + for (k = d; k >= -d; k -= 2) { + if (k < -(int)right->atoms.len || k > (int)left->atoms.len) { + /* This diagonal is completely outside of the Myers graph, don't calculate it. */ + if (k < -(int)right->atoms.len) + debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len); + else + debug(" %d k > left->atoms.len %d\n", k, left->atoms.len); + if (k < 0) { + /* We are traversing negatively, and already below the entire graph, nothing will come + * of this. */ + debug(" break"); + break; + } + debug(" continue"); + continue; + } + debug("- k = %d\n", k); + if (d == 0) { + /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of + * the Myers graph. */ + x = 0; + } + /* Favoring "-" lines first means favoring moving rightwards in the Myers graph. + * For this, all k should derive from k - 1, only the bottom most k derive from k + 1: + * + * | d= 0 1 2 + * ----+---------------- + * k= | + * 2 | 2,0 <-- from prev_k = 2 - 1 = 1 + * | / + * 1 | 1,0 + * | / + * 0 | -->0,0 3,3 + * | \\ / + * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0 + * | \\ + * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1 + * + * Except when a k + 1 from a previous run already means a further advancement in the graph. + * If k == d, there is no k + 1 and k - 1 is the only option. + * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph. + */ + else if (k > -d && (k == d + || (k - 1 >= -(int)right->atoms.len + && kd_forward[k - 1] >= kd_forward[k + 1]))) { + /* Advance from k - 1. + * From position prev_k, step to the right in the Myers graph: x += 1. + */ + int prev_k = k - 1; + prev_x = kd_forward[prev_k]; + prev_y = xk_to_y(prev_x, prev_k); + x = prev_x + 1; + } else { + /* The bottom most one. + * From position prev_k, step to the bottom in the Myers graph: y += 1. + * Incrementing y is achieved by decrementing k while keeping the same x. + * (since we're deriving y from y = x - k). + */ + int prev_k = k + 1; + prev_x = kd_forward[prev_k]; + prev_y = xk_to_y(prev_x, prev_k); + x = prev_x; + } + + /* Slide down any snake that we might find here. */ + while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len + && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)])) + x++; + kd_forward[k] = x; + + if (DEBUG) { + int fi; + for (fi = d; fi >= k; fi--) { + debug("kd_forward[%d] = (%d, %d)\n", fi, kd_forward[fi], kd_forward[fi] - fi); + /* + if (kd_forward[fi] >= 0 && kd_forward[fi] < left->atoms.len) + debug_dump_atom(left, right, &left->atoms.head[kd_forward[fi]]); + else + debug("\n"); + if (kd_forward[fi]-fi >= 0 && kd_forward[fi]-fi < right->atoms.len) + debug_dump_atom(right, left, &right->atoms.head[kd_forward[fi]-fi]); + else + debug("\n"); + */ + } + } + + if (x < 0 || x > left->atoms.len + || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len) + continue; + + /* Figured out a new forwards traversal, see if this has gone onto or even past a preceding backwards + * traversal. + * + * If the delta in length is odd, then d and backwards_d hit the same state indexes: + * | d= 0 1 2 1 0 + * ----+---------------- ---------------- + * k= | c= + * 4 | 3 + * | + * 3 | 2 + * | same + * 2 | 2,0====5,3 1 + * | / \ + * 1 | 1,0 5,4<-- 0 + * | / / + * 0 | -->0,0 3,3====4,4 -1 + * | \ / + * -1 | 0,1 -2 + * | \ + * -2 | 0,2 -3 + * | + * + * If the delta is even, they end up off-by-one, i.e. on different diagonals: + * + * | d= 0 1 2 1 0 + * ----+---------------- ---------------- + * | c= + * 3 | 3 + * | + * 2 | 2,0 off 2 + * | / \\ + * 1 | 1,0 4,3 1 + * | / // \ + * 0 | -->0,0 3,3 4,4<-- 0 + * | \ / / + * -1 | 0,1 3,4 -1 + * | \ // + * -2 | 0,2 -2 + * | + * + * So in the forward path, we can only match up diagonals when the delta is odd. + */ + /* Forwards is done first, so the backwards one was still at d - 1. Can't do this for d == 0. */ + int backwards_d = d - 1; + if ((delta & 1) && (backwards_d >= 0)) { + debug("backwards_d = %d\n", backwards_d); + + /* If both sides have the same length, forward and backward start on the same diagonal, meaning the + * backwards state index c == k. + * As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and + * c = k shifted by the difference in length. + */ + int c = k_to_c(k, delta); + + /* When the file sizes are very different, the traversal trees start on far distant diagonals. + * They don't necessarily meet straight on. See whether this forward value is on a diagonal that + * is also valid in kd_backward[], and match them if so. */ + if (c >= -backwards_d && c <= backwards_d) { + /* Current k is on a diagonal that exists in kd_backward[]. If the two x positions have + * met or passed (forward walked onto or past backward), then we've found a midpoint / a + * mid-box. + * + * But we need to avoid matching a situation like this: + * 0 1 + * x y + * 0 o-o-o + * x |\| | + * 1 o-o-o + * y | |\| + * 2 (B)o-o <--(B) backwards traversal reached here + * a | | | + * 3 o-o-o<-- prev_x, prev_y + * b | | | + * 4 o-o(F) <--(F) forwards traversal reached here + * x |\| | Now both are on the same diagonal and look like they passed, + * 5 o-o-o but actually they have sneaked past each other and have not met. + * y | |\| + * 6 o-o-o + * + * The solution is to notice that prev_x,prev_y were also already past (B). + */ + int backward_x = kd_backward[c]; + int backward_y = xc_to_y(backward_x, c, delta); + debug(" prev_x,y = (%d,%d) c%d:backward_x,y = (%d,%d) k%d:x,y = (%d,%d)\n", + prev_x, prev_y, c, backward_x, backward_y, k, x, xk_to_y(x, k)); + if (prev_x <= backward_x && prev_y <= backward_y + && x >= backward_x) { + *meeting_snake = (struct diff_box){ + .left_start = backward_x, + .left_end = x, + .right_start = xc_to_y(backward_x, c, delta), + .right_end = xk_to_y(x, k), + }; + debug("HIT x=(%u,%u) - y=(%u,%u)\n", + meeting_snake->left_start, + meeting_snake->right_start, + meeting_snake->left_end, + meeting_snake->right_end); + return; + } + } + } + } +} + +/* Do one backwards step in the "divide and conquer" graph traversal. + * left: the left side to diff. + * right: the right side to diff against. + * kd_forward: the traversal state for forwards traversal, to find a meeting point. + * Since forwards is done first, after this, both kd_forward and kd_backward will be valid for d. + * kd_forward points at the center of the state array, allowing negative indexes. + * kd_backward: the traversal state for backwards traversal, to find a meeting point. + * This is carried over between invocations with increasing d. + * kd_backward points at the center of the state array, allowing negative indexes. + * d: Step or distance counter, indicating for what value of d the kd_backward should be populated. + * Before the first invocation, kd_backward[0] shall point at the bottom right of the Myers graph + * (left.len, right.len). + * The first invocation will be for d == 1. + * meeting_snake: resulting meeting point, if any. + */ +static void diff_divide_myers_backward(struct diff_data *left, struct diff_data *right, + int *kd_forward, int *kd_backward, int d, + struct diff_box *meeting_snake) +{ + int delta = (int)right->atoms.len - (int)left->atoms.len; + int prev_x; + int prev_y; + int c; + int x; + + debug("-- %s d=%d\n", __func__, d); + debug_dump_myers_graph(left, right, NULL); + + for (c = d; c >= -d; c -= 2) { + if (c < -(int)left->atoms.len || c > (int)right->atoms.len) { + /* This diagonal is completely outside of the Myers graph, don't calculate it. */ + if (c < -(int)left->atoms.len) + debug(" %d c < -(int)left->atoms.len %d\n", c, -(int)left->atoms.len); + else + debug(" %d c > right->atoms.len %d\n", c, right->atoms.len); + if (c < 0) { + /* We are traversing negatively, and already below the entire graph, nothing will come + * of this. */ + debug(" break"); + break; + } + debug(" continue"); + continue; + } + debug("- c = %d\n", c); + if (d == 0) { + /* This is the initializing step. There is no prev_c yet, get the initial x from the bottom + * right of the Myers graph. */ + x = left->atoms.len; + } + /* Favoring "-" lines first means favoring moving rightwards in the Myers graph. + * For this, all c should derive from c - 1, only the bottom most c derive from c + 1: + * + * 2 1 0 + * --------------------------------------------------- + * c= + * 3 + * + * from prev_c = c - 1 --> 5,2 2 + * \ + * 5,3 1 + * \ + * 4,3 5,4<-- 0 + * \ / + * bottom most for d=1 from c + 1 --> 4,4 -1 + * / + * bottom most for d=2 --> 3,4 -2 + * + * Except when a c + 1 from a previous run already means a further advancement in the graph. + * If c == d, there is no c + 1 and c - 1 is the only option. + * If c < d, use c + 1 in case that yields a larger x. Also use c + 1 if c - 1 is outside the graph. + */ + else if (c > -d && (c == d + || (c - 1 >= -(int)right->atoms.len + && kd_backward[c - 1] <= kd_backward[c + 1]))) { + /* A top one. + * From position prev_c, step upwards in the Myers graph: y -= 1. + * Decrementing y is achieved by incrementing c while keeping the same x. + * (since we're deriving y from y = x - c + delta). + */ + int prev_c = c - 1; + prev_x = kd_backward[prev_c]; + prev_y = xc_to_y(prev_x, prev_c, delta); + x = prev_x; + } else { + /* The bottom most one. + * From position prev_c, step to the left in the Myers graph: x -= 1. + */ + int prev_c = c + 1; + prev_x = kd_backward[prev_c]; + prev_y = xc_to_y(prev_x, prev_c, delta); + x = prev_x - 1; + } + + /* Slide up any snake that we might find here. */ + debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c, delta), xc_to_y(x, c, delta)-1); + if (x > 0) { + debug(" l="); debug_dump_atom(left, right, &left->atoms.head[x-1]); + } + if (xc_to_y(x, c, delta) > 0) { + debug(" r="); debug_dump_atom(right, left, &right->atoms.head[xc_to_y(x, c, delta)-1]); + } + while (x > 0 && xc_to_y(x, c, delta) > 0 + && diff_atom_same(&left->atoms.head[x-1], &right->atoms.head[xc_to_y(x, c, delta)-1])) + x--; + kd_backward[c] = x; + + if (DEBUG) { + int fi; + for (fi = d; fi >= c; fi--) { + debug("kd_backward[%d] = (%d, %d)\n", fi, kd_backward[fi], + kd_backward[fi] - fi + delta); + /* + if (kd_backward[fi] >= 0 && kd_backward[fi] < left->atoms.len) + debug_dump_atom(left, right, &left->atoms.head[kd_backward[fi]]); + else + debug("\n"); + if (kd_backward[fi]-fi+delta >= 0 && kd_backward[fi]-fi+delta < right->atoms.len) + debug_dump_atom(right, left, &right->atoms.head[kd_backward[fi]-fi+delta]); + else + debug("\n"); + */ + } + } + + if (x < 0 || x > left->atoms.len + || xc_to_y(x, c, delta) < 0 || xc_to_y(x, c, delta) > right->atoms.len) + continue; + + /* Figured out a new backwards traversal, see if this has gone onto or even past a preceding forwards + * traversal. + * + * If the delta in length is even, then d and backwards_d hit the same state indexes -- note how this is + * different from in the forwards traversal, because now both d are the same: + * + * | d= 0 1 2 2 1 0 + * ----+---------------- -------------------- + * k= | c= + * 4 | + * | + * 3 | 3 + * | same + * 2 | 2,0====5,2 2 + * | / \ + * 1 | 1,0 5,3 1 + * | / / \ + * 0 | -->0,0 3,3====4,3 5,4<-- 0 + * | \ / / + * -1 | 0,1 4,4 -1 + * | \ + * -2 | 0,2 -2 + * | + * -3 + * If the delta is odd, they end up off-by-one, i.e. on different diagonals. + * So in the backward path, we can only match up diagonals when the delta is even. + */ + if ((delta & 1) == 0) { + /* Forwards was done first, now both d are the same. */ + int forwards_d = d; + + /* As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and + * c = k shifted by the difference in length. + */ + int k = c_to_k(c, delta); + + /* When the file sizes are very different, the traversal trees start on far distant diagonals. + * They don't necessarily meet straight on. See whether this backward value is also on a valid + * diagonal in kd_forward[], and match them if so. */ + if (k >= -forwards_d && k <= forwards_d) { + /* Current c is on a diagonal that exists in kd_forward[]. If the two x positions have + * met or passed (backward walked onto or past forward), then we've found a midpoint / a + * mid-box. */ + int forward_x = kd_forward[k]; + int forward_y = xk_to_y(forward_x, k); + debug("Compare %d to %d k=%d (x=%d,y=%d) to (x=%d,y=%d)\n", + forward_x, x, k, + forward_x, xk_to_y(forward_x, k), x, xc_to_y(x, c, delta)); + if (forward_x <= prev_x && forward_y <= prev_y + && forward_x >= x) { + *meeting_snake = (struct diff_box){ + .left_start = x, + .left_end = forward_x, + .right_start = xc_to_y(x, c, delta), + .right_end = xk_to_y(forward_x, k), + }; + debug("HIT x=%u,%u - y=%u,%u\n", + meeting_snake->left_start, + meeting_snake->right_start, + meeting_snake->left_end, + meeting_snake->right_end); + return; + } + } + } + } +} + +/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides + * the problem into smaller chunks. Requires only linear amounts of memory. */ +enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + enum diff_rc rc = DIFF_RC_ENOMEM; + struct diff_data *left = &state->left; + struct diff_data *right = &state->right; + + debug("\n** %s\n", __func__); + debug("left:\n"); + debug_dump(left); + debug("right:\n"); + debug_dump(right); + debug_dump_myers_graph(left, right, NULL); + + /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */ + unsigned int max = left->atoms.len + right->atoms.len; + size_t kd_len = max + 1; + size_t kd_buf_size = kd_len << 1; + int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int)); + if (!kd_buf) + return DIFF_RC_ENOMEM; + int i; + for (i = 0; i < kd_buf_size; i++) + kd_buf[i] = -1; + int *kd_forward = kd_buf; + int *kd_backward = kd_buf + kd_len; + + /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle. + * It is then possible to index from -max/2 .. max/2. */ + kd_forward += max/2; + kd_backward += max/2; + + int d; + struct diff_box mid_snake = {}; + for (d = 0; d <= (max/2); d++) { + debug("-- d=%d\n", d); + diff_divide_myers_forward(left, right, kd_forward, kd_backward, d, &mid_snake); + if (!diff_box_empty(&mid_snake)) + break; + diff_divide_myers_backward(left, right, kd_forward, kd_backward, d, &mid_snake); + if (!diff_box_empty(&mid_snake)) + break; + } + + if (diff_box_empty(&mid_snake)) { + /* Divide and conquer failed to find a meeting point. Use the fallback_algo defined in the algo_config + * (leave this to the caller). This is just paranoia/sanity, we normally should always find a midpoint. + */ + debug(" no midpoint \n"); + rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK; + goto return_rc; + } else { + debug(" mid snake L: %u to %u of %u R: %u to %u of %u\n", + mid_snake.left_start, mid_snake.left_end, left->atoms.len, + mid_snake.right_start, mid_snake.right_end, right->atoms.len); + + /* Section before the mid-snake. */ + debug("Section before the mid-snake\n"); + + struct diff_atom *left_atom = &left->atoms.head[0]; + unsigned int left_section_len = mid_snake.left_start; + struct diff_atom *right_atom = &right->atoms.head[0]; + unsigned int right_section_len = mid_snake.right_start; + + if (left_section_len && right_section_len) { + /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */ + if (!diff_state_add_chunk(state, false, + left_atom, left_section_len, + right_atom, right_section_len)) + goto return_rc; + } else if (left_section_len && !right_section_len) { + /* Only left atoms and none on the right, they form a "minus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, 0)) + goto return_rc; + } else if (!left_section_len && right_section_len) { + /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, right_section_len)) + goto return_rc; + } + /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing before the mid-snake. */ + + /* the mid-snake, identical data on both sides: */ + debug("the mid-snake\n"); + if (!diff_state_add_chunk(state, true, + &left->atoms.head[mid_snake.left_start], + mid_snake.left_end - mid_snake.left_start, + &right->atoms.head[mid_snake.right_start], + mid_snake.right_end - mid_snake.right_start)) + goto return_rc; + + /* Section after the mid-snake. */ + debug("Section after the mid-snake\n"); + debug(" left_end %u right_end %u\n", mid_snake.left_end, mid_snake.right_end); + debug(" left_count %u right_count %u\n", left->atoms.len, right->atoms.len); + left_atom = &left->atoms.head[mid_snake.left_end]; + left_section_len = left->atoms.len - mid_snake.left_end; + right_atom = &right->atoms.head[mid_snake.right_end]; + right_section_len = right->atoms.len - mid_snake.right_end; + + if (left_section_len && right_section_len) { + /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */ + if (!diff_state_add_chunk(state, false, + left_atom, left_section_len, + right_atom, right_section_len)) + goto return_rc; + } else if (left_section_len && !right_section_len) { + /* Only left atoms and none on the right, they form a "minus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, 0)) + goto return_rc; + } else if (!left_section_len && right_section_len) { + /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, right_section_len)) + goto return_rc; + } + /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing after the mid-snake. */ + } + + rc = DIFF_RC_OK; + +return_rc: + free(kd_buf); + debug("** END %s\n", __func__); + return rc; +} + +/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can + * fail if the required space surpasses algo_config->permitted_state_size. */ +enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + /* do a diff_divide_myers_forward() without a _backward(), so that it walks forward across the entire + * files to reach the end. Keep each run's state, and do a final backtrace. */ + enum diff_rc rc = DIFF_RC_ENOMEM; + struct diff_data *left = &state->left; + struct diff_data *right = &state->right; + + debug("\n** %s\n", __func__); + debug("left:\n"); + debug_dump(left); + debug("right:\n"); + debug_dump(right); + debug_dump_myers_graph(left, right, NULL); + + /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */ + unsigned int max = left->atoms.len + right->atoms.len; + size_t kd_len = max + 1 + max; + size_t kd_buf_size = kd_len * kd_len; + debug("state size: %zu\n", kd_buf_size); + if (kd_buf_size < kd_len /* overflow? */ + || kd_buf_size * sizeof(int) > algo_config->permitted_state_size) { + debug("state size %zu > permitted_state_size %zu, use fallback_algo\n", + kd_buf_size, algo_config->permitted_state_size); + return DIFF_RC_USE_DIFF_ALGO_FALLBACK; + } + + int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int)); + if (!kd_buf) + return DIFF_RC_ENOMEM; + int i; + for (i = 0; i < kd_buf_size; i++) + kd_buf[i] = -1; + + /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle. + * It is then possible to index from -max .. max. */ + int *kd_origin = kd_buf + max; + int *kd_column = kd_origin; + + int d; + int backtrack_d = -1; + int backtrack_k = 0; + int k; + int x, y; + for (d = 0; d <= max; d++, kd_column += kd_len) { + debug("-- d=%d\n", d); + + debug("-- %s d=%d\n", __func__, d); + + for (k = d; k >= -d; k -= 2) { + if (k < -(int)right->atoms.len || k > (int)left->atoms.len) { + /* This diagonal is completely outside of the Myers graph, don't calculate it. */ + if (k < -(int)right->atoms.len) + debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len); + else + debug(" %d k > left->atoms.len %d\n", k, left->atoms.len); + if (k < 0) { + /* We are traversing negatively, and already below the entire graph, nothing will come + * of this. */ + debug(" break"); + break; + } + debug(" continue"); + continue; + } + + debug("- k = %d\n", k); + if (d == 0) { + /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of + * the Myers graph. */ + x = 0; + } else { + int *kd_prev_column = kd_column - kd_len; + + /* Favoring "-" lines first means favoring moving rightwards in the Myers graph. + * For this, all k should derive from k - 1, only the bottom most k derive from k + 1: + * + * | d= 0 1 2 + * ----+---------------- + * k= | + * 2 | 2,0 <-- from prev_k = 2 - 1 = 1 + * | / + * 1 | 1,0 + * | / + * 0 | -->0,0 3,3 + * | \\ / + * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0 + * | \\ + * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1 + * + * Except when a k + 1 from a previous run already means a further advancement in the graph. + * If k == d, there is no k + 1 and k - 1 is the only option. + * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph. + */ + if (k > -d && (k == d + || (k - 1 >= -(int)right->atoms.len + && kd_prev_column[k - 1] >= kd_prev_column[k + 1]))) { + /* Advance from k - 1. + * From position prev_k, step to the right in the Myers graph: x += 1. + */ + int prev_k = k - 1; + int prev_x = kd_prev_column[prev_k]; + x = prev_x + 1; + } else { + /* The bottom most one. + * From position prev_k, step to the bottom in the Myers graph: y += 1. + * Incrementing y is achieved by decrementing k while keeping the same x. + * (since we're deriving y from y = x - k). + */ + int prev_k = k + 1; + int prev_x = kd_prev_column[prev_k]; + x = prev_x; + } + } + + /* Slide down any snake that we might find here. */ + while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len + && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)])) + x++; + kd_column[k] = x; + + if (DEBUG) { + int fi; + for (fi = d; fi >= k; fi-=2) { + debug("kd_column[%d] = (%d, %d)\n", fi, kd_column[fi], kd_column[fi] - fi); +#if 0 + if (kd_column[fi] >= 0 && kd_column[fi] < left->atoms.len) + debug_dump_atom(left, right, &left->atoms.head[kd_column[fi]]); + else + debug("\n"); + if (kd_column[fi]-fi >= 0 && kd_column[fi]-fi < right->atoms.len) + debug_dump_atom(right, left, &right->atoms.head[kd_column[fi]-fi]); + else + debug("\n"); +#endif + } + } + + if (x == left->atoms.len && xk_to_y(x, k) == right->atoms.len) { + /* Found a path */ + backtrack_d = d; + backtrack_k = k; + debug("Reached the end at d = %d, k = %d\n", + backtrack_d, backtrack_k); + break; + } + } + + if (backtrack_d >= 0) + break; + } + + debug_dump_myers_graph(left, right, kd_origin); + + /* backtrack. A matrix spanning from start to end of the file is ready: + * + * | d= 0 1 2 3 4 + * ----+--------------------------------- + * k= | + * 3 | + * | + * 2 | 2,0 + * | / + * 1 | 1,0 4,3 + * | / / \ + * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0 + * | \ / \ + * -1 | 0,1 3,4 + * | \ + * -2 | 0,2 + * | + * + * From (4,4) backwards, find the previous position that is the largest, and remember it. + * + */ + for (d = backtrack_d, k = backtrack_k; d >= 0; d--) { + x = kd_column[k]; + y = xk_to_y(x, k); + + /* When the best position is identified, remember it for that kd_column. + * That kd_column is no longer needed otherwise, so just re-purpose kd_column[0] = x and kd_column[1] = y, + * so that there is no need to allocate more memory. + */ + kd_column[0] = x; + kd_column[1] = y; + debug("Backtrack d=%d: xy=(%d, %d)\n", + d, kd_column[0], kd_column[1]); + + /* Don't access memory before kd_buf */ + if (d == 0) + break; + int *kd_prev_column = kd_column - kd_len; + + /* When y == 0, backtracking downwards (k-1) is the only way. + * When x == 0, backtracking upwards (k+1) is the only way. + * + * | d= 0 1 2 3 4 + * ----+--------------------------------- + * k= | + * 3 | + * | ..y == 0 + * 2 | 2,0 + * | / + * 1 | 1,0 4,3 + * | / / \ + * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0 + * | \ / \ + * -1 | 0,1 3,4 + * | \ + * -2 | 0,2__ + * | x == 0 + */ + debug("prev[k-1] = %d,%d prev[k+1] = %d,%d\n", + kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1), + kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1)); + if (y == 0 + || (x > 0 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) { + k = k - 1; + debug("prev k=k-1=%d x=%d y=%d\n", + k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k)); + } else { + k = k + 1; + debug("prev k=k+1=%d x=%d y=%d\n", + k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k)); + } + kd_column = kd_prev_column; + } + + /* Forwards again, this time recording the diff chunks. + * Definitely start from 0,0. kd_column[0] may actually point to the bottom of a snake starting at 0,0 */ + x = 0; + y = 0; + + kd_column = kd_origin; + for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) { + int next_x = kd_column[0]; + int next_y = kd_column[1]; + debug("Forward track from xy(%d,%d) to xy(%d,%d)\n", + x, y, next_x, next_y); + + struct diff_atom *left_atom = &left->atoms.head[x]; + int left_section_len = next_x - x; + struct diff_atom *right_atom = &right->atoms.head[y]; + int right_section_len = next_y - y; + + rc = DIFF_RC_ENOMEM; + if (left_section_len && right_section_len) { + /* This must be a snake slide. + * Snake slides have a straight line leading into them (except when starting at (0,0)). Find + * out whether the lead-in is horizontal or vertical: + * + * left + * ----------> + * | + * r| o-o o + * i| \ | + * g| o o + * h| \ \ + * t| o o + * v + * + * If left_section_len > right_section_len, the lead-in is horizontal, meaning first + * remove one atom from the left before sliding down the snake. + * If right_section_len > left_section_len, the lead-in is vetical, so add one atom from + * the right before sliding down the snake. */ + if (left_section_len == right_section_len + 1) { + if (!diff_state_add_chunk(state, true, + left_atom, 1, + right_atom, 0)) + goto return_rc; + left_atom++; + left_section_len--; + } else if (right_section_len == left_section_len + 1) { + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, 1)) + goto return_rc; + right_atom++; + right_section_len--; + } else if (left_section_len != right_section_len) { + /* The numbers are making no sense. Should never happen. */ + rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK; + goto return_rc; + } + + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, right_section_len)) + goto return_rc; + } else if (left_section_len && !right_section_len) { + /* Only left atoms and none on the right, they form a "minus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, 0)) + goto return_rc; + } else if (!left_section_len && right_section_len) { + /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, right_section_len)) + goto return_rc; + } + + x = next_x; + y = next_y; + } + + rc = DIFF_RC_OK; + +return_rc: + free(kd_buf); + debug("** END %s rc=%d\n", __func__, rc); + return rc; +} blob - /dev/null blob + 8c9e498eb29496228f6ff282f223b721ba1a3d03 (mode 644) --- /dev/null +++ diff_output.c @@ -0,0 +1,52 @@ +/* Common parts for printing diff output */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "diff_output.h" + +void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count) +{ + struct diff_atom *atom; + foreach_diff_atom(atom, start_atom, count) { + fprintf(dest, "%s", prefix); + int i; + unsigned int len = atom->len; + if (len && atom->at[len - 1] == '\n') { + len--; + if (len && atom->at[len - 1] == '\r') + len--; + } + + for (i = 0; i < len; i++) { + char c = atom->at[i]; + if ((c < 0x20 || c >= 0x7f) && c != '\t') + fprintf(dest, "\\x%02x", (unsigned char)c); + else + fprintf(dest, "%c", c); + } + fprintf(dest, "\n"); + } +} + +enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info) +{ + if (info->arbitrary_info && *info->arbitrary_info) + fprintf(dest, "%s", info->arbitrary_info); + fprintf(dest, "--- %s\n+++ %s\n", + info->left_path ? : "a", + info->right_path ? : "b"); + return DIFF_RC_OK; +} blob - /dev/null blob + cb6854b078da31b5426b493fb1128b98361d923e (mode 644) --- /dev/null +++ diff_output.h @@ -0,0 +1,43 @@ +/* Diff output generators and invocation shims. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#pragma once + +#include +#include "diff_main.h" + +struct diff_input_info { + const char *arbitrary_info; + const char *left_path; + const char *right_path; +}; + +enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info, + const struct diff_result *result); +enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config, + const struct diff_input_info *info, + const char *left, int left_len, const char *right, int right_len); + +enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info, + const struct diff_result *result, unsigned int context_lines); +enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config, + const struct diff_input_info *info, + const char *left, int left_len, const char *right, int right_len, + unsigned int context_lines); + +enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info); +void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count); blob - /dev/null blob + fa532a4c439692797a14d9c7e9e0c5c55c161029 (mode 644) --- /dev/null +++ diff_output_plain.c @@ -0,0 +1,54 @@ +/* Output all lines of a diff_result. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "diff_output.h" + +enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info, + const struct diff_result *result) +{ + if (!result) + return DIFF_RC_EINVAL; + if (result->rc != DIFF_RC_OK) + return result->rc; + + diff_output_info(dest, info); + + int i; + for (i = 0; i < result->chunks.len; i++) { + struct diff_chunk *c = &result->chunks.head[i]; + if (c->left_count && c->right_count) + diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count); + else if (c->left_count && !c->right_count) + diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count); + else if (c->right_count && !c->left_count) + diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count); + } + return DIFF_RC_OK; +} + +enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config, + const struct diff_input_info *info, + const char *left, int left_len, const char *right, int right_len) +{ + enum diff_rc rc; + left_len = left_len < 0 ? strlen(left) : left_len; + right_len = right_len < 0 ? strlen(right) : right_len; + struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len); + rc = diff_output_plain(dest, info, result); + diff_result_free(result); + return rc; +} blob - /dev/null blob + 4c95e6050fb2e06f84ee46543b0ebe1f83119115 (mode 644) --- /dev/null +++ diff_output_unidiff.c @@ -0,0 +1,206 @@ +/* Produce a unidiff output from a diff_result. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "diff_output.h" +#include "debug.h" + +enum chunk_type { + CHUNK_EMPTY, + CHUNK_PLUS, + CHUNK_MINUS, + CHUNK_SAME, + CHUNK_WEIRD, +}; + +static inline enum chunk_type chunk_type(const struct diff_chunk *chunk) +{ + if (!chunk->left_count && !chunk->right_count) + return CHUNK_EMPTY; + if (!chunk->solved) + return CHUNK_WEIRD; + if (!chunk->right_count) + return CHUNK_MINUS; + if (!chunk->left_count) + return CHUNK_PLUS; + if (chunk->left_count != chunk->right_count) + return CHUNK_WEIRD; + return CHUNK_SAME; +} + +struct chunk_context { + struct range chunk; + struct range left, right; +}; + +static bool chunk_context_empty(const struct chunk_context *cc) +{ + return range_empty(&cc->chunk); +} + +static void chunk_context_get(struct chunk_context *cc, const struct diff_result *r, int chunk_idx, + int context_lines) +{ + const struct diff_chunk *c = &r->chunks.head[chunk_idx]; + int left_start = diff_atom_root_idx(&r->left, c->left_start); + int right_start = diff_atom_root_idx(&r->right, c->right_start); + + *cc = (struct chunk_context){ + .chunk = { + .start = chunk_idx, + .end = chunk_idx + 1, + }, + .left = { + .start = MAX(0, left_start - context_lines), + .end = MIN(r->left.atoms.len, left_start + c->left_count + context_lines), + }, + .right = { + .start = MAX(0, right_start - context_lines), + .end = MIN(r->right.atoms.len, right_start + c->right_count + context_lines), + }, + }; +} + +static bool chunk_contexts_touch(const struct chunk_context *cc, const struct chunk_context *other) +{ + return ranges_touch(&cc->chunk, &other->chunk) + || ranges_touch(&cc->left, &other->left) + || ranges_touch(&cc->right, &other->right); +} + +static void chunk_contexts_merge(struct chunk_context *cc, const struct chunk_context *other) +{ + ranges_merge(&cc->chunk, &other->chunk); + ranges_merge(&cc->left, &other->left); + ranges_merge(&cc->right, &other->right); +} + +static void diff_output_unidiff_chunk(FILE *dest, bool *info_printed, const struct diff_input_info *info, + const struct diff_result *result, const struct chunk_context *cc) +{ + if (range_empty(&cc->left) && range_empty(&cc->right)) + return; + + if (!(*info_printed)) { + diff_output_info(dest, info); + *info_printed = true; + } + + fprintf(dest, "@@ -%d,%d +%d,%d @@\n", + cc->left.start + 1, cc->left.end - cc->left.start, + cc->right.start + 1, cc->right.end - cc->right.start); + + /* Got the absolute line numbers where to start printing, and the index of the interesting (non-context) chunk. + * To print context lines above the interesting chunk, nipping on the previous chunk index may be necessary. + * It is guaranteed to be only context lines where left == right, so it suffices to look on the left. */ + const struct diff_chunk *first_chunk = &result->chunks.head[cc->chunk.start]; + int chunk_start_line = diff_atom_root_idx(&result->left, first_chunk->left_start); + if (cc->left.start < chunk_start_line) + diff_output_lines(dest, " ", &result->left.atoms.head[cc->left.start], + chunk_start_line - cc->left.start); + + /* Now write out all the joined chunks and contexts between them */ + int c_idx; + for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) { + const struct diff_chunk *c = &result->chunks.head[c_idx]; + + if (c->left_count && c->right_count) + diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count); + else if (c->left_count && !c->right_count) + diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count); + else if (c->right_count && !c->left_count) + diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count); + } + + /* Trailing context? */ + const struct diff_chunk *last_chunk = &result->chunks.head[cc->chunk.end - 1]; + int chunk_end_line = diff_atom_root_idx(&result->left, last_chunk->left_start + last_chunk->left_count); + if (cc->left.end > chunk_end_line) + diff_output_lines(dest, " ", &result->left.atoms.head[chunk_end_line], + cc->left.end - chunk_end_line); +} + +enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info, + const struct diff_result *result, unsigned int context_lines) +{ + if (!result) + return DIFF_RC_EINVAL; + if (result->rc != DIFF_RC_OK) + return result->rc; + + struct chunk_context cc = {}; + bool info_printed = false; + + int i; + for (i = 0; i < result->chunks.len; i++) { + struct diff_chunk *c = &result->chunks.head[i]; + enum chunk_type t = chunk_type(c); + + if (t == CHUNK_MINUS || t == CHUNK_PLUS) { + if (chunk_context_empty(&cc)) { + /* These are the first lines being printed. + * Note down the start point, any number of subsequent chunks may be joined up to this + * unidiff chunk by context lines or by being directly adjacent. */ + chunk_context_get(&cc, result, i, context_lines); + debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n", + cc.chunk.start, cc.chunk.end, + cc.left.start, cc.left.end, cc.right.start, cc.right.end); + } else { + /* There already is a previous chunk noted down for being printed. + * Does it join up with this one? */ + struct chunk_context next; + chunk_context_get(&next, result, i, context_lines); + debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n", + next.chunk.start, next.chunk.end, + next.left.start, next.left.end, next.right.start, next.right.end); + if (chunk_contexts_touch(&cc, &next)) { + /* This next context touches or overlaps the previous one, join. */ + chunk_contexts_merge(&cc, &next); + debug("new chunk to be printed touches previous chunk, now: left %d-%d right %d-%d\n", + cc.left.start, cc.left.end, cc.right.start, cc.right.end); + } else { + /* No touching, so the previous context is complete with a gap between it and + * this next one. Print the previous one and start fresh here. */ + debug("new chunk to be printed does not touch previous chunk; print left %d-%d right %d-%d\n", + cc.left.start, cc.left.end, cc.right.start, cc.right.end); + diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc); + cc = next; + debug("new unprinted chunk is left %d-%d right %d-%d\n", + cc.left.start, cc.left.end, cc.right.start, cc.right.end); + } + } + } + + } + + if (!chunk_context_empty(&cc)) + diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc); + return DIFF_RC_OK; +} + +enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config, + const struct diff_input_info *info, + const char *left, int left_len, const char *right, int right_len, + unsigned int context_lines) +{ + enum diff_rc rc; + left_len = left_len < 0 ? strlen(left) : left_len; + right_len = right_len < 0 ? strlen(right) : right_len; + struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len); + rc = diff_output_unidiff(dest, info, result, context_lines); + diff_result_free(result); + return rc; +} blob - /dev/null blob + 9d24bfe161fabd6c99cfd6b78be90df096a9d5ae (mode 644) --- /dev/null +++ diff_patience.c @@ -0,0 +1,414 @@ +/* Implementation of the Patience Diff algorithm invented by Bram Cohen: + * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include "diff_main.h" +#include "debug.h" + +/* Set unique_here = true for all atoms that exist exactly once in this list. */ +static void diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count) +{ + struct diff_atom *i; + unsigned int count = 0; + diff_data_foreach_atom(i, d) { + i->patience.unique_here = true; + i->patience.unique_in_both = true; + count++; + } + diff_data_foreach_atom(i, d) { + struct diff_atom *j; + + if (!i->patience.unique_here) + continue; + + diff_data_foreach_atom_from(i + 1, j, d) { + if (diff_atom_same(i, j)) { + if (i->patience.unique_here) { + i->patience.unique_here = false; + i->patience.unique_in_both = false; + count--; + } + j->patience.unique_here = false; + j->patience.unique_in_both = false; + count--; + } + } + } + if (unique_count) + *unique_count = count; +} + +/* Mark those lines as atom->patience.unique_in_both = true that appear exactly once in each side. */ +static void diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right, + unsigned int *unique_in_both_count) +{ + /* Derive the final unique_in_both count without needing an explicit iteration. So this is just some + * optimiziation to save one iteration in the end. */ + unsigned int unique_in_both; + + diff_atoms_mark_unique(left, &unique_in_both); + diff_atoms_mark_unique(right, NULL); + + debug("unique_in_both %u\n", unique_in_both); + + struct diff_atom *i; + diff_data_foreach_atom(i, left) { + if (!i->patience.unique_here) + continue; + struct diff_atom *j; + int found_in_b = 0; + diff_data_foreach_atom(j, right) { + if (!diff_atom_same(i, j)) + continue; + if (!j->patience.unique_here) { + found_in_b = 2; /* or more */ + break; + } else { + found_in_b = 1; + j->patience.pos_in_other = i; + i->patience.pos_in_other = j; + } + } + + if (found_in_b == 0 || found_in_b > 1) { + i->patience.unique_in_both = false; + unique_in_both--; + debug("unique_in_both %u (%d) ", unique_in_both, found_in_b); + debug_dump_atom(left, NULL, i); + } + } + + /* Still need to unmark right[*]->patience.unique_in_both for atoms that don't exist in left */ + diff_data_foreach_atom(i, right) { + if (!i->patience.unique_here + || !i->patience.unique_in_both) + continue; + struct diff_atom *j; + bool found_in_a = false; + diff_data_foreach_atom(j, left) { + if (!j->patience.unique_in_both) + continue; + if (!diff_atom_same(i, j)) + continue; + found_in_a = true; + break; + } + + if (!found_in_a) + i->patience.unique_in_both = false; + } + + if (unique_in_both_count) + *unique_in_both_count = unique_in_both; +} + +static void diff_atoms_swallow_identical_neighbors(struct diff_data *left, struct diff_data *right, + unsigned int *unique_in_both_count) +{ + debug("trivially combine identical lines arount unique_in_both lines\n"); + + unsigned int l_idx; + unsigned int next_l_idx; + unsigned int l_min = 0; + unsigned int r_min = 0; + for (l_idx = 0; l_idx < left->atoms.len; l_idx = next_l_idx) { + next_l_idx = l_idx + 1; + struct diff_atom *l = &left->atoms.head[l_idx]; + + if (!l->patience.unique_in_both) + continue; + + debug("check identical lines around "); + debug_dump_atom(left, right, l); + + unsigned int r_idx = diff_atom_idx(right, l->patience.pos_in_other); + + struct range identical_l; + struct range identical_r; + + /* Swallow upwards. + * Each common-unique line swallows identical lines upwards and downwards. + * All common-unique lines that were part of the identical lines following below were already swallowed + * in the previous iteration, so we will never hit another common-unique line above. */ + for (identical_l.start = l_idx, identical_r.start = r_idx; + identical_l.start > l_min + && identical_r.start > r_min + && diff_atom_same(&left->atoms.head[identical_l.start - 1], + &right->atoms.head[identical_r.start - 1]); + identical_l.start--, identical_r.start--); + + /* Swallow downwards */ + for (identical_l.end = l_idx + 1, identical_r.end = r_idx + 1; + identical_l.end < left->atoms.len + && identical_r.end < right->atoms.len + && diff_atom_same(&left->atoms.head[identical_l.end], + &right->atoms.head[identical_r.end]); + identical_l.end++, identical_r.end++, + next_l_idx++) { + if (left->atoms.head[identical_l.end].patience.unique_in_both) { + /* Part of a chunk of identical lines, remove from listing of unique_in_both lines */ + left->atoms.head[identical_l.end].patience.unique_in_both = false; + right->atoms.head[identical_r.end].patience.unique_in_both = false; + (*unique_in_both_count)--; + } + } + + l->patience.identical_lines = identical_l; + l->patience.pos_in_other->patience.identical_lines = identical_r; + + l_min = identical_l.end; + r_min = identical_r.end; + + if (!range_empty(&l->patience.identical_lines)) { + debug("common-unique line at l=%u r=%u swallowed identical lines l=%u-%u r=%u-%u\n", + l_idx, r_idx, + identical_l.start, identical_l.end, + identical_r.start, identical_r.end); + } + debug("next_l_idx = %u\n", next_l_idx); + } +} + +/* Among the lines that appear exactly once in each side, find the longest streak that appear in both files in the same + * order (with other stuff allowed to interleave). Use patience sort for that, as in the Patience Diff algorithm. + * See https://bramcohen.livejournal.com/73318.html and, for a much more detailed explanation, + * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */ +enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + enum diff_rc rc = DIFF_RC_ENOMEM; + + struct diff_data *left = &state->left; + struct diff_data *right = &state->right; + + unsigned int unique_in_both_count; + + debug("\n** %s\n", __func__); + + /* Find those lines that appear exactly once in 'left' and exactly once in 'right'. */ + diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count); + + debug("unique_in_both_count %u\n", unique_in_both_count); + debug("left:\n"); + debug_dump(left); + debug("right:\n"); + debug_dump(right); + + if (!unique_in_both_count) { + /* Cannot apply Patience, tell the caller to use fallback_algo instead. */ + return DIFF_RC_USE_DIFF_ALGO_FALLBACK; + } + + diff_atoms_swallow_identical_neighbors(left, right, &unique_in_both_count); + debug("After swallowing identical neighbors: unique_in_both = %u\n", + unique_in_both_count); + + /* An array of Longest Common Sequence is the result of the below subscope: */ + unsigned int lcs_count = 0; + struct diff_atom **lcs = NULL; + struct diff_atom *lcs_tail = NULL; + + { + /* This subscope marks the lifetime of the atom_pointers allocation */ + + /* One chunk of storage for atom pointers */ + struct diff_atom **atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2, sizeof(struct diff_atom*)); + + /* Half for the list of atoms that still need to be put on stacks */ + struct diff_atom **uniques = atom_pointers; + + /* Half for the patience sort state's "card stacks" -- we remember only each stack's topmost "card" */ + struct diff_atom **patience_stacks = atom_pointers + unique_in_both_count; + unsigned int patience_stacks_count = 0; + + /* Take all common, unique items from 'left' ... */ + + struct diff_atom *atom; + struct diff_atom **uniques_end = uniques; + diff_data_foreach_atom(atom, left) { + if (!atom->patience.unique_in_both) + continue; + *uniques_end = atom; + uniques_end++; + } + + /* ...and sort them to the order found in 'right'. + * The idea is to find the leftmost stack that has a higher line number and add it to the stack's top. + * If there is no such stack, open a new one on the right. The line number is derived from the atom*, + * which are array items and hence reflect the relative position in the source file. So we got the + * common-uniques from 'left' and sort them according to atom->patience.pos_in_other. */ + unsigned int i; + for (i = 0; i < unique_in_both_count; i++) { + atom = uniques[i]; + unsigned int target_stack; + + if (!patience_stacks_count) + target_stack = 0; + else { + /* binary search to find the stack to put this atom "card" on. */ + unsigned int lo = 0; + unsigned int hi = patience_stacks_count; + while (lo < hi) { + unsigned int mid = (lo + hi) >> 1; + if (patience_stacks[mid]->patience.pos_in_other < atom->patience.pos_in_other) + lo = mid + 1; + else + hi = mid; + } + + target_stack = lo; + } + + assert(target_stack <= patience_stacks_count); + patience_stacks[target_stack] = atom; + if (target_stack == patience_stacks_count) + patience_stacks_count++; + + /* Record a back reference to the next stack on the left, which will form the final longest sequence + * later. */ + atom->patience.prev_stack = target_stack ? patience_stacks[target_stack - 1] : NULL; + + } + + /* backtrace through prev_stack references to form the final longest common sequence */ + lcs_tail = patience_stacks[patience_stacks_count - 1]; + lcs_count = patience_stacks_count; + + /* uniques and patience_stacks are no longer needed. Backpointers are in atom->patience.prev_stack */ + free(atom_pointers); + } + + lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*)); + struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1]; + struct diff_atom *atom; + for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) { + assert(lcs_backtrace_pos >= lcs); + *lcs_backtrace_pos = atom; + } + + unsigned int i; + if (DEBUG) { + debug("\npatience LCS:\n"); + for (i = 0; i < lcs_count; i++) { + debug_dump_atom(left, right, lcs[i]); + } + } + + + /* TODO: For each common-unique line found (now listed in lcs), swallow lines upwards and downwards that are + * identical on each side. Requires a way to represent atoms being glued to adjacent atoms. */ + + debug("\ntraverse LCS, possibly recursing:\n"); + + /* Now we have pinned positions in both files at which it makes sense to divide the diff problem into smaller + * chunks. Go into the next round: look at each section in turn, trying to again find common-unique lines in + * those smaller sections. As soon as no more are found, the remaining smaller sections are solved by Myers. */ + unsigned int left_pos = 0; + unsigned int right_pos = 0; + for (i = 0; i <= lcs_count; i++) { + struct diff_atom *atom; + struct diff_atom *atom_r; + unsigned int left_idx; + unsigned int right_idx; + + if (i < lcs_count) { + atom = lcs[i]; + atom_r = atom->patience.pos_in_other; + debug("lcs[%u] = left[%u] = right[%u]\n", i, + diff_atom_idx(left, atom), diff_atom_idx(right, atom_r)); + left_idx = atom->patience.identical_lines.start; + right_idx = atom_r->patience.identical_lines.start; + debug(" identical lines l %u-%u r %u-%u\n", + atom->patience.identical_lines.start, atom->patience.identical_lines.end, + atom_r->patience.identical_lines.start, atom_r->patience.identical_lines.end); + } else { + atom = NULL; + atom_r = NULL; + left_idx = left->atoms.len; + right_idx = right->atoms.len; + } + + /* 'atom' now marks an atom that matches on both sides according to patience-diff + * (a common-unique identical atom in both files). + * Handle the section before and the atom itself; the section after will be handled by the next loop + * iteration -- note that i loops to last element + 1 ("i <= lcs_count"), so that there will be another + * final iteration to pick up the last remaining items after the last LCS atom. + * The sections before might also be empty on left and/or right. + * left_pos and right_pos mark the indexes of the first atoms that have not yet been handled in the + * previous loop iteration. + * left_idx and right_idx mark the indexes of the matching atom on left and right, respectively. */ + + debug("iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n", + i, left_pos, left_idx, right_pos, right_idx); + + struct diff_chunk *chunk; + + /* Section before the matching atom */ + struct diff_atom *left_atom = &left->atoms.head[left_pos]; + unsigned int left_section_len = left_idx - left_pos; + + struct diff_atom *right_atom = &(right->atoms.head[right_pos]); + unsigned int right_section_len = right_idx - right_pos; + + if (left_section_len && right_section_len) { + /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */ + if (!diff_state_add_chunk(state, false, + left_atom, left_section_len, + right_atom, right_section_len)) + goto return_rc; + } else if (left_section_len && !right_section_len) { + /* Only left atoms and none on the right, they form a "minus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, 0)) + goto return_rc; + } else if (!left_section_len && right_section_len) { + /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, right_section_len)) + goto return_rc; + } + /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing here. */ + + /* The atom found to match on both sides forms a chunk of equals on each side. In the very last + * iteration of this loop, there is no matching atom, we were just cleaning out the remaining lines. */ + if (atom) { + if (!diff_state_add_chunk(state, true, + left->atoms.head + atom->patience.identical_lines.start, + range_len(&atom->patience.identical_lines), + right->atoms.head + atom_r->patience.identical_lines.start, + range_len(&atom_r->patience.identical_lines))) + goto return_rc; + left_pos = atom->patience.identical_lines.end; + right_pos = atom_r->patience.identical_lines.end; + } else { + left_pos = left_idx + 1; + right_pos = right_idx + 1; + } + debug("end of iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n", + i, left_pos, left_idx, right_pos, right_idx); + } + debug("** END %s\n", __func__); + + rc = DIFF_RC_OK; + +return_rc: + free(lcs); + return rc; +}