commit - bb7fb738462f1e54a85659098b77605727b9dee0
commit + 55974fa638c18026f21cd0739d3bc6df41a94a2c
blob - 2ff357ceee0eed252b191258de3fea36fa4ae92b
blob + 3cdc42b334160159d59a29028e6fc531223ba48e
--- README
+++ README
The source code itself is not based on the code found in those blogs, but
written from scratch with the knowledge gained.
-Compile:
- OpenBSD:
- make -C diff
- Linux:
- make -f linux_Makefile -C diff
-
Test:
make -C test/
blob - /dev/null
blob + 3a89e2f6754fad430d6111cd5acc08420f47f111 (mode 644)
--- /dev/null
+++ Makefile
+.PATH:${.CURDIR}/../lib
+
+.include "diff-version.mk"
+
+PROG= diff
+SRCS= \
+ diff.c \
+ diff_atomize_text.c \
+ diff_main.c \
+ diff_myers.c \
+ diff_patience.c \
+ diff_output.c \
+ diff_output_plain.c \
+ diff_output_unidiff.c \
+ ${END}
+MAN = ${PROG}.1
+
+CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
+
+.if defined(PROFILE)
+LDADD = -lutil_p -lz_p -lc_p
+.else
+LDADD = -lutil -lz
+.endif
+
+.if ${DIFF_RELEASE} != "Yes"
+NOMAN = Yes
+.endif
+
+realinstall:
+ ${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} \
+ -m ${BINMODE} ${PROG} ${BINDIR}/${PROG}
+
+dist:
+ mkdir ../diff-${DIFF_VERSION}/diff
+ cp ${SRCS} ${MAN} ../diff-${DIFF_VERSION}/diff
+
+.include <bsd.prog.mk>
blob - b93701332fb1013c6f09d620e138075dc268925c (mode 644)
blob + /dev/null
--- diff/Makefile
+++ /dev/null
-.PATH:${.CURDIR}/../lib
-
-.include "../diff-version.mk"
-
-PROG= diff
-SRCS= \
- diff.c \
- diff_atomize_text.c \
- diff_main.c \
- diff_myers.c \
- diff_patience.c \
- diff_output.c \
- diff_output_plain.c \
- diff_output_unidiff.c \
- ${END}
-MAN = ${PROG}.1
-
-CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
-
-.if defined(PROFILE)
-LDADD = -lutil_p -lz_p -lc_p
-.else
-LDADD = -lutil -lz
-.endif
-
-.if ${DIFF_RELEASE} != "Yes"
-NOMAN = Yes
-.endif
-
-realinstall:
- ${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} \
- -m ${BINMODE} ${PROG} ${BINDIR}/${PROG}
-
-dist:
- mkdir ../diff-${DIFF_VERSION}/diff
- cp ${SRCS} ${MAN} ../diff-${DIFF_VERSION}/diff
-
-.include <bsd.prog.mk>
blob - 86b3c54681283baa034bfbeeb19d4b2ee19ef50a (mode 644)
blob + /dev/null
--- diff/diff.c
+++ /dev/null
-/* Commandline diff utility to test diff implementations. */
-/*
- * Copyright (c) 2018 Martin Pieuchot
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <sys/mman.h>
-#include <sys/stat.h>
-
-#include <err.h>
-#include <fcntl.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#ifdef __linux__
-/* stupid shims to compile and test on linux */
-#define __dead
-
-static const char *getprogname()
-{
- return "diff";
-}
-#endif
-
-__dead void usage(void);
-int diffreg(char *, char *, int);
-char *mmapfile(const char *, struct stat *);
-
-__dead void
-usage(void)
-{
- fprintf(stderr, "usage: %s file1 file2\n", getprogname());
- exit(1);
-}
-
-int
-main(int argc, char *argv[])
-{
- int ch;
-
- while ((ch = getopt(argc, argv, "")) != -1) {
- switch (ch) {
- default:
- usage();
- }
- }
-
- argc -= optind;
- argv += optind;
-
- if (argc != 2)
- usage();
-
- return diffreg(argv[0], argv[1], 0);
-}
-
-#include <diff/diff_main.h>
-#include <diff/diff_output.h>
-
-const struct diff_algo_config myers, patience, myers_divide;
-
-const struct diff_algo_config myers = (struct diff_algo_config){
- .impl = diff_algo_myers,
- .permitted_state_size = 1024 * 1024 * sizeof(int),
- .fallback_algo = &patience,
-};
-
-const struct diff_algo_config patience = (struct diff_algo_config){
- .impl = diff_algo_patience,
- .inner_algo = &patience, // After subdivision, do Patience again.
- .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
-};
-
-const struct diff_algo_config myers_divide = (struct diff_algo_config){
- .impl = diff_algo_myers_divide,
- .inner_algo = &myers, // When division succeeded, start from the top.
- // (fallback_algo = NULL implies diff_algo_none).
-};
-
-const struct diff_config diff_config = {
- .atomize_func = diff_atomize_text_by_line,
- .algo = &myers,
-};
-
-int
-diffreg(char *file1, char *file2, int flags)
-{
- char *str1, *str2;
- struct stat st1, st2;
- struct diff_input_info info = {
- .left_path = file1,
- .right_path = file2,
- };
-
- str1 = mmapfile(file1, &st1);
- str2 = mmapfile(file2, &st2);
-
- diff_unidiff(stdout, &diff_config, &info, str1, st1.st_size, str2, st2.st_size, 3);
-
- munmap(str1, st1.st_size);
- munmap(str2, st2.st_size);
-
- return 0;
-}
-
-char *
-mmapfile(const char *path, struct stat *st)
-{
- int fd;
- char *p;
-
- fd = open(path, O_RDONLY);
- if (fd == -1)
- err(2, "%s", path);
-
- if (fstat(fd, st) == -1)
- err(2, "%s", path);
-
- if ((uintmax_t)st->st_size > SIZE_MAX)
- errx(2, "%s: file too big to fit memory", path);
-
- p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
- if (p == MAP_FAILED)
- err(2, "mmap");
-
- close(fd);
-
- return p;
-}
blob - 62c7103d70ac32f292cc05f62c782305dd8798f6 (mode 644)
blob + /dev/null
--- diff/linux_Makefile
+++ /dev/null
-CFLAGS = -fsanitize=address -fsanitize=undefined -g -O3
-
-diff: diff.c ../lib/libdiff.a
- gcc $(CFLAGS) -I../include -o $@ $^
-
-
-../lib/libdiff.a: ../lib/*.[hc] ../include/diff/*.h
- $(MAKE) -f linux_Makefile -C ../lib
-
-.PHONY: clean
-clean:
- rm diff
- $(MAKE) -f linux_Makefile -C ../lib clean
blob - /dev/null
blob + 395468a8c29ce7c6d5ca5283a1a1aa247acb3d87 (mode 644)
--- /dev/null
+++ arraylist.h
+/* Auto-reallocating array for arbitrary member types. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __linux__
+/* stupid shims to compile and test on linux */
+#include <strings.h>
+static inline void *reallocarray(void *buf, size_t n, size_t member_size)
+{
+ return realloc(buf, n * member_size);
+}
+static inline void *recallocarray(void *buf, size_t oldn, size_t n, size_t member_size)
+{
+ buf = reallocarray(buf, n, member_size);
+ bzero(((char*)buf) + (oldn * member_size), (n - oldn) * member_size);
+ return buf;
+}
+#endif
+
+
+/* Usage:
+ *
+ * ARRAYLIST(any_type_t) list;
+ * // OR
+ * typedef ARRAYLIST(any_type_t) any_type_list_t;
+ * any_type_list_t list;
+ *
+ * ARRAYLIST_INIT(list, 128); // < number of (at first unused) members to add on each realloc
+ * any_type_t *x;
+ * while (bar) {
+ * ARRAYLIST_ADD(x, list); // < enlarges the allocated array as needed; list.head may change due to realloc
+ * if (!x)
+ * abort();
+ * *x = random_foo_value;
+ * }
+ * for (i = 0; i < list.len; i++)
+ * printf("%s", foo_to_str(list.head[i]));
+ * ARRAYLIST_FREE(list);
+ */
+#define ARRAYLIST(MEMBER_TYPE) \
+ struct { \
+ MEMBER_TYPE *head; \
+ unsigned int len; \
+ unsigned int allocated; \
+ unsigned int alloc_blocksize; \
+ }
+
+#define ARRAYLIST_INIT(ARRAY_LIST, ALLOC_BLOCKSIZE) do { \
+ (ARRAY_LIST).head = NULL; \
+ (ARRAY_LIST).len = 0; \
+ (ARRAY_LIST).allocated = 0; \
+ (ARRAY_LIST).alloc_blocksize = ALLOC_BLOCKSIZE; \
+ } while(0)
+
+#define ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST) do { \
+ if ((ARRAY_LIST).len && !(ARRAY_LIST).allocated) { \
+ NEW_ITEM_P = NULL; \
+ break; \
+ } \
+ if ((ARRAY_LIST).head == NULL || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
+ (ARRAY_LIST).allocated += (ARRAY_LIST).alloc_blocksize ? : 8; \
+ (ARRAY_LIST).head = recallocarray((ARRAY_LIST).head, (ARRAY_LIST).len, \
+ (ARRAY_LIST).allocated, sizeof(*(ARRAY_LIST).head)); \
+ }; \
+ if (!(ARRAY_LIST).head || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
+ NEW_ITEM_P = NULL; \
+ break; \
+ } \
+ (NEW_ITEM_P) = &(ARRAY_LIST).head[(ARRAY_LIST).len]; \
+ (ARRAY_LIST).len++; \
+ } while (0)
+
+#define ARRAYLIST_CLEAR(ARRAY_LIST) \
+ (ARRAY_LIST).len = 0
+
+#define ARRAYLIST_FREE(ARRAY_LIST) \
+ do { \
+ if ((ARRAY_LIST).head && (ARRAY_LIST).allocated) \
+ free((ARRAY_LIST).head); \
+ ARRAYLIST_INIT(ARRAY_LIST, (ARRAY_LIST).alloc_blocksize); \
+ } while(0)
blob - 395468a8c29ce7c6d5ca5283a1a1aa247acb3d87 (mode 644)
blob + /dev/null
--- include/diff/arraylist.h
+++ /dev/null
-/* Auto-reallocating array for arbitrary member types. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#ifdef __linux__
-/* stupid shims to compile and test on linux */
-#include <strings.h>
-static inline void *reallocarray(void *buf, size_t n, size_t member_size)
-{
- return realloc(buf, n * member_size);
-}
-static inline void *recallocarray(void *buf, size_t oldn, size_t n, size_t member_size)
-{
- buf = reallocarray(buf, n, member_size);
- bzero(((char*)buf) + (oldn * member_size), (n - oldn) * member_size);
- return buf;
-}
-#endif
-
-
-/* Usage:
- *
- * ARRAYLIST(any_type_t) list;
- * // OR
- * typedef ARRAYLIST(any_type_t) any_type_list_t;
- * any_type_list_t list;
- *
- * ARRAYLIST_INIT(list, 128); // < number of (at first unused) members to add on each realloc
- * any_type_t *x;
- * while (bar) {
- * ARRAYLIST_ADD(x, list); // < enlarges the allocated array as needed; list.head may change due to realloc
- * if (!x)
- * abort();
- * *x = random_foo_value;
- * }
- * for (i = 0; i < list.len; i++)
- * printf("%s", foo_to_str(list.head[i]));
- * ARRAYLIST_FREE(list);
- */
-#define ARRAYLIST(MEMBER_TYPE) \
- struct { \
- MEMBER_TYPE *head; \
- unsigned int len; \
- unsigned int allocated; \
- unsigned int alloc_blocksize; \
- }
-
-#define ARRAYLIST_INIT(ARRAY_LIST, ALLOC_BLOCKSIZE) do { \
- (ARRAY_LIST).head = NULL; \
- (ARRAY_LIST).len = 0; \
- (ARRAY_LIST).allocated = 0; \
- (ARRAY_LIST).alloc_blocksize = ALLOC_BLOCKSIZE; \
- } while(0)
-
-#define ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST) do { \
- if ((ARRAY_LIST).len && !(ARRAY_LIST).allocated) { \
- NEW_ITEM_P = NULL; \
- break; \
- } \
- if ((ARRAY_LIST).head == NULL || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
- (ARRAY_LIST).allocated += (ARRAY_LIST).alloc_blocksize ? : 8; \
- (ARRAY_LIST).head = recallocarray((ARRAY_LIST).head, (ARRAY_LIST).len, \
- (ARRAY_LIST).allocated, sizeof(*(ARRAY_LIST).head)); \
- }; \
- if (!(ARRAY_LIST).head || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
- NEW_ITEM_P = NULL; \
- break; \
- } \
- (NEW_ITEM_P) = &(ARRAY_LIST).head[(ARRAY_LIST).len]; \
- (ARRAY_LIST).len++; \
- } while (0)
-
-#define ARRAYLIST_CLEAR(ARRAY_LIST) \
- (ARRAY_LIST).len = 0
-
-#define ARRAYLIST_FREE(ARRAY_LIST) \
- do { \
- if ((ARRAY_LIST).head && (ARRAY_LIST).allocated) \
- free((ARRAY_LIST).head); \
- ARRAYLIST_INIT(ARRAY_LIST, (ARRAY_LIST).alloc_blocksize); \
- } while(0)
blob - f6dcef58bb67e1a01ce4a633f7928a184a56ebb1 (mode 644)
blob + /dev/null
--- include/diff/diff_main.h
+++ /dev/null
-/* Generic infrastructure to implement various diff algorithms. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-#include <errno.h>
-
-#include <diff/arraylist.h>
-
-#ifndef MAX
-#define MAX(A,B) ((A)>(B)?(A):(B))
-#endif
-#ifndef MIN
-#define MIN(A,B) ((A)<(B)?(A):(B))
-#endif
-
-struct range {
- int start;
- int end;
-};
-
-static inline bool range_empty(const struct range *r)
-{
- return r->start == r->end;
-}
-
-static inline bool ranges_touch(const struct range *a, const struct range *b)
-{
- return (a->end >= b->start) && (a->start <= b->end);
-}
-
-static inline void ranges_merge(struct range *a, const struct range *b)
-{
- *a = (struct range){
- .start = MIN(a->start, b->start),
- .end = MAX(a->end, b->end),
- };
-}
-
-static inline int range_len(const struct range *r)
-{
- if (!r)
- return 0;
- return r->end - r->start;
-}
-
-/* List of all possible return codes of a diff invocation. */
-enum diff_rc {
- DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1,
- DIFF_RC_OK = 0,
- DIFF_RC_ENOTSUP = ENOTSUP,
- DIFF_RC_ENOMEM = ENOMEM,
- DIFF_RC_EINVAL = EINVAL,
-};
-
-struct diff_atom {
- const uint8_t *at;
- size_t len;
-
- /* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to
- * compare entire atoms to find out whether they are indeed identical or not. */
- unsigned int hash;
-
- /* State for the Patience Diff algorithm */
- /* TODO: keep a separate array for the patience state */
- struct {
- bool unique_here;
- bool unique_in_both;
- struct diff_atom *pos_in_other;
- struct diff_atom *prev_stack;
- struct range identical_lines;
- } patience;
-};
-
-static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
-{
- return left->hash == right->hash
- && left->len == right->len
- && memcmp(left->at, right->at, left->len) == 0;
-}
-
-/* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In
- * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file,
- * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms
- * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a
- * subset of atoms. */
-struct diff_data {
- const uint8_t *data;
- size_t len;
- ARRAYLIST(struct diff_atom) atoms;
- struct diff_data *root;
-};
-
-void diff_data_free(struct diff_data *diff_data);
-
-/* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with
- * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in
- * the file (and not the relative position within the subsection). */
-#define diff_atom_root_idx(DIFF_DATA, ATOM) \
- ((ATOM) ? (ATOM) - ((DIFF_DATA)->root->atoms.head) : (DIFF_DATA)->root->atoms.len)
-
-/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with
- * 0). */
-#define diff_atom_idx(DIFF_DATA, ATOM) \
- ((ATOM) ? (ATOM) - ((DIFF_DATA)->atoms.head) : (DIFF_DATA)->atoms.len)
-
-#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
- for ((ATOM) = (FIRST_ATOM); \
- (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
- (ATOM)++)
-
-#define diff_data_foreach_atom(ATOM, DIFF_DATA) \
- foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
-
-#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
- for ((ATOM) = (FROM); \
- (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
- (ATOM)++)
-
-/* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right.
- *
- * If solved == false:
- * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next.
- * The lines on the left should be diffed against the lines on the right.
- * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.)
- *
- * If solved == true:
- * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk").
- * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk").
- * If there are both left and right lines, it is a chunk of equal content on both sides,
- * and left_count == right_count:
- *
- * - foo }
- * - bar }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
- * - baz } right_start = NULL, right_count = 0 }
- * moo }
- * goo }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
- * zoo } right_start = &right.atoms.head[0], right_count = 3 }
- * +loo }
- * +roo }-- diff_chunk{ left_start = NULL, left_count = 0,
- * +too } right_start = &right.atoms.head[3], right_count = 3 }
- *
- */
-struct diff_chunk {
- bool solved;
- struct diff_atom *left_start;
- unsigned int left_count;
- struct diff_atom *right_start;
- unsigned int right_count;
-};
-
-typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
-#define DIFF_RESULT_ALLOC_BLOCKSIZE 128
-
-struct diff_result {
- enum diff_rc rc;
- struct diff_data left;
- struct diff_data right;
- diff_chunk_arraylist_t chunks;
-};
-
-struct diff_state {
- /* The final result passed to the original diff caller. */
- struct diff_result *result;
-
- /* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */
- struct diff_data left;
- struct diff_data right;
-
- unsigned int recursion_depth_left;
-
- /* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */
- diff_chunk_arraylist_t temp_result;
-};
-
-struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
- struct diff_atom *left_start, unsigned int left_count,
- struct diff_atom *right_start, unsigned int right_count);
-
-/* Signature of a utility function to divide both source files into diff atoms.
- * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets
- * both left and right to atomize at the same time.
- * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
- *
- * func_data: context pointer (free to be used by implementation).
- * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created.
- * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created.
- */
-typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right);
-
-extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right);
-
-struct diff_algo_config;
-typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */
-enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
- * fail if the required space surpasses algo_config->permitted_state_size. */
-extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
- * the problem into smaller chunks. Requires only linear amounts of memory. */
-extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to
- * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */
-extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Diff algorithms to use, possibly nested. For example:
- *
- * struct diff_algo_config myers, patience, myers_divide;
- *
- * myers = (struct diff_algo_config){
- * .impl = diff_algo_myers,
- * .permitted_state_size = 32 * 1024 * 1024,
- * .fallback_algo = &patience, // when too large, do diff_algo_patience
- * };
- *
- * patience = (struct diff_algo_config){
- * .impl = diff_algo_patience,
- * .inner_algo = &patience, // After subdivision, do Patience again.
- * .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
- * };
- *
- * myers_divide = (struct diff_algo_config){
- * .impl = diff_algo_myers_divide,
- * .inner_algo = &myers, // When division succeeded, start from the top.
- * // (fallback_algo = NULL implies diff_algo_none).
- * };
- *
- * struct diff_config config = {
- * .algo = &myers,
- * ...
- * };
- * diff_main(&config, ...);
- */
-struct diff_algo_config {
- diff_algo_impl_t impl;
-
- /* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo
- * (diff_algo_myers). permitted_state_size == 0 means no limitation. */
- size_t permitted_state_size;
-
- /* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */
- const struct diff_algo_config *inner_algo;
-
- /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find
- * any common-unique atoms), then use this algorithm instead. */
- const struct diff_algo_config *fallback_algo;
-};
-
-struct diff_config {
- diff_atomize_func_t atomize_func;
- void *atomize_func_data;
-
- const struct diff_algo_config *algo;
-
- /* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite
- * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all
- * left atoms and add all right atoms). */
- unsigned int max_recursion_depth;
-};
-
-struct diff_result *diff_main(const struct diff_config *config,
- const uint8_t *left_data, size_t left_len,
- const uint8_t *right_data, size_t right_len);
-void diff_result_free(struct diff_result *result);
blob - 5d07e74d868098aab6819c41c9499f0f1b30040c (mode 644)
blob + /dev/null
--- include/diff/diff_output.h
+++ /dev/null
-/* Diff output generators and invocation shims. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <diff/diff_main.h>
-
-struct diff_input_info {
- const char *arbitrary_info;
- const char *left_path;
- const char *right_path;
-};
-
-enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
- const struct diff_result *result);
-enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
- const struct diff_input_info *info,
- const char *left, int left_len, const char *right, int right_len);
-
-enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
- const struct diff_result *result, unsigned int context_lines);
-enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
- const struct diff_input_info *info,
- const char *left, int left_len, const char *right, int right_len,
- unsigned int context_lines);
-
-enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info);
-void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count);
blob - /dev/null
blob + ec47f65416456d6fa54228e53e8ccea1fd888fba (mode 644)
--- /dev/null
+++ debug.h
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#define DEBUG 0
+
+#if DEBUG
+#include <stdio.h>
+#define print(args...) fprintf(stderr, ##args)
+#define debug print
+#define debug_dump dump
+#define debug_dump_atom dump_atom
+#define debug_dump_atoms dump_atoms
+
+static inline void dump_atom(const struct diff_data *left, const struct diff_data *right, const struct diff_atom *atom)
+{
+ if (!atom) {
+ print("NULL atom\n");
+ return;
+ }
+ if (left)
+ print(" %3ld", diff_atom_root_idx(left, atom));
+ if (right && atom->patience.pos_in_other)
+ print(" %3ld", diff_atom_root_idx(right, atom->patience.pos_in_other));
+
+ print(" %s%s '", atom->patience.unique_here ? "u" : " ", atom->patience.unique_in_both ? "c" : " ");
+ const char *s;
+ for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) {
+ if (*s == '\r')
+ print("\\r");
+ else if (*s == '\n')
+ print("\\n");
+ else if ((*s < 32 || *s >= 127) && (*s != '\t'))
+ print("\\x%02x", *s);
+ else
+ print("%c", *s);
+ }
+ print("'\n");
+}
+
+static inline void dump_atoms(const struct diff_data *d, struct diff_atom *atom, unsigned int count)
+{
+ if (count > 42) {
+ dump_atoms(d, atom, 20);
+ print("[%u lines skipped]\n", count - 20 - 20);
+ dump_atoms(d, atom + count - 20, 20);
+ return;
+ } else {
+ struct diff_atom *i;
+ foreach_diff_atom(i, atom, count) {
+ dump_atom(d, NULL, i);
+ }
+ }
+}
+
+static inline void dump(struct diff_data *d)
+{
+ dump_atoms(d, d->atoms.head, d->atoms.len);
+}
+
+static inline void dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
+ int *kd)
+{
+ int x;
+ int y;
+ print(" ");
+ for (x = 0; x <= l->atoms.len; x++)
+ print("%2d", x);
+ print("\n");
+
+ for (y = 0; y <= r->atoms.len; y++) {
+ print("%2d ", y);
+ for (x = 0; x <= l->atoms.len; x++) {
+
+ /* print d advancements from kd, if any. */
+ int d = -1;
+ if (kd) {
+ int max = l->atoms.len + r->atoms.len;
+ size_t kd_len = max + 1 + max;
+ int *kd_pos = kd;
+ int di;
+#define xk_to_y(X, K) ((X) - (K))
+ for (di = 0; di < max; di++) {
+ int ki;
+ for (ki = di; ki >= -di; ki -= 2) {
+ if (x == kd_pos[ki]
+ && y == xk_to_y(x, ki)) {
+ d = di;
+ break;
+ }
+ }
+ if (d >= 0)
+ break;
+ kd_pos += kd_len;
+ }
+ }
+ if (d >= 0)
+ print("%d", d);
+ else
+ print("o");
+ if (x < l->atoms.len && d < 10)
+ print("-");
+ }
+ print("\n");
+ if (y == r->atoms.len)
+ break;
+
+ print(" ");
+ for (x = 0; x < l->atoms.len; x++) {
+ if (diff_atom_same(&l->atoms.head[x], &r->atoms.head[y]))
+ print("|\\");
+ else
+ print("| ");
+ }
+ print("|\n");
+ }
+}
+
+static inline void debug_dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
+ int *kd)
+{
+ if (l->atoms.len > 99 || r->atoms.len > 99)
+ return;
+ dump_myers_graph(l, r, kd);
+}
+
+#else
+#define debug(args...)
+#define debug_dump(args...)
+#define debug_dump_atom(args...)
+#define debug_dump_atoms(args...)
+#define debug_dump_myers_graph(args...)
+#endif
blob - ec47f65416456d6fa54228e53e8ccea1fd888fba (mode 644)
blob + /dev/null
--- lib/debug.h
+++ /dev/null
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#define DEBUG 0
-
-#if DEBUG
-#include <stdio.h>
-#define print(args...) fprintf(stderr, ##args)
-#define debug print
-#define debug_dump dump
-#define debug_dump_atom dump_atom
-#define debug_dump_atoms dump_atoms
-
-static inline void dump_atom(const struct diff_data *left, const struct diff_data *right, const struct diff_atom *atom)
-{
- if (!atom) {
- print("NULL atom\n");
- return;
- }
- if (left)
- print(" %3ld", diff_atom_root_idx(left, atom));
- if (right && atom->patience.pos_in_other)
- print(" %3ld", diff_atom_root_idx(right, atom->patience.pos_in_other));
-
- print(" %s%s '", atom->patience.unique_here ? "u" : " ", atom->patience.unique_in_both ? "c" : " ");
- const char *s;
- for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) {
- if (*s == '\r')
- print("\\r");
- else if (*s == '\n')
- print("\\n");
- else if ((*s < 32 || *s >= 127) && (*s != '\t'))
- print("\\x%02x", *s);
- else
- print("%c", *s);
- }
- print("'\n");
-}
-
-static inline void dump_atoms(const struct diff_data *d, struct diff_atom *atom, unsigned int count)
-{
- if (count > 42) {
- dump_atoms(d, atom, 20);
- print("[%u lines skipped]\n", count - 20 - 20);
- dump_atoms(d, atom + count - 20, 20);
- return;
- } else {
- struct diff_atom *i;
- foreach_diff_atom(i, atom, count) {
- dump_atom(d, NULL, i);
- }
- }
-}
-
-static inline void dump(struct diff_data *d)
-{
- dump_atoms(d, d->atoms.head, d->atoms.len);
-}
-
-static inline void dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
- int *kd)
-{
- int x;
- int y;
- print(" ");
- for (x = 0; x <= l->atoms.len; x++)
- print("%2d", x);
- print("\n");
-
- for (y = 0; y <= r->atoms.len; y++) {
- print("%2d ", y);
- for (x = 0; x <= l->atoms.len; x++) {
-
- /* print d advancements from kd, if any. */
- int d = -1;
- if (kd) {
- int max = l->atoms.len + r->atoms.len;
- size_t kd_len = max + 1 + max;
- int *kd_pos = kd;
- int di;
-#define xk_to_y(X, K) ((X) - (K))
- for (di = 0; di < max; di++) {
- int ki;
- for (ki = di; ki >= -di; ki -= 2) {
- if (x == kd_pos[ki]
- && y == xk_to_y(x, ki)) {
- d = di;
- break;
- }
- }
- if (d >= 0)
- break;
- kd_pos += kd_len;
- }
- }
- if (d >= 0)
- print("%d", d);
- else
- print("o");
- if (x < l->atoms.len && d < 10)
- print("-");
- }
- print("\n");
- if (y == r->atoms.len)
- break;
-
- print(" ");
- for (x = 0; x < l->atoms.len; x++) {
- if (diff_atom_same(&l->atoms.head[x], &r->atoms.head[y]))
- print("|\\");
- else
- print("| ");
- }
- print("|\n");
- }
-}
-
-static inline void debug_dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
- int *kd)
-{
- if (l->atoms.len > 99 || r->atoms.len > 99)
- return;
- dump_myers_graph(l, r, kd);
-}
-
-#else
-#define debug(args...)
-#define debug_dump(args...)
-#define debug_dump_atom(args...)
-#define debug_dump_atoms(args...)
-#define debug_dump_myers_graph(args...)
-#endif
blob - 964c1353da1b055ae761ae6961c45ea19e16d032 (mode 644)
blob + /dev/null
--- lib/diff_atomize_text.c
+++ /dev/null
-/* Split source by line breaks, and calculate a simplistic checksum. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_main.h>
-
-static int diff_data_atomize_text_lines(struct diff_data *d)
-{
- const uint8_t *pos = d->data;
- const uint8_t *end = pos + d->len;
- enum diff_rc rc;
-
- unsigned int array_size_estimate = d->len / 50;
- unsigned int pow2 = 1;
- while (array_size_estimate >>= 1)
- pow2++;
-
- ARRAYLIST_INIT(d->atoms, 1 << pow2);
-
- while (pos < end) {
- const uint8_t *line_end = pos;
- unsigned int hash = 0;
-
- while (line_end < end && *line_end != '\r' && *line_end != '\n') {
- hash = hash * 23 + *line_end;
- line_end++;
- }
-
- /* When not at the end of data, the line ending char ('\r' or '\n') must follow */
- if (line_end < end)
- line_end++;
- /* If that was an '\r', also pull in any following '\n' */
- if (line_end[0] == '\r' && line_end < end && line_end[1] == '\n')
- line_end++;
-
- /* Record the found line as diff atom */
- struct diff_atom *atom;
- ARRAYLIST_ADD(atom, d->atoms);
- if (!atom)
- return DIFF_RC_ENOMEM;
-
- *atom = (struct diff_atom){
- .at = pos,
- .len = line_end - pos,
- .hash = hash,
- };
-
- /* Starting point for next line: */
- pos = line_end;
- }
-
- return DIFF_RC_OK;
-}
-
-enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right)
-{
- enum diff_rc rc;
- rc = diff_data_atomize_text_lines(left);
- if (rc != DIFF_RC_OK)
- return rc;
- return diff_data_atomize_text_lines(right);
-}
blob - 285bd40d5941fe84c5d9a9bd437a4a9f981ead13 (mode 644)
blob + /dev/null
--- lib/diff_main.c
+++ /dev/null
-/* Generic infrastructure to implement various diff algorithms (implementation). */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-
-#include <sys/queue.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
-
-#include <assert.h>
-
-#include <diff/diff_main.h>
-
-#include "debug.h"
-
-/* Even if a left or right side is empty, diff output may need to know the position in that file.
- * So left_start or right_start must never be NULL -- pass left_count or right_count as zero to indicate staying at that
- * position without consuming any lines. */
-struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
- struct diff_atom *left_start, unsigned int left_count,
- struct diff_atom *right_start, unsigned int right_count)
-{
- struct diff_chunk *chunk;
- diff_chunk_arraylist_t *result;
-
- if (solved && !state->temp_result.len)
- result = &state->result->chunks;
- else
- result = &state->temp_result;
-
- ARRAYLIST_ADD(chunk, *result);
- if (!chunk)
- return NULL;
- *chunk = (struct diff_chunk){
- .solved = solved,
- .left_start = left_start,
- .left_count = left_count,
- .right_start = right_start,
- .right_count = right_count,
- };
-
- debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED");
- debug("L\n");
- debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
- debug("R\n");
- debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
- return chunk;
-}
-
-void diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len)
-{
- *d = (struct diff_data){
- .data = data,
- .len = len,
- .root = d,
- };
-}
-
-void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
- struct diff_atom *from_atom, unsigned int atoms_count)
-{
- struct diff_atom *last_atom = from_atom + atoms_count - 1;
- *d = (struct diff_data){
- .data = from_atom->at,
- .len = (last_atom->at + last_atom->len) - from_atom->at,
- .root = parent->root,
- .atoms.head = from_atom,
- .atoms.len = atoms_count,
- };
-
- debug("subsection:\n");
- debug_dump(d);
-}
-
-void diff_data_free(struct diff_data *diff_data)
-{
- if (!diff_data)
- return;
- if (diff_data->atoms.allocated)
- ARRAYLIST_FREE(diff_data->atoms);
-}
-
-enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
- debug("\n** %s\n", __func__);
- debug("left:\n");
- debug_dump(&state->left);
- debug("right:\n");
- debug_dump(&state->right);
- debug_dump_myers_graph(&state->left, &state->right, NULL);
-
- /* Add a chunk of equal lines, if any */
- unsigned int equal_atoms = 0;
- while (equal_atoms < state->left.atoms.len && equal_atoms < state->right.atoms.len
- && diff_atom_same(&state->left.atoms.head[equal_atoms], &state->right.atoms.head[equal_atoms]))
- equal_atoms++;
- if (equal_atoms) {
- if (!diff_state_add_chunk(state, true,
- &state->left.atoms.head[0], equal_atoms,
- &state->right.atoms.head[0], equal_atoms))
- return DIFF_RC_ENOMEM;
- }
-
- /* Add a "minus" chunk with all lines from the left. */
- if (equal_atoms < state->left.atoms.len) {
- if (!diff_state_add_chunk(state, true,
- &state->left.atoms.head[equal_atoms],
- state->left.atoms.len - equal_atoms,
- NULL, 0))
- return DIFF_RC_ENOMEM;
- }
-
- /* Add a "plus" chunk with all lines from the right. */
- if (equal_atoms < state->right.atoms.len) {
- if (!diff_state_add_chunk(state, true,
- NULL, 0,
- &state->right.atoms.head[equal_atoms],
- state->right.atoms.len - equal_atoms))
- return DIFF_RC_ENOMEM;
- }
- return DIFF_RC_OK;
-}
-
-enum diff_rc diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
- enum diff_rc rc;
- ARRAYLIST_FREE(state->temp_result);
-
- if (!algo_config || !algo_config->impl || !state->recursion_depth_left) {
- debug("MAX RECURSION REACHED, just dumping diff chunks\n");
- return diff_algo_none(algo_config, state);
- }
-
- ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
- rc = algo_config->impl(algo_config, state);
- switch (rc) {
- case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
- debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo);
- rc = diff_run_algo(algo_config->fallback_algo, state);
- goto return_rc;
-
- case DIFF_RC_OK:
- /* continue below */
- break;
-
- default:
- /* some error happened */
- goto return_rc;
- }
-
- /* Pick up any diff chunks that are still unsolved and feed to inner_algo.
- * inner_algo will solve unsolved chunks and append to result,
- * and subsequent solved chunks on this level are then appended to result afterwards. */
- int i;
- for (i = 0; i < state->temp_result.len; i++) {
- struct diff_chunk *c = &state->temp_result.head[i];
- if (c->solved) {
- struct diff_chunk *final_c;
- ARRAYLIST_ADD(final_c, state->result->chunks);
- if (!final_c) {
- rc = DIFF_RC_ENOMEM;
- goto return_rc;
- }
- *final_c = *c;
- continue;
- }
-
- /* c is an unsolved chunk, feed to inner_algo */
- struct diff_state inner_state = {
- .result = state->result,
- .recursion_depth_left = state->recursion_depth_left - 1,
- };
- diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count);
- diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count);
-
- rc = diff_run_algo(algo_config->inner_algo, &inner_state);
-
- if (rc != DIFF_RC_OK)
- goto return_rc;
- }
-
- rc = DIFF_RC_OK;
-return_rc:
- ARRAYLIST_FREE(state->temp_result);
- return rc;
-}
-
-struct diff_result *diff_main(const struct diff_config *config,
- const uint8_t *left_data, size_t left_len,
- const uint8_t *right_data, size_t right_len)
-{
- struct diff_result *result = malloc(sizeof(struct diff_result));
- if (!result)
- return NULL;
-
- *result = (struct diff_result){};
- diff_data_init_root(&result->left, left_data, left_len);
- diff_data_init_root(&result->right, right_data, right_len);
-
- if (!config->atomize_func) {
- result->rc = DIFF_RC_EINVAL;
- return result;
- }
-
- result->rc = config->atomize_func(config->atomize_func_data, &result->left, &result->right);
- if (result->rc != DIFF_RC_OK)
- return result;
-
- struct diff_state state = {
- .result = result,
- .recursion_depth_left = config->max_recursion_depth ? : 1024,
- };
- diff_data_init_subsection(&state.left, &result->left, result->left.atoms.head, result->left.atoms.len);
- diff_data_init_subsection(&state.right, &result->right, result->right.atoms.head, result->right.atoms.len);
-
- result->rc = diff_run_algo(config->algo, &state);
-
- return result;
-}
-
-void diff_result_free(struct diff_result *result)
-{
- if (!result)
- return;
- diff_data_free(&result->left);
- diff_data_free(&result->right);
- ARRAYLIST_FREE(result->chunks);
- free(result);
-}
blob - cba927a4a5813ed3a1afa1247227a9637271e176 (mode 644)
blob + /dev/null
--- lib/diff_myers.c
+++ /dev/null
-/* Myers diff algorithm implementation, invented by Eugene W. Myers [1].
- * Implementations of both the Myers Divide Et Impera (using linear space)
- * and the canonical Myers algorithm (using quadratic space). */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_main.h>
-
-#include "debug.h"
-
-/* Myers' diff algorithm [1] is nicely explained in [2].
- * [1] http://www.xmailserver.org/diff2.pdf
- * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.
- *
- * Myers approaches finding the smallest diff as a graph problem.
- * The crux is that the original algorithm requires quadratic amount of memory:
- * both sides' lengths added, and that squared. So if we're diffing lines of text, two files with 1000 lines each would
- * blow up to a matrix of about 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text.
- * The solution is using Myers' "divide and conquer" extension algorithm, which does the original traversal from both
- * ends of the files to reach a middle where these "snakes" touch, hence does not need to backtrace the traversal, and
- * so gets away with only keeping a single column of that huge state matrix in memory.
- *
- * Todo: the divide and conquer requires linear *space*, not necessarily linear *time*. It recurses, apparently doing
- * multiple Myers passes, and also it apparently favors fragmented diffs in cases where chunks of text were moved to a
- * different place. Up to a given count of diff atoms (text lines), it might be desirable to accept the quadratic memory
- * usage, get nicer diffs and less re-iteration of the same data?
- */
-
-struct diff_box {
- unsigned int left_start;
- unsigned int left_end;
- unsigned int right_start;
- unsigned int right_end;
-};
-
-#define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0)
-
-
-/* If the two contents of a file are A B C D E and X B C Y,
- * the Myers diff graph looks like:
- *
- * k0 k1
- * \ \
- * k-1 0 1 2 3 4 5
- * \ A B C D E
- * 0 o-o-o-o-o-o
- * X | | | | | |
- * 1 o-o-o-o-o-o
- * B | |\| | | |
- * 2 o-o-o-o-o-o
- * C | | |\| | |
- * 3 o-o-o-o-o-o
- * Y | | | | | |\
- * 4 o-o-o-o-o-o c1
- * \ \
- * c-1 c0
- *
- * Moving right means delete an atom from the left-hand-side,
- * Moving down means add an atom from the right-hand-side.
- * Diagonals indicate identical atoms on both sides, the challenge is to use as many diagonals as possible.
- *
- * The original Myers algorithm walks all the way from the top left to the bottom right, remembers all steps, and then
- * backtraces to find the shortest path. However, that requires keeping the entire graph in memory, which needs
- * quadratic space.
- *
- * Myers adds a variant that uses linear space -- note, not linear time, only linear space: walk forward and backward,
- * find a meeting point in the middle, and recurse on the two separate sections. This is called "divide and conquer".
- *
- * d: the step number, starting with 0, a.k.a. the distance from the starting point.
- * k: relative index in the state array for the forward scan, indicating on which diagonal through the diff graph we
- * currently are.
- * c: relative index in the state array for the backward scan, indicating the diagonal number from the bottom up.
- *
- * The "divide and conquer" traversal through the Myers graph looks like this:
- *
- * | d= 0 1 2 3 2 1 0
- * ----+--------------------------------------------
- * k= | c=
- * 4 | 3
- * |
- * 3 | 3,0 5,2 2
- * | / \
- * 2 | 2,0 5,3 1
- * | / \
- * 1 | 1,0 4,3 >= 4,3 5,4<-- 0
- * | / / \ /
- * 0 | -->0,0 3,3 4,4 -1
- * | \ / /
- * -1 | 0,1 1,2 3,4 -2
- * | \ /
- * -2 | 0,2 -3
- * | \
- * | 0,3
- * | forward-> <-backward
- *
- * x,y pairs here are the coordinates in the Myers graph:
- * x = atom index in left-side source, y = atom index in the right-side source.
- *
- * Only one forward column and one backward column are kept in mem, each need at most left.len + 1 + right.len items.
- * Note that each d step occupies either the even or the odd items of a column: if e.g. the previous column is in the
- * odd items, the next column is formed in the even items, without overwriting the previous column's results.
- *
- * Also note that from the diagonal index k and the x coordinate, the y coordinate can be derived:
- * y = x - k
- * Hence the state array only needs to keep the x coordinate, i.e. the position in the left-hand file, and the y
- * coordinate, i.e. position in the right-hand file, is derived from the index in the state array.
- *
- * The two traces meet at 4,3, the first step (here found in the forward traversal) where a forward position is on or
- * past a backward traced position on the same diagonal.
- *
- * This divides the problem space into:
- *
- * 0 1 2 3 4 5
- * A B C D E
- * 0 o-o-o-o-o
- * X | | | | |
- * 1 o-o-o-o-o
- * B | |\| | |
- * 2 o-o-o-o-o
- * C | | |\| |
- * 3 o-o-o-o-*-o *: forward and backward meet here
- * Y | |
- * 4 o-o
- *
- * Doing the same on each section lead to:
- *
- * 0 1 2 3 4 5
- * A B C D E
- * 0 o-o
- * X | |
- * 1 o-b b: backward d=1 first reaches here (sliding up the snake)
- * B \ f: then forward d=2 reaches here (sliding down the snake)
- * 2 o As result, the box from b to f is found to be identical;
- * C \ leaving a top box from 0,0 to 1,1 and a bottom trivial tail 3,3 to 4,3.
- * 3 f-o
- *
- * 3 o-*
- * Y |
- * 4 o *: forward and backward meet here
- *
- * and solving the last top left box gives:
- *
- * 0 1 2 3 4 5
- * A B C D E -A
- * 0 o-o +X
- * X | B
- * 1 o C
- * B \ -D
- * 2 o -E
- * C \ +Y
- * 3 o-o-o
- * Y |
- * 4 o
- *
- */
-
-#define xk_to_y(X, K) ((X) - (K))
-#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))
-#define k_to_c(K, DELTA) ((K) + (DELTA))
-#define c_to_k(C, DELTA) ((C) - (DELTA))
-
-/* Do one forwards step in the "divide and conquer" graph traversal.
- * left: the left side to diff.
- * right: the right side to diff against.
- * kd_forward: the traversal state for forwards traversal, modified by this function.
- * This is carried over between invocations with increasing d.
- * kd_forward points at the center of the state array, allowing negative indexes.
- * kd_backward: the traversal state for backwards traversal, to find a meeting point.
- * Since forwards is done first, kd_backward will be valid for d - 1, not d.
- * kd_backward points at the center of the state array, allowing negative indexes.
- * d: Step or distance counter, indicating for what value of d the kd_forward should be populated.
- * For d == 0, kd_forward[0] is initialized, i.e. the first invocation should be for d == 0.
- * meeting_snake: resulting meeting point, if any.
- */
-static void diff_divide_myers_forward(struct diff_data *left, struct diff_data *right,
- int *kd_forward, int *kd_backward, int d,
- struct diff_box *meeting_snake)
-{
- int delta = (int)right->atoms.len - (int)left->atoms.len;
- int prev_x;
- int prev_y;
- int k;
- int x;
-
- debug("-- %s d=%d\n", __func__, d);
- debug_dump_myers_graph(left, right, NULL);
-
- for (k = d; k >= -d; k -= 2) {
- if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
- /* This diagonal is completely outside of the Myers graph, don't calculate it. */
- if (k < -(int)right->atoms.len)
- debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
- else
- debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
- if (k < 0) {
- /* We are traversing negatively, and already below the entire graph, nothing will come
- * of this. */
- debug(" break");
- break;
- }
- debug(" continue");
- continue;
- }
- debug("- k = %d\n", k);
- if (d == 0) {
- /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
- * the Myers graph. */
- x = 0;
- }
- /* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
- * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
- *
- * | d= 0 1 2
- * ----+----------------
- * k= |
- * 2 | 2,0 <-- from prev_k = 2 - 1 = 1
- * | /
- * 1 | 1,0
- * | /
- * 0 | -->0,0 3,3
- * | \\ /
- * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
- * | \\
- * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
- *
- * Except when a k + 1 from a previous run already means a further advancement in the graph.
- * If k == d, there is no k + 1 and k - 1 is the only option.
- * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
- */
- else if (k > -d && (k == d
- || (k - 1 >= -(int)right->atoms.len
- && kd_forward[k - 1] >= kd_forward[k + 1]))) {
- /* Advance from k - 1.
- * From position prev_k, step to the right in the Myers graph: x += 1.
- */
- int prev_k = k - 1;
- prev_x = kd_forward[prev_k];
- prev_y = xk_to_y(prev_x, prev_k);
- x = prev_x + 1;
- } else {
- /* The bottom most one.
- * From position prev_k, step to the bottom in the Myers graph: y += 1.
- * Incrementing y is achieved by decrementing k while keeping the same x.
- * (since we're deriving y from y = x - k).
- */
- int prev_k = k + 1;
- prev_x = kd_forward[prev_k];
- prev_y = xk_to_y(prev_x, prev_k);
- x = prev_x;
- }
-
- /* Slide down any snake that we might find here. */
- while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
- && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
- x++;
- kd_forward[k] = x;
-
- if (DEBUG) {
- int fi;
- for (fi = d; fi >= k; fi--) {
- debug("kd_forward[%d] = (%d, %d)\n", fi, kd_forward[fi], kd_forward[fi] - fi);
- /*
- if (kd_forward[fi] >= 0 && kd_forward[fi] < left->atoms.len)
- debug_dump_atom(left, right, &left->atoms.head[kd_forward[fi]]);
- else
- debug("\n");
- if (kd_forward[fi]-fi >= 0 && kd_forward[fi]-fi < right->atoms.len)
- debug_dump_atom(right, left, &right->atoms.head[kd_forward[fi]-fi]);
- else
- debug("\n");
- */
- }
- }
-
- if (x < 0 || x > left->atoms.len
- || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len)
- continue;
-
- /* Figured out a new forwards traversal, see if this has gone onto or even past a preceding backwards
- * traversal.
- *
- * If the delta in length is odd, then d and backwards_d hit the same state indexes:
- * | d= 0 1 2 1 0
- * ----+---------------- ----------------
- * k= | c=
- * 4 | 3
- * |
- * 3 | 2
- * | same
- * 2 | 2,0====5,3 1
- * | / \
- * 1 | 1,0 5,4<-- 0
- * | / /
- * 0 | -->0,0 3,3====4,4 -1
- * | \ /
- * -1 | 0,1 -2
- * | \
- * -2 | 0,2 -3
- * |
- *
- * If the delta is even, they end up off-by-one, i.e. on different diagonals:
- *
- * | d= 0 1 2 1 0
- * ----+---------------- ----------------
- * | c=
- * 3 | 3
- * |
- * 2 | 2,0 off 2
- * | / \\
- * 1 | 1,0 4,3 1
- * | / // \
- * 0 | -->0,0 3,3 4,4<-- 0
- * | \ / /
- * -1 | 0,1 3,4 -1
- * | \ //
- * -2 | 0,2 -2
- * |
- *
- * So in the forward path, we can only match up diagonals when the delta is odd.
- */
- /* Forwards is done first, so the backwards one was still at d - 1. Can't do this for d == 0. */
- int backwards_d = d - 1;
- if ((delta & 1) && (backwards_d >= 0)) {
- debug("backwards_d = %d\n", backwards_d);
-
- /* If both sides have the same length, forward and backward start on the same diagonal, meaning the
- * backwards state index c == k.
- * As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
- * c = k shifted by the difference in length.
- */
- int c = k_to_c(k, delta);
-
- /* When the file sizes are very different, the traversal trees start on far distant diagonals.
- * They don't necessarily meet straight on. See whether this forward value is on a diagonal that
- * is also valid in kd_backward[], and match them if so. */
- if (c >= -backwards_d && c <= backwards_d) {
- /* Current k is on a diagonal that exists in kd_backward[]. If the two x positions have
- * met or passed (forward walked onto or past backward), then we've found a midpoint / a
- * mid-box.
- *
- * But we need to avoid matching a situation like this:
- * 0 1
- * x y
- * 0 o-o-o
- * x |\| |
- * 1 o-o-o
- * y | |\|
- * 2 (B)o-o <--(B) backwards traversal reached here
- * a | | |
- * 3 o-o-o<-- prev_x, prev_y
- * b | | |
- * 4 o-o(F) <--(F) forwards traversal reached here
- * x |\| | Now both are on the same diagonal and look like they passed,
- * 5 o-o-o but actually they have sneaked past each other and have not met.
- * y | |\|
- * 6 o-o-o
- *
- * The solution is to notice that prev_x,prev_y were also already past (B).
- */
- int backward_x = kd_backward[c];
- int backward_y = xc_to_y(backward_x, c, delta);
- debug(" prev_x,y = (%d,%d) c%d:backward_x,y = (%d,%d) k%d:x,y = (%d,%d)\n",
- prev_x, prev_y, c, backward_x, backward_y, k, x, xk_to_y(x, k));
- if (prev_x <= backward_x && prev_y <= backward_y
- && x >= backward_x) {
- *meeting_snake = (struct diff_box){
- .left_start = backward_x,
- .left_end = x,
- .right_start = xc_to_y(backward_x, c, delta),
- .right_end = xk_to_y(x, k),
- };
- debug("HIT x=(%u,%u) - y=(%u,%u)\n",
- meeting_snake->left_start,
- meeting_snake->right_start,
- meeting_snake->left_end,
- meeting_snake->right_end);
- return;
- }
- }
- }
- }
-}
-
-/* Do one backwards step in the "divide and conquer" graph traversal.
- * left: the left side to diff.
- * right: the right side to diff against.
- * kd_forward: the traversal state for forwards traversal, to find a meeting point.
- * Since forwards is done first, after this, both kd_forward and kd_backward will be valid for d.
- * kd_forward points at the center of the state array, allowing negative indexes.
- * kd_backward: the traversal state for backwards traversal, to find a meeting point.
- * This is carried over between invocations with increasing d.
- * kd_backward points at the center of the state array, allowing negative indexes.
- * d: Step or distance counter, indicating for what value of d the kd_backward should be populated.
- * Before the first invocation, kd_backward[0] shall point at the bottom right of the Myers graph
- * (left.len, right.len).
- * The first invocation will be for d == 1.
- * meeting_snake: resulting meeting point, if any.
- */
-static void diff_divide_myers_backward(struct diff_data *left, struct diff_data *right,
- int *kd_forward, int *kd_backward, int d,
- struct diff_box *meeting_snake)
-{
- int delta = (int)right->atoms.len - (int)left->atoms.len;
- int prev_x;
- int prev_y;
- int c;
- int x;
-
- debug("-- %s d=%d\n", __func__, d);
- debug_dump_myers_graph(left, right, NULL);
-
- for (c = d; c >= -d; c -= 2) {
- if (c < -(int)left->atoms.len || c > (int)right->atoms.len) {
- /* This diagonal is completely outside of the Myers graph, don't calculate it. */
- if (c < -(int)left->atoms.len)
- debug(" %d c < -(int)left->atoms.len %d\n", c, -(int)left->atoms.len);
- else
- debug(" %d c > right->atoms.len %d\n", c, right->atoms.len);
- if (c < 0) {
- /* We are traversing negatively, and already below the entire graph, nothing will come
- * of this. */
- debug(" break");
- break;
- }
- debug(" continue");
- continue;
- }
- debug("- c = %d\n", c);
- if (d == 0) {
- /* This is the initializing step. There is no prev_c yet, get the initial x from the bottom
- * right of the Myers graph. */
- x = left->atoms.len;
- }
- /* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
- * For this, all c should derive from c - 1, only the bottom most c derive from c + 1:
- *
- * 2 1 0
- * ---------------------------------------------------
- * c=
- * 3
- *
- * from prev_c = c - 1 --> 5,2 2
- * \
- * 5,3 1
- * \
- * 4,3 5,4<-- 0
- * \ /
- * bottom most for d=1 from c + 1 --> 4,4 -1
- * /
- * bottom most for d=2 --> 3,4 -2
- *
- * Except when a c + 1 from a previous run already means a further advancement in the graph.
- * If c == d, there is no c + 1 and c - 1 is the only option.
- * If c < d, use c + 1 in case that yields a larger x. Also use c + 1 if c - 1 is outside the graph.
- */
- else if (c > -d && (c == d
- || (c - 1 >= -(int)right->atoms.len
- && kd_backward[c - 1] <= kd_backward[c + 1]))) {
- /* A top one.
- * From position prev_c, step upwards in the Myers graph: y -= 1.
- * Decrementing y is achieved by incrementing c while keeping the same x.
- * (since we're deriving y from y = x - c + delta).
- */
- int prev_c = c - 1;
- prev_x = kd_backward[prev_c];
- prev_y = xc_to_y(prev_x, prev_c, delta);
- x = prev_x;
- } else {
- /* The bottom most one.
- * From position prev_c, step to the left in the Myers graph: x -= 1.
- */
- int prev_c = c + 1;
- prev_x = kd_backward[prev_c];
- prev_y = xc_to_y(prev_x, prev_c, delta);
- x = prev_x - 1;
- }
-
- /* Slide up any snake that we might find here. */
- debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c, delta), xc_to_y(x, c, delta)-1);
- if (x > 0) {
- debug(" l="); debug_dump_atom(left, right, &left->atoms.head[x-1]);
- }
- if (xc_to_y(x, c, delta) > 0) {
- debug(" r="); debug_dump_atom(right, left, &right->atoms.head[xc_to_y(x, c, delta)-1]);
- }
- while (x > 0 && xc_to_y(x, c, delta) > 0
- && diff_atom_same(&left->atoms.head[x-1], &right->atoms.head[xc_to_y(x, c, delta)-1]))
- x--;
- kd_backward[c] = x;
-
- if (DEBUG) {
- int fi;
- for (fi = d; fi >= c; fi--) {
- debug("kd_backward[%d] = (%d, %d)\n", fi, kd_backward[fi],
- kd_backward[fi] - fi + delta);
- /*
- if (kd_backward[fi] >= 0 && kd_backward[fi] < left->atoms.len)
- debug_dump_atom(left, right, &left->atoms.head[kd_backward[fi]]);
- else
- debug("\n");
- if (kd_backward[fi]-fi+delta >= 0 && kd_backward[fi]-fi+delta < right->atoms.len)
- debug_dump_atom(right, left, &right->atoms.head[kd_backward[fi]-fi+delta]);
- else
- debug("\n");
- */
- }
- }
-
- if (x < 0 || x > left->atoms.len
- || xc_to_y(x, c, delta) < 0 || xc_to_y(x, c, delta) > right->atoms.len)
- continue;
-
- /* Figured out a new backwards traversal, see if this has gone onto or even past a preceding forwards
- * traversal.
- *
- * If the delta in length is even, then d and backwards_d hit the same state indexes -- note how this is
- * different from in the forwards traversal, because now both d are the same:
- *
- * | d= 0 1 2 2 1 0
- * ----+---------------- --------------------
- * k= | c=
- * 4 |
- * |
- * 3 | 3
- * | same
- * 2 | 2,0====5,2 2
- * | / \
- * 1 | 1,0 5,3 1
- * | / / \
- * 0 | -->0,0 3,3====4,3 5,4<-- 0
- * | \ / /
- * -1 | 0,1 4,4 -1
- * | \
- * -2 | 0,2 -2
- * |
- * -3
- * If the delta is odd, they end up off-by-one, i.e. on different diagonals.
- * So in the backward path, we can only match up diagonals when the delta is even.
- */
- if ((delta & 1) == 0) {
- /* Forwards was done first, now both d are the same. */
- int forwards_d = d;
-
- /* As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
- * c = k shifted by the difference in length.
- */
- int k = c_to_k(c, delta);
-
- /* When the file sizes are very different, the traversal trees start on far distant diagonals.
- * They don't necessarily meet straight on. See whether this backward value is also on a valid
- * diagonal in kd_forward[], and match them if so. */
- if (k >= -forwards_d && k <= forwards_d) {
- /* Current c is on a diagonal that exists in kd_forward[]. If the two x positions have
- * met or passed (backward walked onto or past forward), then we've found a midpoint / a
- * mid-box. */
- int forward_x = kd_forward[k];
- int forward_y = xk_to_y(forward_x, k);
- debug("Compare %d to %d k=%d (x=%d,y=%d) to (x=%d,y=%d)\n",
- forward_x, x, k,
- forward_x, xk_to_y(forward_x, k), x, xc_to_y(x, c, delta));
- if (forward_x <= prev_x && forward_y <= prev_y
- && forward_x >= x) {
- *meeting_snake = (struct diff_box){
- .left_start = x,
- .left_end = forward_x,
- .right_start = xc_to_y(x, c, delta),
- .right_end = xk_to_y(forward_x, k),
- };
- debug("HIT x=%u,%u - y=%u,%u\n",
- meeting_snake->left_start,
- meeting_snake->right_start,
- meeting_snake->left_end,
- meeting_snake->right_end);
- return;
- }
- }
- }
- }
-}
-
-/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
- * the problem into smaller chunks. Requires only linear amounts of memory. */
-enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
- enum diff_rc rc = DIFF_RC_ENOMEM;
- struct diff_data *left = &state->left;
- struct diff_data *right = &state->right;
-
- debug("\n** %s\n", __func__);
- debug("left:\n");
- debug_dump(left);
- debug("right:\n");
- debug_dump(right);
- debug_dump_myers_graph(left, right, NULL);
-
- /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
- unsigned int max = left->atoms.len + right->atoms.len;
- size_t kd_len = max + 1;
- size_t kd_buf_size = kd_len << 1;
- int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
- if (!kd_buf)
- return DIFF_RC_ENOMEM;
- int i;
- for (i = 0; i < kd_buf_size; i++)
- kd_buf[i] = -1;
- int *kd_forward = kd_buf;
- int *kd_backward = kd_buf + kd_len;
-
- /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
- * It is then possible to index from -max/2 .. max/2. */
- kd_forward += max/2;
- kd_backward += max/2;
-
- int d;
- struct diff_box mid_snake = {};
- for (d = 0; d <= (max/2); d++) {
- debug("-- d=%d\n", d);
- diff_divide_myers_forward(left, right, kd_forward, kd_backward, d, &mid_snake);
- if (!diff_box_empty(&mid_snake))
- break;
- diff_divide_myers_backward(left, right, kd_forward, kd_backward, d, &mid_snake);
- if (!diff_box_empty(&mid_snake))
- break;
- }
-
- if (diff_box_empty(&mid_snake)) {
- /* Divide and conquer failed to find a meeting point. Use the fallback_algo defined in the algo_config
- * (leave this to the caller). This is just paranoia/sanity, we normally should always find a midpoint.
- */
- debug(" no midpoint \n");
- rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
- goto return_rc;
- } else {
- debug(" mid snake L: %u to %u of %u R: %u to %u of %u\n",
- mid_snake.left_start, mid_snake.left_end, left->atoms.len,
- mid_snake.right_start, mid_snake.right_end, right->atoms.len);
-
- /* Section before the mid-snake. */
- debug("Section before the mid-snake\n");
-
- struct diff_atom *left_atom = &left->atoms.head[0];
- unsigned int left_section_len = mid_snake.left_start;
- struct diff_atom *right_atom = &right->atoms.head[0];
- unsigned int right_section_len = mid_snake.right_start;
-
- if (left_section_len && right_section_len) {
- /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
- if (!diff_state_add_chunk(state, false,
- left_atom, left_section_len,
- right_atom, right_section_len))
- goto return_rc;
- } else if (left_section_len && !right_section_len) {
- /* Only left atoms and none on the right, they form a "minus" chunk, then. */
- if (!diff_state_add_chunk(state, true,
- left_atom, left_section_len,
- right_atom, 0))
- goto return_rc;
- } else if (!left_section_len && right_section_len) {
- /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
- if (!diff_state_add_chunk(state, true,
- left_atom, 0,
- right_atom, right_section_len))
- goto return_rc;
- }
- /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing before the mid-snake. */
-
- /* the mid-snake, identical data on both sides: */
- debug("the mid-snake\n");
- if (!diff_state_add_chunk(state, true,
- &left->atoms.head[mid_snake.left_start],
- mid_snake.left_end - mid_snake.left_start,
- &right->atoms.head[mid_snake.right_start],
- mid_snake.right_end - mid_snake.right_start))
- goto return_rc;
-
- /* Section after the mid-snake. */
- debug("Section after the mid-snake\n");
- debug(" left_end %u right_end %u\n", mid_snake.left_end, mid_snake.right_end);
- debug(" left_count %u right_count %u\n", left->atoms.len, right->atoms.len);
- left_atom = &left->atoms.head[mid_snake.left_end];
- left_section_len = left->atoms.len - mid_snake.left_end;
- right_atom = &right->atoms.head[mid_snake.right_end];
- right_section_len = right->atoms.len - mid_snake.right_end;
-
- if (left_section_len && right_section_len) {
- /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
- if (!diff_state_add_chunk(state, false,
- left_atom, left_section_len,
- right_atom, right_section_len))
- goto return_rc;
- } else if (left_section_len && !right_section_len) {
- /* Only left atoms and none on the right, they form a "minus" chunk, then. */
- if (!diff_state_add_chunk(state, true,
- left_atom, left_section_len,
- right_atom, 0))
- goto return_rc;
- } else if (!left_section_len && right_section_len) {
- /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
- if (!diff_state_add_chunk(state, true,
- left_atom, 0,
- right_atom, right_section_len))
- goto return_rc;
- }
- /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing after the mid-snake. */
- }
-
- rc = DIFF_RC_OK;
-
-return_rc:
- free(kd_buf);
- debug("** END %s\n", __func__);
- return rc;
-}
-
-/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
- * fail if the required space surpasses algo_config->permitted_state_size. */
-enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
- /* do a diff_divide_myers_forward() without a _backward(), so that it walks forward across the entire
- * files to reach the end. Keep each run's state, and do a final backtrace. */
- enum diff_rc rc = DIFF_RC_ENOMEM;
- struct diff_data *left = &state->left;
- struct diff_data *right = &state->right;
-
- debug("\n** %s\n", __func__);
- debug("left:\n");
- debug_dump(left);
- debug("right:\n");
- debug_dump(right);
- debug_dump_myers_graph(left, right, NULL);
-
- /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
- unsigned int max = left->atoms.len + right->atoms.len;
- size_t kd_len = max + 1 + max;
- size_t kd_buf_size = kd_len * kd_len;
- debug("state size: %zu\n", kd_buf_size);
- if (kd_buf_size < kd_len /* overflow? */
- || kd_buf_size * sizeof(int) > algo_config->permitted_state_size) {
- debug("state size %zu > permitted_state_size %zu, use fallback_algo\n",
- kd_buf_size, algo_config->permitted_state_size);
- return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
- }
-
- int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
- if (!kd_buf)
- return DIFF_RC_ENOMEM;
- int i;
- for (i = 0; i < kd_buf_size; i++)
- kd_buf[i] = -1;
-
- /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
- * It is then possible to index from -max .. max. */
- int *kd_origin = kd_buf + max;
- int *kd_column = kd_origin;
-
- int d;
- int backtrack_d = -1;
- int backtrack_k = 0;
- int k;
- int x, y;
- for (d = 0; d <= max; d++, kd_column += kd_len) {
- debug("-- d=%d\n", d);
-
- debug("-- %s d=%d\n", __func__, d);
-
- for (k = d; k >= -d; k -= 2) {
- if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
- /* This diagonal is completely outside of the Myers graph, don't calculate it. */
- if (k < -(int)right->atoms.len)
- debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
- else
- debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
- if (k < 0) {
- /* We are traversing negatively, and already below the entire graph, nothing will come
- * of this. */
- debug(" break");
- break;
- }
- debug(" continue");
- continue;
- }
-
- debug("- k = %d\n", k);
- if (d == 0) {
- /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
- * the Myers graph. */
- x = 0;
- } else {
- int *kd_prev_column = kd_column - kd_len;
-
- /* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
- * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
- *
- * | d= 0 1 2
- * ----+----------------
- * k= |
- * 2 | 2,0 <-- from prev_k = 2 - 1 = 1
- * | /
- * 1 | 1,0
- * | /
- * 0 | -->0,0 3,3
- * | \\ /
- * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
- * | \\
- * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
- *
- * Except when a k + 1 from a previous run already means a further advancement in the graph.
- * If k == d, there is no k + 1 and k - 1 is the only option.
- * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
- */
- if (k > -d && (k == d
- || (k - 1 >= -(int)right->atoms.len
- && kd_prev_column[k - 1] >= kd_prev_column[k + 1]))) {
- /* Advance from k - 1.
- * From position prev_k, step to the right in the Myers graph: x += 1.
- */
- int prev_k = k - 1;
- int prev_x = kd_prev_column[prev_k];
- x = prev_x + 1;
- } else {
- /* The bottom most one.
- * From position prev_k, step to the bottom in the Myers graph: y += 1.
- * Incrementing y is achieved by decrementing k while keeping the same x.
- * (since we're deriving y from y = x - k).
- */
- int prev_k = k + 1;
- int prev_x = kd_prev_column[prev_k];
- x = prev_x;
- }
- }
-
- /* Slide down any snake that we might find here. */
- while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
- && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
- x++;
- kd_column[k] = x;
-
- if (DEBUG) {
- int fi;
- for (fi = d; fi >= k; fi-=2) {
- debug("kd_column[%d] = (%d, %d)\n", fi, kd_column[fi], kd_column[fi] - fi);
-#if 0
- if (kd_column[fi] >= 0 && kd_column[fi] < left->atoms.len)
- debug_dump_atom(left, right, &left->atoms.head[kd_column[fi]]);
- else
- debug("\n");
- if (kd_column[fi]-fi >= 0 && kd_column[fi]-fi < right->atoms.len)
- debug_dump_atom(right, left, &right->atoms.head[kd_column[fi]-fi]);
- else
- debug("\n");
-#endif
- }
- }
-
- if (x == left->atoms.len && xk_to_y(x, k) == right->atoms.len) {
- /* Found a path */
- backtrack_d = d;
- backtrack_k = k;
- debug("Reached the end at d = %d, k = %d\n",
- backtrack_d, backtrack_k);
- break;
- }
- }
-
- if (backtrack_d >= 0)
- break;
- }
-
- debug_dump_myers_graph(left, right, kd_origin);
-
- /* backtrack. A matrix spanning from start to end of the file is ready:
- *
- * | d= 0 1 2 3 4
- * ----+---------------------------------
- * k= |
- * 3 |
- * |
- * 2 | 2,0
- * | /
- * 1 | 1,0 4,3
- * | / / \
- * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0
- * | \ / \
- * -1 | 0,1 3,4
- * | \
- * -2 | 0,2
- * |
- *
- * From (4,4) backwards, find the previous position that is the largest, and remember it.
- *
- */
- for (d = backtrack_d, k = backtrack_k; d >= 0; d--) {
- x = kd_column[k];
- y = xk_to_y(x, k);
-
- /* When the best position is identified, remember it for that kd_column.
- * That kd_column is no longer needed otherwise, so just re-purpose kd_column[0] = x and kd_column[1] = y,
- * so that there is no need to allocate more memory.
- */
- kd_column[0] = x;
- kd_column[1] = y;
- debug("Backtrack d=%d: xy=(%d, %d)\n",
- d, kd_column[0], kd_column[1]);
-
- /* Don't access memory before kd_buf */
- if (d == 0)
- break;
- int *kd_prev_column = kd_column - kd_len;
-
- /* When y == 0, backtracking downwards (k-1) is the only way.
- * When x == 0, backtracking upwards (k+1) is the only way.
- *
- * | d= 0 1 2 3 4
- * ----+---------------------------------
- * k= |
- * 3 |
- * | ..y == 0
- * 2 | 2,0
- * | /
- * 1 | 1,0 4,3
- * | / / \
- * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0
- * | \ / \
- * -1 | 0,1 3,4
- * | \
- * -2 | 0,2__
- * | x == 0
- */
- debug("prev[k-1] = %d,%d prev[k+1] = %d,%d\n",
- kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1),
- kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1));
- if (y == 0
- || (x > 0 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) {
- k = k - 1;
- debug("prev k=k-1=%d x=%d y=%d\n",
- k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
- } else {
- k = k + 1;
- debug("prev k=k+1=%d x=%d y=%d\n",
- k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
- }
- kd_column = kd_prev_column;
- }
-
- /* Forwards again, this time recording the diff chunks.
- * Definitely start from 0,0. kd_column[0] may actually point to the bottom of a snake starting at 0,0 */
- x = 0;
- y = 0;
-
- kd_column = kd_origin;
- for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) {
- int next_x = kd_column[0];
- int next_y = kd_column[1];
- debug("Forward track from xy(%d,%d) to xy(%d,%d)\n",
- x, y, next_x, next_y);
-
- struct diff_atom *left_atom = &left->atoms.head[x];
- int left_section_len = next_x - x;
- struct diff_atom *right_atom = &right->atoms.head[y];
- int right_section_len = next_y - y;
-
- rc = DIFF_RC_ENOMEM;
- if (left_section_len && right_section_len) {
- /* This must be a snake slide.
- * Snake slides have a straight line leading into them (except when starting at (0,0)). Find
- * out whether the lead-in is horizontal or vertical:
- *
- * left
- * ---------->
- * |
- * r| o-o o
- * i| \ |
- * g| o o
- * h| \ \
- * t| o o
- * v
- *
- * If left_section_len > right_section_len, the lead-in is horizontal, meaning first
- * remove one atom from the left before sliding down the snake.
- * If right_section_len > left_section_len, the lead-in is vetical, so add one atom from
- * the right before sliding down the snake. */
- if (left_section_len == right_section_len + 1) {
- if (!diff_state_add_chunk(state, true,
- left_atom, 1,
- right_atom, 0))
- goto return_rc;
- left_atom++;
- left_section_len--;
- } else if (right_section_len == left_section_len + 1) {
- if (!diff_state_add_chunk(state, true,
- left_atom, 0,
- right_atom, 1))
- goto return_rc;
- right_atom++;
- right_section_len--;
- } else if (left_section_len != right_section_len) {
- /* The numbers are making no sense. Should never happen. */
- rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
- goto return_rc;
- }
-
- if (!diff_state_add_chunk(state, true,
- left_atom, left_section_len,
- right_atom, right_section_len))
- goto return_rc;
- } else if (left_section_len && !right_section_len) {
- /* Only left atoms and none on the right, they form a "minus" chunk, then. */
- if (!diff_state_add_chunk(state, true,
- left_atom, left_section_len,
- right_atom, 0))
- goto return_rc;
- } else if (!left_section_len && right_section_len) {
- /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
- if (!diff_state_add_chunk(state, true,
- left_atom, 0,
- right_atom, right_section_len))
- goto return_rc;
- }
-
- x = next_x;
- y = next_y;
- }
-
- rc = DIFF_RC_OK;
-
-return_rc:
- free(kd_buf);
- debug("** END %s rc=%d\n", __func__, rc);
- return rc;
-}
blob - 4c76a249cac3df710e57ed404bb9a918acf41870 (mode 644)
blob + /dev/null
--- lib/diff_output.c
+++ /dev/null
-/* Common parts for printing diff output */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_output.h>
-
-void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count)
-{
- struct diff_atom *atom;
- foreach_diff_atom(atom, start_atom, count) {
- fprintf(dest, "%s", prefix);
- int i;
- unsigned int len = atom->len;
- if (len && atom->at[len - 1] == '\n') {
- len--;
- if (len && atom->at[len - 1] == '\r')
- len--;
- }
-
- for (i = 0; i < len; i++) {
- char c = atom->at[i];
- if ((c < 0x20 || c >= 0x7f) && c != '\t')
- fprintf(dest, "\\x%02x", (unsigned char)c);
- else
- fprintf(dest, "%c", c);
- }
- fprintf(dest, "\n");
- }
-}
-
-enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info)
-{
- if (info->arbitrary_info && *info->arbitrary_info)
- fprintf(dest, "%s", info->arbitrary_info);
- fprintf(dest, "--- %s\n+++ %s\n",
- info->left_path ? : "a",
- info->right_path ? : "b");
- return DIFF_RC_OK;
-}
blob - 796c3ee69f83bf338c67c8b310b27c2dea7501f1 (mode 644)
blob + /dev/null
--- lib/diff_output_plain.c
+++ /dev/null
-/* Output all lines of a diff_result. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_output.h>
-
-enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
- const struct diff_result *result)
-{
- if (!result)
- return DIFF_RC_EINVAL;
- if (result->rc != DIFF_RC_OK)
- return result->rc;
-
- diff_output_info(dest, info);
-
- int i;
- for (i = 0; i < result->chunks.len; i++) {
- struct diff_chunk *c = &result->chunks.head[i];
- if (c->left_count && c->right_count)
- diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
- else if (c->left_count && !c->right_count)
- diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
- else if (c->right_count && !c->left_count)
- diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
- }
- return DIFF_RC_OK;
-}
-
-enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
- const struct diff_input_info *info,
- const char *left, int left_len, const char *right, int right_len)
-{
- enum diff_rc rc;
- left_len = left_len < 0 ? strlen(left) : left_len;
- right_len = right_len < 0 ? strlen(right) : right_len;
- struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
- rc = diff_output_plain(dest, info, result);
- diff_result_free(result);
- return rc;
-}
blob - 986d06cbf5ec125bca1bfbce84d7cd3378501735 (mode 644)
blob + /dev/null
--- lib/diff_output_unidiff.c
+++ /dev/null
-/* Produce a unidiff output from a diff_result. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_output.h>
-
-#include "debug.h"
-
-enum chunk_type {
- CHUNK_EMPTY,
- CHUNK_PLUS,
- CHUNK_MINUS,
- CHUNK_SAME,
- CHUNK_WEIRD,
-};
-
-static inline enum chunk_type chunk_type(const struct diff_chunk *chunk)
-{
- if (!chunk->left_count && !chunk->right_count)
- return CHUNK_EMPTY;
- if (!chunk->solved)
- return CHUNK_WEIRD;
- if (!chunk->right_count)
- return CHUNK_MINUS;
- if (!chunk->left_count)
- return CHUNK_PLUS;
- if (chunk->left_count != chunk->right_count)
- return CHUNK_WEIRD;
- return CHUNK_SAME;
-}
-
-struct chunk_context {
- struct range chunk;
- struct range left, right;
-};
-
-static bool chunk_context_empty(const struct chunk_context *cc)
-{
- return range_empty(&cc->chunk);
-}
-
-static void chunk_context_get(struct chunk_context *cc, const struct diff_result *r, int chunk_idx,
- int context_lines)
-{
- const struct diff_chunk *c = &r->chunks.head[chunk_idx];
- int left_start = diff_atom_root_idx(&r->left, c->left_start);
- int right_start = diff_atom_root_idx(&r->right, c->right_start);
-
- *cc = (struct chunk_context){
- .chunk = {
- .start = chunk_idx,
- .end = chunk_idx + 1,
- },
- .left = {
- .start = MAX(0, left_start - context_lines),
- .end = MIN(r->left.atoms.len, left_start + c->left_count + context_lines),
- },
- .right = {
- .start = MAX(0, right_start - context_lines),
- .end = MIN(r->right.atoms.len, right_start + c->right_count + context_lines),
- },
- };
-}
-
-static bool chunk_contexts_touch(const struct chunk_context *cc, const struct chunk_context *other)
-{
- return ranges_touch(&cc->chunk, &other->chunk)
- || ranges_touch(&cc->left, &other->left)
- || ranges_touch(&cc->right, &other->right);
-}
-
-static void chunk_contexts_merge(struct chunk_context *cc, const struct chunk_context *other)
-{
- ranges_merge(&cc->chunk, &other->chunk);
- ranges_merge(&cc->left, &other->left);
- ranges_merge(&cc->right, &other->right);
-}
-
-static void diff_output_unidiff_chunk(FILE *dest, bool *info_printed, const struct diff_input_info *info,
- const struct diff_result *result, const struct chunk_context *cc)
-{
- if (range_empty(&cc->left) && range_empty(&cc->right))
- return;
-
- if (!(*info_printed)) {
- diff_output_info(dest, info);
- *info_printed = true;
- }
-
- fprintf(dest, "@@ -%d,%d +%d,%d @@\n",
- cc->left.start + 1, cc->left.end - cc->left.start,
- cc->right.start + 1, cc->right.end - cc->right.start);
-
- /* Got the absolute line numbers where to start printing, and the index of the interesting (non-context) chunk.
- * To print context lines above the interesting chunk, nipping on the previous chunk index may be necessary.
- * It is guaranteed to be only context lines where left == right, so it suffices to look on the left. */
- const struct diff_chunk *first_chunk = &result->chunks.head[cc->chunk.start];
- int chunk_start_line = diff_atom_root_idx(&result->left, first_chunk->left_start);
- if (cc->left.start < chunk_start_line)
- diff_output_lines(dest, " ", &result->left.atoms.head[cc->left.start],
- chunk_start_line - cc->left.start);
-
- /* Now write out all the joined chunks and contexts between them */
- int c_idx;
- for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) {
- const struct diff_chunk *c = &result->chunks.head[c_idx];
-
- if (c->left_count && c->right_count)
- diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
- else if (c->left_count && !c->right_count)
- diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
- else if (c->right_count && !c->left_count)
- diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
- }
-
- /* Trailing context? */
- const struct diff_chunk *last_chunk = &result->chunks.head[cc->chunk.end - 1];
- int chunk_end_line = diff_atom_root_idx(&result->left, last_chunk->left_start + last_chunk->left_count);
- if (cc->left.end > chunk_end_line)
- diff_output_lines(dest, " ", &result->left.atoms.head[chunk_end_line],
- cc->left.end - chunk_end_line);
-}
-
-enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
- const struct diff_result *result, unsigned int context_lines)
-{
- if (!result)
- return DIFF_RC_EINVAL;
- if (result->rc != DIFF_RC_OK)
- return result->rc;
-
- struct chunk_context cc = {};
- bool info_printed = false;
-
- int i;
- for (i = 0; i < result->chunks.len; i++) {
- struct diff_chunk *c = &result->chunks.head[i];
- enum chunk_type t = chunk_type(c);
-
- if (t == CHUNK_MINUS || t == CHUNK_PLUS) {
- if (chunk_context_empty(&cc)) {
- /* These are the first lines being printed.
- * Note down the start point, any number of subsequent chunks may be joined up to this
- * unidiff chunk by context lines or by being directly adjacent. */
- chunk_context_get(&cc, result, i, context_lines);
- debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
- cc.chunk.start, cc.chunk.end,
- cc.left.start, cc.left.end, cc.right.start, cc.right.end);
- } else {
- /* There already is a previous chunk noted down for being printed.
- * Does it join up with this one? */
- struct chunk_context next;
- chunk_context_get(&next, result, i, context_lines);
- debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
- next.chunk.start, next.chunk.end,
- next.left.start, next.left.end, next.right.start, next.right.end);
- if (chunk_contexts_touch(&cc, &next)) {
- /* This next context touches or overlaps the previous one, join. */
- chunk_contexts_merge(&cc, &next);
- debug("new chunk to be printed touches previous chunk, now: left %d-%d right %d-%d\n",
- cc.left.start, cc.left.end, cc.right.start, cc.right.end);
- } else {
- /* No touching, so the previous context is complete with a gap between it and
- * this next one. Print the previous one and start fresh here. */
- debug("new chunk to be printed does not touch previous chunk; print left %d-%d right %d-%d\n",
- cc.left.start, cc.left.end, cc.right.start, cc.right.end);
- diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
- cc = next;
- debug("new unprinted chunk is left %d-%d right %d-%d\n",
- cc.left.start, cc.left.end, cc.right.start, cc.right.end);
- }
- }
- }
-
- }
-
- if (!chunk_context_empty(&cc))
- diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
- return DIFF_RC_OK;
-}
-
-enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
- const struct diff_input_info *info,
- const char *left, int left_len, const char *right, int right_len,
- unsigned int context_lines)
-{
- enum diff_rc rc;
- left_len = left_len < 0 ? strlen(left) : left_len;
- right_len = right_len < 0 ? strlen(right) : right_len;
- struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
- rc = diff_output_unidiff(dest, info, result, context_lines);
- diff_result_free(result);
- return rc;
-}
blob - 17a6bffe139d685b095825de47ef10ff51e508b5 (mode 644)
blob + /dev/null
--- lib/diff_patience.c
+++ /dev/null
-/* Implementation of the Patience Diff algorithm invented by Bram Cohen:
- * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <assert.h>
-#include <diff/diff_main.h>
-
-#include "debug.h"
-
-/* Set unique_here = true for all atoms that exist exactly once in this list. */
-static void diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count)
-{
- struct diff_atom *i;
- unsigned int count = 0;
- diff_data_foreach_atom(i, d) {
- i->patience.unique_here = true;
- i->patience.unique_in_both = true;
- count++;
- }
- diff_data_foreach_atom(i, d) {
- struct diff_atom *j;
-
- if (!i->patience.unique_here)
- continue;
-
- diff_data_foreach_atom_from(i + 1, j, d) {
- if (diff_atom_same(i, j)) {
- if (i->patience.unique_here) {
- i->patience.unique_here = false;
- i->patience.unique_in_both = false;
- count--;
- }
- j->patience.unique_here = false;
- j->patience.unique_in_both = false;
- count--;
- }
- }
- }
- if (unique_count)
- *unique_count = count;
-}
-
-/* Mark those lines as atom->patience.unique_in_both = true that appear exactly once in each side. */
-static void diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
- unsigned int *unique_in_both_count)
-{
- /* Derive the final unique_in_both count without needing an explicit iteration. So this is just some
- * optimiziation to save one iteration in the end. */
- unsigned int unique_in_both;
-
- diff_atoms_mark_unique(left, &unique_in_both);
- diff_atoms_mark_unique(right, NULL);
-
- debug("unique_in_both %u\n", unique_in_both);
-
- struct diff_atom *i;
- diff_data_foreach_atom(i, left) {
- if (!i->patience.unique_here)
- continue;
- struct diff_atom *j;
- int found_in_b = 0;
- diff_data_foreach_atom(j, right) {
- if (!diff_atom_same(i, j))
- continue;
- if (!j->patience.unique_here) {
- found_in_b = 2; /* or more */
- break;
- } else {
- found_in_b = 1;
- j->patience.pos_in_other = i;
- i->patience.pos_in_other = j;
- }
- }
-
- if (found_in_b == 0 || found_in_b > 1) {
- i->patience.unique_in_both = false;
- unique_in_both--;
- debug("unique_in_both %u (%d) ", unique_in_both, found_in_b);
- debug_dump_atom(left, NULL, i);
- }
- }
-
- /* Still need to unmark right[*]->patience.unique_in_both for atoms that don't exist in left */
- diff_data_foreach_atom(i, right) {
- if (!i->patience.unique_here
- || !i->patience.unique_in_both)
- continue;
- struct diff_atom *j;
- bool found_in_a = false;
- diff_data_foreach_atom(j, left) {
- if (!j->patience.unique_in_both)
- continue;
- if (!diff_atom_same(i, j))
- continue;
- found_in_a = true;
- break;
- }
-
- if (!found_in_a)
- i->patience.unique_in_both = false;
- }
-
- if (unique_in_both_count)
- *unique_in_both_count = unique_in_both;
-}
-
-static void diff_atoms_swallow_identical_neighbors(struct diff_data *left, struct diff_data *right,
- unsigned int *unique_in_both_count)
-{
- debug("trivially combine identical lines arount unique_in_both lines\n");
-
- unsigned int l_idx;
- unsigned int next_l_idx;
- unsigned int l_min = 0;
- unsigned int r_min = 0;
- for (l_idx = 0; l_idx < left->atoms.len; l_idx = next_l_idx) {
- next_l_idx = l_idx + 1;
- struct diff_atom *l = &left->atoms.head[l_idx];
-
- if (!l->patience.unique_in_both)
- continue;
-
- debug("check identical lines around ");
- debug_dump_atom(left, right, l);
-
- unsigned int r_idx = diff_atom_idx(right, l->patience.pos_in_other);
-
- struct range identical_l;
- struct range identical_r;
-
- /* Swallow upwards.
- * Each common-unique line swallows identical lines upwards and downwards.
- * All common-unique lines that were part of the identical lines following below were already swallowed
- * in the previous iteration, so we will never hit another common-unique line above. */
- for (identical_l.start = l_idx, identical_r.start = r_idx;
- identical_l.start > l_min
- && identical_r.start > r_min
- && diff_atom_same(&left->atoms.head[identical_l.start - 1],
- &right->atoms.head[identical_r.start - 1]);
- identical_l.start--, identical_r.start--);
-
- /* Swallow downwards */
- for (identical_l.end = l_idx + 1, identical_r.end = r_idx + 1;
- identical_l.end < left->atoms.len
- && identical_r.end < right->atoms.len
- && diff_atom_same(&left->atoms.head[identical_l.end],
- &right->atoms.head[identical_r.end]);
- identical_l.end++, identical_r.end++,
- next_l_idx++) {
- if (left->atoms.head[identical_l.end].patience.unique_in_both) {
- /* Part of a chunk of identical lines, remove from listing of unique_in_both lines */
- left->atoms.head[identical_l.end].patience.unique_in_both = false;
- right->atoms.head[identical_r.end].patience.unique_in_both = false;
- (*unique_in_both_count)--;
- }
- }
-
- l->patience.identical_lines = identical_l;
- l->patience.pos_in_other->patience.identical_lines = identical_r;
-
- l_min = identical_l.end;
- r_min = identical_r.end;
-
- if (!range_empty(&l->patience.identical_lines)) {
- debug("common-unique line at l=%u r=%u swallowed identical lines l=%u-%u r=%u-%u\n",
- l_idx, r_idx,
- identical_l.start, identical_l.end,
- identical_r.start, identical_r.end);
- }
- debug("next_l_idx = %u\n", next_l_idx);
- }
-}
-
-/* Among the lines that appear exactly once in each side, find the longest streak that appear in both files in the same
- * order (with other stuff allowed to interleave). Use patience sort for that, as in the Patience Diff algorithm.
- * See https://bramcohen.livejournal.com/73318.html and, for a much more detailed explanation,
- * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */
-enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
- enum diff_rc rc = DIFF_RC_ENOMEM;
-
- struct diff_data *left = &state->left;
- struct diff_data *right = &state->right;
-
- unsigned int unique_in_both_count;
-
- debug("\n** %s\n", __func__);
-
- /* Find those lines that appear exactly once in 'left' and exactly once in 'right'. */
- diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count);
-
- debug("unique_in_both_count %u\n", unique_in_both_count);
- debug("left:\n");
- debug_dump(left);
- debug("right:\n");
- debug_dump(right);
-
- if (!unique_in_both_count) {
- /* Cannot apply Patience, tell the caller to use fallback_algo instead. */
- return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
- }
-
- diff_atoms_swallow_identical_neighbors(left, right, &unique_in_both_count);
- debug("After swallowing identical neighbors: unique_in_both = %u\n",
- unique_in_both_count);
-
- /* An array of Longest Common Sequence is the result of the below subscope: */
- unsigned int lcs_count = 0;
- struct diff_atom **lcs = NULL;
- struct diff_atom *lcs_tail = NULL;
-
- {
- /* This subscope marks the lifetime of the atom_pointers allocation */
-
- /* One chunk of storage for atom pointers */
- struct diff_atom **atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2, sizeof(struct diff_atom*));
-
- /* Half for the list of atoms that still need to be put on stacks */
- struct diff_atom **uniques = atom_pointers;
-
- /* Half for the patience sort state's "card stacks" -- we remember only each stack's topmost "card" */
- struct diff_atom **patience_stacks = atom_pointers + unique_in_both_count;
- unsigned int patience_stacks_count = 0;
-
- /* Take all common, unique items from 'left' ... */
-
- struct diff_atom *atom;
- struct diff_atom **uniques_end = uniques;
- diff_data_foreach_atom(atom, left) {
- if (!atom->patience.unique_in_both)
- continue;
- *uniques_end = atom;
- uniques_end++;
- }
-
- /* ...and sort them to the order found in 'right'.
- * The idea is to find the leftmost stack that has a higher line number and add it to the stack's top.
- * If there is no such stack, open a new one on the right. The line number is derived from the atom*,
- * which are array items and hence reflect the relative position in the source file. So we got the
- * common-uniques from 'left' and sort them according to atom->patience.pos_in_other. */
- unsigned int i;
- for (i = 0; i < unique_in_both_count; i++) {
- atom = uniques[i];
- unsigned int target_stack;
-
- if (!patience_stacks_count)
- target_stack = 0;
- else {
- /* binary search to find the stack to put this atom "card" on. */
- unsigned int lo = 0;
- unsigned int hi = patience_stacks_count;
- while (lo < hi) {
- unsigned int mid = (lo + hi) >> 1;
- if (patience_stacks[mid]->patience.pos_in_other < atom->patience.pos_in_other)
- lo = mid + 1;
- else
- hi = mid;
- }
-
- target_stack = lo;
- }
-
- assert(target_stack <= patience_stacks_count);
- patience_stacks[target_stack] = atom;
- if (target_stack == patience_stacks_count)
- patience_stacks_count++;
-
- /* Record a back reference to the next stack on the left, which will form the final longest sequence
- * later. */
- atom->patience.prev_stack = target_stack ? patience_stacks[target_stack - 1] : NULL;
-
- }
-
- /* backtrace through prev_stack references to form the final longest common sequence */
- lcs_tail = patience_stacks[patience_stacks_count - 1];
- lcs_count = patience_stacks_count;
-
- /* uniques and patience_stacks are no longer needed. Backpointers are in atom->patience.prev_stack */
- free(atom_pointers);
- }
-
- lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*));
- struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1];
- struct diff_atom *atom;
- for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) {
- assert(lcs_backtrace_pos >= lcs);
- *lcs_backtrace_pos = atom;
- }
-
- unsigned int i;
- if (DEBUG) {
- debug("\npatience LCS:\n");
- for (i = 0; i < lcs_count; i++) {
- debug_dump_atom(left, right, lcs[i]);
- }
- }
-
-
- /* TODO: For each common-unique line found (now listed in lcs), swallow lines upwards and downwards that are
- * identical on each side. Requires a way to represent atoms being glued to adjacent atoms. */
-
- debug("\ntraverse LCS, possibly recursing:\n");
-
- /* Now we have pinned positions in both files at which it makes sense to divide the diff problem into smaller
- * chunks. Go into the next round: look at each section in turn, trying to again find common-unique lines in
- * those smaller sections. As soon as no more are found, the remaining smaller sections are solved by Myers. */
- unsigned int left_pos = 0;
- unsigned int right_pos = 0;
- for (i = 0; i <= lcs_count; i++) {
- struct diff_atom *atom;
- struct diff_atom *atom_r;
- unsigned int left_idx;
- unsigned int right_idx;
-
- if (i < lcs_count) {
- atom = lcs[i];
- atom_r = atom->patience.pos_in_other;
- debug("lcs[%u] = left[%u] = right[%u]\n", i,
- diff_atom_idx(left, atom), diff_atom_idx(right, atom_r));
- left_idx = atom->patience.identical_lines.start;
- right_idx = atom_r->patience.identical_lines.start;
- debug(" identical lines l %u-%u r %u-%u\n",
- atom->patience.identical_lines.start, atom->patience.identical_lines.end,
- atom_r->patience.identical_lines.start, atom_r->patience.identical_lines.end);
- } else {
- atom = NULL;
- atom_r = NULL;
- left_idx = left->atoms.len;
- right_idx = right->atoms.len;
- }
-
- /* 'atom' now marks an atom that matches on both sides according to patience-diff
- * (a common-unique identical atom in both files).
- * Handle the section before and the atom itself; the section after will be handled by the next loop
- * iteration -- note that i loops to last element + 1 ("i <= lcs_count"), so that there will be another
- * final iteration to pick up the last remaining items after the last LCS atom.
- * The sections before might also be empty on left and/or right.
- * left_pos and right_pos mark the indexes of the first atoms that have not yet been handled in the
- * previous loop iteration.
- * left_idx and right_idx mark the indexes of the matching atom on left and right, respectively. */
-
- debug("iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n",
- i, left_pos, left_idx, right_pos, right_idx);
-
- struct diff_chunk *chunk;
-
- /* Section before the matching atom */
- struct diff_atom *left_atom = &left->atoms.head[left_pos];
- unsigned int left_section_len = left_idx - left_pos;
-
- struct diff_atom *right_atom = &(right->atoms.head[right_pos]);
- unsigned int right_section_len = right_idx - right_pos;
-
- if (left_section_len && right_section_len) {
- /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
- if (!diff_state_add_chunk(state, false,
- left_atom, left_section_len,
- right_atom, right_section_len))
- goto return_rc;
- } else if (left_section_len && !right_section_len) {
- /* Only left atoms and none on the right, they form a "minus" chunk, then. */
- if (!diff_state_add_chunk(state, true,
- left_atom, left_section_len,
- right_atom, 0))
- goto return_rc;
- } else if (!left_section_len && right_section_len) {
- /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
- if (!diff_state_add_chunk(state, true,
- left_atom, 0,
- right_atom, right_section_len))
- goto return_rc;
- }
- /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing here. */
-
- /* The atom found to match on both sides forms a chunk of equals on each side. In the very last
- * iteration of this loop, there is no matching atom, we were just cleaning out the remaining lines. */
- if (atom) {
- if (!diff_state_add_chunk(state, true,
- left->atoms.head + atom->patience.identical_lines.start,
- range_len(&atom->patience.identical_lines),
- right->atoms.head + atom_r->patience.identical_lines.start,
- range_len(&atom_r->patience.identical_lines)))
- goto return_rc;
- left_pos = atom->patience.identical_lines.end;
- right_pos = atom_r->patience.identical_lines.end;
- } else {
- left_pos = left_idx + 1;
- right_pos = right_idx + 1;
- }
- debug("end of iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n",
- i, left_pos, left_idx, right_pos, right_idx);
- }
- debug("** END %s\n", __func__);
-
- rc = DIFF_RC_OK;
-
-return_rc:
- free(lcs);
- return rc;
-}
blob - a6ff8bd1ed65c0db6c0feaf7866523a9d9f52948 (mode 644)
blob + /dev/null
--- lib/linux_Makefile
+++ /dev/null
-srcs = \
- diff_atomize_text.c \
- diff_main.c \
- diff_myers.c \
- diff_patience.c \
- diff_output.c \
- diff_output_plain.c \
- diff_output_unidiff.c \
- $(END)
-
-objs = $(srcs:.c=.o)
-
-libdiff.a: $(objs)
- ar rcs $@ $^
-
-CFLAGS = -fsanitize=address -fsanitize=undefined -g -O3
-
-%.o: %.c ./*.h ../include/diff/*.h
- gcc $(CFLAGS) -I../include -o $@ -c $<
-
-.PHONY: clean
-clean:
- -rm $(objs)
- -rm libdiff.a
blob - b4a9acb0cdc71e2d714c76d3f510873470cb8236 (mode 644)
blob + /dev/null
--- man/diff.1
+++ /dev/null
-.\" $OpenBSD$
-.\"
-.\" Copyright (c) 2018 Martin Pieuchot
-.\" Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
-.\"
-.\" Permission to use, copy, modify, and distribute this software for any
-.\" purpose with or without fee is hereby granted, provided that the above
-.\" copyright notice and this permission notice appear in all copies.
-.\"
-.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-.\"
-.Dd $Mdocdate: August 28 2017 $
-.Dt DIFF 1
-.Os
-.Sh NAME
-.Nm diff
-.Nd compare files
-.Sh SYNOPSIS
-.Nm diff
-.Ar file1 file2
-.Sh DESCRIPTION
-The
-.Nm
-utility compares the contents of
-.Ar file1
-and
-.Ar file2
-line by line.
-.Sh EXIT STATUS
-The
-.Nm
-utility exits with one of the following values:
-.Pp
-.Bl -tag -width Ds -offset indent -compact
-.It 0
-No differences were found.
-.It 1
-Differences were found.
-.It >1
-An error occurred.
-.El
blob - /dev/null
blob + b4a9acb0cdc71e2d714c76d3f510873470cb8236 (mode 644)
--- /dev/null
+++ diff.1
+.\" $OpenBSD$
+.\"
+.\" Copyright (c) 2018 Martin Pieuchot
+.\" Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: August 28 2017 $
+.Dt DIFF 1
+.Os
+.Sh NAME
+.Nm diff
+.Nd compare files
+.Sh SYNOPSIS
+.Nm diff
+.Ar file1 file2
+.Sh DESCRIPTION
+The
+.Nm
+utility compares the contents of
+.Ar file1
+and
+.Ar file2
+line by line.
+.Sh EXIT STATUS
+The
+.Nm
+utility exits with one of the following values:
+.Pp
+.Bl -tag -width Ds -offset indent -compact
+.It 0
+No differences were found.
+.It 1
+Differences were found.
+.It >1
+An error occurred.
+.El
blob - 17f8df091e1bb71385f342f2e783b09347df7d75
blob + 480a582f5d2848c9cb614cf98532f97a5bfb5ba2
--- test/verify_all.sh
+++ test/verify_all.sh
#!/bin/sh
-diff_prog="../diff/diff"
+diff_prog="../obj/diff"
diff_type=unidiff
blob - /dev/null
blob + bbe26f81080d982f5d5a2678f29193e5c70df6fd (mode 644)
--- /dev/null
+++ diff.c
+/* Commandline diff utility to test diff implementations. */
+/*
+ * Copyright (c) 2018 Martin Pieuchot
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <err.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "diff_main.h"
+#include "diff_output.h"
+
+#ifdef __linux__
+/* stupid shims to compile and test on linux */
+#define __dead
+
+static const char *getprogname()
+{
+ return "diff";
+}
+#endif
+
+__dead void usage(void);
+int diffreg(char *, char *, int);
+char *mmapfile(const char *, struct stat *);
+
+__dead void
+usage(void)
+{
+ fprintf(stderr, "usage: %s file1 file2\n", getprogname());
+ exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int ch;
+
+ while ((ch = getopt(argc, argv, "")) != -1) {
+ switch (ch) {
+ default:
+ usage();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 2)
+ usage();
+
+ return diffreg(argv[0], argv[1], 0);
+}
+
+const struct diff_algo_config myers, patience, myers_divide;
+
+const struct diff_algo_config myers = (struct diff_algo_config){
+ .impl = diff_algo_myers,
+ .permitted_state_size = 1024 * 1024 * sizeof(int),
+ .fallback_algo = &patience,
+};
+
+const struct diff_algo_config patience = (struct diff_algo_config){
+ .impl = diff_algo_patience,
+ .inner_algo = &patience, // After subdivision, do Patience again.
+ .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
+};
+
+const struct diff_algo_config myers_divide = (struct diff_algo_config){
+ .impl = diff_algo_myers_divide,
+ .inner_algo = &myers, // When division succeeded, start from the top.
+ // (fallback_algo = NULL implies diff_algo_none).
+};
+
+const struct diff_config diff_config = {
+ .atomize_func = diff_atomize_text_by_line,
+ .algo = &myers,
+};
+
+int
+diffreg(char *file1, char *file2, int flags)
+{
+ char *str1, *str2;
+ struct stat st1, st2;
+ struct diff_input_info info = {
+ .left_path = file1,
+ .right_path = file2,
+ };
+
+ str1 = mmapfile(file1, &st1);
+ str2 = mmapfile(file2, &st2);
+
+ diff_unidiff(stdout, &diff_config, &info, str1, st1.st_size, str2, st2.st_size, 3);
+
+ munmap(str1, st1.st_size);
+ munmap(str2, st2.st_size);
+
+ return 0;
+}
+
+char *
+mmapfile(const char *path, struct stat *st)
+{
+ int fd;
+ char *p;
+
+ fd = open(path, O_RDONLY);
+ if (fd == -1)
+ err(2, "%s", path);
+
+ if (fstat(fd, st) == -1)
+ err(2, "%s", path);
+
+ if ((uintmax_t)st->st_size > SIZE_MAX)
+ errx(2, "%s: file too big to fit memory", path);
+
+ p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (p == MAP_FAILED)
+ err(2, "mmap");
+
+ close(fd);
+
+ return p;
+}
blob - /dev/null
blob + 9aa20ee96f002f8a10c60b035b8e62be1dff769b (mode 644)
--- /dev/null
+++ diff_atomize_text.c
+/* Split source by line breaks, and calculate a simplistic checksum. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_main.h"
+
+static int diff_data_atomize_text_lines(struct diff_data *d)
+{
+ const uint8_t *pos = d->data;
+ const uint8_t *end = pos + d->len;
+ enum diff_rc rc;
+
+ unsigned int array_size_estimate = d->len / 50;
+ unsigned int pow2 = 1;
+ while (array_size_estimate >>= 1)
+ pow2++;
+
+ ARRAYLIST_INIT(d->atoms, 1 << pow2);
+
+ while (pos < end) {
+ const uint8_t *line_end = pos;
+ unsigned int hash = 0;
+
+ while (line_end < end && *line_end != '\r' && *line_end != '\n') {
+ hash = hash * 23 + *line_end;
+ line_end++;
+ }
+
+ /* When not at the end of data, the line ending char ('\r' or '\n') must follow */
+ if (line_end < end)
+ line_end++;
+ /* If that was an '\r', also pull in any following '\n' */
+ if (line_end[0] == '\r' && line_end < end && line_end[1] == '\n')
+ line_end++;
+
+ /* Record the found line as diff atom */
+ struct diff_atom *atom;
+ ARRAYLIST_ADD(atom, d->atoms);
+ if (!atom)
+ return DIFF_RC_ENOMEM;
+
+ *atom = (struct diff_atom){
+ .at = pos,
+ .len = line_end - pos,
+ .hash = hash,
+ };
+
+ /* Starting point for next line: */
+ pos = line_end;
+ }
+
+ return DIFF_RC_OK;
+}
+
+enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right)
+{
+ enum diff_rc rc;
+ rc = diff_data_atomize_text_lines(left);
+ if (rc != DIFF_RC_OK)
+ return rc;
+ return diff_data_atomize_text_lines(right);
+}
blob - /dev/null
blob + 29fdd78115f0ea2392af08b51e5d57d450d9dd1b (mode 644)
--- /dev/null
+++ diff_main.c
+/* Generic infrastructure to implement various diff algorithms (implementation). */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+#include <sys/queue.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+
+#include <assert.h>
+
+#include "diff_main.h"
+#include "debug.h"
+
+/* Even if a left or right side is empty, diff output may need to know the position in that file.
+ * So left_start or right_start must never be NULL -- pass left_count or right_count as zero to indicate staying at that
+ * position without consuming any lines. */
+struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
+ struct diff_atom *left_start, unsigned int left_count,
+ struct diff_atom *right_start, unsigned int right_count)
+{
+ struct diff_chunk *chunk;
+ diff_chunk_arraylist_t *result;
+
+ if (solved && !state->temp_result.len)
+ result = &state->result->chunks;
+ else
+ result = &state->temp_result;
+
+ ARRAYLIST_ADD(chunk, *result);
+ if (!chunk)
+ return NULL;
+ *chunk = (struct diff_chunk){
+ .solved = solved,
+ .left_start = left_start,
+ .left_count = left_count,
+ .right_start = right_start,
+ .right_count = right_count,
+ };
+
+ debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED");
+ debug("L\n");
+ debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
+ debug("R\n");
+ debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
+ return chunk;
+}
+
+void diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len)
+{
+ *d = (struct diff_data){
+ .data = data,
+ .len = len,
+ .root = d,
+ };
+}
+
+void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
+ struct diff_atom *from_atom, unsigned int atoms_count)
+{
+ struct diff_atom *last_atom = from_atom + atoms_count - 1;
+ *d = (struct diff_data){
+ .data = from_atom->at,
+ .len = (last_atom->at + last_atom->len) - from_atom->at,
+ .root = parent->root,
+ .atoms.head = from_atom,
+ .atoms.len = atoms_count,
+ };
+
+ debug("subsection:\n");
+ debug_dump(d);
+}
+
+void diff_data_free(struct diff_data *diff_data)
+{
+ if (!diff_data)
+ return;
+ if (diff_data->atoms.allocated)
+ ARRAYLIST_FREE(diff_data->atoms);
+}
+
+enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ debug("\n** %s\n", __func__);
+ debug("left:\n");
+ debug_dump(&state->left);
+ debug("right:\n");
+ debug_dump(&state->right);
+ debug_dump_myers_graph(&state->left, &state->right, NULL);
+
+ /* Add a chunk of equal lines, if any */
+ unsigned int equal_atoms = 0;
+ while (equal_atoms < state->left.atoms.len && equal_atoms < state->right.atoms.len
+ && diff_atom_same(&state->left.atoms.head[equal_atoms], &state->right.atoms.head[equal_atoms]))
+ equal_atoms++;
+ if (equal_atoms) {
+ if (!diff_state_add_chunk(state, true,
+ &state->left.atoms.head[0], equal_atoms,
+ &state->right.atoms.head[0], equal_atoms))
+ return DIFF_RC_ENOMEM;
+ }
+
+ /* Add a "minus" chunk with all lines from the left. */
+ if (equal_atoms < state->left.atoms.len) {
+ if (!diff_state_add_chunk(state, true,
+ &state->left.atoms.head[equal_atoms],
+ state->left.atoms.len - equal_atoms,
+ NULL, 0))
+ return DIFF_RC_ENOMEM;
+ }
+
+ /* Add a "plus" chunk with all lines from the right. */
+ if (equal_atoms < state->right.atoms.len) {
+ if (!diff_state_add_chunk(state, true,
+ NULL, 0,
+ &state->right.atoms.head[equal_atoms],
+ state->right.atoms.len - equal_atoms))
+ return DIFF_RC_ENOMEM;
+ }
+ return DIFF_RC_OK;
+}
+
+enum diff_rc diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ enum diff_rc rc;
+ ARRAYLIST_FREE(state->temp_result);
+
+ if (!algo_config || !algo_config->impl || !state->recursion_depth_left) {
+ debug("MAX RECURSION REACHED, just dumping diff chunks\n");
+ return diff_algo_none(algo_config, state);
+ }
+
+ ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
+ rc = algo_config->impl(algo_config, state);
+ switch (rc) {
+ case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
+ debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo);
+ rc = diff_run_algo(algo_config->fallback_algo, state);
+ goto return_rc;
+
+ case DIFF_RC_OK:
+ /* continue below */
+ break;
+
+ default:
+ /* some error happened */
+ goto return_rc;
+ }
+
+ /* Pick up any diff chunks that are still unsolved and feed to inner_algo.
+ * inner_algo will solve unsolved chunks and append to result,
+ * and subsequent solved chunks on this level are then appended to result afterwards. */
+ int i;
+ for (i = 0; i < state->temp_result.len; i++) {
+ struct diff_chunk *c = &state->temp_result.head[i];
+ if (c->solved) {
+ struct diff_chunk *final_c;
+ ARRAYLIST_ADD(final_c, state->result->chunks);
+ if (!final_c) {
+ rc = DIFF_RC_ENOMEM;
+ goto return_rc;
+ }
+ *final_c = *c;
+ continue;
+ }
+
+ /* c is an unsolved chunk, feed to inner_algo */
+ struct diff_state inner_state = {
+ .result = state->result,
+ .recursion_depth_left = state->recursion_depth_left - 1,
+ };
+ diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count);
+ diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count);
+
+ rc = diff_run_algo(algo_config->inner_algo, &inner_state);
+
+ if (rc != DIFF_RC_OK)
+ goto return_rc;
+ }
+
+ rc = DIFF_RC_OK;
+return_rc:
+ ARRAYLIST_FREE(state->temp_result);
+ return rc;
+}
+
+struct diff_result *diff_main(const struct diff_config *config,
+ const uint8_t *left_data, size_t left_len,
+ const uint8_t *right_data, size_t right_len)
+{
+ struct diff_result *result = malloc(sizeof(struct diff_result));
+ if (!result)
+ return NULL;
+
+ *result = (struct diff_result){};
+ diff_data_init_root(&result->left, left_data, left_len);
+ diff_data_init_root(&result->right, right_data, right_len);
+
+ if (!config->atomize_func) {
+ result->rc = DIFF_RC_EINVAL;
+ return result;
+ }
+
+ result->rc = config->atomize_func(config->atomize_func_data, &result->left, &result->right);
+ if (result->rc != DIFF_RC_OK)
+ return result;
+
+ struct diff_state state = {
+ .result = result,
+ .recursion_depth_left = config->max_recursion_depth ? : 1024,
+ };
+ diff_data_init_subsection(&state.left, &result->left, result->left.atoms.head, result->left.atoms.len);
+ diff_data_init_subsection(&state.right, &result->right, result->right.atoms.head, result->right.atoms.len);
+
+ result->rc = diff_run_algo(config->algo, &state);
+
+ return result;
+}
+
+void diff_result_free(struct diff_result *result)
+{
+ if (!result)
+ return;
+ diff_data_free(&result->left);
+ diff_data_free(&result->right);
+ ARRAYLIST_FREE(result->chunks);
+ free(result);
+}
blob - /dev/null
blob + 933592cf5e6a33c4c1ace4d204f3fb494ab62d34 (mode 644)
--- /dev/null
+++ diff_main.h
+/* Generic infrastructure to implement various diff algorithms. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+
+#include "arraylist.h"
+
+#ifndef MAX
+#define MAX(A,B) ((A)>(B)?(A):(B))
+#endif
+#ifndef MIN
+#define MIN(A,B) ((A)<(B)?(A):(B))
+#endif
+
+struct range {
+ int start;
+ int end;
+};
+
+static inline bool range_empty(const struct range *r)
+{
+ return r->start == r->end;
+}
+
+static inline bool ranges_touch(const struct range *a, const struct range *b)
+{
+ return (a->end >= b->start) && (a->start <= b->end);
+}
+
+static inline void ranges_merge(struct range *a, const struct range *b)
+{
+ *a = (struct range){
+ .start = MIN(a->start, b->start),
+ .end = MAX(a->end, b->end),
+ };
+}
+
+static inline int range_len(const struct range *r)
+{
+ if (!r)
+ return 0;
+ return r->end - r->start;
+}
+
+/* List of all possible return codes of a diff invocation. */
+enum diff_rc {
+ DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1,
+ DIFF_RC_OK = 0,
+ DIFF_RC_ENOTSUP = ENOTSUP,
+ DIFF_RC_ENOMEM = ENOMEM,
+ DIFF_RC_EINVAL = EINVAL,
+};
+
+struct diff_atom {
+ const uint8_t *at;
+ size_t len;
+
+ /* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to
+ * compare entire atoms to find out whether they are indeed identical or not. */
+ unsigned int hash;
+
+ /* State for the Patience Diff algorithm */
+ /* TODO: keep a separate array for the patience state */
+ struct {
+ bool unique_here;
+ bool unique_in_both;
+ struct diff_atom *pos_in_other;
+ struct diff_atom *prev_stack;
+ struct range identical_lines;
+ } patience;
+};
+
+static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
+{
+ return left->hash == right->hash
+ && left->len == right->len
+ && memcmp(left->at, right->at, left->len) == 0;
+}
+
+/* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In
+ * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file,
+ * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms
+ * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a
+ * subset of atoms. */
+struct diff_data {
+ const uint8_t *data;
+ size_t len;
+ ARRAYLIST(struct diff_atom) atoms;
+ struct diff_data *root;
+};
+
+void diff_data_free(struct diff_data *diff_data);
+
+/* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with
+ * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in
+ * the file (and not the relative position within the subsection). */
+#define diff_atom_root_idx(DIFF_DATA, ATOM) \
+ ((ATOM) ? (ATOM) - ((DIFF_DATA)->root->atoms.head) : (DIFF_DATA)->root->atoms.len)
+
+/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with
+ * 0). */
+#define diff_atom_idx(DIFF_DATA, ATOM) \
+ ((ATOM) ? (ATOM) - ((DIFF_DATA)->atoms.head) : (DIFF_DATA)->atoms.len)
+
+#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
+ for ((ATOM) = (FIRST_ATOM); \
+ (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
+ (ATOM)++)
+
+#define diff_data_foreach_atom(ATOM, DIFF_DATA) \
+ foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
+
+#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
+ for ((ATOM) = (FROM); \
+ (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
+ (ATOM)++)
+
+/* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right.
+ *
+ * If solved == false:
+ * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next.
+ * The lines on the left should be diffed against the lines on the right.
+ * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.)
+ *
+ * If solved == true:
+ * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk").
+ * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk").
+ * If there are both left and right lines, it is a chunk of equal content on both sides,
+ * and left_count == right_count:
+ *
+ * - foo }
+ * - bar }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
+ * - baz } right_start = NULL, right_count = 0 }
+ * moo }
+ * goo }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
+ * zoo } right_start = &right.atoms.head[0], right_count = 3 }
+ * +loo }
+ * +roo }-- diff_chunk{ left_start = NULL, left_count = 0,
+ * +too } right_start = &right.atoms.head[3], right_count = 3 }
+ *
+ */
+struct diff_chunk {
+ bool solved;
+ struct diff_atom *left_start;
+ unsigned int left_count;
+ struct diff_atom *right_start;
+ unsigned int right_count;
+};
+
+typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
+#define DIFF_RESULT_ALLOC_BLOCKSIZE 128
+
+struct diff_result {
+ enum diff_rc rc;
+ struct diff_data left;
+ struct diff_data right;
+ diff_chunk_arraylist_t chunks;
+};
+
+struct diff_state {
+ /* The final result passed to the original diff caller. */
+ struct diff_result *result;
+
+ /* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */
+ struct diff_data left;
+ struct diff_data right;
+
+ unsigned int recursion_depth_left;
+
+ /* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */
+ diff_chunk_arraylist_t temp_result;
+};
+
+struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
+ struct diff_atom *left_start, unsigned int left_count,
+ struct diff_atom *right_start, unsigned int right_count);
+
+/* Signature of a utility function to divide both source files into diff atoms.
+ * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets
+ * both left and right to atomize at the same time.
+ * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
+ *
+ * func_data: context pointer (free to be used by implementation).
+ * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created.
+ * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created.
+ */
+typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right);
+
+extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right);
+
+struct diff_algo_config;
+typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */
+enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
+ * fail if the required space surpasses algo_config->permitted_state_size. */
+extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
+ * the problem into smaller chunks. Requires only linear amounts of memory. */
+extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to
+ * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */
+extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Diff algorithms to use, possibly nested. For example:
+ *
+ * struct diff_algo_config myers, patience, myers_divide;
+ *
+ * myers = (struct diff_algo_config){
+ * .impl = diff_algo_myers,
+ * .permitted_state_size = 32 * 1024 * 1024,
+ * .fallback_algo = &patience, // when too large, do diff_algo_patience
+ * };
+ *
+ * patience = (struct diff_algo_config){
+ * .impl = diff_algo_patience,
+ * .inner_algo = &patience, // After subdivision, do Patience again.
+ * .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
+ * };
+ *
+ * myers_divide = (struct diff_algo_config){
+ * .impl = diff_algo_myers_divide,
+ * .inner_algo = &myers, // When division succeeded, start from the top.
+ * // (fallback_algo = NULL implies diff_algo_none).
+ * };
+ *
+ * struct diff_config config = {
+ * .algo = &myers,
+ * ...
+ * };
+ * diff_main(&config, ...);
+ */
+struct diff_algo_config {
+ diff_algo_impl_t impl;
+
+ /* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo
+ * (diff_algo_myers). permitted_state_size == 0 means no limitation. */
+ size_t permitted_state_size;
+
+ /* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */
+ const struct diff_algo_config *inner_algo;
+
+ /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find
+ * any common-unique atoms), then use this algorithm instead. */
+ const struct diff_algo_config *fallback_algo;
+};
+
+struct diff_config {
+ diff_atomize_func_t atomize_func;
+ void *atomize_func_data;
+
+ const struct diff_algo_config *algo;
+
+ /* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite
+ * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all
+ * left atoms and add all right atoms). */
+ unsigned int max_recursion_depth;
+};
+
+struct diff_result *diff_main(const struct diff_config *config,
+ const uint8_t *left_data, size_t left_len,
+ const uint8_t *right_data, size_t right_len);
+void diff_result_free(struct diff_result *result);
blob - /dev/null
blob + 1021b3be22639ea353533646e72153bb03276f0e (mode 644)
--- /dev/null
+++ diff_myers.c
+/* Myers diff algorithm implementation, invented by Eugene W. Myers [1].
+ * Implementations of both the Myers Divide Et Impera (using linear space)
+ * and the canonical Myers algorithm (using quadratic space). */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_main.h"
+#include "debug.h"
+
+/* Myers' diff algorithm [1] is nicely explained in [2].
+ * [1] http://www.xmailserver.org/diff2.pdf
+ * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.
+ *
+ * Myers approaches finding the smallest diff as a graph problem.
+ * The crux is that the original algorithm requires quadratic amount of memory:
+ * both sides' lengths added, and that squared. So if we're diffing lines of text, two files with 1000 lines each would
+ * blow up to a matrix of about 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text.
+ * The solution is using Myers' "divide and conquer" extension algorithm, which does the original traversal from both
+ * ends of the files to reach a middle where these "snakes" touch, hence does not need to backtrace the traversal, and
+ * so gets away with only keeping a single column of that huge state matrix in memory.
+ *
+ * Todo: the divide and conquer requires linear *space*, not necessarily linear *time*. It recurses, apparently doing
+ * multiple Myers passes, and also it apparently favors fragmented diffs in cases where chunks of text were moved to a
+ * different place. Up to a given count of diff atoms (text lines), it might be desirable to accept the quadratic memory
+ * usage, get nicer diffs and less re-iteration of the same data?
+ */
+
+struct diff_box {
+ unsigned int left_start;
+ unsigned int left_end;
+ unsigned int right_start;
+ unsigned int right_end;
+};
+
+#define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0)
+
+
+/* If the two contents of a file are A B C D E and X B C Y,
+ * the Myers diff graph looks like:
+ *
+ * k0 k1
+ * \ \
+ * k-1 0 1 2 3 4 5
+ * \ A B C D E
+ * 0 o-o-o-o-o-o
+ * X | | | | | |
+ * 1 o-o-o-o-o-o
+ * B | |\| | | |
+ * 2 o-o-o-o-o-o
+ * C | | |\| | |
+ * 3 o-o-o-o-o-o
+ * Y | | | | | |\
+ * 4 o-o-o-o-o-o c1
+ * \ \
+ * c-1 c0
+ *
+ * Moving right means delete an atom from the left-hand-side,
+ * Moving down means add an atom from the right-hand-side.
+ * Diagonals indicate identical atoms on both sides, the challenge is to use as many diagonals as possible.
+ *
+ * The original Myers algorithm walks all the way from the top left to the bottom right, remembers all steps, and then
+ * backtraces to find the shortest path. However, that requires keeping the entire graph in memory, which needs
+ * quadratic space.
+ *
+ * Myers adds a variant that uses linear space -- note, not linear time, only linear space: walk forward and backward,
+ * find a meeting point in the middle, and recurse on the two separate sections. This is called "divide and conquer".
+ *
+ * d: the step number, starting with 0, a.k.a. the distance from the starting point.
+ * k: relative index in the state array for the forward scan, indicating on which diagonal through the diff graph we
+ * currently are.
+ * c: relative index in the state array for the backward scan, indicating the diagonal number from the bottom up.
+ *
+ * The "divide and conquer" traversal through the Myers graph looks like this:
+ *
+ * | d= 0 1 2 3 2 1 0
+ * ----+--------------------------------------------
+ * k= | c=
+ * 4 | 3
+ * |
+ * 3 | 3,0 5,2 2
+ * | / \
+ * 2 | 2,0 5,3 1
+ * | / \
+ * 1 | 1,0 4,3 >= 4,3 5,4<-- 0
+ * | / / \ /
+ * 0 | -->0,0 3,3 4,4 -1
+ * | \ / /
+ * -1 | 0,1 1,2 3,4 -2
+ * | \ /
+ * -2 | 0,2 -3
+ * | \
+ * | 0,3
+ * | forward-> <-backward
+ *
+ * x,y pairs here are the coordinates in the Myers graph:
+ * x = atom index in left-side source, y = atom index in the right-side source.
+ *
+ * Only one forward column and one backward column are kept in mem, each need at most left.len + 1 + right.len items.
+ * Note that each d step occupies either the even or the odd items of a column: if e.g. the previous column is in the
+ * odd items, the next column is formed in the even items, without overwriting the previous column's results.
+ *
+ * Also note that from the diagonal index k and the x coordinate, the y coordinate can be derived:
+ * y = x - k
+ * Hence the state array only needs to keep the x coordinate, i.e. the position in the left-hand file, and the y
+ * coordinate, i.e. position in the right-hand file, is derived from the index in the state array.
+ *
+ * The two traces meet at 4,3, the first step (here found in the forward traversal) where a forward position is on or
+ * past a backward traced position on the same diagonal.
+ *
+ * This divides the problem space into:
+ *
+ * 0 1 2 3 4 5
+ * A B C D E
+ * 0 o-o-o-o-o
+ * X | | | | |
+ * 1 o-o-o-o-o
+ * B | |\| | |
+ * 2 o-o-o-o-o
+ * C | | |\| |
+ * 3 o-o-o-o-*-o *: forward and backward meet here
+ * Y | |
+ * 4 o-o
+ *
+ * Doing the same on each section lead to:
+ *
+ * 0 1 2 3 4 5
+ * A B C D E
+ * 0 o-o
+ * X | |
+ * 1 o-b b: backward d=1 first reaches here (sliding up the snake)
+ * B \ f: then forward d=2 reaches here (sliding down the snake)
+ * 2 o As result, the box from b to f is found to be identical;
+ * C \ leaving a top box from 0,0 to 1,1 and a bottom trivial tail 3,3 to 4,3.
+ * 3 f-o
+ *
+ * 3 o-*
+ * Y |
+ * 4 o *: forward and backward meet here
+ *
+ * and solving the last top left box gives:
+ *
+ * 0 1 2 3 4 5
+ * A B C D E -A
+ * 0 o-o +X
+ * X | B
+ * 1 o C
+ * B \ -D
+ * 2 o -E
+ * C \ +Y
+ * 3 o-o-o
+ * Y |
+ * 4 o
+ *
+ */
+
+#define xk_to_y(X, K) ((X) - (K))
+#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))
+#define k_to_c(K, DELTA) ((K) + (DELTA))
+#define c_to_k(C, DELTA) ((C) - (DELTA))
+
+/* Do one forwards step in the "divide and conquer" graph traversal.
+ * left: the left side to diff.
+ * right: the right side to diff against.
+ * kd_forward: the traversal state for forwards traversal, modified by this function.
+ * This is carried over between invocations with increasing d.
+ * kd_forward points at the center of the state array, allowing negative indexes.
+ * kd_backward: the traversal state for backwards traversal, to find a meeting point.
+ * Since forwards is done first, kd_backward will be valid for d - 1, not d.
+ * kd_backward points at the center of the state array, allowing negative indexes.
+ * d: Step or distance counter, indicating for what value of d the kd_forward should be populated.
+ * For d == 0, kd_forward[0] is initialized, i.e. the first invocation should be for d == 0.
+ * meeting_snake: resulting meeting point, if any.
+ */
+static void diff_divide_myers_forward(struct diff_data *left, struct diff_data *right,
+ int *kd_forward, int *kd_backward, int d,
+ struct diff_box *meeting_snake)
+{
+ int delta = (int)right->atoms.len - (int)left->atoms.len;
+ int prev_x;
+ int prev_y;
+ int k;
+ int x;
+
+ debug("-- %s d=%d\n", __func__, d);
+ debug_dump_myers_graph(left, right, NULL);
+
+ for (k = d; k >= -d; k -= 2) {
+ if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
+ /* This diagonal is completely outside of the Myers graph, don't calculate it. */
+ if (k < -(int)right->atoms.len)
+ debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
+ else
+ debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
+ if (k < 0) {
+ /* We are traversing negatively, and already below the entire graph, nothing will come
+ * of this. */
+ debug(" break");
+ break;
+ }
+ debug(" continue");
+ continue;
+ }
+ debug("- k = %d\n", k);
+ if (d == 0) {
+ /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
+ * the Myers graph. */
+ x = 0;
+ }
+ /* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
+ * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
+ *
+ * | d= 0 1 2
+ * ----+----------------
+ * k= |
+ * 2 | 2,0 <-- from prev_k = 2 - 1 = 1
+ * | /
+ * 1 | 1,0
+ * | /
+ * 0 | -->0,0 3,3
+ * | \\ /
+ * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
+ * | \\
+ * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
+ *
+ * Except when a k + 1 from a previous run already means a further advancement in the graph.
+ * If k == d, there is no k + 1 and k - 1 is the only option.
+ * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
+ */
+ else if (k > -d && (k == d
+ || (k - 1 >= -(int)right->atoms.len
+ && kd_forward[k - 1] >= kd_forward[k + 1]))) {
+ /* Advance from k - 1.
+ * From position prev_k, step to the right in the Myers graph: x += 1.
+ */
+ int prev_k = k - 1;
+ prev_x = kd_forward[prev_k];
+ prev_y = xk_to_y(prev_x, prev_k);
+ x = prev_x + 1;
+ } else {
+ /* The bottom most one.
+ * From position prev_k, step to the bottom in the Myers graph: y += 1.
+ * Incrementing y is achieved by decrementing k while keeping the same x.
+ * (since we're deriving y from y = x - k).
+ */
+ int prev_k = k + 1;
+ prev_x = kd_forward[prev_k];
+ prev_y = xk_to_y(prev_x, prev_k);
+ x = prev_x;
+ }
+
+ /* Slide down any snake that we might find here. */
+ while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
+ && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
+ x++;
+ kd_forward[k] = x;
+
+ if (DEBUG) {
+ int fi;
+ for (fi = d; fi >= k; fi--) {
+ debug("kd_forward[%d] = (%d, %d)\n", fi, kd_forward[fi], kd_forward[fi] - fi);
+ /*
+ if (kd_forward[fi] >= 0 && kd_forward[fi] < left->atoms.len)
+ debug_dump_atom(left, right, &left->atoms.head[kd_forward[fi]]);
+ else
+ debug("\n");
+ if (kd_forward[fi]-fi >= 0 && kd_forward[fi]-fi < right->atoms.len)
+ debug_dump_atom(right, left, &right->atoms.head[kd_forward[fi]-fi]);
+ else
+ debug("\n");
+ */
+ }
+ }
+
+ if (x < 0 || x > left->atoms.len
+ || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len)
+ continue;
+
+ /* Figured out a new forwards traversal, see if this has gone onto or even past a preceding backwards
+ * traversal.
+ *
+ * If the delta in length is odd, then d and backwards_d hit the same state indexes:
+ * | d= 0 1 2 1 0
+ * ----+---------------- ----------------
+ * k= | c=
+ * 4 | 3
+ * |
+ * 3 | 2
+ * | same
+ * 2 | 2,0====5,3 1
+ * | / \
+ * 1 | 1,0 5,4<-- 0
+ * | / /
+ * 0 | -->0,0 3,3====4,4 -1
+ * | \ /
+ * -1 | 0,1 -2
+ * | \
+ * -2 | 0,2 -3
+ * |
+ *
+ * If the delta is even, they end up off-by-one, i.e. on different diagonals:
+ *
+ * | d= 0 1 2 1 0
+ * ----+---------------- ----------------
+ * | c=
+ * 3 | 3
+ * |
+ * 2 | 2,0 off 2
+ * | / \\
+ * 1 | 1,0 4,3 1
+ * | / // \
+ * 0 | -->0,0 3,3 4,4<-- 0
+ * | \ / /
+ * -1 | 0,1 3,4 -1
+ * | \ //
+ * -2 | 0,2 -2
+ * |
+ *
+ * So in the forward path, we can only match up diagonals when the delta is odd.
+ */
+ /* Forwards is done first, so the backwards one was still at d - 1. Can't do this for d == 0. */
+ int backwards_d = d - 1;
+ if ((delta & 1) && (backwards_d >= 0)) {
+ debug("backwards_d = %d\n", backwards_d);
+
+ /* If both sides have the same length, forward and backward start on the same diagonal, meaning the
+ * backwards state index c == k.
+ * As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
+ * c = k shifted by the difference in length.
+ */
+ int c = k_to_c(k, delta);
+
+ /* When the file sizes are very different, the traversal trees start on far distant diagonals.
+ * They don't necessarily meet straight on. See whether this forward value is on a diagonal that
+ * is also valid in kd_backward[], and match them if so. */
+ if (c >= -backwards_d && c <= backwards_d) {
+ /* Current k is on a diagonal that exists in kd_backward[]. If the two x positions have
+ * met or passed (forward walked onto or past backward), then we've found a midpoint / a
+ * mid-box.
+ *
+ * But we need to avoid matching a situation like this:
+ * 0 1
+ * x y
+ * 0 o-o-o
+ * x |\| |
+ * 1 o-o-o
+ * y | |\|
+ * 2 (B)o-o <--(B) backwards traversal reached here
+ * a | | |
+ * 3 o-o-o<-- prev_x, prev_y
+ * b | | |
+ * 4 o-o(F) <--(F) forwards traversal reached here
+ * x |\| | Now both are on the same diagonal and look like they passed,
+ * 5 o-o-o but actually they have sneaked past each other and have not met.
+ * y | |\|
+ * 6 o-o-o
+ *
+ * The solution is to notice that prev_x,prev_y were also already past (B).
+ */
+ int backward_x = kd_backward[c];
+ int backward_y = xc_to_y(backward_x, c, delta);
+ debug(" prev_x,y = (%d,%d) c%d:backward_x,y = (%d,%d) k%d:x,y = (%d,%d)\n",
+ prev_x, prev_y, c, backward_x, backward_y, k, x, xk_to_y(x, k));
+ if (prev_x <= backward_x && prev_y <= backward_y
+ && x >= backward_x) {
+ *meeting_snake = (struct diff_box){
+ .left_start = backward_x,
+ .left_end = x,
+ .right_start = xc_to_y(backward_x, c, delta),
+ .right_end = xk_to_y(x, k),
+ };
+ debug("HIT x=(%u,%u) - y=(%u,%u)\n",
+ meeting_snake->left_start,
+ meeting_snake->right_start,
+ meeting_snake->left_end,
+ meeting_snake->right_end);
+ return;
+ }
+ }
+ }
+ }
+}
+
+/* Do one backwards step in the "divide and conquer" graph traversal.
+ * left: the left side to diff.
+ * right: the right side to diff against.
+ * kd_forward: the traversal state for forwards traversal, to find a meeting point.
+ * Since forwards is done first, after this, both kd_forward and kd_backward will be valid for d.
+ * kd_forward points at the center of the state array, allowing negative indexes.
+ * kd_backward: the traversal state for backwards traversal, to find a meeting point.
+ * This is carried over between invocations with increasing d.
+ * kd_backward points at the center of the state array, allowing negative indexes.
+ * d: Step or distance counter, indicating for what value of d the kd_backward should be populated.
+ * Before the first invocation, kd_backward[0] shall point at the bottom right of the Myers graph
+ * (left.len, right.len).
+ * The first invocation will be for d == 1.
+ * meeting_snake: resulting meeting point, if any.
+ */
+static void diff_divide_myers_backward(struct diff_data *left, struct diff_data *right,
+ int *kd_forward, int *kd_backward, int d,
+ struct diff_box *meeting_snake)
+{
+ int delta = (int)right->atoms.len - (int)left->atoms.len;
+ int prev_x;
+ int prev_y;
+ int c;
+ int x;
+
+ debug("-- %s d=%d\n", __func__, d);
+ debug_dump_myers_graph(left, right, NULL);
+
+ for (c = d; c >= -d; c -= 2) {
+ if (c < -(int)left->atoms.len || c > (int)right->atoms.len) {
+ /* This diagonal is completely outside of the Myers graph, don't calculate it. */
+ if (c < -(int)left->atoms.len)
+ debug(" %d c < -(int)left->atoms.len %d\n", c, -(int)left->atoms.len);
+ else
+ debug(" %d c > right->atoms.len %d\n", c, right->atoms.len);
+ if (c < 0) {
+ /* We are traversing negatively, and already below the entire graph, nothing will come
+ * of this. */
+ debug(" break");
+ break;
+ }
+ debug(" continue");
+ continue;
+ }
+ debug("- c = %d\n", c);
+ if (d == 0) {
+ /* This is the initializing step. There is no prev_c yet, get the initial x from the bottom
+ * right of the Myers graph. */
+ x = left->atoms.len;
+ }
+ /* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
+ * For this, all c should derive from c - 1, only the bottom most c derive from c + 1:
+ *
+ * 2 1 0
+ * ---------------------------------------------------
+ * c=
+ * 3
+ *
+ * from prev_c = c - 1 --> 5,2 2
+ * \
+ * 5,3 1
+ * \
+ * 4,3 5,4<-- 0
+ * \ /
+ * bottom most for d=1 from c + 1 --> 4,4 -1
+ * /
+ * bottom most for d=2 --> 3,4 -2
+ *
+ * Except when a c + 1 from a previous run already means a further advancement in the graph.
+ * If c == d, there is no c + 1 and c - 1 is the only option.
+ * If c < d, use c + 1 in case that yields a larger x. Also use c + 1 if c - 1 is outside the graph.
+ */
+ else if (c > -d && (c == d
+ || (c - 1 >= -(int)right->atoms.len
+ && kd_backward[c - 1] <= kd_backward[c + 1]))) {
+ /* A top one.
+ * From position prev_c, step upwards in the Myers graph: y -= 1.
+ * Decrementing y is achieved by incrementing c while keeping the same x.
+ * (since we're deriving y from y = x - c + delta).
+ */
+ int prev_c = c - 1;
+ prev_x = kd_backward[prev_c];
+ prev_y = xc_to_y(prev_x, prev_c, delta);
+ x = prev_x;
+ } else {
+ /* The bottom most one.
+ * From position prev_c, step to the left in the Myers graph: x -= 1.
+ */
+ int prev_c = c + 1;
+ prev_x = kd_backward[prev_c];
+ prev_y = xc_to_y(prev_x, prev_c, delta);
+ x = prev_x - 1;
+ }
+
+ /* Slide up any snake that we might find here. */
+ debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c, delta), xc_to_y(x, c, delta)-1);
+ if (x > 0) {
+ debug(" l="); debug_dump_atom(left, right, &left->atoms.head[x-1]);
+ }
+ if (xc_to_y(x, c, delta) > 0) {
+ debug(" r="); debug_dump_atom(right, left, &right->atoms.head[xc_to_y(x, c, delta)-1]);
+ }
+ while (x > 0 && xc_to_y(x, c, delta) > 0
+ && diff_atom_same(&left->atoms.head[x-1], &right->atoms.head[xc_to_y(x, c, delta)-1]))
+ x--;
+ kd_backward[c] = x;
+
+ if (DEBUG) {
+ int fi;
+ for (fi = d; fi >= c; fi--) {
+ debug("kd_backward[%d] = (%d, %d)\n", fi, kd_backward[fi],
+ kd_backward[fi] - fi + delta);
+ /*
+ if (kd_backward[fi] >= 0 && kd_backward[fi] < left->atoms.len)
+ debug_dump_atom(left, right, &left->atoms.head[kd_backward[fi]]);
+ else
+ debug("\n");
+ if (kd_backward[fi]-fi+delta >= 0 && kd_backward[fi]-fi+delta < right->atoms.len)
+ debug_dump_atom(right, left, &right->atoms.head[kd_backward[fi]-fi+delta]);
+ else
+ debug("\n");
+ */
+ }
+ }
+
+ if (x < 0 || x > left->atoms.len
+ || xc_to_y(x, c, delta) < 0 || xc_to_y(x, c, delta) > right->atoms.len)
+ continue;
+
+ /* Figured out a new backwards traversal, see if this has gone onto or even past a preceding forwards
+ * traversal.
+ *
+ * If the delta in length is even, then d and backwards_d hit the same state indexes -- note how this is
+ * different from in the forwards traversal, because now both d are the same:
+ *
+ * | d= 0 1 2 2 1 0
+ * ----+---------------- --------------------
+ * k= | c=
+ * 4 |
+ * |
+ * 3 | 3
+ * | same
+ * 2 | 2,0====5,2 2
+ * | / \
+ * 1 | 1,0 5,3 1
+ * | / / \
+ * 0 | -->0,0 3,3====4,3 5,4<-- 0
+ * | \ / /
+ * -1 | 0,1 4,4 -1
+ * | \
+ * -2 | 0,2 -2
+ * |
+ * -3
+ * If the delta is odd, they end up off-by-one, i.e. on different diagonals.
+ * So in the backward path, we can only match up diagonals when the delta is even.
+ */
+ if ((delta & 1) == 0) {
+ /* Forwards was done first, now both d are the same. */
+ int forwards_d = d;
+
+ /* As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
+ * c = k shifted by the difference in length.
+ */
+ int k = c_to_k(c, delta);
+
+ /* When the file sizes are very different, the traversal trees start on far distant diagonals.
+ * They don't necessarily meet straight on. See whether this backward value is also on a valid
+ * diagonal in kd_forward[], and match them if so. */
+ if (k >= -forwards_d && k <= forwards_d) {
+ /* Current c is on a diagonal that exists in kd_forward[]. If the two x positions have
+ * met or passed (backward walked onto or past forward), then we've found a midpoint / a
+ * mid-box. */
+ int forward_x = kd_forward[k];
+ int forward_y = xk_to_y(forward_x, k);
+ debug("Compare %d to %d k=%d (x=%d,y=%d) to (x=%d,y=%d)\n",
+ forward_x, x, k,
+ forward_x, xk_to_y(forward_x, k), x, xc_to_y(x, c, delta));
+ if (forward_x <= prev_x && forward_y <= prev_y
+ && forward_x >= x) {
+ *meeting_snake = (struct diff_box){
+ .left_start = x,
+ .left_end = forward_x,
+ .right_start = xc_to_y(x, c, delta),
+ .right_end = xk_to_y(forward_x, k),
+ };
+ debug("HIT x=%u,%u - y=%u,%u\n",
+ meeting_snake->left_start,
+ meeting_snake->right_start,
+ meeting_snake->left_end,
+ meeting_snake->right_end);
+ return;
+ }
+ }
+ }
+ }
+}
+
+/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
+ * the problem into smaller chunks. Requires only linear amounts of memory. */
+enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ enum diff_rc rc = DIFF_RC_ENOMEM;
+ struct diff_data *left = &state->left;
+ struct diff_data *right = &state->right;
+
+ debug("\n** %s\n", __func__);
+ debug("left:\n");
+ debug_dump(left);
+ debug("right:\n");
+ debug_dump(right);
+ debug_dump_myers_graph(left, right, NULL);
+
+ /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
+ unsigned int max = left->atoms.len + right->atoms.len;
+ size_t kd_len = max + 1;
+ size_t kd_buf_size = kd_len << 1;
+ int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
+ if (!kd_buf)
+ return DIFF_RC_ENOMEM;
+ int i;
+ for (i = 0; i < kd_buf_size; i++)
+ kd_buf[i] = -1;
+ int *kd_forward = kd_buf;
+ int *kd_backward = kd_buf + kd_len;
+
+ /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
+ * It is then possible to index from -max/2 .. max/2. */
+ kd_forward += max/2;
+ kd_backward += max/2;
+
+ int d;
+ struct diff_box mid_snake = {};
+ for (d = 0; d <= (max/2); d++) {
+ debug("-- d=%d\n", d);
+ diff_divide_myers_forward(left, right, kd_forward, kd_backward, d, &mid_snake);
+ if (!diff_box_empty(&mid_snake))
+ break;
+ diff_divide_myers_backward(left, right, kd_forward, kd_backward, d, &mid_snake);
+ if (!diff_box_empty(&mid_snake))
+ break;
+ }
+
+ if (diff_box_empty(&mid_snake)) {
+ /* Divide and conquer failed to find a meeting point. Use the fallback_algo defined in the algo_config
+ * (leave this to the caller). This is just paranoia/sanity, we normally should always find a midpoint.
+ */
+ debug(" no midpoint \n");
+ rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+ goto return_rc;
+ } else {
+ debug(" mid snake L: %u to %u of %u R: %u to %u of %u\n",
+ mid_snake.left_start, mid_snake.left_end, left->atoms.len,
+ mid_snake.right_start, mid_snake.right_end, right->atoms.len);
+
+ /* Section before the mid-snake. */
+ debug("Section before the mid-snake\n");
+
+ struct diff_atom *left_atom = &left->atoms.head[0];
+ unsigned int left_section_len = mid_snake.left_start;
+ struct diff_atom *right_atom = &right->atoms.head[0];
+ unsigned int right_section_len = mid_snake.right_start;
+
+ if (left_section_len && right_section_len) {
+ /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
+ if (!diff_state_add_chunk(state, false,
+ left_atom, left_section_len,
+ right_atom, right_section_len))
+ goto return_rc;
+ } else if (left_section_len && !right_section_len) {
+ /* Only left atoms and none on the right, they form a "minus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, 0))
+ goto return_rc;
+ } else if (!left_section_len && right_section_len) {
+ /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, right_section_len))
+ goto return_rc;
+ }
+ /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing before the mid-snake. */
+
+ /* the mid-snake, identical data on both sides: */
+ debug("the mid-snake\n");
+ if (!diff_state_add_chunk(state, true,
+ &left->atoms.head[mid_snake.left_start],
+ mid_snake.left_end - mid_snake.left_start,
+ &right->atoms.head[mid_snake.right_start],
+ mid_snake.right_end - mid_snake.right_start))
+ goto return_rc;
+
+ /* Section after the mid-snake. */
+ debug("Section after the mid-snake\n");
+ debug(" left_end %u right_end %u\n", mid_snake.left_end, mid_snake.right_end);
+ debug(" left_count %u right_count %u\n", left->atoms.len, right->atoms.len);
+ left_atom = &left->atoms.head[mid_snake.left_end];
+ left_section_len = left->atoms.len - mid_snake.left_end;
+ right_atom = &right->atoms.head[mid_snake.right_end];
+ right_section_len = right->atoms.len - mid_snake.right_end;
+
+ if (left_section_len && right_section_len) {
+ /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
+ if (!diff_state_add_chunk(state, false,
+ left_atom, left_section_len,
+ right_atom, right_section_len))
+ goto return_rc;
+ } else if (left_section_len && !right_section_len) {
+ /* Only left atoms and none on the right, they form a "minus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, 0))
+ goto return_rc;
+ } else if (!left_section_len && right_section_len) {
+ /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, right_section_len))
+ goto return_rc;
+ }
+ /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing after the mid-snake. */
+ }
+
+ rc = DIFF_RC_OK;
+
+return_rc:
+ free(kd_buf);
+ debug("** END %s\n", __func__);
+ return rc;
+}
+
+/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
+ * fail if the required space surpasses algo_config->permitted_state_size. */
+enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ /* do a diff_divide_myers_forward() without a _backward(), so that it walks forward across the entire
+ * files to reach the end. Keep each run's state, and do a final backtrace. */
+ enum diff_rc rc = DIFF_RC_ENOMEM;
+ struct diff_data *left = &state->left;
+ struct diff_data *right = &state->right;
+
+ debug("\n** %s\n", __func__);
+ debug("left:\n");
+ debug_dump(left);
+ debug("right:\n");
+ debug_dump(right);
+ debug_dump_myers_graph(left, right, NULL);
+
+ /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
+ unsigned int max = left->atoms.len + right->atoms.len;
+ size_t kd_len = max + 1 + max;
+ size_t kd_buf_size = kd_len * kd_len;
+ debug("state size: %zu\n", kd_buf_size);
+ if (kd_buf_size < kd_len /* overflow? */
+ || kd_buf_size * sizeof(int) > algo_config->permitted_state_size) {
+ debug("state size %zu > permitted_state_size %zu, use fallback_algo\n",
+ kd_buf_size, algo_config->permitted_state_size);
+ return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+ }
+
+ int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
+ if (!kd_buf)
+ return DIFF_RC_ENOMEM;
+ int i;
+ for (i = 0; i < kd_buf_size; i++)
+ kd_buf[i] = -1;
+
+ /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
+ * It is then possible to index from -max .. max. */
+ int *kd_origin = kd_buf + max;
+ int *kd_column = kd_origin;
+
+ int d;
+ int backtrack_d = -1;
+ int backtrack_k = 0;
+ int k;
+ int x, y;
+ for (d = 0; d <= max; d++, kd_column += kd_len) {
+ debug("-- d=%d\n", d);
+
+ debug("-- %s d=%d\n", __func__, d);
+
+ for (k = d; k >= -d; k -= 2) {
+ if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
+ /* This diagonal is completely outside of the Myers graph, don't calculate it. */
+ if (k < -(int)right->atoms.len)
+ debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
+ else
+ debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
+ if (k < 0) {
+ /* We are traversing negatively, and already below the entire graph, nothing will come
+ * of this. */
+ debug(" break");
+ break;
+ }
+ debug(" continue");
+ continue;
+ }
+
+ debug("- k = %d\n", k);
+ if (d == 0) {
+ /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
+ * the Myers graph. */
+ x = 0;
+ } else {
+ int *kd_prev_column = kd_column - kd_len;
+
+ /* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
+ * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
+ *
+ * | d= 0 1 2
+ * ----+----------------
+ * k= |
+ * 2 | 2,0 <-- from prev_k = 2 - 1 = 1
+ * | /
+ * 1 | 1,0
+ * | /
+ * 0 | -->0,0 3,3
+ * | \\ /
+ * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
+ * | \\
+ * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
+ *
+ * Except when a k + 1 from a previous run already means a further advancement in the graph.
+ * If k == d, there is no k + 1 and k - 1 is the only option.
+ * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
+ */
+ if (k > -d && (k == d
+ || (k - 1 >= -(int)right->atoms.len
+ && kd_prev_column[k - 1] >= kd_prev_column[k + 1]))) {
+ /* Advance from k - 1.
+ * From position prev_k, step to the right in the Myers graph: x += 1.
+ */
+ int prev_k = k - 1;
+ int prev_x = kd_prev_column[prev_k];
+ x = prev_x + 1;
+ } else {
+ /* The bottom most one.
+ * From position prev_k, step to the bottom in the Myers graph: y += 1.
+ * Incrementing y is achieved by decrementing k while keeping the same x.
+ * (since we're deriving y from y = x - k).
+ */
+ int prev_k = k + 1;
+ int prev_x = kd_prev_column[prev_k];
+ x = prev_x;
+ }
+ }
+
+ /* Slide down any snake that we might find here. */
+ while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
+ && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
+ x++;
+ kd_column[k] = x;
+
+ if (DEBUG) {
+ int fi;
+ for (fi = d; fi >= k; fi-=2) {
+ debug("kd_column[%d] = (%d, %d)\n", fi, kd_column[fi], kd_column[fi] - fi);
+#if 0
+ if (kd_column[fi] >= 0 && kd_column[fi] < left->atoms.len)
+ debug_dump_atom(left, right, &left->atoms.head[kd_column[fi]]);
+ else
+ debug("\n");
+ if (kd_column[fi]-fi >= 0 && kd_column[fi]-fi < right->atoms.len)
+ debug_dump_atom(right, left, &right->atoms.head[kd_column[fi]-fi]);
+ else
+ debug("\n");
+#endif
+ }
+ }
+
+ if (x == left->atoms.len && xk_to_y(x, k) == right->atoms.len) {
+ /* Found a path */
+ backtrack_d = d;
+ backtrack_k = k;
+ debug("Reached the end at d = %d, k = %d\n",
+ backtrack_d, backtrack_k);
+ break;
+ }
+ }
+
+ if (backtrack_d >= 0)
+ break;
+ }
+
+ debug_dump_myers_graph(left, right, kd_origin);
+
+ /* backtrack. A matrix spanning from start to end of the file is ready:
+ *
+ * | d= 0 1 2 3 4
+ * ----+---------------------------------
+ * k= |
+ * 3 |
+ * |
+ * 2 | 2,0
+ * | /
+ * 1 | 1,0 4,3
+ * | / / \
+ * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0
+ * | \ / \
+ * -1 | 0,1 3,4
+ * | \
+ * -2 | 0,2
+ * |
+ *
+ * From (4,4) backwards, find the previous position that is the largest, and remember it.
+ *
+ */
+ for (d = backtrack_d, k = backtrack_k; d >= 0; d--) {
+ x = kd_column[k];
+ y = xk_to_y(x, k);
+
+ /* When the best position is identified, remember it for that kd_column.
+ * That kd_column is no longer needed otherwise, so just re-purpose kd_column[0] = x and kd_column[1] = y,
+ * so that there is no need to allocate more memory.
+ */
+ kd_column[0] = x;
+ kd_column[1] = y;
+ debug("Backtrack d=%d: xy=(%d, %d)\n",
+ d, kd_column[0], kd_column[1]);
+
+ /* Don't access memory before kd_buf */
+ if (d == 0)
+ break;
+ int *kd_prev_column = kd_column - kd_len;
+
+ /* When y == 0, backtracking downwards (k-1) is the only way.
+ * When x == 0, backtracking upwards (k+1) is the only way.
+ *
+ * | d= 0 1 2 3 4
+ * ----+---------------------------------
+ * k= |
+ * 3 |
+ * | ..y == 0
+ * 2 | 2,0
+ * | /
+ * 1 | 1,0 4,3
+ * | / / \
+ * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0
+ * | \ / \
+ * -1 | 0,1 3,4
+ * | \
+ * -2 | 0,2__
+ * | x == 0
+ */
+ debug("prev[k-1] = %d,%d prev[k+1] = %d,%d\n",
+ kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1),
+ kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1));
+ if (y == 0
+ || (x > 0 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) {
+ k = k - 1;
+ debug("prev k=k-1=%d x=%d y=%d\n",
+ k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
+ } else {
+ k = k + 1;
+ debug("prev k=k+1=%d x=%d y=%d\n",
+ k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
+ }
+ kd_column = kd_prev_column;
+ }
+
+ /* Forwards again, this time recording the diff chunks.
+ * Definitely start from 0,0. kd_column[0] may actually point to the bottom of a snake starting at 0,0 */
+ x = 0;
+ y = 0;
+
+ kd_column = kd_origin;
+ for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) {
+ int next_x = kd_column[0];
+ int next_y = kd_column[1];
+ debug("Forward track from xy(%d,%d) to xy(%d,%d)\n",
+ x, y, next_x, next_y);
+
+ struct diff_atom *left_atom = &left->atoms.head[x];
+ int left_section_len = next_x - x;
+ struct diff_atom *right_atom = &right->atoms.head[y];
+ int right_section_len = next_y - y;
+
+ rc = DIFF_RC_ENOMEM;
+ if (left_section_len && right_section_len) {
+ /* This must be a snake slide.
+ * Snake slides have a straight line leading into them (except when starting at (0,0)). Find
+ * out whether the lead-in is horizontal or vertical:
+ *
+ * left
+ * ---------->
+ * |
+ * r| o-o o
+ * i| \ |
+ * g| o o
+ * h| \ \
+ * t| o o
+ * v
+ *
+ * If left_section_len > right_section_len, the lead-in is horizontal, meaning first
+ * remove one atom from the left before sliding down the snake.
+ * If right_section_len > left_section_len, the lead-in is vetical, so add one atom from
+ * the right before sliding down the snake. */
+ if (left_section_len == right_section_len + 1) {
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 1,
+ right_atom, 0))
+ goto return_rc;
+ left_atom++;
+ left_section_len--;
+ } else if (right_section_len == left_section_len + 1) {
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, 1))
+ goto return_rc;
+ right_atom++;
+ right_section_len--;
+ } else if (left_section_len != right_section_len) {
+ /* The numbers are making no sense. Should never happen. */
+ rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+ goto return_rc;
+ }
+
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, right_section_len))
+ goto return_rc;
+ } else if (left_section_len && !right_section_len) {
+ /* Only left atoms and none on the right, they form a "minus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, 0))
+ goto return_rc;
+ } else if (!left_section_len && right_section_len) {
+ /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, right_section_len))
+ goto return_rc;
+ }
+
+ x = next_x;
+ y = next_y;
+ }
+
+ rc = DIFF_RC_OK;
+
+return_rc:
+ free(kd_buf);
+ debug("** END %s rc=%d\n", __func__, rc);
+ return rc;
+}
blob - /dev/null
blob + 8c9e498eb29496228f6ff282f223b721ba1a3d03 (mode 644)
--- /dev/null
+++ diff_output.c
+/* Common parts for printing diff output */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_output.h"
+
+void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count)
+{
+ struct diff_atom *atom;
+ foreach_diff_atom(atom, start_atom, count) {
+ fprintf(dest, "%s", prefix);
+ int i;
+ unsigned int len = atom->len;
+ if (len && atom->at[len - 1] == '\n') {
+ len--;
+ if (len && atom->at[len - 1] == '\r')
+ len--;
+ }
+
+ for (i = 0; i < len; i++) {
+ char c = atom->at[i];
+ if ((c < 0x20 || c >= 0x7f) && c != '\t')
+ fprintf(dest, "\\x%02x", (unsigned char)c);
+ else
+ fprintf(dest, "%c", c);
+ }
+ fprintf(dest, "\n");
+ }
+}
+
+enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info)
+{
+ if (info->arbitrary_info && *info->arbitrary_info)
+ fprintf(dest, "%s", info->arbitrary_info);
+ fprintf(dest, "--- %s\n+++ %s\n",
+ info->left_path ? : "a",
+ info->right_path ? : "b");
+ return DIFF_RC_OK;
+}
blob - /dev/null
blob + cb6854b078da31b5426b493fb1128b98361d923e (mode 644)
--- /dev/null
+++ diff_output.h
+/* Diff output generators and invocation shims. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "diff_main.h"
+
+struct diff_input_info {
+ const char *arbitrary_info;
+ const char *left_path;
+ const char *right_path;
+};
+
+enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
+ const struct diff_result *result);
+enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
+ const struct diff_input_info *info,
+ const char *left, int left_len, const char *right, int right_len);
+
+enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
+ const struct diff_result *result, unsigned int context_lines);
+enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
+ const struct diff_input_info *info,
+ const char *left, int left_len, const char *right, int right_len,
+ unsigned int context_lines);
+
+enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info);
+void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count);
blob - /dev/null
blob + fa532a4c439692797a14d9c7e9e0c5c55c161029 (mode 644)
--- /dev/null
+++ diff_output_plain.c
+/* Output all lines of a diff_result. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_output.h"
+
+enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
+ const struct diff_result *result)
+{
+ if (!result)
+ return DIFF_RC_EINVAL;
+ if (result->rc != DIFF_RC_OK)
+ return result->rc;
+
+ diff_output_info(dest, info);
+
+ int i;
+ for (i = 0; i < result->chunks.len; i++) {
+ struct diff_chunk *c = &result->chunks.head[i];
+ if (c->left_count && c->right_count)
+ diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
+ else if (c->left_count && !c->right_count)
+ diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
+ else if (c->right_count && !c->left_count)
+ diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
+ }
+ return DIFF_RC_OK;
+}
+
+enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
+ const struct diff_input_info *info,
+ const char *left, int left_len, const char *right, int right_len)
+{
+ enum diff_rc rc;
+ left_len = left_len < 0 ? strlen(left) : left_len;
+ right_len = right_len < 0 ? strlen(right) : right_len;
+ struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
+ rc = diff_output_plain(dest, info, result);
+ diff_result_free(result);
+ return rc;
+}
blob - /dev/null
blob + 4c95e6050fb2e06f84ee46543b0ebe1f83119115 (mode 644)
--- /dev/null
+++ diff_output_unidiff.c
+/* Produce a unidiff output from a diff_result. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_output.h"
+#include "debug.h"
+
+enum chunk_type {
+ CHUNK_EMPTY,
+ CHUNK_PLUS,
+ CHUNK_MINUS,
+ CHUNK_SAME,
+ CHUNK_WEIRD,
+};
+
+static inline enum chunk_type chunk_type(const struct diff_chunk *chunk)
+{
+ if (!chunk->left_count && !chunk->right_count)
+ return CHUNK_EMPTY;
+ if (!chunk->solved)
+ return CHUNK_WEIRD;
+ if (!chunk->right_count)
+ return CHUNK_MINUS;
+ if (!chunk->left_count)
+ return CHUNK_PLUS;
+ if (chunk->left_count != chunk->right_count)
+ return CHUNK_WEIRD;
+ return CHUNK_SAME;
+}
+
+struct chunk_context {
+ struct range chunk;
+ struct range left, right;
+};
+
+static bool chunk_context_empty(const struct chunk_context *cc)
+{
+ return range_empty(&cc->chunk);
+}
+
+static void chunk_context_get(struct chunk_context *cc, const struct diff_result *r, int chunk_idx,
+ int context_lines)
+{
+ const struct diff_chunk *c = &r->chunks.head[chunk_idx];
+ int left_start = diff_atom_root_idx(&r->left, c->left_start);
+ int right_start = diff_atom_root_idx(&r->right, c->right_start);
+
+ *cc = (struct chunk_context){
+ .chunk = {
+ .start = chunk_idx,
+ .end = chunk_idx + 1,
+ },
+ .left = {
+ .start = MAX(0, left_start - context_lines),
+ .end = MIN(r->left.atoms.len, left_start + c->left_count + context_lines),
+ },
+ .right = {
+ .start = MAX(0, right_start - context_lines),
+ .end = MIN(r->right.atoms.len, right_start + c->right_count + context_lines),
+ },
+ };
+}
+
+static bool chunk_contexts_touch(const struct chunk_context *cc, const struct chunk_context *other)
+{
+ return ranges_touch(&cc->chunk, &other->chunk)
+ || ranges_touch(&cc->left, &other->left)
+ || ranges_touch(&cc->right, &other->right);
+}
+
+static void chunk_contexts_merge(struct chunk_context *cc, const struct chunk_context *other)
+{
+ ranges_merge(&cc->chunk, &other->chunk);
+ ranges_merge(&cc->left, &other->left);
+ ranges_merge(&cc->right, &other->right);
+}
+
+static void diff_output_unidiff_chunk(FILE *dest, bool *info_printed, const struct diff_input_info *info,
+ const struct diff_result *result, const struct chunk_context *cc)
+{
+ if (range_empty(&cc->left) && range_empty(&cc->right))
+ return;
+
+ if (!(*info_printed)) {
+ diff_output_info(dest, info);
+ *info_printed = true;
+ }
+
+ fprintf(dest, "@@ -%d,%d +%d,%d @@\n",
+ cc->left.start + 1, cc->left.end - cc->left.start,
+ cc->right.start + 1, cc->right.end - cc->right.start);
+
+ /* Got the absolute line numbers where to start printing, and the index of the interesting (non-context) chunk.
+ * To print context lines above the interesting chunk, nipping on the previous chunk index may be necessary.
+ * It is guaranteed to be only context lines where left == right, so it suffices to look on the left. */
+ const struct diff_chunk *first_chunk = &result->chunks.head[cc->chunk.start];
+ int chunk_start_line = diff_atom_root_idx(&result->left, first_chunk->left_start);
+ if (cc->left.start < chunk_start_line)
+ diff_output_lines(dest, " ", &result->left.atoms.head[cc->left.start],
+ chunk_start_line - cc->left.start);
+
+ /* Now write out all the joined chunks and contexts between them */
+ int c_idx;
+ for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) {
+ const struct diff_chunk *c = &result->chunks.head[c_idx];
+
+ if (c->left_count && c->right_count)
+ diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
+ else if (c->left_count && !c->right_count)
+ diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
+ else if (c->right_count && !c->left_count)
+ diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
+ }
+
+ /* Trailing context? */
+ const struct diff_chunk *last_chunk = &result->chunks.head[cc->chunk.end - 1];
+ int chunk_end_line = diff_atom_root_idx(&result->left, last_chunk->left_start + last_chunk->left_count);
+ if (cc->left.end > chunk_end_line)
+ diff_output_lines(dest, " ", &result->left.atoms.head[chunk_end_line],
+ cc->left.end - chunk_end_line);
+}
+
+enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
+ const struct diff_result *result, unsigned int context_lines)
+{
+ if (!result)
+ return DIFF_RC_EINVAL;
+ if (result->rc != DIFF_RC_OK)
+ return result->rc;
+
+ struct chunk_context cc = {};
+ bool info_printed = false;
+
+ int i;
+ for (i = 0; i < result->chunks.len; i++) {
+ struct diff_chunk *c = &result->chunks.head[i];
+ enum chunk_type t = chunk_type(c);
+
+ if (t == CHUNK_MINUS || t == CHUNK_PLUS) {
+ if (chunk_context_empty(&cc)) {
+ /* These are the first lines being printed.
+ * Note down the start point, any number of subsequent chunks may be joined up to this
+ * unidiff chunk by context lines or by being directly adjacent. */
+ chunk_context_get(&cc, result, i, context_lines);
+ debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
+ cc.chunk.start, cc.chunk.end,
+ cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+ } else {
+ /* There already is a previous chunk noted down for being printed.
+ * Does it join up with this one? */
+ struct chunk_context next;
+ chunk_context_get(&next, result, i, context_lines);
+ debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
+ next.chunk.start, next.chunk.end,
+ next.left.start, next.left.end, next.right.start, next.right.end);
+ if (chunk_contexts_touch(&cc, &next)) {
+ /* This next context touches or overlaps the previous one, join. */
+ chunk_contexts_merge(&cc, &next);
+ debug("new chunk to be printed touches previous chunk, now: left %d-%d right %d-%d\n",
+ cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+ } else {
+ /* No touching, so the previous context is complete with a gap between it and
+ * this next one. Print the previous one and start fresh here. */
+ debug("new chunk to be printed does not touch previous chunk; print left %d-%d right %d-%d\n",
+ cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+ diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
+ cc = next;
+ debug("new unprinted chunk is left %d-%d right %d-%d\n",
+ cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+ }
+ }
+ }
+
+ }
+
+ if (!chunk_context_empty(&cc))
+ diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
+ return DIFF_RC_OK;
+}
+
+enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
+ const struct diff_input_info *info,
+ const char *left, int left_len, const char *right, int right_len,
+ unsigned int context_lines)
+{
+ enum diff_rc rc;
+ left_len = left_len < 0 ? strlen(left) : left_len;
+ right_len = right_len < 0 ? strlen(right) : right_len;
+ struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
+ rc = diff_output_unidiff(dest, info, result, context_lines);
+ diff_result_free(result);
+ return rc;
+}
blob - /dev/null
blob + 9d24bfe161fabd6c99cfd6b78be90df096a9d5ae (mode 644)
--- /dev/null
+++ diff_patience.c
+/* Implementation of the Patience Diff algorithm invented by Bram Cohen:
+ * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <assert.h>
+
+#include "diff_main.h"
+#include "debug.h"
+
+/* Set unique_here = true for all atoms that exist exactly once in this list. */
+static void diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count)
+{
+ struct diff_atom *i;
+ unsigned int count = 0;
+ diff_data_foreach_atom(i, d) {
+ i->patience.unique_here = true;
+ i->patience.unique_in_both = true;
+ count++;
+ }
+ diff_data_foreach_atom(i, d) {
+ struct diff_atom *j;
+
+ if (!i->patience.unique_here)
+ continue;
+
+ diff_data_foreach_atom_from(i + 1, j, d) {
+ if (diff_atom_same(i, j)) {
+ if (i->patience.unique_here) {
+ i->patience.unique_here = false;
+ i->patience.unique_in_both = false;
+ count--;
+ }
+ j->patience.unique_here = false;
+ j->patience.unique_in_both = false;
+ count--;
+ }
+ }
+ }
+ if (unique_count)
+ *unique_count = count;
+}
+
+/* Mark those lines as atom->patience.unique_in_both = true that appear exactly once in each side. */
+static void diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
+ unsigned int *unique_in_both_count)
+{
+ /* Derive the final unique_in_both count without needing an explicit iteration. So this is just some
+ * optimiziation to save one iteration in the end. */
+ unsigned int unique_in_both;
+
+ diff_atoms_mark_unique(left, &unique_in_both);
+ diff_atoms_mark_unique(right, NULL);
+
+ debug("unique_in_both %u\n", unique_in_both);
+
+ struct diff_atom *i;
+ diff_data_foreach_atom(i, left) {
+ if (!i->patience.unique_here)
+ continue;
+ struct diff_atom *j;
+ int found_in_b = 0;
+ diff_data_foreach_atom(j, right) {
+ if (!diff_atom_same(i, j))
+ continue;
+ if (!j->patience.unique_here) {
+ found_in_b = 2; /* or more */
+ break;
+ } else {
+ found_in_b = 1;
+ j->patience.pos_in_other = i;
+ i->patience.pos_in_other = j;
+ }
+ }
+
+ if (found_in_b == 0 || found_in_b > 1) {
+ i->patience.unique_in_both = false;
+ unique_in_both--;
+ debug("unique_in_both %u (%d) ", unique_in_both, found_in_b);
+ debug_dump_atom(left, NULL, i);
+ }
+ }
+
+ /* Still need to unmark right[*]->patience.unique_in_both for atoms that don't exist in left */
+ diff_data_foreach_atom(i, right) {
+ if (!i->patience.unique_here
+ || !i->patience.unique_in_both)
+ continue;
+ struct diff_atom *j;
+ bool found_in_a = false;
+ diff_data_foreach_atom(j, left) {
+ if (!j->patience.unique_in_both)
+ continue;
+ if (!diff_atom_same(i, j))
+ continue;
+ found_in_a = true;
+ break;
+ }
+
+ if (!found_in_a)
+ i->patience.unique_in_both = false;
+ }
+
+ if (unique_in_both_count)
+ *unique_in_both_count = unique_in_both;
+}
+
+static void diff_atoms_swallow_identical_neighbors(struct diff_data *left, struct diff_data *right,
+ unsigned int *unique_in_both_count)
+{
+ debug("trivially combine identical lines arount unique_in_both lines\n");
+
+ unsigned int l_idx;
+ unsigned int next_l_idx;
+ unsigned int l_min = 0;
+ unsigned int r_min = 0;
+ for (l_idx = 0; l_idx < left->atoms.len; l_idx = next_l_idx) {
+ next_l_idx = l_idx + 1;
+ struct diff_atom *l = &left->atoms.head[l_idx];
+
+ if (!l->patience.unique_in_both)
+ continue;
+
+ debug("check identical lines around ");
+ debug_dump_atom(left, right, l);
+
+ unsigned int r_idx = diff_atom_idx(right, l->patience.pos_in_other);
+
+ struct range identical_l;
+ struct range identical_r;
+
+ /* Swallow upwards.
+ * Each common-unique line swallows identical lines upwards and downwards.
+ * All common-unique lines that were part of the identical lines following below were already swallowed
+ * in the previous iteration, so we will never hit another common-unique line above. */
+ for (identical_l.start = l_idx, identical_r.start = r_idx;
+ identical_l.start > l_min
+ && identical_r.start > r_min
+ && diff_atom_same(&left->atoms.head[identical_l.start - 1],
+ &right->atoms.head[identical_r.start - 1]);
+ identical_l.start--, identical_r.start--);
+
+ /* Swallow downwards */
+ for (identical_l.end = l_idx + 1, identical_r.end = r_idx + 1;
+ identical_l.end < left->atoms.len
+ && identical_r.end < right->atoms.len
+ && diff_atom_same(&left->atoms.head[identical_l.end],
+ &right->atoms.head[identical_r.end]);
+ identical_l.end++, identical_r.end++,
+ next_l_idx++) {
+ if (left->atoms.head[identical_l.end].patience.unique_in_both) {
+ /* Part of a chunk of identical lines, remove from listing of unique_in_both lines */
+ left->atoms.head[identical_l.end].patience.unique_in_both = false;
+ right->atoms.head[identical_r.end].patience.unique_in_both = false;
+ (*unique_in_both_count)--;
+ }
+ }
+
+ l->patience.identical_lines = identical_l;
+ l->patience.pos_in_other->patience.identical_lines = identical_r;
+
+ l_min = identical_l.end;
+ r_min = identical_r.end;
+
+ if (!range_empty(&l->patience.identical_lines)) {
+ debug("common-unique line at l=%u r=%u swallowed identical lines l=%u-%u r=%u-%u\n",
+ l_idx, r_idx,
+ identical_l.start, identical_l.end,
+ identical_r.start, identical_r.end);
+ }
+ debug("next_l_idx = %u\n", next_l_idx);
+ }
+}
+
+/* Among the lines that appear exactly once in each side, find the longest streak that appear in both files in the same
+ * order (with other stuff allowed to interleave). Use patience sort for that, as in the Patience Diff algorithm.
+ * See https://bramcohen.livejournal.com/73318.html and, for a much more detailed explanation,
+ * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */
+enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ enum diff_rc rc = DIFF_RC_ENOMEM;
+
+ struct diff_data *left = &state->left;
+ struct diff_data *right = &state->right;
+
+ unsigned int unique_in_both_count;
+
+ debug("\n** %s\n", __func__);
+
+ /* Find those lines that appear exactly once in 'left' and exactly once in 'right'. */
+ diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count);
+
+ debug("unique_in_both_count %u\n", unique_in_both_count);
+ debug("left:\n");
+ debug_dump(left);
+ debug("right:\n");
+ debug_dump(right);
+
+ if (!unique_in_both_count) {
+ /* Cannot apply Patience, tell the caller to use fallback_algo instead. */
+ return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+ }
+
+ diff_atoms_swallow_identical_neighbors(left, right, &unique_in_both_count);
+ debug("After swallowing identical neighbors: unique_in_both = %u\n",
+ unique_in_both_count);
+
+ /* An array of Longest Common Sequence is the result of the below subscope: */
+ unsigned int lcs_count = 0;
+ struct diff_atom **lcs = NULL;
+ struct diff_atom *lcs_tail = NULL;
+
+ {
+ /* This subscope marks the lifetime of the atom_pointers allocation */
+
+ /* One chunk of storage for atom pointers */
+ struct diff_atom **atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2, sizeof(struct diff_atom*));
+
+ /* Half for the list of atoms that still need to be put on stacks */
+ struct diff_atom **uniques = atom_pointers;
+
+ /* Half for the patience sort state's "card stacks" -- we remember only each stack's topmost "card" */
+ struct diff_atom **patience_stacks = atom_pointers + unique_in_both_count;
+ unsigned int patience_stacks_count = 0;
+
+ /* Take all common, unique items from 'left' ... */
+
+ struct diff_atom *atom;
+ struct diff_atom **uniques_end = uniques;
+ diff_data_foreach_atom(atom, left) {
+ if (!atom->patience.unique_in_both)
+ continue;
+ *uniques_end = atom;
+ uniques_end++;
+ }
+
+ /* ...and sort them to the order found in 'right'.
+ * The idea is to find the leftmost stack that has a higher line number and add it to the stack's top.
+ * If there is no such stack, open a new one on the right. The line number is derived from the atom*,
+ * which are array items and hence reflect the relative position in the source file. So we got the
+ * common-uniques from 'left' and sort them according to atom->patience.pos_in_other. */
+ unsigned int i;
+ for (i = 0; i < unique_in_both_count; i++) {
+ atom = uniques[i];
+ unsigned int target_stack;
+
+ if (!patience_stacks_count)
+ target_stack = 0;
+ else {
+ /* binary search to find the stack to put this atom "card" on. */
+ unsigned int lo = 0;
+ unsigned int hi = patience_stacks_count;
+ while (lo < hi) {
+ unsigned int mid = (lo + hi) >> 1;
+ if (patience_stacks[mid]->patience.pos_in_other < atom->patience.pos_in_other)
+ lo = mid + 1;
+ else
+ hi = mid;
+ }
+
+ target_stack = lo;
+ }
+
+ assert(target_stack <= patience_stacks_count);
+ patience_stacks[target_stack] = atom;
+ if (target_stack == patience_stacks_count)
+ patience_stacks_count++;
+
+ /* Record a back reference to the next stack on the left, which will form the final longest sequence
+ * later. */
+ atom->patience.prev_stack = target_stack ? patience_stacks[target_stack - 1] : NULL;
+
+ }
+
+ /* backtrace through prev_stack references to form the final longest common sequence */
+ lcs_tail = patience_stacks[patience_stacks_count - 1];
+ lcs_count = patience_stacks_count;
+
+ /* uniques and patience_stacks are no longer needed. Backpointers are in atom->patience.prev_stack */
+ free(atom_pointers);
+ }
+
+ lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*));
+ struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1];
+ struct diff_atom *atom;
+ for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) {
+ assert(lcs_backtrace_pos >= lcs);
+ *lcs_backtrace_pos = atom;
+ }
+
+ unsigned int i;
+ if (DEBUG) {
+ debug("\npatience LCS:\n");
+ for (i = 0; i < lcs_count; i++) {
+ debug_dump_atom(left, right, lcs[i]);
+ }
+ }
+
+
+ /* TODO: For each common-unique line found (now listed in lcs), swallow lines upwards and downwards that are
+ * identical on each side. Requires a way to represent atoms being glued to adjacent atoms. */
+
+ debug("\ntraverse LCS, possibly recursing:\n");
+
+ /* Now we have pinned positions in both files at which it makes sense to divide the diff problem into smaller
+ * chunks. Go into the next round: look at each section in turn, trying to again find common-unique lines in
+ * those smaller sections. As soon as no more are found, the remaining smaller sections are solved by Myers. */
+ unsigned int left_pos = 0;
+ unsigned int right_pos = 0;
+ for (i = 0; i <= lcs_count; i++) {
+ struct diff_atom *atom;
+ struct diff_atom *atom_r;
+ unsigned int left_idx;
+ unsigned int right_idx;
+
+ if (i < lcs_count) {
+ atom = lcs[i];
+ atom_r = atom->patience.pos_in_other;
+ debug("lcs[%u] = left[%u] = right[%u]\n", i,
+ diff_atom_idx(left, atom), diff_atom_idx(right, atom_r));
+ left_idx = atom->patience.identical_lines.start;
+ right_idx = atom_r->patience.identical_lines.start;
+ debug(" identical lines l %u-%u r %u-%u\n",
+ atom->patience.identical_lines.start, atom->patience.identical_lines.end,
+ atom_r->patience.identical_lines.start, atom_r->patience.identical_lines.end);
+ } else {
+ atom = NULL;
+ atom_r = NULL;
+ left_idx = left->atoms.len;
+ right_idx = right->atoms.len;
+ }
+
+ /* 'atom' now marks an atom that matches on both sides according to patience-diff
+ * (a common-unique identical atom in both files).
+ * Handle the section before and the atom itself; the section after will be handled by the next loop
+ * iteration -- note that i loops to last element + 1 ("i <= lcs_count"), so that there will be another
+ * final iteration to pick up the last remaining items after the last LCS atom.
+ * The sections before might also be empty on left and/or right.
+ * left_pos and right_pos mark the indexes of the first atoms that have not yet been handled in the
+ * previous loop iteration.
+ * left_idx and right_idx mark the indexes of the matching atom on left and right, respectively. */
+
+ debug("iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n",
+ i, left_pos, left_idx, right_pos, right_idx);
+
+ struct diff_chunk *chunk;
+
+ /* Section before the matching atom */
+ struct diff_atom *left_atom = &left->atoms.head[left_pos];
+ unsigned int left_section_len = left_idx - left_pos;
+
+ struct diff_atom *right_atom = &(right->atoms.head[right_pos]);
+ unsigned int right_section_len = right_idx - right_pos;
+
+ if (left_section_len && right_section_len) {
+ /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
+ if (!diff_state_add_chunk(state, false,
+ left_atom, left_section_len,
+ right_atom, right_section_len))
+ goto return_rc;
+ } else if (left_section_len && !right_section_len) {
+ /* Only left atoms and none on the right, they form a "minus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, 0))
+ goto return_rc;
+ } else if (!left_section_len && right_section_len) {
+ /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, right_section_len))
+ goto return_rc;
+ }
+ /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing here. */
+
+ /* The atom found to match on both sides forms a chunk of equals on each side. In the very last
+ * iteration of this loop, there is no matching atom, we were just cleaning out the remaining lines. */
+ if (atom) {
+ if (!diff_state_add_chunk(state, true,
+ left->atoms.head + atom->patience.identical_lines.start,
+ range_len(&atom->patience.identical_lines),
+ right->atoms.head + atom_r->patience.identical_lines.start,
+ range_len(&atom_r->patience.identical_lines)))
+ goto return_rc;
+ left_pos = atom->patience.identical_lines.end;
+ right_pos = atom_r->patience.identical_lines.end;
+ } else {
+ left_pos = left_idx + 1;
+ right_pos = right_idx + 1;
+ }
+ debug("end of iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n",
+ i, left_pos, left_idx, right_pos, right_idx);
+ }
+ debug("** END %s\n", __func__);
+
+ rc = DIFF_RC_OK;
+
+return_rc:
+ free(lcs);
+ return rc;
+}