Gotweb

Commit Diff

Commit:: 55974fa638c18026f21cd0739d3bc6df41a94a2c
From:: Martin Pieuchot <mpi@openbsd.org>
Date:: Tue Feb 11 13:27:47 2020 UTC
Message:: Move all sources to a single place
Actions:: Patch | Tree
commit - bb7fb738462f1e54a85659098b77605727b9dee0
commit + 55974fa638c18026f21cd0739d3bc6df41a94a2c
blob - 2ff357ceee0eed252b191258de3fea36fa4ae92b
blob + 3cdc42b334160159d59a29028e6fc531223ba48e
--- README
+++ README
@@ -19,11 +19,5 @@ Many thanks for this valuable door opener!
 The source code itself is not based on the code found in those blogs, but
 written from scratch with the knowledge gained.
 
-Compile:
-  OpenBSD:
-    make -C diff
-  Linux:
-    make -f linux_Makefile -C diff
-
 Test:
   make -C test/
blob - /dev/null
blob + 3a89e2f6754fad430d6111cd5acc08420f47f111 (mode 644)
--- /dev/null
+++ Makefile
@@ -0,0 +1,38 @@
+.PATH:${.CURDIR}/../lib
+
+.include "diff-version.mk"
+
+PROG=	diff
+SRCS= \
+	diff.c \
+	diff_atomize_text.c \
+	diff_main.c \
+	diff_myers.c \
+	diff_patience.c \
+	diff_output.c \
+	diff_output_plain.c \
+	diff_output_unidiff.c \
+	${END}
+MAN =	${PROG}.1
+
+CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
+
+.if defined(PROFILE)
+LDADD = -lutil_p -lz_p -lc_p
+.else
+LDADD = -lutil -lz
+.endif
+
+.if ${DIFF_RELEASE} != "Yes"
+NOMAN = Yes
+.endif
+
+realinstall:
+	${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} \
+	-m ${BINMODE} ${PROG} ${BINDIR}/${PROG}
+
+dist:
+	mkdir ../diff-${DIFF_VERSION}/diff
+	cp ${SRCS} ${MAN} ../diff-${DIFF_VERSION}/diff
+
+.include <bsd.prog.mk>
blob - b93701332fb1013c6f09d620e138075dc268925c (mode 644)
blob + /dev/null
--- diff/Makefile
+++ /dev/null
@@ -1,38 +0,0 @@
-.PATH:${.CURDIR}/../lib
-
-.include "../diff-version.mk"
-
-PROG=	diff
-SRCS= \
-	diff.c \
-	diff_atomize_text.c \
-	diff_main.c \
-	diff_myers.c \
-	diff_patience.c \
-	diff_output.c \
-	diff_output_plain.c \
-	diff_output_unidiff.c \
-	${END}
-MAN =	${PROG}.1
-
-CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
-
-.if defined(PROFILE)
-LDADD = -lutil_p -lz_p -lc_p
-.else
-LDADD = -lutil -lz
-.endif
-
-.if ${DIFF_RELEASE} != "Yes"
-NOMAN = Yes
-.endif
-
-realinstall:
-	${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} \
-	-m ${BINMODE} ${PROG} ${BINDIR}/${PROG}
-
-dist:
-	mkdir ../diff-${DIFF_VERSION}/diff
-	cp ${SRCS} ${MAN} ../diff-${DIFF_VERSION}/diff
-
-.include <bsd.prog.mk>
blob - 86b3c54681283baa034bfbeeb19d4b2ee19ef50a (mode 644)
blob + /dev/null
--- diff/diff.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Commandline diff utility to test diff implementations. */
-/*
- * Copyright (c) 2018 Martin Pieuchot
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <sys/mman.h>
-#include <sys/stat.h>
-
-#include <err.h>
-#include <fcntl.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#ifdef __linux__
-/* stupid shims to compile and test on linux */
-#define __dead
-
-static const char *getprogname()
-{
-	return "diff";
-}
-#endif
-
-__dead void	 usage(void);
-int		 diffreg(char *, char *, int);
-char		*mmapfile(const char *, struct stat *);
-
-__dead void
-usage(void)
-{
-	fprintf(stderr, "usage: %s file1 file2\n", getprogname());
-	exit(1);
-}
-
-int
-main(int argc, char *argv[])
-{
-	int ch;
-
-	while ((ch = getopt(argc, argv, "")) != -1) {
-		switch (ch) {
-		default:
-			usage();
-		}
-	}
-
-	argc -= optind;
-	argv += optind;
-
-	if (argc != 2)
-		usage();
-
-	return diffreg(argv[0], argv[1], 0);
-}
-
-#include <diff/diff_main.h>
-#include <diff/diff_output.h>
-
-const struct diff_algo_config myers, patience, myers_divide;
-
-const struct diff_algo_config myers = (struct diff_algo_config){
-	.impl = diff_algo_myers,
-	.permitted_state_size = 1024 * 1024 * sizeof(int),
-	.fallback_algo = &patience,
-};
-
-const struct diff_algo_config patience = (struct diff_algo_config){
-	.impl = diff_algo_patience,
-	.inner_algo = &patience,	// After subdivision, do Patience again.
-	.fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
-};
-
-const struct diff_algo_config myers_divide = (struct diff_algo_config){
-	.impl = diff_algo_myers_divide,
-	.inner_algo = &myers,		// When division succeeded, start from the top.
-					// (fallback_algo = NULL implies diff_algo_none).
-};
-
-const struct diff_config diff_config = {
-	.atomize_func = diff_atomize_text_by_line,
-	.algo = &myers,
-};
-
-int
-diffreg(char *file1, char *file2, int flags)
-{
-	char *str1, *str2;
-	struct stat st1, st2;
-	struct diff_input_info info = {
-		.left_path = file1,
-		.right_path = file2,
-	};
-
-	str1 = mmapfile(file1, &st1);
-	str2 = mmapfile(file2, &st2);
-
-	diff_unidiff(stdout, &diff_config, &info, str1, st1.st_size, str2, st2.st_size, 3);
-
-	munmap(str1, st1.st_size);
-	munmap(str2, st2.st_size);
-
-	return 0;
-}
-
-char *
-mmapfile(const char *path, struct stat *st)
-{
-	int			 fd;
-	char			*p;
-
-	fd = open(path, O_RDONLY);
-	if (fd == -1)
-		err(2, "%s", path);
-
-	if (fstat(fd, st) == -1)
-		err(2, "%s", path);
-
-	if ((uintmax_t)st->st_size > SIZE_MAX)
-		errx(2, "%s: file too big to fit memory", path);
-
-	p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
-	if (p == MAP_FAILED)
-		err(2, "mmap");
-
-	close(fd);
-
-	return p;
-}
blob - 62c7103d70ac32f292cc05f62c782305dd8798f6 (mode 644)
blob + /dev/null
--- diff/linux_Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-CFLAGS = -fsanitize=address -fsanitize=undefined -g -O3
-
-diff: diff.c ../lib/libdiff.a
-	gcc $(CFLAGS) -I../include -o $@ $^
-
-
-../lib/libdiff.a: ../lib/*.[hc] ../include/diff/*.h
-	$(MAKE) -f linux_Makefile -C ../lib
-
-.PHONY: clean
-clean:
-	rm diff
-	$(MAKE) -f linux_Makefile -C ../lib clean
blob - /dev/null
blob + 395468a8c29ce7c6d5ca5283a1a1aa247acb3d87 (mode 644)
--- /dev/null
+++ arraylist.h
@@ -0,0 +1,96 @@
+/* Auto-reallocating array for arbitrary member types. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __linux__
+/* stupid shims to compile and test on linux */
+#include <strings.h>
+static inline void *reallocarray(void *buf, size_t n, size_t member_size)
+{
+	return realloc(buf, n * member_size);
+}
+static inline void *recallocarray(void *buf, size_t oldn, size_t n, size_t member_size)
+{
+	buf = reallocarray(buf, n, member_size);
+	bzero(((char*)buf) + (oldn * member_size), (n - oldn) * member_size);
+	return buf;
+}
+#endif
+
+
+/* Usage:
+ *
+ * ARRAYLIST(any_type_t) list;
+ * // OR
+ * typedef ARRAYLIST(any_type_t) any_type_list_t;
+ * any_type_list_t list;
+ *
+ * ARRAYLIST_INIT(list, 128); // < number of (at first unused) members to add on each realloc
+ * any_type_t *x;
+ * while (bar) {
+ *         ARRAYLIST_ADD(x, list); // < enlarges the allocated array as needed; list.head may change due to realloc
+ *         if (!x)
+ *                 abort();
+ *         *x = random_foo_value;
+ * }
+ * for (i = 0; i < list.len; i++)
+ *         printf("%s", foo_to_str(list.head[i]));
+ * ARRAYLIST_FREE(list);
+ */
+#define ARRAYLIST(MEMBER_TYPE) \
+	struct { \
+		MEMBER_TYPE *head; \
+		unsigned int len; \
+		unsigned int allocated; \
+		unsigned int alloc_blocksize; \
+	}
+
+#define ARRAYLIST_INIT(ARRAY_LIST, ALLOC_BLOCKSIZE) do { \
+		(ARRAY_LIST).head = NULL; \
+		(ARRAY_LIST).len = 0; \
+		(ARRAY_LIST).allocated = 0; \
+		(ARRAY_LIST).alloc_blocksize = ALLOC_BLOCKSIZE; \
+	} while(0)
+
+#define ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST) do { \
+		if ((ARRAY_LIST).len && !(ARRAY_LIST).allocated) { \
+			NEW_ITEM_P = NULL; \
+			break; \
+		} \
+		if ((ARRAY_LIST).head == NULL || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
+			(ARRAY_LIST).allocated += (ARRAY_LIST).alloc_blocksize ? : 8; \
+			(ARRAY_LIST).head = recallocarray((ARRAY_LIST).head, (ARRAY_LIST).len, \
+							  (ARRAY_LIST).allocated, sizeof(*(ARRAY_LIST).head)); \
+		}; \
+		if (!(ARRAY_LIST).head || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
+			NEW_ITEM_P = NULL; \
+			break; \
+		} \
+		(NEW_ITEM_P) = &(ARRAY_LIST).head[(ARRAY_LIST).len]; \
+		(ARRAY_LIST).len++; \
+	} while (0)
+
+#define ARRAYLIST_CLEAR(ARRAY_LIST) \
+	(ARRAY_LIST).len = 0
+
+#define ARRAYLIST_FREE(ARRAY_LIST) \
+	do { \
+		if ((ARRAY_LIST).head && (ARRAY_LIST).allocated) \
+			free((ARRAY_LIST).head); \
+		ARRAYLIST_INIT(ARRAY_LIST, (ARRAY_LIST).alloc_blocksize); \
+	} while(0)
blob - 395468a8c29ce7c6d5ca5283a1a1aa247acb3d87 (mode 644)
blob + /dev/null
--- include/diff/arraylist.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Auto-reallocating array for arbitrary member types. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#ifdef __linux__
-/* stupid shims to compile and test on linux */
-#include <strings.h>
-static inline void *reallocarray(void *buf, size_t n, size_t member_size)
-{
-	return realloc(buf, n * member_size);
-}
-static inline void *recallocarray(void *buf, size_t oldn, size_t n, size_t member_size)
-{
-	buf = reallocarray(buf, n, member_size);
-	bzero(((char*)buf) + (oldn * member_size), (n - oldn) * member_size);
-	return buf;
-}
-#endif
-
-
-/* Usage:
- *
- * ARRAYLIST(any_type_t) list;
- * // OR
- * typedef ARRAYLIST(any_type_t) any_type_list_t;
- * any_type_list_t list;
- *
- * ARRAYLIST_INIT(list, 128); // < number of (at first unused) members to add on each realloc
- * any_type_t *x;
- * while (bar) {
- *         ARRAYLIST_ADD(x, list); // < enlarges the allocated array as needed; list.head may change due to realloc
- *         if (!x)
- *                 abort();
- *         *x = random_foo_value;
- * }
- * for (i = 0; i < list.len; i++)
- *         printf("%s", foo_to_str(list.head[i]));
- * ARRAYLIST_FREE(list);
- */
-#define ARRAYLIST(MEMBER_TYPE) \
-	struct { \
-		MEMBER_TYPE *head; \
-		unsigned int len; \
-		unsigned int allocated; \
-		unsigned int alloc_blocksize; \
-	}
-
-#define ARRAYLIST_INIT(ARRAY_LIST, ALLOC_BLOCKSIZE) do { \
-		(ARRAY_LIST).head = NULL; \
-		(ARRAY_LIST).len = 0; \
-		(ARRAY_LIST).allocated = 0; \
-		(ARRAY_LIST).alloc_blocksize = ALLOC_BLOCKSIZE; \
-	} while(0)
-
-#define ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST) do { \
-		if ((ARRAY_LIST).len && !(ARRAY_LIST).allocated) { \
-			NEW_ITEM_P = NULL; \
-			break; \
-		} \
-		if ((ARRAY_LIST).head == NULL || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
-			(ARRAY_LIST).allocated += (ARRAY_LIST).alloc_blocksize ? : 8; \
-			(ARRAY_LIST).head = recallocarray((ARRAY_LIST).head, (ARRAY_LIST).len, \
-							  (ARRAY_LIST).allocated, sizeof(*(ARRAY_LIST).head)); \
-		}; \
-		if (!(ARRAY_LIST).head || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
-			NEW_ITEM_P = NULL; \
-			break; \
-		} \
-		(NEW_ITEM_P) = &(ARRAY_LIST).head[(ARRAY_LIST).len]; \
-		(ARRAY_LIST).len++; \
-	} while (0)
-
-#define ARRAYLIST_CLEAR(ARRAY_LIST) \
-	(ARRAY_LIST).len = 0
-
-#define ARRAYLIST_FREE(ARRAY_LIST) \
-	do { \
-		if ((ARRAY_LIST).head && (ARRAY_LIST).allocated) \
-			free((ARRAY_LIST).head); \
-		ARRAYLIST_INIT(ARRAY_LIST, (ARRAY_LIST).alloc_blocksize); \
-	} while(0)
blob - f6dcef58bb67e1a01ce4a633f7928a184a56ebb1 (mode 644)
blob + /dev/null
--- include/diff/diff_main.h
+++ /dev/null
@@ -1,287 +0,0 @@
-/* Generic infrastructure to implement various diff algorithms. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-#include <errno.h>
-
-#include <diff/arraylist.h>
-
-#ifndef MAX
-#define MAX(A,B) ((A)>(B)?(A):(B))
-#endif
-#ifndef MIN
-#define MIN(A,B) ((A)<(B)?(A):(B))
-#endif
-
-struct range {
-	int start;
-	int end;
-};
-
-static inline bool range_empty(const struct range *r)
-{
-	return r->start == r->end;
-}
-
-static inline bool ranges_touch(const struct range *a, const struct range *b)
-{
-	return (a->end >= b->start) && (a->start <= b->end);
-}
-
-static inline void ranges_merge(struct range *a, const struct range *b)
-{
-	*a = (struct range){
-		.start = MIN(a->start, b->start),
-		.end = MAX(a->end, b->end),
-	};
-}
-
-static inline int range_len(const struct range *r)
-{
-	if (!r)
-		return 0;
-	return r->end - r->start;
-}
-
-/* List of all possible return codes of a diff invocation. */
-enum diff_rc {
-	DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1,
-	DIFF_RC_OK = 0,
-	DIFF_RC_ENOTSUP = ENOTSUP,
-	DIFF_RC_ENOMEM = ENOMEM,
-	DIFF_RC_EINVAL = EINVAL,
-};
-
-struct diff_atom {
-	const uint8_t *at;
-	size_t len;
-
-	/* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to
-	 * compare entire atoms to find out whether they are indeed identical or not. */
-	unsigned int hash;
-
-	/* State for the Patience Diff algorithm */
-	/* TODO: keep a separate array for the patience state */
-	struct {
-		bool unique_here;
-		bool unique_in_both;
-		struct diff_atom *pos_in_other;
-		struct diff_atom *prev_stack;
-		struct range identical_lines;
-	} patience;
-};
-
-static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
-{
-	return left->hash == right->hash
-		&& left->len == right->len
-		&& memcmp(left->at, right->at, left->len) == 0;
-}
-
-/* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In
- * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file,
- * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms
- * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a
- * subset of atoms. */
-struct diff_data {
-	const uint8_t *data;
-	size_t len;
-	ARRAYLIST(struct diff_atom) atoms;
-	struct diff_data *root;
-};
-
-void diff_data_free(struct diff_data *diff_data);
-
-/* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with
- * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in
- * the file (and not the relative position within the subsection). */
-#define diff_atom_root_idx(DIFF_DATA, ATOM) \
-	((ATOM) ? (ATOM) - ((DIFF_DATA)->root->atoms.head) : (DIFF_DATA)->root->atoms.len)
-
-/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with
- * 0). */
-#define diff_atom_idx(DIFF_DATA, ATOM) \
-	((ATOM) ? (ATOM) - ((DIFF_DATA)->atoms.head) : (DIFF_DATA)->atoms.len)
-
-#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
-	for ((ATOM) = (FIRST_ATOM); \
-	     (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
-	     (ATOM)++)
-
-#define diff_data_foreach_atom(ATOM, DIFF_DATA) \
-	foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
-
-#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
-	for ((ATOM) = (FROM); \
-	     (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
-	     (ATOM)++)
-
-/* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right.
- *
- * If solved == false:
- * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next.
- * The lines on the left should be diffed against the lines on the right.
- * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.)
- *
- * If solved == true:
- * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk").
- * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk").
- * If there are both left and right lines, it is a chunk of equal content on both sides,
- * and left_count == right_count:
- *
- * - foo  }
- * - bar  }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
- * - baz  }               right_start = NULL, right_count = 0 }
- *   moo    }
- *   goo    }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
- *   zoo    }               right_start = &right.atoms.head[0], right_count = 3 }
- *  +loo      }
- *  +roo      }-- diff_chunk{ left_start = NULL, left_count = 0,
- *  +too      }               right_start = &right.atoms.head[3], right_count = 3 }
- *
- */
-struct diff_chunk {
-	bool solved;
-	struct diff_atom *left_start;
-	unsigned int left_count;
-	struct diff_atom *right_start;
-	unsigned int right_count;
-};
-
-typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
-#define DIFF_RESULT_ALLOC_BLOCKSIZE 128
-
-struct diff_result {
-	enum diff_rc rc;
-	struct diff_data left;
-	struct diff_data right;
-	diff_chunk_arraylist_t chunks;
-};
-
-struct diff_state {
-	/* The final result passed to the original diff caller. */
-	struct diff_result *result;
-
-	/* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */
-	struct diff_data left;
-	struct diff_data right;
-
-	unsigned int recursion_depth_left;
-
-	/* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */
-	diff_chunk_arraylist_t temp_result;
-};
-
-struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
-					struct diff_atom *left_start, unsigned int left_count,
-					struct diff_atom *right_start, unsigned int right_count);
-
-/* Signature of a utility function to divide both source files into diff atoms.
- * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets
- * both left and right to atomize at the same time.
- * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
- *
- * func_data: context pointer (free to be used by implementation).
- * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created.
- * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created.
- */
-typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right);
-
-extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right);
-
-struct diff_algo_config;
-typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */
-enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
- * fail if the required space surpasses algo_config->permitted_state_size. */
-extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
- * the problem into smaller chunks. Requires only linear amounts of memory. */
-extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to
- * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */
-extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state);
-
-/* Diff algorithms to use, possibly nested. For example:
- *
- * struct diff_algo_config myers, patience, myers_divide;
- *
- * myers = (struct diff_algo_config){
- *         .impl = diff_algo_myers,
- *         .permitted_state_size = 32 * 1024 * 1024,
- *         .fallback_algo = &patience, // when too large, do diff_algo_patience
- * };
- *
- * patience = (struct diff_algo_config){
- *         .impl = diff_algo_patience,
- *         .inner_algo = &patience,        // After subdivision, do Patience again.
- *         .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
- * };
- *
- * myers_divide = (struct diff_algo_config){
- *         .impl = diff_algo_myers_divide,
- *         .inner_algo = &myers,           // When division succeeded, start from the top.
- *                                         // (fallback_algo = NULL implies diff_algo_none).
- * };
- *
- * struct diff_config config = {
- *         .algo = &myers,
- *         ...
- * };
- * diff_main(&config, ...);
- */
-struct diff_algo_config {
-	diff_algo_impl_t impl;
-
-	/* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo
-	 * (diff_algo_myers). permitted_state_size == 0 means no limitation. */
-	size_t permitted_state_size;
-
-	/* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */
-	const struct diff_algo_config *inner_algo;
-
-	/* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find
-	 * any common-unique atoms), then use this algorithm instead. */
-	const struct diff_algo_config *fallback_algo;
-};
-
-struct diff_config {
-	diff_atomize_func_t atomize_func;
-	void *atomize_func_data;
-
-	const struct diff_algo_config *algo;
-
-	/* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite
-	 * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all
-	 * left atoms and add all right atoms). */
-	unsigned int max_recursion_depth;
-};
-
-struct diff_result *diff_main(const struct diff_config *config,
-			      const uint8_t *left_data, size_t left_len,
-			      const uint8_t *right_data, size_t right_len);
-void diff_result_free(struct diff_result *result);
blob - 5d07e74d868098aab6819c41c9499f0f1b30040c (mode 644)
blob + /dev/null
--- include/diff/diff_output.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Diff output generators and invocation shims. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <diff/diff_main.h>
-
-struct diff_input_info {
-	const char *arbitrary_info;
-	const char *left_path;
-	const char *right_path;
-};
-
-enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
-			       const struct diff_result *result);
-enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
-			const struct diff_input_info *info,
-			const char *left, int left_len, const char *right, int right_len);
-
-enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
-				 const struct diff_result *result, unsigned int context_lines);
-enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
-			  const struct diff_input_info *info,
-			  const char *left, int left_len, const char *right, int right_len,
-			  unsigned int context_lines);
-
-enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info);
-void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count);
blob - /dev/null
blob + ec47f65416456d6fa54228e53e8ccea1fd888fba (mode 644)
--- /dev/null
+++ debug.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#define DEBUG 0
+
+#if DEBUG
+#include <stdio.h>
+#define print(args...) fprintf(stderr, ##args)
+#define debug print
+#define debug_dump dump
+#define debug_dump_atom dump_atom
+#define debug_dump_atoms dump_atoms
+
+static inline void dump_atom(const struct diff_data *left, const struct diff_data *right, const struct diff_atom *atom)
+{
+	if (!atom) {
+		print("NULL atom\n");
+		return;
+	}
+	if (left)
+		print(" %3ld", diff_atom_root_idx(left, atom));
+	if (right && atom->patience.pos_in_other)
+		print(" %3ld", diff_atom_root_idx(right, atom->patience.pos_in_other));
+
+	print(" %s%s '", atom->patience.unique_here ? "u" : " ", atom->patience.unique_in_both ? "c" : " ");
+	const char *s;
+	for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) {
+		if (*s == '\r')
+			print("\\r");
+		else if (*s == '\n')
+			print("\\n");
+		else if ((*s < 32 || *s >= 127) && (*s != '\t'))
+			print("\\x%02x", *s);
+		else
+			print("%c", *s);
+	}
+	print("'\n");
+}
+
+static inline void dump_atoms(const struct diff_data *d, struct diff_atom *atom, unsigned int count)
+{
+	if (count > 42) {
+		dump_atoms(d, atom, 20);
+		print("[%u lines skipped]\n", count - 20 - 20);
+		dump_atoms(d, atom + count - 20, 20);
+		return;
+	} else {
+		struct diff_atom *i;
+		foreach_diff_atom(i, atom, count) {
+			dump_atom(d, NULL, i);
+		}
+	}
+}
+
+static inline void dump(struct diff_data *d)
+{
+	dump_atoms(d, d->atoms.head, d->atoms.len);
+}
+
+static inline void dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
+				    int *kd)
+{
+	int x;
+	int y;
+	print("  ");
+	for (x = 0; x <= l->atoms.len; x++)
+		print("%2d", x);
+	print("\n");
+
+	for (y = 0; y <= r->atoms.len; y++) {
+		print("%2d ", y);
+		for (x = 0; x <= l->atoms.len; x++) {
+
+			/* print d advancements from kd, if any. */
+			int d = -1;
+			if (kd) {
+				int max = l->atoms.len + r->atoms.len;
+				size_t kd_len = max + 1 + max;
+				int *kd_pos = kd;
+				int di;
+#define xk_to_y(X, K) ((X) - (K))
+				for (di = 0; di < max; di++) {
+					int ki;
+					for (ki = di; ki >= -di; ki -= 2) {
+						if (x == kd_pos[ki]
+						    && y == xk_to_y(x, ki)) {
+							d = di;
+							break;
+						}
+					}
+					if (d >= 0)
+						break;
+					kd_pos += kd_len;
+				}
+			}
+			if (d >= 0)
+				print("%d", d);
+			else
+				print("o");
+			if (x < l->atoms.len && d < 10)
+				print("-");
+		}
+		print("\n");
+		if (y == r->atoms.len)
+			break;
+
+		print("   ");
+		for (x = 0; x < l->atoms.len; x++) {
+			if (diff_atom_same(&l->atoms.head[x], &r->atoms.head[y]))
+				print("|\\");
+			else
+				print("| ");
+		}
+		print("|\n");
+	}
+}
+
+static inline void debug_dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
+				    int *kd)
+{
+	if (l->atoms.len > 99 || r->atoms.len > 99)
+		return;
+	dump_myers_graph(l, r, kd);
+}
+
+#else
+#define debug(args...)
+#define debug_dump(args...)
+#define debug_dump_atom(args...)
+#define debug_dump_atoms(args...)
+#define debug_dump_myers_graph(args...)
+#endif
blob - ec47f65416456d6fa54228e53e8ccea1fd888fba (mode 644)
blob + /dev/null
--- lib/debug.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#define DEBUG 0
-
-#if DEBUG
-#include <stdio.h>
-#define print(args...) fprintf(stderr, ##args)
-#define debug print
-#define debug_dump dump
-#define debug_dump_atom dump_atom
-#define debug_dump_atoms dump_atoms
-
-static inline void dump_atom(const struct diff_data *left, const struct diff_data *right, const struct diff_atom *atom)
-{
-	if (!atom) {
-		print("NULL atom\n");
-		return;
-	}
-	if (left)
-		print(" %3ld", diff_atom_root_idx(left, atom));
-	if (right && atom->patience.pos_in_other)
-		print(" %3ld", diff_atom_root_idx(right, atom->patience.pos_in_other));
-
-	print(" %s%s '", atom->patience.unique_here ? "u" : " ", atom->patience.unique_in_both ? "c" : " ");
-	const char *s;
-	for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) {
-		if (*s == '\r')
-			print("\\r");
-		else if (*s == '\n')
-			print("\\n");
-		else if ((*s < 32 || *s >= 127) && (*s != '\t'))
-			print("\\x%02x", *s);
-		else
-			print("%c", *s);
-	}
-	print("'\n");
-}
-
-static inline void dump_atoms(const struct diff_data *d, struct diff_atom *atom, unsigned int count)
-{
-	if (count > 42) {
-		dump_atoms(d, atom, 20);
-		print("[%u lines skipped]\n", count - 20 - 20);
-		dump_atoms(d, atom + count - 20, 20);
-		return;
-	} else {
-		struct diff_atom *i;
-		foreach_diff_atom(i, atom, count) {
-			dump_atom(d, NULL, i);
-		}
-	}
-}
-
-static inline void dump(struct diff_data *d)
-{
-	dump_atoms(d, d->atoms.head, d->atoms.len);
-}
-
-static inline void dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
-				    int *kd)
-{
-	int x;
-	int y;
-	print("  ");
-	for (x = 0; x <= l->atoms.len; x++)
-		print("%2d", x);
-	print("\n");
-
-	for (y = 0; y <= r->atoms.len; y++) {
-		print("%2d ", y);
-		for (x = 0; x <= l->atoms.len; x++) {
-
-			/* print d advancements from kd, if any. */
-			int d = -1;
-			if (kd) {
-				int max = l->atoms.len + r->atoms.len;
-				size_t kd_len = max + 1 + max;
-				int *kd_pos = kd;
-				int di;
-#define xk_to_y(X, K) ((X) - (K))
-				for (di = 0; di < max; di++) {
-					int ki;
-					for (ki = di; ki >= -di; ki -= 2) {
-						if (x == kd_pos[ki]
-						    && y == xk_to_y(x, ki)) {
-							d = di;
-							break;
-						}
-					}
-					if (d >= 0)
-						break;
-					kd_pos += kd_len;
-				}
-			}
-			if (d >= 0)
-				print("%d", d);
-			else
-				print("o");
-			if (x < l->atoms.len && d < 10)
-				print("-");
-		}
-		print("\n");
-		if (y == r->atoms.len)
-			break;
-
-		print("   ");
-		for (x = 0; x < l->atoms.len; x++) {
-			if (diff_atom_same(&l->atoms.head[x], &r->atoms.head[y]))
-				print("|\\");
-			else
-				print("| ");
-		}
-		print("|\n");
-	}
-}
-
-static inline void debug_dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
-				    int *kd)
-{
-	if (l->atoms.len > 99 || r->atoms.len > 99)
-		return;
-	dump_myers_graph(l, r, kd);
-}
-
-#else
-#define debug(args...)
-#define debug_dump(args...)
-#define debug_dump_atom(args...)
-#define debug_dump_atoms(args...)
-#define debug_dump_myers_graph(args...)
-#endif
blob - 964c1353da1b055ae761ae6961c45ea19e16d032 (mode 644)
blob + /dev/null
--- lib/diff_atomize_text.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Split source by line breaks, and calculate a simplistic checksum. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_main.h>
-
-static int diff_data_atomize_text_lines(struct diff_data *d)
-{
-	const uint8_t *pos = d->data;
-	const uint8_t *end = pos + d->len;
-	enum diff_rc rc;
-
-	unsigned int array_size_estimate = d->len / 50;
-	unsigned int pow2 = 1;
-	while (array_size_estimate >>= 1)
-		pow2++;
-
-	ARRAYLIST_INIT(d->atoms, 1 << pow2);
-
-	while (pos < end) {
-		const uint8_t *line_end = pos;
-		unsigned int hash = 0;
-
-		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
-			hash = hash * 23 + *line_end;
-			line_end++;
-		}
-
-		/* When not at the end of data, the line ending char ('\r' or '\n') must follow */
-		if (line_end < end)
-			line_end++;
-		/* If that was an '\r', also pull in any following '\n' */
-		if (line_end[0] == '\r' && line_end < end && line_end[1] == '\n')
-			line_end++;
-
-		/* Record the found line as diff atom */
-		struct diff_atom *atom;
-		ARRAYLIST_ADD(atom, d->atoms);
-		if (!atom)
-			return DIFF_RC_ENOMEM;
-
-		*atom = (struct diff_atom){
-			.at = pos,
-			.len = line_end - pos,
-			.hash = hash,
-		};
-
-		/* Starting point for next line: */
-		pos = line_end;
-	}
-
-	return DIFF_RC_OK;
-}
-
-enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right)
-{
-	enum diff_rc rc;
-	rc = diff_data_atomize_text_lines(left);
-	if (rc != DIFF_RC_OK)
-		return rc;
-	return diff_data_atomize_text_lines(right);
-}
blob - 285bd40d5941fe84c5d9a9bd437a4a9f981ead13 (mode 644)
blob + /dev/null
--- lib/diff_main.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/* Generic infrastructure to implement various diff algorithms (implementation). */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-
-#include <sys/queue.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
-
-#include <assert.h>
-
-#include <diff/diff_main.h>
-
-#include "debug.h"
-
-/* Even if a left or right side is empty, diff output may need to know the position in that file.
- * So left_start or right_start must never be NULL -- pass left_count or right_count as zero to indicate staying at that
- * position without consuming any lines. */
-struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
-					struct diff_atom *left_start, unsigned int left_count,
-					struct diff_atom *right_start, unsigned int right_count)
-{
-	struct diff_chunk *chunk;
-	diff_chunk_arraylist_t *result;
-
-	if (solved && !state->temp_result.len)
-		result = &state->result->chunks;
-	else
-		result = &state->temp_result;
-
-	ARRAYLIST_ADD(chunk, *result);
-	if (!chunk)
-		return NULL;
-	*chunk = (struct diff_chunk){
-		.solved = solved,
-		.left_start = left_start,
-		.left_count = left_count,
-		.right_start = right_start,
-		.right_count = right_count,
-	};
-
-	debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED");
-	debug("L\n");
-	debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
-	debug("R\n");
-	debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
-	return chunk;
-}
-
-void diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len)
-{
-	*d = (struct diff_data){
-		.data = data,
-		.len = len,
-		.root = d,
-	};
-}
-
-void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
-			       struct diff_atom *from_atom, unsigned int atoms_count)
-{
-	struct diff_atom *last_atom = from_atom + atoms_count - 1;
-	*d = (struct diff_data){
-		.data = from_atom->at,
-		.len = (last_atom->at + last_atom->len) - from_atom->at,
-		.root = parent->root,
-		.atoms.head = from_atom,
-		.atoms.len = atoms_count,
-	};
-
-	debug("subsection:\n");
-	debug_dump(d);
-}
-
-void diff_data_free(struct diff_data *diff_data)
-{
-	if (!diff_data)
-		return;
-	if (diff_data->atoms.allocated)
-		ARRAYLIST_FREE(diff_data->atoms);
-}
-
-enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
-	debug("\n** %s\n", __func__);
-	debug("left:\n");
-	debug_dump(&state->left);
-	debug("right:\n");
-	debug_dump(&state->right);
-	debug_dump_myers_graph(&state->left, &state->right, NULL);
-
-	/* Add a chunk of equal lines, if any */
-	unsigned int equal_atoms = 0;
-	while (equal_atoms < state->left.atoms.len && equal_atoms < state->right.atoms.len
-	       && diff_atom_same(&state->left.atoms.head[equal_atoms], &state->right.atoms.head[equal_atoms]))
-		equal_atoms++;
-	if (equal_atoms) {
-		if (!diff_state_add_chunk(state, true,
-					  &state->left.atoms.head[0], equal_atoms,
-					  &state->right.atoms.head[0], equal_atoms))
-			return DIFF_RC_ENOMEM;
-	}
-
-	/* Add a "minus" chunk with all lines from the left. */
-	if (equal_atoms < state->left.atoms.len) {
-		if (!diff_state_add_chunk(state, true,
-					  &state->left.atoms.head[equal_atoms],
-					  state->left.atoms.len - equal_atoms,
-					  NULL, 0))
-		    return DIFF_RC_ENOMEM;
-	}
-
-	/* Add a "plus" chunk with all lines from the right. */
-	if (equal_atoms < state->right.atoms.len) {
-		if (!diff_state_add_chunk(state, true,
-					  NULL, 0,
-					  &state->right.atoms.head[equal_atoms],
-					  state->right.atoms.len - equal_atoms))
-		return DIFF_RC_ENOMEM;
-	}
-	return DIFF_RC_OK;
-}
-
-enum diff_rc diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
-	enum diff_rc rc;
-	ARRAYLIST_FREE(state->temp_result);
-
-	if (!algo_config || !algo_config->impl || !state->recursion_depth_left) {
-		debug("MAX RECURSION REACHED, just dumping diff chunks\n");
-		return diff_algo_none(algo_config, state);
-	}
-
-	ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
-	rc = algo_config->impl(algo_config, state);
-	switch (rc) {
-	case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
-		debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo);
-		rc = diff_run_algo(algo_config->fallback_algo, state);
-		goto return_rc;
-
-	case DIFF_RC_OK:
-		/* continue below */
-		break;
-
-	default:
-		/* some error happened */
-		goto return_rc;
-	}
-
-	/* Pick up any diff chunks that are still unsolved and feed to inner_algo.
-	 * inner_algo will solve unsolved chunks and append to result,
-	 * and subsequent solved chunks on this level are then appended to result afterwards. */
-	int i;
-	for (i = 0; i < state->temp_result.len; i++) {
-		struct diff_chunk *c = &state->temp_result.head[i];
-		if (c->solved) {
-			struct diff_chunk *final_c;
-			ARRAYLIST_ADD(final_c, state->result->chunks);
-			if (!final_c) {
-				rc = DIFF_RC_ENOMEM;
-				goto return_rc;
-			}
-			*final_c = *c;
-			continue;
-		}
-
-		/* c is an unsolved chunk, feed to inner_algo */
-		struct diff_state inner_state = {
-			.result = state->result,
-			.recursion_depth_left = state->recursion_depth_left - 1,
-		};
-		diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count);
-		diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count);
-
-		rc = diff_run_algo(algo_config->inner_algo, &inner_state);
-
-		if (rc != DIFF_RC_OK)
-			goto return_rc;
-	}
-
-	rc = DIFF_RC_OK;
-return_rc:
-	ARRAYLIST_FREE(state->temp_result);
-	return rc;
-}
-
-struct diff_result *diff_main(const struct diff_config *config,
-			      const uint8_t *left_data, size_t left_len,
-			      const uint8_t *right_data, size_t right_len)
-{
-	struct diff_result *result = malloc(sizeof(struct diff_result));
-	if (!result)
-		return NULL;
-
-	*result = (struct diff_result){};
-	diff_data_init_root(&result->left, left_data, left_len);
-	diff_data_init_root(&result->right, right_data, right_len);
-
-	if (!config->atomize_func) {
-		result->rc = DIFF_RC_EINVAL;
-		return result;
-	}
-
-	result->rc = config->atomize_func(config->atomize_func_data, &result->left, &result->right);
-	if (result->rc != DIFF_RC_OK)
-		return result;
-
-	struct diff_state state = {
-		.result = result,
-		.recursion_depth_left = config->max_recursion_depth ? : 1024,
-	};
-	diff_data_init_subsection(&state.left, &result->left, result->left.atoms.head, result->left.atoms.len);
-	diff_data_init_subsection(&state.right, &result->right, result->right.atoms.head, result->right.atoms.len);
-
-	result->rc = diff_run_algo(config->algo, &state);
-
-	return result;
-}
-
-void diff_result_free(struct diff_result *result)
-{
-	if (!result)
-		return;
-	diff_data_free(&result->left);
-	diff_data_free(&result->right);
-	ARRAYLIST_FREE(result->chunks);
-	free(result);
-}
blob - cba927a4a5813ed3a1afa1247227a9637271e176 (mode 644)
blob + /dev/null
--- lib/diff_myers.c
+++ /dev/null
@@ -1,1043 +0,0 @@
-/* Myers diff algorithm implementation, invented by Eugene W. Myers [1].
- * Implementations of both the Myers Divide Et Impera (using linear space)
- * and the canonical Myers algorithm (using quadratic space). */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_main.h>
-
-#include "debug.h"
-
-/* Myers' diff algorithm [1] is nicely explained in [2].
- * [1] http://www.xmailserver.org/diff2.pdf
- * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.
- *
- * Myers approaches finding the smallest diff as a graph problem.
- * The crux is that the original algorithm requires quadratic amount of memory:
- * both sides' lengths added, and that squared. So if we're diffing lines of text, two files with 1000 lines each would
- * blow up to a matrix of about 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text.
- * The solution is using Myers' "divide and conquer" extension algorithm, which does the original traversal from both
- * ends of the files to reach a middle where these "snakes" touch, hence does not need to backtrace the traversal, and
- * so gets away with only keeping a single column of that huge state matrix in memory.
- *
- * Todo: the divide and conquer requires linear *space*, not necessarily linear *time*. It recurses, apparently doing
- * multiple Myers passes, and also it apparently favors fragmented diffs in cases where chunks of text were moved to a
- * different place. Up to a given count of diff atoms (text lines), it might be desirable to accept the quadratic memory
- * usage, get nicer diffs and less re-iteration of the same data?
- */
-
-struct diff_box {
-	unsigned int left_start;
-	unsigned int left_end;
-	unsigned int right_start;
-	unsigned int right_end;
-};
-
-#define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0)
-
-
-/* If the two contents of a file are A B C D E and X B C Y,
- * the Myers diff graph looks like:
- *
- *   k0  k1
- *    \   \
- * k-1     0 1 2 3 4 5
- *   \      A B C D E
- *     0   o-o-o-o-o-o
- *      X  | | | | | |
- *     1   o-o-o-o-o-o
- *      B  | |\| | | |
- *     2   o-o-o-o-o-o
- *      C  | | |\| | |
- *     3   o-o-o-o-o-o
- *      Y  | | | | | |\
- *     4   o-o-o-o-o-o c1
- *                  \ \
- *                 c-1 c0
- *
- * Moving right means delete an atom from the left-hand-side,
- * Moving down means add an atom from the right-hand-side.
- * Diagonals indicate identical atoms on both sides, the challenge is to use as many diagonals as possible.
- *
- * The original Myers algorithm walks all the way from the top left to the bottom right, remembers all steps, and then
- * backtraces to find the shortest path. However, that requires keeping the entire graph in memory, which needs
- * quadratic space.
- *
- * Myers adds a variant that uses linear space -- note, not linear time, only linear space: walk forward and backward,
- * find a meeting point in the middle, and recurse on the two separate sections. This is called "divide and conquer".
- *
- * d: the step number, starting with 0, a.k.a. the distance from the starting point.
- * k: relative index in the state array for the forward scan, indicating on which diagonal through the diff graph we
- *    currently are.
- * c: relative index in the state array for the backward scan, indicating the diagonal number from the bottom up.
- *
- * The "divide and conquer" traversal through the Myers graph looks like this:
- *
- *      | d=   0   1   2   3      2   1   0
- *  ----+--------------------------------------------
- *  k=  |                                      c=
- *   4  |                                       3
- *      |
- *   3  |                 3,0    5,2            2
- *      |                /          \
- *   2  |             2,0            5,3        1
- *      |            /                 \
- *   1  |         1,0     4,3 >= 4,3    5,4<--  0
- *      |        /       /          \  /
- *   0  |  -->0,0     3,3            4,4       -1
- *      |        \   /              /
- *  -1  |         0,1     1,2    3,4           -2
- *      |            \   /
- *  -2  |             0,2                      -3
- *      |                \
- *      |                 0,3
- *      |  forward->                 <-backward
- *
- * x,y pairs here are the coordinates in the Myers graph:
- * x = atom index in left-side source, y = atom index in the right-side source.
- *
- * Only one forward column and one backward column are kept in mem, each need at most left.len + 1 + right.len items.
- * Note that each d step occupies either the even or the odd items of a column: if e.g. the previous column is in the
- * odd items, the next column is formed in the even items, without overwriting the previous column's results.
- *
- * Also note that from the diagonal index k and the x coordinate, the y coordinate can be derived:
- *    y = x - k
- * Hence the state array only needs to keep the x coordinate, i.e. the position in the left-hand file, and the y
- * coordinate, i.e. position in the right-hand file, is derived from the index in the state array.
- *
- * The two traces meet at 4,3, the first step (here found in the forward traversal) where a forward position is on or
- * past a backward traced position on the same diagonal.
- *
- * This divides the problem space into:
- *
- *         0 1 2 3 4 5
- *          A B C D E
- *     0   o-o-o-o-o
- *      X  | | | | |
- *     1   o-o-o-o-o
- *      B  | |\| | |
- *     2   o-o-o-o-o
- *      C  | | |\| |
- *     3   o-o-o-o-*-o   *: forward and backward meet here
- *      Y          | |
- *     4           o-o
- *
- * Doing the same on each section lead to:
- *
- *         0 1 2 3 4 5
- *          A B C D E
- *     0   o-o
- *      X  | |
- *     1   o-b    b: backward d=1 first reaches here (sliding up the snake)
- *      B     \   f: then forward d=2 reaches here (sliding down the snake)
- *     2       o     As result, the box from b to f is found to be identical;
- *      C       \    leaving a top box from 0,0 to 1,1 and a bottom trivial tail 3,3 to 4,3.
- *     3         f-o
- *
- *     3           o-*
- *      Y            |
- *     4             o   *: forward and backward meet here
- *
- * and solving the last top left box gives:
- *
- *         0 1 2 3 4 5
- *          A B C D E           -A
- *     0   o-o                  +X
- *      X    |                   B
- *     1     o                   C
- *      B     \                 -D
- *     2       o                -E
- *      C       \               +Y
- *     3         o-o-o
- *      Y            |
- *     4             o
- *
- */
-
-#define xk_to_y(X, K) ((X) - (K))
-#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))
-#define k_to_c(K, DELTA) ((K) + (DELTA))
-#define c_to_k(C, DELTA) ((C) - (DELTA))
-
-/* Do one forwards step in the "divide and conquer" graph traversal.
- * left: the left side to diff.
- * right: the right side to diff against.
- * kd_forward: the traversal state for forwards traversal, modified by this function.
- *             This is carried over between invocations with increasing d.
- *             kd_forward points at the center of the state array, allowing negative indexes.
- * kd_backward: the traversal state for backwards traversal, to find a meeting point.
- *              Since forwards is done first, kd_backward will be valid for d - 1, not d.
- *              kd_backward points at the center of the state array, allowing negative indexes.
- * d: Step or distance counter, indicating for what value of d the kd_forward should be populated.
- *    For d == 0, kd_forward[0] is initialized, i.e. the first invocation should be for d == 0.
- * meeting_snake: resulting meeting point, if any.
- */
-static void diff_divide_myers_forward(struct diff_data *left, struct diff_data *right,
-				      int *kd_forward, int *kd_backward, int d,
-				      struct diff_box *meeting_snake)
-{
-	int delta = (int)right->atoms.len - (int)left->atoms.len;
-	int prev_x;
-	int prev_y;
-	int k;
-	int x;
-
-	debug("-- %s d=%d\n", __func__, d);
-	debug_dump_myers_graph(left, right, NULL);
-
-	for (k = d; k >= -d; k -= 2) {
-		if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
-			/* This diagonal is completely outside of the Myers graph, don't calculate it. */
-			if (k < -(int)right->atoms.len)
-				debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
-			else
-				debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
-			if (k < 0) {
-				/* We are traversing negatively, and already below the entire graph, nothing will come
-				 * of this. */
-				debug(" break");
-				break;
-			}
-			debug(" continue");
-			continue;
-		}
-		debug("- k = %d\n", k);
-		if (d == 0) {
-			/* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
-			 * the Myers graph. */
-			x = 0;
-		}
-		/* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
-		 * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
-		 *
-		 *      | d=   0   1   2
-		 *  ----+----------------
-		 *  k=  |
-		 *   2  |             2,0 <-- from prev_k = 2 - 1 = 1
-		 *      |            /
-		 *   1  |         1,0
-		 *      |        /
-		 *   0  |  -->0,0     3,3
-		 *      |       \\   /
-		 *  -1  |         0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
-		 *      |           \\
-		 *  -2  |             0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
-		 *
-		 * Except when a k + 1 from a previous run already means a further advancement in the graph.
-		 * If k == d, there is no k + 1 and k - 1 is the only option.
-		 * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
-		 */
-		else if (k > -d && (k == d
-				    || (k - 1 >= -(int)right->atoms.len
-					&& kd_forward[k - 1] >= kd_forward[k + 1]))) {
-			/* Advance from k - 1.
-			 * From position prev_k, step to the right in the Myers graph: x += 1.
-			 */
-			int prev_k = k - 1;
-			prev_x = kd_forward[prev_k];
-			prev_y = xk_to_y(prev_x, prev_k);
-			x = prev_x + 1;
-		} else {
-			/* The bottom most one.
-			 * From position prev_k, step to the bottom in the Myers graph: y += 1.
-			 * Incrementing y is achieved by decrementing k while keeping the same x.
-			 * (since we're deriving y from y = x - k).
-			 */
-			int prev_k = k + 1;
-			prev_x = kd_forward[prev_k];
-			prev_y = xk_to_y(prev_x, prev_k);
-			x = prev_x;
-		}
-
-		/* Slide down any snake that we might find here. */
-		while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
-		       && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
-		       x++;
-		kd_forward[k] = x;
-
-		if (DEBUG) {
-			int fi;
-			for (fi = d; fi >= k; fi--) {
-				debug("kd_forward[%d] = (%d, %d)\n", fi, kd_forward[fi], kd_forward[fi] - fi);
-				/*
-				if (kd_forward[fi] >= 0 && kd_forward[fi] < left->atoms.len)
-					debug_dump_atom(left, right, &left->atoms.head[kd_forward[fi]]);
-				else
-					debug("\n");
-				if (kd_forward[fi]-fi >= 0 && kd_forward[fi]-fi < right->atoms.len)
-					debug_dump_atom(right, left, &right->atoms.head[kd_forward[fi]-fi]);
-				else
-					debug("\n");
-				*/
-			}
-		}
-
-		if (x < 0 || x > left->atoms.len
-		    || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len)
-			continue;
-
-		/* Figured out a new forwards traversal, see if this has gone onto or even past a preceding backwards
-		 * traversal.
-		 *
-		 * If the delta in length is odd, then d and backwards_d hit the same state indexes:
-		 *      | d=   0   1   2      1   0
-		 *  ----+----------------    ----------------
-		 *  k=  |                              c=
-		 *   4  |                               3
-		 *      |
-		 *   3  |                               2
-		 *      |                same
-		 *   2  |             2,0====5,3        1
-		 *      |            /          \
-		 *   1  |         1,0            5,4<-- 0
-		 *      |        /              /
-		 *   0  |  -->0,0     3,3====4,4       -1
-		 *      |        \   /
-		 *  -1  |         0,1                  -2
-		 *      |            \
-		 *  -2  |             0,2              -3
-		 *      |
-		 *
-		 * If the delta is even, they end up off-by-one, i.e. on different diagonals:
-		 *
-		 *      | d=   0   1   2    1   0
-		 *  ----+----------------  ----------------
-		 *      |                            c=
-		 *   3  |                             3
-		 *      |
-		 *   2  |             2,0 off         2
-		 *      |            /   \\
-		 *   1  |         1,0      4,3        1
-		 *      |        /       //   \
-		 *   0  |  -->0,0     3,3      4,4<-- 0
-		 *      |        \   /        /
-		 *  -1  |         0,1      3,4       -1
-		 *      |            \   //
-		 *  -2  |             0,2            -2
-		 *      |
-		 *
-		 * So in the forward path, we can only match up diagonals when the delta is odd.
-		 */
-		 /* Forwards is done first, so the backwards one was still at d - 1. Can't do this for d == 0. */
-		int backwards_d = d - 1;
-		if ((delta & 1) && (backwards_d >= 0)) {
-			debug("backwards_d = %d\n", backwards_d);
-
-			/* If both sides have the same length, forward and backward start on the same diagonal, meaning the
-			 * backwards state index c == k.
-			 * As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
-			 * c = k shifted by the difference in length.
-			 */
-			int c = k_to_c(k, delta);
-
-			/* When the file sizes are very different, the traversal trees start on far distant diagonals.
-			 * They don't necessarily meet straight on. See whether this forward value is on a diagonal that
-			 * is also valid in kd_backward[], and match them if so. */
-			if (c >= -backwards_d && c <= backwards_d) {
-				/* Current k is on a diagonal that exists in kd_backward[]. If the two x positions have
-				 * met or passed (forward walked onto or past backward), then we've found a midpoint / a
-				 * mid-box.
-				 *
-				 * But we need to avoid matching a situation like this:
-				 *       0  1
-				 *        x y
-				 *   0   o-o-o
-				 *     x |\| |
-				 *   1   o-o-o
-				 *     y | |\|
-				 *   2  (B)o-o  <--(B) backwards traversal reached here
-				 *     a | | |
-				 *   3   o-o-o<-- prev_x, prev_y
-				 *     b | | |
-				 *   4   o-o(F) <--(F) forwards traversal  reached here
-				 *     x |\| |     Now both are on the same diagonal and look like they passed,
-				 *   5   o-o-o     but actually they have sneaked past each other and have not met.
-				 *     y | |\|
-				 *   6   o-o-o
-				 *
-				 * The solution is to notice that prev_x,prev_y were also already past (B).
-				 */
-				int backward_x = kd_backward[c];
-				int backward_y = xc_to_y(backward_x, c, delta);
-				debug(" prev_x,y = (%d,%d)  c%d:backward_x,y = (%d,%d)  k%d:x,y = (%d,%d)\n",
-				      prev_x, prev_y, c, backward_x, backward_y, k, x, xk_to_y(x, k));
-				if (prev_x <= backward_x && prev_y <= backward_y
-				    && x >= backward_x) {
-					*meeting_snake = (struct diff_box){
-						.left_start = backward_x,
-						.left_end = x,
-						.right_start = xc_to_y(backward_x, c, delta),
-						.right_end = xk_to_y(x, k),
-					};
-					debug("HIT x=(%u,%u) - y=(%u,%u)\n",
-					      meeting_snake->left_start,
-					      meeting_snake->right_start,
-					      meeting_snake->left_end,
-					      meeting_snake->right_end);
-					return;
-				}
-			}
-		}
-	}
-}
-
-/* Do one backwards step in the "divide and conquer" graph traversal.
- * left: the left side to diff.
- * right: the right side to diff against.
- * kd_forward: the traversal state for forwards traversal, to find a meeting point.
- *             Since forwards is done first, after this, both kd_forward and kd_backward will be valid for d.
- *             kd_forward points at the center of the state array, allowing negative indexes.
- * kd_backward: the traversal state for backwards traversal, to find a meeting point.
- *              This is carried over between invocations with increasing d.
- *              kd_backward points at the center of the state array, allowing negative indexes.
- * d: Step or distance counter, indicating for what value of d the kd_backward should be populated.
- *    Before the first invocation, kd_backward[0] shall point at the bottom right of the Myers graph
- *    (left.len, right.len).
- *    The first invocation will be for d == 1.
- * meeting_snake: resulting meeting point, if any.
- */
-static void diff_divide_myers_backward(struct diff_data *left, struct diff_data *right,
-				       int *kd_forward, int *kd_backward, int d,
-				       struct diff_box *meeting_snake)
-{
-	int delta = (int)right->atoms.len - (int)left->atoms.len;
-	int prev_x;
-	int prev_y;
-	int c;
-	int x;
-
-	debug("-- %s d=%d\n", __func__, d);
-	debug_dump_myers_graph(left, right, NULL);
-
-	for (c = d; c >= -d; c -= 2) {
-		if (c < -(int)left->atoms.len || c > (int)right->atoms.len) {
-			/* This diagonal is completely outside of the Myers graph, don't calculate it. */
-			if (c < -(int)left->atoms.len)
-				debug(" %d c < -(int)left->atoms.len %d\n", c, -(int)left->atoms.len);
-			else
-				debug(" %d c > right->atoms.len %d\n", c, right->atoms.len);
-			if (c < 0) {
-				/* We are traversing negatively, and already below the entire graph, nothing will come
-				 * of this. */
-				debug(" break");
-				break;
-			}
-			debug(" continue");
-			continue;
-		}
-		debug("- c = %d\n", c);
-		if (d == 0) {
-			/* This is the initializing step. There is no prev_c yet, get the initial x from the bottom
-			 * right of the Myers graph. */
-			x = left->atoms.len;
-		}
-		/* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
-		 * For this, all c should derive from c - 1, only the bottom most c derive from c + 1:
-		 *
-		 *                                  2   1   0
-		 *  ---------------------------------------------------
-		 *                                               c=
-		 *                                                3
-		 *
-		 *         from prev_c = c - 1 --> 5,2            2
-		 *                                    \
-		 *                                     5,3        1
-		 *                                        \
-		 *                                 4,3     5,4<-- 0
-		 *                                    \   /
-		 *  bottom most for d=1 from c + 1 --> 4,4       -1
-		 *                                    /
-		 *         bottom most for d=2 --> 3,4           -2
-		 *
-		 * Except when a c + 1 from a previous run already means a further advancement in the graph.
-		 * If c == d, there is no c + 1 and c - 1 is the only option.
-		 * If c < d, use c + 1 in case that yields a larger x. Also use c + 1 if c - 1 is outside the graph.
-		 */
-		else if (c > -d && (c == d
-				    || (c - 1 >= -(int)right->atoms.len
-					&& kd_backward[c - 1] <= kd_backward[c + 1]))) {
-			/* A top one.
-			 * From position prev_c, step upwards in the Myers graph: y -= 1.
-			 * Decrementing y is achieved by incrementing c while keeping the same x.
-			 * (since we're deriving y from y = x - c + delta).
-			 */
-			int prev_c = c - 1;
-			prev_x = kd_backward[prev_c];
-			prev_y = xc_to_y(prev_x, prev_c, delta);
-			x = prev_x;
-		} else {
-			/* The bottom most one.
-			 * From position prev_c, step to the left in the Myers graph: x -= 1.
-			 */
-			int prev_c = c + 1;
-			prev_x = kd_backward[prev_c];
-			prev_y = xc_to_y(prev_x, prev_c, delta);
-			x = prev_x - 1;
-		}
-
-		/* Slide up any snake that we might find here. */
-		debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c, delta), xc_to_y(x, c, delta)-1);
-		if (x > 0) {
-			debug("  l="); debug_dump_atom(left, right, &left->atoms.head[x-1]);
-		}
-		if (xc_to_y(x, c, delta) > 0) {
-			debug("   r="); debug_dump_atom(right, left, &right->atoms.head[xc_to_y(x, c, delta)-1]);
-		}
-		while (x > 0 && xc_to_y(x, c, delta) > 0
-		       && diff_atom_same(&left->atoms.head[x-1], &right->atoms.head[xc_to_y(x, c, delta)-1]))
-		       x--;
-		kd_backward[c] = x;
-
-		if (DEBUG) {
-			int fi;
-			for (fi = d; fi >= c; fi--) {
-				debug("kd_backward[%d] = (%d, %d)\n", fi, kd_backward[fi],
-				      kd_backward[fi] - fi + delta);
-				/*
-				if (kd_backward[fi] >= 0 && kd_backward[fi] < left->atoms.len)
-					debug_dump_atom(left, right, &left->atoms.head[kd_backward[fi]]);
-				else
-					debug("\n");
-				if (kd_backward[fi]-fi+delta >= 0 && kd_backward[fi]-fi+delta < right->atoms.len)
-					debug_dump_atom(right, left, &right->atoms.head[kd_backward[fi]-fi+delta]);
-				else
-					debug("\n");
-				*/
-			}
-		}
-
-		if (x < 0 || x > left->atoms.len
-		    || xc_to_y(x, c, delta) < 0 || xc_to_y(x, c, delta) > right->atoms.len)
-			continue;
-
-		/* Figured out a new backwards traversal, see if this has gone onto or even past a preceding forwards
-		 * traversal.
-		 *
-		 * If the delta in length is even, then d and backwards_d hit the same state indexes -- note how this is
-		 * different from in the forwards traversal, because now both d are the same:
-		 *
-		 *      | d=   0   1   2      2   1   0
-		 *  ----+----------------    --------------------
-		 *  k=  |                                  c=
-		 *   4  |
-		 *      |
-		 *   3  |                                   3
-		 *      |                same
-		 *   2  |             2,0====5,2            2
-		 *      |            /          \
-		 *   1  |         1,0            5,3        1
-		 *      |        /              /  \
-		 *   0  |  -->0,0     3,3====4,3    5,4<--  0
-		 *      |        \   /             /
-		 *  -1  |         0,1            4,4       -1
-		 *      |            \
-		 *  -2  |             0,2                  -2
-		 *      |
-		 *                                      -3
-		 * If the delta is odd, they end up off-by-one, i.e. on different diagonals.
-		 * So in the backward path, we can only match up diagonals when the delta is even.
-		 */
-		if ((delta & 1) == 0) {
-			/* Forwards was done first, now both d are the same. */
-			int forwards_d = d;
-
-			/* As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
-			 * c = k shifted by the difference in length.
-			 */
-			int k = c_to_k(c, delta);
-
-			/* When the file sizes are very different, the traversal trees start on far distant diagonals.
-			 * They don't necessarily meet straight on. See whether this backward value is also on a valid
-			 * diagonal in kd_forward[], and match them if so. */
-			if (k >= -forwards_d && k <= forwards_d) {
-				/* Current c is on a diagonal that exists in kd_forward[]. If the two x positions have
-				 * met or passed (backward walked onto or past forward), then we've found a midpoint / a
-				 * mid-box. */
-				int forward_x = kd_forward[k];
-				int forward_y = xk_to_y(forward_x, k);
-				debug("Compare %d to %d  k=%d  (x=%d,y=%d) to (x=%d,y=%d)\n",
-				      forward_x, x, k,
-				      forward_x, xk_to_y(forward_x, k), x, xc_to_y(x, c, delta));
-				if (forward_x <= prev_x && forward_y <= prev_y
-				    && forward_x >= x) {
-					*meeting_snake = (struct diff_box){
-						.left_start = x,
-						.left_end = forward_x,
-						.right_start = xc_to_y(x, c, delta),
-						.right_end = xk_to_y(forward_x, k),
-					};
-					debug("HIT x=%u,%u - y=%u,%u\n",
-					      meeting_snake->left_start,
-					      meeting_snake->right_start,
-					      meeting_snake->left_end,
-					      meeting_snake->right_end);
-					return;
-				}
-			}
-		}
-	}
-}
-
-/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
- * the problem into smaller chunks. Requires only linear amounts of memory. */
-enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
-	enum diff_rc rc = DIFF_RC_ENOMEM;
-	struct diff_data *left = &state->left;
-	struct diff_data *right = &state->right;
-
-	debug("\n** %s\n", __func__);
-	debug("left:\n");
-	debug_dump(left);
-	debug("right:\n");
-	debug_dump(right);
-	debug_dump_myers_graph(left, right, NULL);
-
-	/* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
-	unsigned int max = left->atoms.len + right->atoms.len;
-	size_t kd_len = max + 1;
-	size_t kd_buf_size = kd_len << 1;
-	int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
-	if (!kd_buf)
-		return DIFF_RC_ENOMEM;
-	int i;
-	for (i = 0; i < kd_buf_size; i++)
-		kd_buf[i] = -1;
-	int *kd_forward = kd_buf;
-	int *kd_backward = kd_buf + kd_len;
-
-	/* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
-	 * It is then possible to index from -max/2 .. max/2. */
-	kd_forward += max/2;
-	kd_backward += max/2;
-
-	int d;
-	struct diff_box mid_snake = {};
-	for (d = 0; d <= (max/2); d++) {
-		debug("-- d=%d\n", d);
-		diff_divide_myers_forward(left, right, kd_forward, kd_backward, d, &mid_snake);
-		if (!diff_box_empty(&mid_snake))
-			break;
-		diff_divide_myers_backward(left, right, kd_forward, kd_backward, d, &mid_snake);
-		if (!diff_box_empty(&mid_snake))
-			break;
-	}
-
-	if (diff_box_empty(&mid_snake)) {
-		/* Divide and conquer failed to find a meeting point. Use the fallback_algo defined in the algo_config
-		 * (leave this to the caller). This is just paranoia/sanity, we normally should always find a midpoint.
-		 */
-		debug(" no midpoint \n");
-		rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
-		goto return_rc;
-	} else {
-		debug(" mid snake L: %u to %u of %u   R: %u to %u of %u\n",
-		      mid_snake.left_start, mid_snake.left_end, left->atoms.len,
-		      mid_snake.right_start, mid_snake.right_end, right->atoms.len);
-
-		/* Section before the mid-snake.  */
-		debug("Section before the mid-snake\n");
-
-		struct diff_atom *left_atom = &left->atoms.head[0];
-		unsigned int left_section_len = mid_snake.left_start;
-		struct diff_atom *right_atom = &right->atoms.head[0];
-		unsigned int right_section_len = mid_snake.right_start;
-
-		if (left_section_len && right_section_len) {
-			/* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
-			if (!diff_state_add_chunk(state, false,
-						  left_atom, left_section_len,
-						  right_atom, right_section_len))
-				goto return_rc;
-		} else if (left_section_len && !right_section_len) {
-			/* Only left atoms and none on the right, they form a "minus" chunk, then. */
-			if (!diff_state_add_chunk(state, true,
-						  left_atom, left_section_len,
-						  right_atom, 0))
-				goto return_rc;
-		} else if (!left_section_len && right_section_len) {
-			/* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
-			if (!diff_state_add_chunk(state, true,
-						  left_atom, 0,
-						  right_atom, right_section_len))
-				goto return_rc;
-		}
-		/* else: left_section_len == 0 and right_section_len == 0, i.e. nothing before the mid-snake. */
-
-		/* the mid-snake, identical data on both sides: */
-		debug("the mid-snake\n");
-		if (!diff_state_add_chunk(state, true,
-					  &left->atoms.head[mid_snake.left_start],
-					  mid_snake.left_end - mid_snake.left_start,
-					  &right->atoms.head[mid_snake.right_start],
-					  mid_snake.right_end - mid_snake.right_start))
-			goto return_rc;
-
-		/* Section after the mid-snake. */
-		debug("Section after the mid-snake\n");
-		debug("  left_end %u  right_end %u\n", mid_snake.left_end, mid_snake.right_end);
-		debug("  left_count %u  right_count %u\n", left->atoms.len, right->atoms.len);
-		left_atom = &left->atoms.head[mid_snake.left_end];
-		left_section_len = left->atoms.len - mid_snake.left_end;
-		right_atom = &right->atoms.head[mid_snake.right_end];
-		right_section_len = right->atoms.len - mid_snake.right_end;
-
-		if (left_section_len && right_section_len) {
-			/* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
-			if (!diff_state_add_chunk(state, false,
-						  left_atom, left_section_len,
-						  right_atom, right_section_len))
-				goto return_rc;
-		} else if (left_section_len && !right_section_len) {
-			/* Only left atoms and none on the right, they form a "minus" chunk, then. */
-			if (!diff_state_add_chunk(state, true,
-						  left_atom, left_section_len,
-						  right_atom, 0))
-				goto return_rc;
-		} else if (!left_section_len && right_section_len) {
-			/* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
-			if (!diff_state_add_chunk(state, true,
-						  left_atom, 0,
-						  right_atom, right_section_len))
-				goto return_rc;
-		}
-		/* else: left_section_len == 0 and right_section_len == 0, i.e. nothing after the mid-snake. */
-	}
-
-	rc = DIFF_RC_OK;
-
-return_rc:
-	free(kd_buf);
-	debug("** END %s\n", __func__);
-	return rc;
-}
-
-/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
- * fail if the required space surpasses algo_config->permitted_state_size. */
-enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
-	/* do a diff_divide_myers_forward() without a _backward(), so that it walks forward across the entire
-	 * files to reach the end. Keep each run's state, and do a final backtrace. */
-	enum diff_rc rc = DIFF_RC_ENOMEM;
-	struct diff_data *left = &state->left;
-	struct diff_data *right = &state->right;
-
-	debug("\n** %s\n", __func__);
-	debug("left:\n");
-	debug_dump(left);
-	debug("right:\n");
-	debug_dump(right);
-	debug_dump_myers_graph(left, right, NULL);
-
-	/* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
-	unsigned int max = left->atoms.len + right->atoms.len;
-	size_t kd_len = max + 1 + max;
-	size_t kd_buf_size = kd_len * kd_len;
-	debug("state size: %zu\n", kd_buf_size);
-	if (kd_buf_size < kd_len /* overflow? */
-	    || kd_buf_size * sizeof(int) > algo_config->permitted_state_size) {
-		debug("state size %zu > permitted_state_size %zu, use fallback_algo\n",
-		      kd_buf_size, algo_config->permitted_state_size);
-		return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
-	}
-
-	int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
-	if (!kd_buf)
-		return DIFF_RC_ENOMEM;
-	int i;
-	for (i = 0; i < kd_buf_size; i++)
-		kd_buf[i] = -1;
-
-	/* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
-	 * It is then possible to index from -max .. max. */
-	int *kd_origin = kd_buf + max;
-	int *kd_column = kd_origin;
-
-	int d;
-	int backtrack_d = -1;
-	int backtrack_k = 0;
-	int k;
-	int x, y;
-	for (d = 0; d <= max; d++, kd_column += kd_len) {
-		debug("-- d=%d\n", d);
-
-		debug("-- %s d=%d\n", __func__, d);
-
-		for (k = d; k >= -d; k -= 2) {
-			if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
-				/* This diagonal is completely outside of the Myers graph, don't calculate it. */
-				if (k < -(int)right->atoms.len)
-					debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
-				else
-					debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
-				if (k < 0) {
-					/* We are traversing negatively, and already below the entire graph, nothing will come
-					 * of this. */
-					debug(" break");
-					break;
-				}
-				debug(" continue");
-				continue;
-			}
-
-			debug("- k = %d\n", k);
-			if (d == 0) {
-				/* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
-				 * the Myers graph. */
-				x = 0;
-			} else {
-				int *kd_prev_column = kd_column - kd_len;
-
-				/* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
-				 * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
-				 *
-				 *      | d=   0   1   2
-				 *  ----+----------------
-				 *  k=  |
-				 *   2  |             2,0 <-- from prev_k = 2 - 1 = 1
-				 *      |            /
-				 *   1  |         1,0
-				 *      |        /
-				 *   0  |  -->0,0     3,3
-				 *      |       \\   /
-				 *  -1  |         0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
-				 *      |           \\
-				 *  -2  |             0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
-				 *
-				 * Except when a k + 1 from a previous run already means a further advancement in the graph.
-				 * If k == d, there is no k + 1 and k - 1 is the only option.
-				 * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
-				 */
-				if (k > -d && (k == d
-					       || (k - 1 >= -(int)right->atoms.len
-						   && kd_prev_column[k - 1] >= kd_prev_column[k + 1]))) {
-					/* Advance from k - 1.
-					 * From position prev_k, step to the right in the Myers graph: x += 1.
-					 */
-					int prev_k = k - 1;
-					int prev_x = kd_prev_column[prev_k];
-					x = prev_x + 1;
-				} else {
-					/* The bottom most one.
-					 * From position prev_k, step to the bottom in the Myers graph: y += 1.
-					 * Incrementing y is achieved by decrementing k while keeping the same x.
-					 * (since we're deriving y from y = x - k).
-					 */
-					int prev_k = k + 1;
-					int prev_x = kd_prev_column[prev_k];
-					x = prev_x;
-				}
-			}
-
-			/* Slide down any snake that we might find here. */
-			while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
-			       && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
-			       x++;
-			kd_column[k] = x;
-
-			if (DEBUG) {
-				int fi;
-				for (fi = d; fi >= k; fi-=2) {
-					debug("kd_column[%d] = (%d, %d)\n", fi, kd_column[fi], kd_column[fi] - fi);
-#if 0
-					if (kd_column[fi] >= 0 && kd_column[fi] < left->atoms.len)
-						debug_dump_atom(left, right, &left->atoms.head[kd_column[fi]]);
-					else
-						debug("\n");
-					if (kd_column[fi]-fi >= 0 && kd_column[fi]-fi < right->atoms.len)
-						debug_dump_atom(right, left, &right->atoms.head[kd_column[fi]-fi]);
-					else
-						debug("\n");
-#endif
-				}
-			}
-
-			if (x == left->atoms.len && xk_to_y(x, k) == right->atoms.len) {
-				/* Found a path */
-				backtrack_d = d;
-				backtrack_k = k;
-				debug("Reached the end at d = %d, k = %d\n",
-				      backtrack_d, backtrack_k);
-				break;
-			}
-		}
-
-		if (backtrack_d >= 0)
-			break;
-	}
-
-	debug_dump_myers_graph(left, right, kd_origin);
-
-	/* backtrack. A matrix spanning from start to end of the file is ready:
-	 *
-	 *      | d=   0   1   2   3   4
-	 *  ----+---------------------------------
-	 *  k=  |
-	 *   3  |
-	 *      |
-	 *   2  |             2,0
-	 *      |            /
-	 *   1  |         1,0     4,3
-	 *      |        /       /   \
-	 *   0  |  -->0,0     3,3     4,4 --> backtrack_d = 4, backtrack_k = 0
-	 *      |        \   /   \
-	 *  -1  |         0,1     3,4
-	 *      |            \
-	 *  -2  |             0,2
-	 *      |
-	 *
-	 * From (4,4) backwards, find the previous position that is the largest, and remember it.
-	 *
-	 */
-	for (d = backtrack_d, k = backtrack_k; d >= 0; d--) {
-		x = kd_column[k];
-		y = xk_to_y(x, k);
-
-		/* When the best position is identified, remember it for that kd_column.
-		 * That kd_column is no longer needed otherwise, so just re-purpose kd_column[0] = x and kd_column[1] = y,
-		 * so that there is no need to allocate more memory.
-		 */
-		kd_column[0] = x;
-		kd_column[1] = y;
-		debug("Backtrack d=%d: xy=(%d, %d)\n",
-		      d, kd_column[0], kd_column[1]);
-
-		/* Don't access memory before kd_buf */
-		if (d == 0)
-			break;
-		int *kd_prev_column = kd_column - kd_len;
-
-		/* When y == 0, backtracking downwards (k-1) is the only way.
-		 * When x == 0, backtracking upwards (k+1) is the only way.
-		 *
-		 *      | d=   0   1   2   3   4
-		 *  ----+---------------------------------
-		 *  k=  |
-		 *   3  |
-		 *      |                ..y == 0
-		 *   2  |             2,0
-		 *      |            /
-		 *   1  |         1,0     4,3
-		 *      |        /       /   \
-		 *   0  |  -->0,0     3,3     4,4 --> backtrack_d = 4, backtrack_k = 0
-		 *      |        \   /   \
-		 *  -1  |         0,1     3,4
-		 *      |            \
-		 *  -2  |             0,2__
-		 *      |                  x == 0
-		 */
-		debug("prev[k-1] = %d,%d  prev[k+1] = %d,%d\n",
-		      kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1),
-		      kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1));
-		if (y == 0
-		    || (x > 0 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) {
-			k = k - 1;
-			debug("prev k=k-1=%d x=%d y=%d\n",
-			      k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
-		} else {
-			k = k + 1;
-			debug("prev k=k+1=%d x=%d y=%d\n",
-			      k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
-		}
-		kd_column = kd_prev_column;
-	}
-
-	/* Forwards again, this time recording the diff chunks.
-	 * Definitely start from 0,0. kd_column[0] may actually point to the bottom of a snake starting at 0,0 */
-	x = 0;
-	y = 0;
-
-	kd_column = kd_origin;
-	for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) {
-		int next_x = kd_column[0];
-		int next_y = kd_column[1];
-		debug("Forward track from xy(%d,%d) to xy(%d,%d)\n",
-		      x, y, next_x, next_y);
-
-		struct diff_atom *left_atom = &left->atoms.head[x];
-		int left_section_len = next_x - x;
-		struct diff_atom *right_atom = &right->atoms.head[y];
-		int right_section_len = next_y - y;
-
-		rc = DIFF_RC_ENOMEM;
-		if (left_section_len && right_section_len) {
-			/* This must be a snake slide.
-			 * Snake slides have a straight line leading into them (except when starting at (0,0)). Find
-			 * out whether the lead-in is horizontal or vertical:
-			 *
-			 *     left
-			 *  ---------->
-			 *  |
-			 * r|   o-o        o
-			 * i|      \       |
-			 * g|       o      o
-			 * h|        \      \
-			 * t|         o      o
-			 *  v
-			 *
-			 * If left_section_len > right_section_len, the lead-in is horizontal, meaning first
-			 * remove one atom from the left before sliding down the snake.
-			 * If right_section_len > left_section_len, the lead-in is vetical, so add one atom from
-			 * the right before sliding down the snake. */
-			if (left_section_len == right_section_len + 1) {
-				if (!diff_state_add_chunk(state, true,
-							  left_atom, 1,
-							  right_atom, 0))
-					goto return_rc;
-				left_atom++;
-				left_section_len--;
-			} else if (right_section_len == left_section_len + 1) {
-				if (!diff_state_add_chunk(state, true,
-							  left_atom, 0,
-							  right_atom, 1))
-					goto return_rc;
-				right_atom++;
-				right_section_len--;
-			} else if (left_section_len != right_section_len) {
-				/* The numbers are making no sense. Should never happen. */
-				rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
-				goto return_rc;
-			}
-
-			if (!diff_state_add_chunk(state, true,
-						  left_atom, left_section_len,
-						  right_atom, right_section_len))
-				goto return_rc;
-		} else if (left_section_len && !right_section_len) {
-			/* Only left atoms and none on the right, they form a "minus" chunk, then. */
-			if (!diff_state_add_chunk(state, true,
-						  left_atom, left_section_len,
-						  right_atom, 0))
-				goto return_rc;
-		} else if (!left_section_len && right_section_len) {
-			/* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
-			if (!diff_state_add_chunk(state, true,
-						  left_atom, 0,
-						  right_atom, right_section_len))
-				goto return_rc;
-		}
-
-		x = next_x;
-		y = next_y;
-	}
-
-	rc = DIFF_RC_OK;
-
-return_rc:
-	free(kd_buf);
-	debug("** END %s rc=%d\n", __func__, rc);
-	return rc;
-}
blob - 4c76a249cac3df710e57ed404bb9a918acf41870 (mode 644)
blob + /dev/null
--- lib/diff_output.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Common parts for printing diff output */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_output.h>
-
-void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count)
-{
-	struct diff_atom *atom;
-	foreach_diff_atom(atom, start_atom, count) {
-		fprintf(dest, "%s", prefix);
-		int i;
-		unsigned int len = atom->len;
-		if (len && atom->at[len - 1] == '\n') {
-			len--;
-			if (len && atom->at[len - 1] == '\r')
-				len--;
-		}
-
-		for (i = 0; i < len; i++) {
-			char c = atom->at[i];
-			if ((c < 0x20 || c >= 0x7f) && c != '\t')
-				fprintf(dest, "\\x%02x", (unsigned char)c);
-			else
-				fprintf(dest, "%c", c);
-		}
-		fprintf(dest, "\n");
-	}
-}
-
-enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info)
-{
-	if (info->arbitrary_info && *info->arbitrary_info)
-		fprintf(dest, "%s", info->arbitrary_info);
-	fprintf(dest, "--- %s\n+++ %s\n",
-		info->left_path ? : "a",
-		info->right_path ? : "b");
-        return DIFF_RC_OK;
-}
blob - 796c3ee69f83bf338c67c8b310b27c2dea7501f1 (mode 644)
blob + /dev/null
--- lib/diff_output_plain.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Output all lines of a diff_result. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_output.h>
-
-enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
-			       const struct diff_result *result)
-{
-	if (!result)
-		return DIFF_RC_EINVAL;
-	if (result->rc != DIFF_RC_OK)
-		return result->rc;
-
-	diff_output_info(dest, info);
-
-	int i;
-	for (i = 0; i < result->chunks.len; i++) {
-		struct diff_chunk *c = &result->chunks.head[i];
-		if (c->left_count && c->right_count)
-			diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
-		else if (c->left_count && !c->right_count)
-			diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
-		else if (c->right_count && !c->left_count)
-			diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
-	}
-	return DIFF_RC_OK;
-}
-
-enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
-			const struct diff_input_info *info,
-			const char *left, int left_len, const char *right, int right_len)
-{
-	enum diff_rc rc;
-	left_len = left_len < 0 ? strlen(left) : left_len;
-	right_len = right_len < 0 ? strlen(right) : right_len;
-	struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
-	rc = diff_output_plain(dest, info, result);
-	diff_result_free(result);
-	return rc;
-}
blob - 986d06cbf5ec125bca1bfbce84d7cd3378501735 (mode 644)
blob + /dev/null
--- lib/diff_output_unidiff.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Produce a unidiff output from a diff_result. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <diff/diff_output.h>
-
-#include "debug.h"
-
-enum chunk_type {
-	CHUNK_EMPTY,
-	CHUNK_PLUS,
-	CHUNK_MINUS,
-	CHUNK_SAME,
-	CHUNK_WEIRD,
-};
-
-static inline enum chunk_type chunk_type(const struct diff_chunk *chunk)
-{
-	if (!chunk->left_count && !chunk->right_count)
-		return CHUNK_EMPTY;
-	if (!chunk->solved)
-		return CHUNK_WEIRD;
-	if (!chunk->right_count)
-		return CHUNK_MINUS;
-	if (!chunk->left_count)
-		return CHUNK_PLUS;
-	if (chunk->left_count != chunk->right_count)
-		return CHUNK_WEIRD;
-	return CHUNK_SAME;
-}
-
-struct chunk_context {
-	struct range chunk;
-	struct range left, right;
-};
-
-static bool chunk_context_empty(const struct chunk_context *cc)
-{
-	return range_empty(&cc->chunk);
-}
-
-static void chunk_context_get(struct chunk_context *cc, const struct diff_result *r, int chunk_idx,
-		       int context_lines)
-{
-	const struct diff_chunk *c = &r->chunks.head[chunk_idx];
-	int left_start = diff_atom_root_idx(&r->left, c->left_start);
-	int right_start = diff_atom_root_idx(&r->right, c->right_start);
-
-	*cc = (struct chunk_context){
-		.chunk = {
-			.start = chunk_idx,
-			.end = chunk_idx + 1,
-		},
-		.left = {
-			.start = MAX(0, left_start - context_lines),
-			.end = MIN(r->left.atoms.len, left_start + c->left_count + context_lines),
-		},
-		.right = {
-			.start = MAX(0, right_start - context_lines),
-			.end = MIN(r->right.atoms.len, right_start + c->right_count + context_lines),
-		},
-	};
-}
-
-static bool chunk_contexts_touch(const struct chunk_context *cc, const struct chunk_context *other)
-{
-	return ranges_touch(&cc->chunk, &other->chunk)
-		|| ranges_touch(&cc->left, &other->left)
-		|| ranges_touch(&cc->right, &other->right);
-}
-
-static void chunk_contexts_merge(struct chunk_context *cc, const struct chunk_context *other)
-{
-	ranges_merge(&cc->chunk, &other->chunk);
-	ranges_merge(&cc->left, &other->left);
-	ranges_merge(&cc->right, &other->right);
-}
-
-static void diff_output_unidiff_chunk(FILE *dest, bool *info_printed, const struct diff_input_info *info,
-				      const struct diff_result *result, const struct chunk_context *cc)
-{
-	if (range_empty(&cc->left) && range_empty(&cc->right))
-		return;
-
-	if (!(*info_printed)) {
-		diff_output_info(dest, info);
-		*info_printed = true;
-	}
-
-	fprintf(dest, "@@ -%d,%d +%d,%d @@\n",
-		cc->left.start + 1, cc->left.end - cc->left.start,
-		cc->right.start + 1, cc->right.end - cc->right.start);
-
-	/* Got the absolute line numbers where to start printing, and the index of the interesting (non-context) chunk.
-	 * To print context lines above the interesting chunk, nipping on the previous chunk index may be necessary.
-	 * It is guaranteed to be only context lines where left == right, so it suffices to look on the left. */
-	const struct diff_chunk *first_chunk = &result->chunks.head[cc->chunk.start];
-	int chunk_start_line = diff_atom_root_idx(&result->left, first_chunk->left_start);
-	if (cc->left.start < chunk_start_line)
-		diff_output_lines(dest, " ", &result->left.atoms.head[cc->left.start],
-				  chunk_start_line - cc->left.start);
-
-	/* Now write out all the joined chunks and contexts between them */
-	int c_idx;
-	for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) {
-		const struct diff_chunk *c = &result->chunks.head[c_idx];
-
-		if (c->left_count && c->right_count)
-			diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
-		else if (c->left_count && !c->right_count)
-			diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
-		else if (c->right_count && !c->left_count)
-			diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
-	}
-
-	/* Trailing context? */
-	const struct diff_chunk *last_chunk = &result->chunks.head[cc->chunk.end - 1];
-	int chunk_end_line = diff_atom_root_idx(&result->left, last_chunk->left_start + last_chunk->left_count);
-	if (cc->left.end > chunk_end_line)
-		diff_output_lines(dest, " ", &result->left.atoms.head[chunk_end_line],
-				  cc->left.end - chunk_end_line);
-}
-
-enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
-				 const struct diff_result *result, unsigned int context_lines)
-{
-	if (!result)
-		return DIFF_RC_EINVAL;
-	if (result->rc != DIFF_RC_OK)
-		return result->rc;
-
-	struct chunk_context cc = {};
-	bool info_printed = false;
-
-	int i;
-	for (i = 0; i < result->chunks.len; i++) {
-		struct diff_chunk *c = &result->chunks.head[i];
-		enum chunk_type t = chunk_type(c);
-
-		if (t == CHUNK_MINUS || t == CHUNK_PLUS) {
-			if (chunk_context_empty(&cc)) {
-				/* These are the first lines being printed.
-				 * Note down the start point, any number of subsequent chunks may be joined up to this
-				 * unidiff chunk by context lines or by being directly adjacent. */
-				chunk_context_get(&cc, result, i, context_lines);
-				debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
-				      cc.chunk.start, cc.chunk.end,
-				      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
-			} else {
-				/* There already is a previous chunk noted down for being printed.
-				 * Does it join up with this one? */
-				struct chunk_context next;
-				chunk_context_get(&next, result, i, context_lines);
-				debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
-				      next.chunk.start, next.chunk.end,
-				      next.left.start, next.left.end, next.right.start, next.right.end);
-				if (chunk_contexts_touch(&cc, &next)) {
-					/* This next context touches or overlaps the previous one, join. */
-					chunk_contexts_merge(&cc, &next);
-					debug("new chunk to be printed touches previous chunk, now: left %d-%d right %d-%d\n",
-					      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
-				} else {
-					/* No touching, so the previous context is complete with a gap between it and
-					 * this next one. Print the previous one and start fresh here. */
-					debug("new chunk to be printed does not touch previous chunk; print left %d-%d right %d-%d\n",
-					      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
-					diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
-					cc = next;
-					debug("new unprinted chunk is left %d-%d right %d-%d\n",
-					      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
-				}
-			}
-		}
-
-	}
-
-	if (!chunk_context_empty(&cc))
-		diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
-	return DIFF_RC_OK;
-}
-
-enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
-			  const struct diff_input_info *info,
-			  const char *left, int left_len, const char *right, int right_len,
-			  unsigned int context_lines)
-{
-	enum diff_rc rc;
-	left_len = left_len < 0 ? strlen(left) : left_len;
-	right_len = right_len < 0 ? strlen(right) : right_len;
-	struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
-	rc = diff_output_unidiff(dest, info, result, context_lines);
-	diff_result_free(result);
-	return rc;
-}
blob - 17a6bffe139d685b095825de47ef10ff51e508b5 (mode 644)
blob + /dev/null
--- lib/diff_patience.c
+++ /dev/null
@@ -1,414 +0,0 @@
-/* Implementation of the Patience Diff algorithm invented by Bram Cohen:
- * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */
-/*
- * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <assert.h>
-#include <diff/diff_main.h>
-
-#include "debug.h"
-
-/* Set unique_here = true for all atoms that exist exactly once in this list. */
-static void diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count)
-{
-	struct diff_atom *i;
-	unsigned int count = 0;
-	diff_data_foreach_atom(i, d) {
-		i->patience.unique_here = true;
-		i->patience.unique_in_both = true;
-		count++;
-	}
-	diff_data_foreach_atom(i, d) {
-		struct diff_atom *j;
-
-		if (!i->patience.unique_here)
-			continue;
-
-		diff_data_foreach_atom_from(i + 1, j, d) {
-			if (diff_atom_same(i, j)) {
-				if (i->patience.unique_here) {
-					i->patience.unique_here = false;
-					i->patience.unique_in_both = false;
-					count--;
-				}
-				j->patience.unique_here = false;
-				j->patience.unique_in_both = false;
-				count--;
-			}
-		}
-	}
-	if (unique_count)
-		*unique_count = count;
-}
-
-/* Mark those lines as atom->patience.unique_in_both = true that appear exactly once in each side. */
-static void diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
-					   unsigned int *unique_in_both_count)
-{
-	/* Derive the final unique_in_both count without needing an explicit iteration. So this is just some
-	 * optimiziation to save one iteration in the end. */
-	unsigned int unique_in_both;
-
-	diff_atoms_mark_unique(left, &unique_in_both);
-	diff_atoms_mark_unique(right, NULL);
-
-	debug("unique_in_both %u\n", unique_in_both);
-
-	struct diff_atom *i;
-	diff_data_foreach_atom(i, left) {
-		if (!i->patience.unique_here)
-			continue;
-		struct diff_atom *j;
-		int found_in_b = 0;
-		diff_data_foreach_atom(j, right) {
-			if (!diff_atom_same(i, j))
-				continue;
-			if (!j->patience.unique_here) {
-				found_in_b = 2; /* or more */
-				break;
-			} else {
-				found_in_b = 1;
-				j->patience.pos_in_other = i;
-				i->patience.pos_in_other = j;
-			}
-		}
-
-		if (found_in_b == 0 || found_in_b > 1) {
-			i->patience.unique_in_both = false;
-			unique_in_both--;
-			debug("unique_in_both %u  (%d) ", unique_in_both, found_in_b);
-			debug_dump_atom(left, NULL, i);
-		}
-	}
-
-	/* Still need to unmark right[*]->patience.unique_in_both for atoms that don't exist in left */
-	diff_data_foreach_atom(i, right) {
-		if (!i->patience.unique_here
-		    || !i->patience.unique_in_both)
-			continue;
-		struct diff_atom *j;
-		bool found_in_a = false;
-		diff_data_foreach_atom(j, left) {
-			if (!j->patience.unique_in_both)
-				continue;
-			if (!diff_atom_same(i, j))
-				continue;
-			found_in_a = true;
-			break;
-		}
-
-		if (!found_in_a)
-			i->patience.unique_in_both = false;
-	}
-
-	if (unique_in_both_count)
-		*unique_in_both_count = unique_in_both;
-}
-
-static void diff_atoms_swallow_identical_neighbors(struct diff_data *left, struct diff_data *right,
-						   unsigned int *unique_in_both_count)
-{
-	debug("trivially combine identical lines arount unique_in_both lines\n");
-
-	unsigned int l_idx;
-	unsigned int next_l_idx;
-	unsigned int l_min = 0;
-	unsigned int r_min = 0;
-	for (l_idx = 0; l_idx < left->atoms.len; l_idx = next_l_idx) {
-		next_l_idx = l_idx + 1;
-		struct diff_atom *l = &left->atoms.head[l_idx];
-
-		if (!l->patience.unique_in_both)
-			continue;
-
-		debug("check identical lines around ");
-		debug_dump_atom(left, right, l);
-
-		unsigned int r_idx = diff_atom_idx(right, l->patience.pos_in_other);
-
-		struct range identical_l;
-		struct range identical_r;
-
-		/* Swallow upwards.
-		 * Each common-unique line swallows identical lines upwards and downwards.
-		 * All common-unique lines that were part of the identical lines following below were already swallowed
-		 * in the previous iteration, so we will never hit another common-unique line above. */
-		for (identical_l.start = l_idx, identical_r.start = r_idx;
-		     identical_l.start > l_min
-		     && identical_r.start > r_min
-		     && diff_atom_same(&left->atoms.head[identical_l.start - 1],
-				       &right->atoms.head[identical_r.start - 1]);
-		     identical_l.start--, identical_r.start--);
-
-		/* Swallow downwards */
-		for (identical_l.end = l_idx + 1, identical_r.end = r_idx + 1;
-		     identical_l.end < left->atoms.len
-		     && identical_r.end < right->atoms.len
-		     && diff_atom_same(&left->atoms.head[identical_l.end],
-				       &right->atoms.head[identical_r.end]);
-		     identical_l.end++, identical_r.end++,
-		     next_l_idx++) {
-			if (left->atoms.head[identical_l.end].patience.unique_in_both) {
-				/* Part of a chunk of identical lines, remove from listing of unique_in_both lines */
-				left->atoms.head[identical_l.end].patience.unique_in_both = false;
-				right->atoms.head[identical_r.end].patience.unique_in_both = false;
-				(*unique_in_both_count)--;
-			}
-		}
-
-		l->patience.identical_lines = identical_l;
-		l->patience.pos_in_other->patience.identical_lines = identical_r;
-
-		l_min = identical_l.end;
-		r_min = identical_r.end;
-
-		if (!range_empty(&l->patience.identical_lines)) {
-			debug("common-unique line at l=%u r=%u swallowed identical lines l=%u-%u r=%u-%u\n",
-			      l_idx, r_idx,
-			      identical_l.start, identical_l.end,
-			      identical_r.start, identical_r.end);
-		}
-		debug("next_l_idx = %u\n", next_l_idx);
-	}
-}
-
-/* Among the lines that appear exactly once in each side, find the longest streak that appear in both files in the same
- * order (with other stuff allowed to interleave). Use patience sort for that, as in the Patience Diff algorithm.
- * See https://bramcohen.livejournal.com/73318.html and, for a much more detailed explanation,
- * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */
-enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state)
-{
-	enum diff_rc rc = DIFF_RC_ENOMEM;
-
-	struct diff_data *left = &state->left;
-	struct diff_data *right = &state->right;
-
-	unsigned int unique_in_both_count;
-
-	debug("\n** %s\n", __func__);
-
-	/* Find those lines that appear exactly once in 'left' and exactly once in 'right'. */
-	diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count);
-
-	debug("unique_in_both_count %u\n", unique_in_both_count);
-	debug("left:\n");
-	debug_dump(left);
-	debug("right:\n");
-	debug_dump(right);
-
-	if (!unique_in_both_count) {
-		/* Cannot apply Patience, tell the caller to use fallback_algo instead. */
-		return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
-	}
-
-	diff_atoms_swallow_identical_neighbors(left, right, &unique_in_both_count);
-	debug("After swallowing identical neighbors: unique_in_both = %u\n",
-	      unique_in_both_count);
-
-	/* An array of Longest Common Sequence is the result of the below subscope: */
-	unsigned int lcs_count = 0;
-	struct diff_atom **lcs = NULL;
-	struct diff_atom *lcs_tail = NULL;
-
-	{
-		/* This subscope marks the lifetime of the atom_pointers allocation */
-
-		/* One chunk of storage for atom pointers */
-		struct diff_atom **atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2, sizeof(struct diff_atom*));
-
-		/* Half for the list of atoms that still need to be put on stacks */
-		struct diff_atom **uniques = atom_pointers;
-
-		/* Half for the patience sort state's "card stacks" -- we remember only each stack's topmost "card" */
-		struct diff_atom **patience_stacks = atom_pointers + unique_in_both_count;
-		unsigned int patience_stacks_count = 0;
-
-		/* Take all common, unique items from 'left' ... */
-
-		struct diff_atom *atom;
-		struct diff_atom **uniques_end = uniques;
-		diff_data_foreach_atom(atom, left) {
-			if (!atom->patience.unique_in_both)
-				continue;
-			*uniques_end = atom;
-			uniques_end++;
-		}
-
-		/* ...and sort them to the order found in 'right'.
-		 * The idea is to find the leftmost stack that has a higher line number and add it to the stack's top.
-		 * If there is no such stack, open a new one on the right. The line number is derived from the atom*,
-		 * which are array items and hence reflect the relative position in the source file. So we got the
-		 * common-uniques from 'left' and sort them according to atom->patience.pos_in_other. */
-		unsigned int i;
-		for (i = 0; i < unique_in_both_count; i++) {
-			atom = uniques[i];
-			unsigned int target_stack;
-
-			if (!patience_stacks_count)
-				target_stack = 0;
-			else {
-				/* binary search to find the stack to put this atom "card" on. */
-				unsigned int lo = 0;
-				unsigned int hi = patience_stacks_count;
-				while (lo < hi) {
-					unsigned int mid = (lo + hi) >> 1;
-					if (patience_stacks[mid]->patience.pos_in_other < atom->patience.pos_in_other)
-						lo = mid + 1;
-					else
-						hi = mid;
-				}
-
-				target_stack = lo;
-			}
-
-			assert(target_stack <= patience_stacks_count);
-			patience_stacks[target_stack] = atom;
-			if (target_stack == patience_stacks_count)
-				patience_stacks_count++;
-
-			/* Record a back reference to the next stack on the left, which will form the final longest sequence
-			 * later. */
-			atom->patience.prev_stack = target_stack ? patience_stacks[target_stack - 1] : NULL;
-
-		}
-
-		/* backtrace through prev_stack references to form the final longest common sequence */
-		lcs_tail = patience_stacks[patience_stacks_count - 1];
-		lcs_count = patience_stacks_count;
-
-		/* uniques and patience_stacks are no longer needed. Backpointers are in atom->patience.prev_stack */
-		free(atom_pointers);
-	}
-
-	lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*));
-	struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1];
-	struct diff_atom *atom;
-	for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) {
-		assert(lcs_backtrace_pos >= lcs);
-		*lcs_backtrace_pos = atom;
-	}
-
-	unsigned int i;
-	if (DEBUG) {
-		debug("\npatience LCS:\n");
-		for (i = 0; i < lcs_count; i++) {
-			debug_dump_atom(left, right, lcs[i]);
-		}
-	}
-
-
-	/* TODO: For each common-unique line found (now listed in lcs), swallow lines upwards and downwards that are
-	 * identical on each side. Requires a way to represent atoms being glued to adjacent atoms. */
-
-	debug("\ntraverse LCS, possibly recursing:\n");
-
-	/* Now we have pinned positions in both files at which it makes sense to divide the diff problem into smaller
-	 * chunks. Go into the next round: look at each section in turn, trying to again find common-unique lines in
-	 * those smaller sections. As soon as no more are found, the remaining smaller sections are solved by Myers. */
-	unsigned int left_pos = 0;
-	unsigned int right_pos = 0;
-	for (i = 0; i <= lcs_count; i++) {
-		struct diff_atom *atom;
-		struct diff_atom *atom_r;
-		unsigned int left_idx;
-		unsigned int right_idx;
-
-		if (i < lcs_count) {
-			atom = lcs[i];
-			atom_r = atom->patience.pos_in_other;
-			debug("lcs[%u] = left[%u] = right[%u]\n", i,
-			      diff_atom_idx(left, atom), diff_atom_idx(right, atom_r));
-			left_idx = atom->patience.identical_lines.start;
-			right_idx = atom_r->patience.identical_lines.start;
-			debug(" identical lines l %u-%u  r %u-%u\n",
-			      atom->patience.identical_lines.start, atom->patience.identical_lines.end,
-			      atom_r->patience.identical_lines.start, atom_r->patience.identical_lines.end);
-		} else {
-			atom = NULL;
-			atom_r = NULL;
-			left_idx = left->atoms.len;
-			right_idx = right->atoms.len;
-		}
-
-		/* 'atom' now marks an atom that matches on both sides according to patience-diff
-		 * (a common-unique identical atom in both files).
-		 * Handle the section before and the atom itself; the section after will be handled by the next loop
-		 * iteration -- note that i loops to last element + 1 ("i <= lcs_count"), so that there will be another
-		 * final iteration to pick up the last remaining items after the last LCS atom.
-		 * The sections before might also be empty on left and/or right.
-		 * left_pos and right_pos mark the indexes of the first atoms that have not yet been handled in the
-		 * previous loop iteration.
-		 * left_idx and right_idx mark the indexes of the matching atom on left and right, respectively. */
-
-		debug("iteration %u  left_pos %u  left_idx %u  right_pos %u  right_idx %u\n",
-		      i, left_pos, left_idx, right_pos, right_idx);
-
-		struct diff_chunk *chunk;
-
-		/* Section before the matching atom */
-		struct diff_atom *left_atom = &left->atoms.head[left_pos];
-		unsigned int left_section_len = left_idx - left_pos;
-
-		struct diff_atom *right_atom = &(right->atoms.head[right_pos]);
-		unsigned int right_section_len = right_idx - right_pos;
-
-		if (left_section_len && right_section_len) {
-			/* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
-			if (!diff_state_add_chunk(state, false,
-						  left_atom, left_section_len,
-						  right_atom, right_section_len))
-				goto return_rc;
-		} else if (left_section_len && !right_section_len) {
-			/* Only left atoms and none on the right, they form a "minus" chunk, then. */
-			if (!diff_state_add_chunk(state, true,
-						  left_atom, left_section_len,
-						  right_atom, 0))
-				goto return_rc;
-		} else if (!left_section_len && right_section_len) {
-			/* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
-			if (!diff_state_add_chunk(state, true,
-						  left_atom, 0,
-						  right_atom, right_section_len))
-				goto return_rc;
-		}
-		/* else: left_section_len == 0 and right_section_len == 0, i.e. nothing here. */
-
-		/* The atom found to match on both sides forms a chunk of equals on each side. In the very last
-		 * iteration of this loop, there is no matching atom, we were just cleaning out the remaining lines. */
-		if (atom) {
-			if (!diff_state_add_chunk(state, true,
-						  left->atoms.head + atom->patience.identical_lines.start,
-						  range_len(&atom->patience.identical_lines),
-						  right->atoms.head + atom_r->patience.identical_lines.start,
-						  range_len(&atom_r->patience.identical_lines)))
-				goto return_rc;
-			left_pos = atom->patience.identical_lines.end;
-			right_pos = atom_r->patience.identical_lines.end;
-		} else {
-			left_pos = left_idx + 1;
-			right_pos = right_idx + 1;
-		}
-		debug("end of iteration %u  left_pos %u  left_idx %u  right_pos %u  right_idx %u\n",
-		      i, left_pos, left_idx, right_pos, right_idx);
-	}
-	debug("** END %s\n", __func__);
-
-	rc = DIFF_RC_OK;
-
-return_rc:
-	free(lcs);
-	return rc;
-}
blob - a6ff8bd1ed65c0db6c0feaf7866523a9d9f52948 (mode 644)
blob + /dev/null
--- lib/linux_Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-srcs = \
-	diff_atomize_text.c \
-	diff_main.c \
-	diff_myers.c \
-	diff_patience.c \
-	diff_output.c \
-	diff_output_plain.c \
-	diff_output_unidiff.c \
-	$(END)
-
-objs = $(srcs:.c=.o)
-
-libdiff.a: $(objs)
-	ar rcs $@ $^
-
-CFLAGS = -fsanitize=address -fsanitize=undefined -g -O3
-
-%.o: %.c ./*.h ../include/diff/*.h
-	gcc $(CFLAGS) -I../include -o $@ -c $<
-
-.PHONY: clean
-clean:
-	-rm $(objs)
-	-rm libdiff.a
blob - b4a9acb0cdc71e2d714c76d3f510873470cb8236 (mode 644)
blob + /dev/null
--- man/diff.1
+++ /dev/null
@@ -1,47 +0,0 @@
-.\"	$OpenBSD$
-.\"
-.\" Copyright (c) 2018 Martin Pieuchot
-.\" Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
-.\"
-.\" Permission to use, copy, modify, and distribute this software for any
-.\" purpose with or without fee is hereby granted, provided that the above
-.\" copyright notice and this permission notice appear in all copies.
-.\"
-.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-.\"
-.Dd $Mdocdate: August 28 2017 $
-.Dt DIFF 1
-.Os
-.Sh NAME
-.Nm diff
-.Nd compare files
-.Sh SYNOPSIS
-.Nm diff
-.Ar file1 file2
-.Sh DESCRIPTION
-The
-.Nm
-utility compares the contents of
-.Ar file1
-and
-.Ar file2
-line by line.
-.Sh EXIT STATUS
-The
-.Nm
-utility exits with one of the following values:
-.Pp
-.Bl -tag -width Ds -offset indent -compact
-.It 0
-No differences were found.
-.It 1
-Differences were found.
-.It >1
-An error occurred.
-.El
blob - /dev/null
blob + b4a9acb0cdc71e2d714c76d3f510873470cb8236 (mode 644)
--- /dev/null
+++ diff.1
@@ -0,0 +1,47 @@
+.\"	$OpenBSD$
+.\"
+.\" Copyright (c) 2018 Martin Pieuchot
+.\" Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: August 28 2017 $
+.Dt DIFF 1
+.Os
+.Sh NAME
+.Nm diff
+.Nd compare files
+.Sh SYNOPSIS
+.Nm diff
+.Ar file1 file2
+.Sh DESCRIPTION
+The
+.Nm
+utility compares the contents of
+.Ar file1
+and
+.Ar file2
+line by line.
+.Sh EXIT STATUS
+The
+.Nm
+utility exits with one of the following values:
+.Pp
+.Bl -tag -width Ds -offset indent -compact
+.It 0
+No differences were found.
+.It 1
+Differences were found.
+.It >1
+An error occurred.
+.El
blob - 17f8df091e1bb71385f342f2e783b09347df7d75
blob + 480a582f5d2848c9cb614cf98532f97a5bfb5ba2
--- test/verify_all.sh
+++ test/verify_all.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-diff_prog="../diff/diff"
+diff_prog="../obj/diff"
 
 diff_type=unidiff
 
blob - /dev/null
blob + bbe26f81080d982f5d5a2678f29193e5c70df6fd (mode 644)
--- /dev/null
+++ diff.c
@@ -0,0 +1,143 @@
+/* Commandline diff utility to test diff implementations. */
+/*
+ * Copyright (c) 2018 Martin Pieuchot
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <err.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "diff_main.h"
+#include "diff_output.h"
+
+#ifdef __linux__
+/* stupid shims to compile and test on linux */
+#define __dead
+
+static const char *getprogname()
+{
+	return "diff";
+}
+#endif
+
+__dead void	 usage(void);
+int		 diffreg(char *, char *, int);
+char		*mmapfile(const char *, struct stat *);
+
+__dead void
+usage(void)
+{
+	fprintf(stderr, "usage: %s file1 file2\n", getprogname());
+	exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ch;
+
+	while ((ch = getopt(argc, argv, "")) != -1) {
+		switch (ch) {
+		default:
+			usage();
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc != 2)
+		usage();
+
+	return diffreg(argv[0], argv[1], 0);
+}
+
+const struct diff_algo_config myers, patience, myers_divide;
+
+const struct diff_algo_config myers = (struct diff_algo_config){
+	.impl = diff_algo_myers,
+	.permitted_state_size = 1024 * 1024 * sizeof(int),
+	.fallback_algo = &patience,
+};
+
+const struct diff_algo_config patience = (struct diff_algo_config){
+	.impl = diff_algo_patience,
+	.inner_algo = &patience,	// After subdivision, do Patience again.
+	.fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
+};
+
+const struct diff_algo_config myers_divide = (struct diff_algo_config){
+	.impl = diff_algo_myers_divide,
+	.inner_algo = &myers,		// When division succeeded, start from the top.
+					// (fallback_algo = NULL implies diff_algo_none).
+};
+
+const struct diff_config diff_config = {
+	.atomize_func = diff_atomize_text_by_line,
+	.algo = &myers,
+};
+
+int
+diffreg(char *file1, char *file2, int flags)
+{
+	char *str1, *str2;
+	struct stat st1, st2;
+	struct diff_input_info info = {
+		.left_path = file1,
+		.right_path = file2,
+	};
+
+	str1 = mmapfile(file1, &st1);
+	str2 = mmapfile(file2, &st2);
+
+	diff_unidiff(stdout, &diff_config, &info, str1, st1.st_size, str2, st2.st_size, 3);
+
+	munmap(str1, st1.st_size);
+	munmap(str2, st2.st_size);
+
+	return 0;
+}
+
+char *
+mmapfile(const char *path, struct stat *st)
+{
+	int			 fd;
+	char			*p;
+
+	fd = open(path, O_RDONLY);
+	if (fd == -1)
+		err(2, "%s", path);
+
+	if (fstat(fd, st) == -1)
+		err(2, "%s", path);
+
+	if ((uintmax_t)st->st_size > SIZE_MAX)
+		errx(2, "%s: file too big to fit memory", path);
+
+	p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (p == MAP_FAILED)
+		err(2, "mmap");
+
+	close(fd);
+
+	return p;
+}
blob - /dev/null
blob + 9aa20ee96f002f8a10c60b035b8e62be1dff769b (mode 644)
--- /dev/null
+++ diff_atomize_text.c
@@ -0,0 +1,75 @@
+/* Split source by line breaks, and calculate a simplistic checksum. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_main.h"
+
+static int diff_data_atomize_text_lines(struct diff_data *d)
+{
+	const uint8_t *pos = d->data;
+	const uint8_t *end = pos + d->len;
+	enum diff_rc rc;
+
+	unsigned int array_size_estimate = d->len / 50;
+	unsigned int pow2 = 1;
+	while (array_size_estimate >>= 1)
+		pow2++;
+
+	ARRAYLIST_INIT(d->atoms, 1 << pow2);
+
+	while (pos < end) {
+		const uint8_t *line_end = pos;
+		unsigned int hash = 0;
+
+		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
+			hash = hash * 23 + *line_end;
+			line_end++;
+		}
+
+		/* When not at the end of data, the line ending char ('\r' or '\n') must follow */
+		if (line_end < end)
+			line_end++;
+		/* If that was an '\r', also pull in any following '\n' */
+		if (line_end[0] == '\r' && line_end < end && line_end[1] == '\n')
+			line_end++;
+
+		/* Record the found line as diff atom */
+		struct diff_atom *atom;
+		ARRAYLIST_ADD(atom, d->atoms);
+		if (!atom)
+			return DIFF_RC_ENOMEM;
+
+		*atom = (struct diff_atom){
+			.at = pos,
+			.len = line_end - pos,
+			.hash = hash,
+		};
+
+		/* Starting point for next line: */
+		pos = line_end;
+	}
+
+	return DIFF_RC_OK;
+}
+
+enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right)
+{
+	enum diff_rc rc;
+	rc = diff_data_atomize_text_lines(left);
+	if (rc != DIFF_RC_OK)
+		return rc;
+	return diff_data_atomize_text_lines(right);
+}
blob - /dev/null
blob + 29fdd78115f0ea2392af08b51e5d57d450d9dd1b (mode 644)
--- /dev/null
+++ diff_main.c
@@ -0,0 +1,245 @@
+/* Generic infrastructure to implement various diff algorithms (implementation). */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+#include <sys/queue.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+
+#include <assert.h>
+
+#include "diff_main.h"
+#include "debug.h"
+
+/* Even if a left or right side is empty, diff output may need to know the position in that file.
+ * So left_start or right_start must never be NULL -- pass left_count or right_count as zero to indicate staying at that
+ * position without consuming any lines. */
+struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
+					struct diff_atom *left_start, unsigned int left_count,
+					struct diff_atom *right_start, unsigned int right_count)
+{
+	struct diff_chunk *chunk;
+	diff_chunk_arraylist_t *result;
+
+	if (solved && !state->temp_result.len)
+		result = &state->result->chunks;
+	else
+		result = &state->temp_result;
+
+	ARRAYLIST_ADD(chunk, *result);
+	if (!chunk)
+		return NULL;
+	*chunk = (struct diff_chunk){
+		.solved = solved,
+		.left_start = left_start,
+		.left_count = left_count,
+		.right_start = right_start,
+		.right_count = right_count,
+	};
+
+	debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED");
+	debug("L\n");
+	debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
+	debug("R\n");
+	debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
+	return chunk;
+}
+
+void diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len)
+{
+	*d = (struct diff_data){
+		.data = data,
+		.len = len,
+		.root = d,
+	};
+}
+
+void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
+			       struct diff_atom *from_atom, unsigned int atoms_count)
+{
+	struct diff_atom *last_atom = from_atom + atoms_count - 1;
+	*d = (struct diff_data){
+		.data = from_atom->at,
+		.len = (last_atom->at + last_atom->len) - from_atom->at,
+		.root = parent->root,
+		.atoms.head = from_atom,
+		.atoms.len = atoms_count,
+	};
+
+	debug("subsection:\n");
+	debug_dump(d);
+}
+
+void diff_data_free(struct diff_data *diff_data)
+{
+	if (!diff_data)
+		return;
+	if (diff_data->atoms.allocated)
+		ARRAYLIST_FREE(diff_data->atoms);
+}
+
+enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+	debug("\n** %s\n", __func__);
+	debug("left:\n");
+	debug_dump(&state->left);
+	debug("right:\n");
+	debug_dump(&state->right);
+	debug_dump_myers_graph(&state->left, &state->right, NULL);
+
+	/* Add a chunk of equal lines, if any */
+	unsigned int equal_atoms = 0;
+	while (equal_atoms < state->left.atoms.len && equal_atoms < state->right.atoms.len
+	       && diff_atom_same(&state->left.atoms.head[equal_atoms], &state->right.atoms.head[equal_atoms]))
+		equal_atoms++;
+	if (equal_atoms) {
+		if (!diff_state_add_chunk(state, true,
+					  &state->left.atoms.head[0], equal_atoms,
+					  &state->right.atoms.head[0], equal_atoms))
+			return DIFF_RC_ENOMEM;
+	}
+
+	/* Add a "minus" chunk with all lines from the left. */
+	if (equal_atoms < state->left.atoms.len) {
+		if (!diff_state_add_chunk(state, true,
+					  &state->left.atoms.head[equal_atoms],
+					  state->left.atoms.len - equal_atoms,
+					  NULL, 0))
+		    return DIFF_RC_ENOMEM;
+	}
+
+	/* Add a "plus" chunk with all lines from the right. */
+	if (equal_atoms < state->right.atoms.len) {
+		if (!diff_state_add_chunk(state, true,
+					  NULL, 0,
+					  &state->right.atoms.head[equal_atoms],
+					  state->right.atoms.len - equal_atoms))
+		return DIFF_RC_ENOMEM;
+	}
+	return DIFF_RC_OK;
+}
+
+enum diff_rc diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+	enum diff_rc rc;
+	ARRAYLIST_FREE(state->temp_result);
+
+	if (!algo_config || !algo_config->impl || !state->recursion_depth_left) {
+		debug("MAX RECURSION REACHED, just dumping diff chunks\n");
+		return diff_algo_none(algo_config, state);
+	}
+
+	ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
+	rc = algo_config->impl(algo_config, state);
+	switch (rc) {
+	case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
+		debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo);
+		rc = diff_run_algo(algo_config->fallback_algo, state);
+		goto return_rc;
+
+	case DIFF_RC_OK:
+		/* continue below */
+		break;
+
+	default:
+		/* some error happened */
+		goto return_rc;
+	}
+
+	/* Pick up any diff chunks that are still unsolved and feed to inner_algo.
+	 * inner_algo will solve unsolved chunks and append to result,
+	 * and subsequent solved chunks on this level are then appended to result afterwards. */
+	int i;
+	for (i = 0; i < state->temp_result.len; i++) {
+		struct diff_chunk *c = &state->temp_result.head[i];
+		if (c->solved) {
+			struct diff_chunk *final_c;
+			ARRAYLIST_ADD(final_c, state->result->chunks);
+			if (!final_c) {
+				rc = DIFF_RC_ENOMEM;
+				goto return_rc;
+			}
+			*final_c = *c;
+			continue;
+		}
+
+		/* c is an unsolved chunk, feed to inner_algo */
+		struct diff_state inner_state = {
+			.result = state->result,
+			.recursion_depth_left = state->recursion_depth_left - 1,
+		};
+		diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count);
+		diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count);
+
+		rc = diff_run_algo(algo_config->inner_algo, &inner_state);
+
+		if (rc != DIFF_RC_OK)
+			goto return_rc;
+	}
+
+	rc = DIFF_RC_OK;
+return_rc:
+	ARRAYLIST_FREE(state->temp_result);
+	return rc;
+}
+
+struct diff_result *diff_main(const struct diff_config *config,
+			      const uint8_t *left_data, size_t left_len,
+			      const uint8_t *right_data, size_t right_len)
+{
+	struct diff_result *result = malloc(sizeof(struct diff_result));
+	if (!result)
+		return NULL;
+
+	*result = (struct diff_result){};
+	diff_data_init_root(&result->left, left_data, left_len);
+	diff_data_init_root(&result->right, right_data, right_len);
+
+	if (!config->atomize_func) {
+		result->rc = DIFF_RC_EINVAL;
+		return result;
+	}
+
+	result->rc = config->atomize_func(config->atomize_func_data, &result->left, &result->right);
+	if (result->rc != DIFF_RC_OK)
+		return result;
+
+	struct diff_state state = {
+		.result = result,
+		.recursion_depth_left = config->max_recursion_depth ? : 1024,
+	};
+	diff_data_init_subsection(&state.left, &result->left, result->left.atoms.head, result->left.atoms.len);
+	diff_data_init_subsection(&state.right, &result->right, result->right.atoms.head, result->right.atoms.len);
+
+	result->rc = diff_run_algo(config->algo, &state);
+
+	return result;
+}
+
+void diff_result_free(struct diff_result *result)
+{
+	if (!result)
+		return;
+	diff_data_free(&result->left);
+	diff_data_free(&result->right);
+	ARRAYLIST_FREE(result->chunks);
+	free(result);
+}
blob - /dev/null
blob + 933592cf5e6a33c4c1ace4d204f3fb494ab62d34 (mode 644)
--- /dev/null
+++ diff_main.h
@@ -0,0 +1,287 @@
+/* Generic infrastructure to implement various diff algorithms. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+
+#include "arraylist.h"
+
+#ifndef MAX
+#define MAX(A,B) ((A)>(B)?(A):(B))
+#endif
+#ifndef MIN
+#define MIN(A,B) ((A)<(B)?(A):(B))
+#endif
+
+struct range {
+	int start;
+	int end;
+};
+
+static inline bool range_empty(const struct range *r)
+{
+	return r->start == r->end;
+}
+
+static inline bool ranges_touch(const struct range *a, const struct range *b)
+{
+	return (a->end >= b->start) && (a->start <= b->end);
+}
+
+static inline void ranges_merge(struct range *a, const struct range *b)
+{
+	*a = (struct range){
+		.start = MIN(a->start, b->start),
+		.end = MAX(a->end, b->end),
+	};
+}
+
+static inline int range_len(const struct range *r)
+{
+	if (!r)
+		return 0;
+	return r->end - r->start;
+}
+
+/* List of all possible return codes of a diff invocation. */
+enum diff_rc {
+	DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1,
+	DIFF_RC_OK = 0,
+	DIFF_RC_ENOTSUP = ENOTSUP,
+	DIFF_RC_ENOMEM = ENOMEM,
+	DIFF_RC_EINVAL = EINVAL,
+};
+
+struct diff_atom {
+	const uint8_t *at;
+	size_t len;
+
+	/* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to
+	 * compare entire atoms to find out whether they are indeed identical or not. */
+	unsigned int hash;
+
+	/* State for the Patience Diff algorithm */
+	/* TODO: keep a separate array for the patience state */
+	struct {
+		bool unique_here;
+		bool unique_in_both;
+		struct diff_atom *pos_in_other;
+		struct diff_atom *prev_stack;
+		struct range identical_lines;
+	} patience;
+};
+
+static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
+{
+	return left->hash == right->hash
+		&& left->len == right->len
+		&& memcmp(left->at, right->at, left->len) == 0;
+}
+
+/* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In
+ * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file,
+ * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms
+ * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a
+ * subset of atoms. */
+struct diff_data {
+	const uint8_t *data;
+	size_t len;
+	ARRAYLIST(struct diff_atom) atoms;
+	struct diff_data *root;
+};
+
+void diff_data_free(struct diff_data *diff_data);
+
+/* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with
+ * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in
+ * the file (and not the relative position within the subsection). */
+#define diff_atom_root_idx(DIFF_DATA, ATOM) \
+	((ATOM) ? (ATOM) - ((DIFF_DATA)->root->atoms.head) : (DIFF_DATA)->root->atoms.len)
+
+/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with
+ * 0). */
+#define diff_atom_idx(DIFF_DATA, ATOM) \
+	((ATOM) ? (ATOM) - ((DIFF_DATA)->atoms.head) : (DIFF_DATA)->atoms.len)
+
+#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
+	for ((ATOM) = (FIRST_ATOM); \
+	     (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
+	     (ATOM)++)
+
+#define diff_data_foreach_atom(ATOM, DIFF_DATA) \
+	foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
+
+#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
+	for ((ATOM) = (FROM); \
+	     (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
+	     (ATOM)++)
+
+/* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right.
+ *
+ * If solved == false:
+ * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next.
+ * The lines on the left should be diffed against the lines on the right.
+ * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.)
+ *
+ * If solved == true:
+ * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk").
+ * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk").
+ * If there are both left and right lines, it is a chunk of equal content on both sides,
+ * and left_count == right_count:
+ *
+ * - foo  }
+ * - bar  }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
+ * - baz  }               right_start = NULL, right_count = 0 }
+ *   moo    }
+ *   goo    }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
+ *   zoo    }               right_start = &right.atoms.head[0], right_count = 3 }
+ *  +loo      }
+ *  +roo      }-- diff_chunk{ left_start = NULL, left_count = 0,
+ *  +too      }               right_start = &right.atoms.head[3], right_count = 3 }
+ *
+ */
+struct diff_chunk {
+	bool solved;
+	struct diff_atom *left_start;
+	unsigned int left_count;
+	struct diff_atom *right_start;
+	unsigned int right_count;
+};
+
+typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
+#define DIFF_RESULT_ALLOC_BLOCKSIZE 128
+
+struct diff_result {
+	enum diff_rc rc;
+	struct diff_data left;
+	struct diff_data right;
+	diff_chunk_arraylist_t chunks;
+};
+
+struct diff_state {
+	/* The final result passed to the original diff caller. */
+	struct diff_result *result;
+
+	/* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */
+	struct diff_data left;
+	struct diff_data right;
+
+	unsigned int recursion_depth_left;
+
+	/* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */
+	diff_chunk_arraylist_t temp_result;
+};
+
+struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
+					struct diff_atom *left_start, unsigned int left_count,
+					struct diff_atom *right_start, unsigned int right_count);
+
+/* Signature of a utility function to divide both source files into diff atoms.
+ * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets
+ * both left and right to atomize at the same time.
+ * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
+ *
+ * func_data: context pointer (free to be used by implementation).
+ * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created.
+ * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created.
+ */
+typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right);
+
+extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right);
+
+struct diff_algo_config;
+typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */
+enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
+ * fail if the required space surpasses algo_config->permitted_state_size. */
+extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
+ * the problem into smaller chunks. Requires only linear amounts of memory. */
+extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to
+ * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */
+extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Diff algorithms to use, possibly nested. For example:
+ *
+ * struct diff_algo_config myers, patience, myers_divide;
+ *
+ * myers = (struct diff_algo_config){
+ *         .impl = diff_algo_myers,
+ *         .permitted_state_size = 32 * 1024 * 1024,
+ *         .fallback_algo = &patience, // when too large, do diff_algo_patience
+ * };
+ *
+ * patience = (struct diff_algo_config){
+ *         .impl = diff_algo_patience,
+ *         .inner_algo = &patience,        // After subdivision, do Patience again.
+ *         .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
+ * };
+ *
+ * myers_divide = (struct diff_algo_config){
+ *         .impl = diff_algo_myers_divide,
+ *         .inner_algo = &myers,           // When division succeeded, start from the top.
+ *                                         // (fallback_algo = NULL implies diff_algo_none).
+ * };
+ *
+ * struct diff_config config = {
+ *         .algo = &myers,
+ *         ...
+ * };
+ * diff_main(&config, ...);
+ */
+struct diff_algo_config {
+	diff_algo_impl_t impl;
+
+	/* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo
+	 * (diff_algo_myers). permitted_state_size == 0 means no limitation. */
+	size_t permitted_state_size;
+
+	/* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */
+	const struct diff_algo_config *inner_algo;
+
+	/* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find
+	 * any common-unique atoms), then use this algorithm instead. */
+	const struct diff_algo_config *fallback_algo;
+};
+
+struct diff_config {
+	diff_atomize_func_t atomize_func;
+	void *atomize_func_data;
+
+	const struct diff_algo_config *algo;
+
+	/* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite
+	 * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all
+	 * left atoms and add all right atoms). */
+	unsigned int max_recursion_depth;
+};
+
+struct diff_result *diff_main(const struct diff_config *config,
+			      const uint8_t *left_data, size_t left_len,
+			      const uint8_t *right_data, size_t right_len);
+void diff_result_free(struct diff_result *result);
blob - /dev/null
blob + 1021b3be22639ea353533646e72153bb03276f0e (mode 644)
--- /dev/null
+++ diff_myers.c
@@ -0,0 +1,1042 @@
+/* Myers diff algorithm implementation, invented by Eugene W. Myers [1].
+ * Implementations of both the Myers Divide Et Impera (using linear space)
+ * and the canonical Myers algorithm (using quadratic space). */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_main.h"
+#include "debug.h"
+
+/* Myers' diff algorithm [1] is nicely explained in [2].
+ * [1] http://www.xmailserver.org/diff2.pdf
+ * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.
+ *
+ * Myers approaches finding the smallest diff as a graph problem.
+ * The crux is that the original algorithm requires quadratic amount of memory:
+ * both sides' lengths added, and that squared. So if we're diffing lines of text, two files with 1000 lines each would
+ * blow up to a matrix of about 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text.
+ * The solution is using Myers' "divide and conquer" extension algorithm, which does the original traversal from both
+ * ends of the files to reach a middle where these "snakes" touch, hence does not need to backtrace the traversal, and
+ * so gets away with only keeping a single column of that huge state matrix in memory.
+ *
+ * Todo: the divide and conquer requires linear *space*, not necessarily linear *time*. It recurses, apparently doing
+ * multiple Myers passes, and also it apparently favors fragmented diffs in cases where chunks of text were moved to a
+ * different place. Up to a given count of diff atoms (text lines), it might be desirable to accept the quadratic memory
+ * usage, get nicer diffs and less re-iteration of the same data?
+ */
+
+struct diff_box {
+	unsigned int left_start;
+	unsigned int left_end;
+	unsigned int right_start;
+	unsigned int right_end;
+};
+
+#define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0)
+
+
+/* If the two contents of a file are A B C D E and X B C Y,
+ * the Myers diff graph looks like:
+ *
+ *   k0  k1
+ *    \   \
+ * k-1     0 1 2 3 4 5
+ *   \      A B C D E
+ *     0   o-o-o-o-o-o
+ *      X  | | | | | |
+ *     1   o-o-o-o-o-o
+ *      B  | |\| | | |
+ *     2   o-o-o-o-o-o
+ *      C  | | |\| | |
+ *     3   o-o-o-o-o-o
+ *      Y  | | | | | |\
+ *     4   o-o-o-o-o-o c1
+ *                  \ \
+ *                 c-1 c0
+ *
+ * Moving right means delete an atom from the left-hand-side,
+ * Moving down means add an atom from the right-hand-side.
+ * Diagonals indicate identical atoms on both sides, the challenge is to use as many diagonals as possible.
+ *
+ * The original Myers algorithm walks all the way from the top left to the bottom right, remembers all steps, and then
+ * backtraces to find the shortest path. However, that requires keeping the entire graph in memory, which needs
+ * quadratic space.
+ *
+ * Myers adds a variant that uses linear space -- note, not linear time, only linear space: walk forward and backward,
+ * find a meeting point in the middle, and recurse on the two separate sections. This is called "divide and conquer".
+ *
+ * d: the step number, starting with 0, a.k.a. the distance from the starting point.
+ * k: relative index in the state array for the forward scan, indicating on which diagonal through the diff graph we
+ *    currently are.
+ * c: relative index in the state array for the backward scan, indicating the diagonal number from the bottom up.
+ *
+ * The "divide and conquer" traversal through the Myers graph looks like this:
+ *
+ *      | d=   0   1   2   3      2   1   0
+ *  ----+--------------------------------------------
+ *  k=  |                                      c=
+ *   4  |                                       3
+ *      |
+ *   3  |                 3,0    5,2            2
+ *      |                /          \
+ *   2  |             2,0            5,3        1
+ *      |            /                 \
+ *   1  |         1,0     4,3 >= 4,3    5,4<--  0
+ *      |        /       /          \  /
+ *   0  |  -->0,0     3,3            4,4       -1
+ *      |        \   /              /
+ *  -1  |         0,1     1,2    3,4           -2
+ *      |            \   /
+ *  -2  |             0,2                      -3
+ *      |                \
+ *      |                 0,3
+ *      |  forward->                 <-backward
+ *
+ * x,y pairs here are the coordinates in the Myers graph:
+ * x = atom index in left-side source, y = atom index in the right-side source.
+ *
+ * Only one forward column and one backward column are kept in mem, each need at most left.len + 1 + right.len items.
+ * Note that each d step occupies either the even or the odd items of a column: if e.g. the previous column is in the
+ * odd items, the next column is formed in the even items, without overwriting the previous column's results.
+ *
+ * Also note that from the diagonal index k and the x coordinate, the y coordinate can be derived:
+ *    y = x - k
+ * Hence the state array only needs to keep the x coordinate, i.e. the position in the left-hand file, and the y
+ * coordinate, i.e. position in the right-hand file, is derived from the index in the state array.
+ *
+ * The two traces meet at 4,3, the first step (here found in the forward traversal) where a forward position is on or
+ * past a backward traced position on the same diagonal.
+ *
+ * This divides the problem space into:
+ *
+ *         0 1 2 3 4 5
+ *          A B C D E
+ *     0   o-o-o-o-o
+ *      X  | | | | |
+ *     1   o-o-o-o-o
+ *      B  | |\| | |
+ *     2   o-o-o-o-o
+ *      C  | | |\| |
+ *     3   o-o-o-o-*-o   *: forward and backward meet here
+ *      Y          | |
+ *     4           o-o
+ *
+ * Doing the same on each section lead to:
+ *
+ *         0 1 2 3 4 5
+ *          A B C D E
+ *     0   o-o
+ *      X  | |
+ *     1   o-b    b: backward d=1 first reaches here (sliding up the snake)
+ *      B     \   f: then forward d=2 reaches here (sliding down the snake)
+ *     2       o     As result, the box from b to f is found to be identical;
+ *      C       \    leaving a top box from 0,0 to 1,1 and a bottom trivial tail 3,3 to 4,3.
+ *     3         f-o
+ *
+ *     3           o-*
+ *      Y            |
+ *     4             o   *: forward and backward meet here
+ *
+ * and solving the last top left box gives:
+ *
+ *         0 1 2 3 4 5
+ *          A B C D E           -A
+ *     0   o-o                  +X
+ *      X    |                   B
+ *     1     o                   C
+ *      B     \                 -D
+ *     2       o                -E
+ *      C       \               +Y
+ *     3         o-o-o
+ *      Y            |
+ *     4             o
+ *
+ */
+
+#define xk_to_y(X, K) ((X) - (K))
+#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))
+#define k_to_c(K, DELTA) ((K) + (DELTA))
+#define c_to_k(C, DELTA) ((C) - (DELTA))
+
+/* Do one forwards step in the "divide and conquer" graph traversal.
+ * left: the left side to diff.
+ * right: the right side to diff against.
+ * kd_forward: the traversal state for forwards traversal, modified by this function.
+ *             This is carried over between invocations with increasing d.
+ *             kd_forward points at the center of the state array, allowing negative indexes.
+ * kd_backward: the traversal state for backwards traversal, to find a meeting point.
+ *              Since forwards is done first, kd_backward will be valid for d - 1, not d.
+ *              kd_backward points at the center of the state array, allowing negative indexes.
+ * d: Step or distance counter, indicating for what value of d the kd_forward should be populated.
+ *    For d == 0, kd_forward[0] is initialized, i.e. the first invocation should be for d == 0.
+ * meeting_snake: resulting meeting point, if any.
+ */
+static void diff_divide_myers_forward(struct diff_data *left, struct diff_data *right,
+				      int *kd_forward, int *kd_backward, int d,
+				      struct diff_box *meeting_snake)
+{
+	int delta = (int)right->atoms.len - (int)left->atoms.len;
+	int prev_x;
+	int prev_y;
+	int k;
+	int x;
+
+	debug("-- %s d=%d\n", __func__, d);
+	debug_dump_myers_graph(left, right, NULL);
+
+	for (k = d; k >= -d; k -= 2) {
+		if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
+			/* This diagonal is completely outside of the Myers graph, don't calculate it. */
+			if (k < -(int)right->atoms.len)
+				debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
+			else
+				debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
+			if (k < 0) {
+				/* We are traversing negatively, and already below the entire graph, nothing will come
+				 * of this. */
+				debug(" break");
+				break;
+			}
+			debug(" continue");
+			continue;
+		}
+		debug("- k = %d\n", k);
+		if (d == 0) {
+			/* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
+			 * the Myers graph. */
+			x = 0;
+		}
+		/* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
+		 * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
+		 *
+		 *      | d=   0   1   2
+		 *  ----+----------------
+		 *  k=  |
+		 *   2  |             2,0 <-- from prev_k = 2 - 1 = 1
+		 *      |            /
+		 *   1  |         1,0
+		 *      |        /
+		 *   0  |  -->0,0     3,3
+		 *      |       \\   /
+		 *  -1  |         0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
+		 *      |           \\
+		 *  -2  |             0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
+		 *
+		 * Except when a k + 1 from a previous run already means a further advancement in the graph.
+		 * If k == d, there is no k + 1 and k - 1 is the only option.
+		 * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
+		 */
+		else if (k > -d && (k == d
+				    || (k - 1 >= -(int)right->atoms.len
+					&& kd_forward[k - 1] >= kd_forward[k + 1]))) {
+			/* Advance from k - 1.
+			 * From position prev_k, step to the right in the Myers graph: x += 1.
+			 */
+			int prev_k = k - 1;
+			prev_x = kd_forward[prev_k];
+			prev_y = xk_to_y(prev_x, prev_k);
+			x = prev_x + 1;
+		} else {
+			/* The bottom most one.
+			 * From position prev_k, step to the bottom in the Myers graph: y += 1.
+			 * Incrementing y is achieved by decrementing k while keeping the same x.
+			 * (since we're deriving y from y = x - k).
+			 */
+			int prev_k = k + 1;
+			prev_x = kd_forward[prev_k];
+			prev_y = xk_to_y(prev_x, prev_k);
+			x = prev_x;
+		}
+
+		/* Slide down any snake that we might find here. */
+		while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
+		       && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
+		       x++;
+		kd_forward[k] = x;
+
+		if (DEBUG) {
+			int fi;
+			for (fi = d; fi >= k; fi--) {
+				debug("kd_forward[%d] = (%d, %d)\n", fi, kd_forward[fi], kd_forward[fi] - fi);
+				/*
+				if (kd_forward[fi] >= 0 && kd_forward[fi] < left->atoms.len)
+					debug_dump_atom(left, right, &left->atoms.head[kd_forward[fi]]);
+				else
+					debug("\n");
+				if (kd_forward[fi]-fi >= 0 && kd_forward[fi]-fi < right->atoms.len)
+					debug_dump_atom(right, left, &right->atoms.head[kd_forward[fi]-fi]);
+				else
+					debug("\n");
+				*/
+			}
+		}
+
+		if (x < 0 || x > left->atoms.len
+		    || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len)
+			continue;
+
+		/* Figured out a new forwards traversal, see if this has gone onto or even past a preceding backwards
+		 * traversal.
+		 *
+		 * If the delta in length is odd, then d and backwards_d hit the same state indexes:
+		 *      | d=   0   1   2      1   0
+		 *  ----+----------------    ----------------
+		 *  k=  |                              c=
+		 *   4  |                               3
+		 *      |
+		 *   3  |                               2
+		 *      |                same
+		 *   2  |             2,0====5,3        1
+		 *      |            /          \
+		 *   1  |         1,0            5,4<-- 0
+		 *      |        /              /
+		 *   0  |  -->0,0     3,3====4,4       -1
+		 *      |        \   /
+		 *  -1  |         0,1                  -2
+		 *      |            \
+		 *  -2  |             0,2              -3
+		 *      |
+		 *
+		 * If the delta is even, they end up off-by-one, i.e. on different diagonals:
+		 *
+		 *      | d=   0   1   2    1   0
+		 *  ----+----------------  ----------------
+		 *      |                            c=
+		 *   3  |                             3
+		 *      |
+		 *   2  |             2,0 off         2
+		 *      |            /   \\
+		 *   1  |         1,0      4,3        1
+		 *      |        /       //   \
+		 *   0  |  -->0,0     3,3      4,4<-- 0
+		 *      |        \   /        /
+		 *  -1  |         0,1      3,4       -1
+		 *      |            \   //
+		 *  -2  |             0,2            -2
+		 *      |
+		 *
+		 * So in the forward path, we can only match up diagonals when the delta is odd.
+		 */
+		 /* Forwards is done first, so the backwards one was still at d - 1. Can't do this for d == 0. */
+		int backwards_d = d - 1;
+		if ((delta & 1) && (backwards_d >= 0)) {
+			debug("backwards_d = %d\n", backwards_d);
+
+			/* If both sides have the same length, forward and backward start on the same diagonal, meaning the
+			 * backwards state index c == k.
+			 * As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
+			 * c = k shifted by the difference in length.
+			 */
+			int c = k_to_c(k, delta);
+
+			/* When the file sizes are very different, the traversal trees start on far distant diagonals.
+			 * They don't necessarily meet straight on. See whether this forward value is on a diagonal that
+			 * is also valid in kd_backward[], and match them if so. */
+			if (c >= -backwards_d && c <= backwards_d) {
+				/* Current k is on a diagonal that exists in kd_backward[]. If the two x positions have
+				 * met or passed (forward walked onto or past backward), then we've found a midpoint / a
+				 * mid-box.
+				 *
+				 * But we need to avoid matching a situation like this:
+				 *       0  1
+				 *        x y
+				 *   0   o-o-o
+				 *     x |\| |
+				 *   1   o-o-o
+				 *     y | |\|
+				 *   2  (B)o-o  <--(B) backwards traversal reached here
+				 *     a | | |
+				 *   3   o-o-o<-- prev_x, prev_y
+				 *     b | | |
+				 *   4   o-o(F) <--(F) forwards traversal  reached here
+				 *     x |\| |     Now both are on the same diagonal and look like they passed,
+				 *   5   o-o-o     but actually they have sneaked past each other and have not met.
+				 *     y | |\|
+				 *   6   o-o-o
+				 *
+				 * The solution is to notice that prev_x,prev_y were also already past (B).
+				 */
+				int backward_x = kd_backward[c];
+				int backward_y = xc_to_y(backward_x, c, delta);
+				debug(" prev_x,y = (%d,%d)  c%d:backward_x,y = (%d,%d)  k%d:x,y = (%d,%d)\n",
+				      prev_x, prev_y, c, backward_x, backward_y, k, x, xk_to_y(x, k));
+				if (prev_x <= backward_x && prev_y <= backward_y
+				    && x >= backward_x) {
+					*meeting_snake = (struct diff_box){
+						.left_start = backward_x,
+						.left_end = x,
+						.right_start = xc_to_y(backward_x, c, delta),
+						.right_end = xk_to_y(x, k),
+					};
+					debug("HIT x=(%u,%u) - y=(%u,%u)\n",
+					      meeting_snake->left_start,
+					      meeting_snake->right_start,
+					      meeting_snake->left_end,
+					      meeting_snake->right_end);
+					return;
+				}
+			}
+		}
+	}
+}
+
+/* Do one backwards step in the "divide and conquer" graph traversal.
+ * left: the left side to diff.
+ * right: the right side to diff against.
+ * kd_forward: the traversal state for forwards traversal, to find a meeting point.
+ *             Since forwards is done first, after this, both kd_forward and kd_backward will be valid for d.
+ *             kd_forward points at the center of the state array, allowing negative indexes.
+ * kd_backward: the traversal state for backwards traversal, to find a meeting point.
+ *              This is carried over between invocations with increasing d.
+ *              kd_backward points at the center of the state array, allowing negative indexes.
+ * d: Step or distance counter, indicating for what value of d the kd_backward should be populated.
+ *    Before the first invocation, kd_backward[0] shall point at the bottom right of the Myers graph
+ *    (left.len, right.len).
+ *    The first invocation will be for d == 1.
+ * meeting_snake: resulting meeting point, if any.
+ */
+static void diff_divide_myers_backward(struct diff_data *left, struct diff_data *right,
+				       int *kd_forward, int *kd_backward, int d,
+				       struct diff_box *meeting_snake)
+{
+	int delta = (int)right->atoms.len - (int)left->atoms.len;
+	int prev_x;
+	int prev_y;
+	int c;
+	int x;
+
+	debug("-- %s d=%d\n", __func__, d);
+	debug_dump_myers_graph(left, right, NULL);
+
+	for (c = d; c >= -d; c -= 2) {
+		if (c < -(int)left->atoms.len || c > (int)right->atoms.len) {
+			/* This diagonal is completely outside of the Myers graph, don't calculate it. */
+			if (c < -(int)left->atoms.len)
+				debug(" %d c < -(int)left->atoms.len %d\n", c, -(int)left->atoms.len);
+			else
+				debug(" %d c > right->atoms.len %d\n", c, right->atoms.len);
+			if (c < 0) {
+				/* We are traversing negatively, and already below the entire graph, nothing will come
+				 * of this. */
+				debug(" break");
+				break;
+			}
+			debug(" continue");
+			continue;
+		}
+		debug("- c = %d\n", c);
+		if (d == 0) {
+			/* This is the initializing step. There is no prev_c yet, get the initial x from the bottom
+			 * right of the Myers graph. */
+			x = left->atoms.len;
+		}
+		/* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
+		 * For this, all c should derive from c - 1, only the bottom most c derive from c + 1:
+		 *
+		 *                                  2   1   0
+		 *  ---------------------------------------------------
+		 *                                               c=
+		 *                                                3
+		 *
+		 *         from prev_c = c - 1 --> 5,2            2
+		 *                                    \
+		 *                                     5,3        1
+		 *                                        \
+		 *                                 4,3     5,4<-- 0
+		 *                                    \   /
+		 *  bottom most for d=1 from c + 1 --> 4,4       -1
+		 *                                    /
+		 *         bottom most for d=2 --> 3,4           -2
+		 *
+		 * Except when a c + 1 from a previous run already means a further advancement in the graph.
+		 * If c == d, there is no c + 1 and c - 1 is the only option.
+		 * If c < d, use c + 1 in case that yields a larger x. Also use c + 1 if c - 1 is outside the graph.
+		 */
+		else if (c > -d && (c == d
+				    || (c - 1 >= -(int)right->atoms.len
+					&& kd_backward[c - 1] <= kd_backward[c + 1]))) {
+			/* A top one.
+			 * From position prev_c, step upwards in the Myers graph: y -= 1.
+			 * Decrementing y is achieved by incrementing c while keeping the same x.
+			 * (since we're deriving y from y = x - c + delta).
+			 */
+			int prev_c = c - 1;
+			prev_x = kd_backward[prev_c];
+			prev_y = xc_to_y(prev_x, prev_c, delta);
+			x = prev_x;
+		} else {
+			/* The bottom most one.
+			 * From position prev_c, step to the left in the Myers graph: x -= 1.
+			 */
+			int prev_c = c + 1;
+			prev_x = kd_backward[prev_c];
+			prev_y = xc_to_y(prev_x, prev_c, delta);
+			x = prev_x - 1;
+		}
+
+		/* Slide up any snake that we might find here. */
+		debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c, delta), xc_to_y(x, c, delta)-1);
+		if (x > 0) {
+			debug("  l="); debug_dump_atom(left, right, &left->atoms.head[x-1]);
+		}
+		if (xc_to_y(x, c, delta) > 0) {
+			debug("   r="); debug_dump_atom(right, left, &right->atoms.head[xc_to_y(x, c, delta)-1]);
+		}
+		while (x > 0 && xc_to_y(x, c, delta) > 0
+		       && diff_atom_same(&left->atoms.head[x-1], &right->atoms.head[xc_to_y(x, c, delta)-1]))
+		       x--;
+		kd_backward[c] = x;
+
+		if (DEBUG) {
+			int fi;
+			for (fi = d; fi >= c; fi--) {
+				debug("kd_backward[%d] = (%d, %d)\n", fi, kd_backward[fi],
+				      kd_backward[fi] - fi + delta);
+				/*
+				if (kd_backward[fi] >= 0 && kd_backward[fi] < left->atoms.len)
+					debug_dump_atom(left, right, &left->atoms.head[kd_backward[fi]]);
+				else
+					debug("\n");
+				if (kd_backward[fi]-fi+delta >= 0 && kd_backward[fi]-fi+delta < right->atoms.len)
+					debug_dump_atom(right, left, &right->atoms.head[kd_backward[fi]-fi+delta]);
+				else
+					debug("\n");
+				*/
+			}
+		}
+
+		if (x < 0 || x > left->atoms.len
+		    || xc_to_y(x, c, delta) < 0 || xc_to_y(x, c, delta) > right->atoms.len)
+			continue;
+
+		/* Figured out a new backwards traversal, see if this has gone onto or even past a preceding forwards
+		 * traversal.
+		 *
+		 * If the delta in length is even, then d and backwards_d hit the same state indexes -- note how this is
+		 * different from in the forwards traversal, because now both d are the same:
+		 *
+		 *      | d=   0   1   2      2   1   0
+		 *  ----+----------------    --------------------
+		 *  k=  |                                  c=
+		 *   4  |
+		 *      |
+		 *   3  |                                   3
+		 *      |                same
+		 *   2  |             2,0====5,2            2
+		 *      |            /          \
+		 *   1  |         1,0            5,3        1
+		 *      |        /              /  \
+		 *   0  |  -->0,0     3,3====4,3    5,4<--  0
+		 *      |        \   /             /
+		 *  -1  |         0,1            4,4       -1
+		 *      |            \
+		 *  -2  |             0,2                  -2
+		 *      |
+		 *                                      -3
+		 * If the delta is odd, they end up off-by-one, i.e. on different diagonals.
+		 * So in the backward path, we can only match up diagonals when the delta is even.
+		 */
+		if ((delta & 1) == 0) {
+			/* Forwards was done first, now both d are the same. */
+			int forwards_d = d;
+
+			/* As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
+			 * c = k shifted by the difference in length.
+			 */
+			int k = c_to_k(c, delta);
+
+			/* When the file sizes are very different, the traversal trees start on far distant diagonals.
+			 * They don't necessarily meet straight on. See whether this backward value is also on a valid
+			 * diagonal in kd_forward[], and match them if so. */
+			if (k >= -forwards_d && k <= forwards_d) {
+				/* Current c is on a diagonal that exists in kd_forward[]. If the two x positions have
+				 * met or passed (backward walked onto or past forward), then we've found a midpoint / a
+				 * mid-box. */
+				int forward_x = kd_forward[k];
+				int forward_y = xk_to_y(forward_x, k);
+				debug("Compare %d to %d  k=%d  (x=%d,y=%d) to (x=%d,y=%d)\n",
+				      forward_x, x, k,
+				      forward_x, xk_to_y(forward_x, k), x, xc_to_y(x, c, delta));
+				if (forward_x <= prev_x && forward_y <= prev_y
+				    && forward_x >= x) {
+					*meeting_snake = (struct diff_box){
+						.left_start = x,
+						.left_end = forward_x,
+						.right_start = xc_to_y(x, c, delta),
+						.right_end = xk_to_y(forward_x, k),
+					};
+					debug("HIT x=%u,%u - y=%u,%u\n",
+					      meeting_snake->left_start,
+					      meeting_snake->right_start,
+					      meeting_snake->left_end,
+					      meeting_snake->right_end);
+					return;
+				}
+			}
+		}
+	}
+}
+
+/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
+ * the problem into smaller chunks. Requires only linear amounts of memory. */
+enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+	enum diff_rc rc = DIFF_RC_ENOMEM;
+	struct diff_data *left = &state->left;
+	struct diff_data *right = &state->right;
+
+	debug("\n** %s\n", __func__);
+	debug("left:\n");
+	debug_dump(left);
+	debug("right:\n");
+	debug_dump(right);
+	debug_dump_myers_graph(left, right, NULL);
+
+	/* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
+	unsigned int max = left->atoms.len + right->atoms.len;
+	size_t kd_len = max + 1;
+	size_t kd_buf_size = kd_len << 1;
+	int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
+	if (!kd_buf)
+		return DIFF_RC_ENOMEM;
+	int i;
+	for (i = 0; i < kd_buf_size; i++)
+		kd_buf[i] = -1;
+	int *kd_forward = kd_buf;
+	int *kd_backward = kd_buf + kd_len;
+
+	/* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
+	 * It is then possible to index from -max/2 .. max/2. */
+	kd_forward += max/2;
+	kd_backward += max/2;
+
+	int d;
+	struct diff_box mid_snake = {};
+	for (d = 0; d <= (max/2); d++) {
+		debug("-- d=%d\n", d);
+		diff_divide_myers_forward(left, right, kd_forward, kd_backward, d, &mid_snake);
+		if (!diff_box_empty(&mid_snake))
+			break;
+		diff_divide_myers_backward(left, right, kd_forward, kd_backward, d, &mid_snake);
+		if (!diff_box_empty(&mid_snake))
+			break;
+	}
+
+	if (diff_box_empty(&mid_snake)) {
+		/* Divide and conquer failed to find a meeting point. Use the fallback_algo defined in the algo_config
+		 * (leave this to the caller). This is just paranoia/sanity, we normally should always find a midpoint.
+		 */
+		debug(" no midpoint \n");
+		rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+		goto return_rc;
+	} else {
+		debug(" mid snake L: %u to %u of %u   R: %u to %u of %u\n",
+		      mid_snake.left_start, mid_snake.left_end, left->atoms.len,
+		      mid_snake.right_start, mid_snake.right_end, right->atoms.len);
+
+		/* Section before the mid-snake.  */
+		debug("Section before the mid-snake\n");
+
+		struct diff_atom *left_atom = &left->atoms.head[0];
+		unsigned int left_section_len = mid_snake.left_start;
+		struct diff_atom *right_atom = &right->atoms.head[0];
+		unsigned int right_section_len = mid_snake.right_start;
+
+		if (left_section_len && right_section_len) {
+			/* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
+			if (!diff_state_add_chunk(state, false,
+						  left_atom, left_section_len,
+						  right_atom, right_section_len))
+				goto return_rc;
+		} else if (left_section_len && !right_section_len) {
+			/* Only left atoms and none on the right, they form a "minus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom, 0))
+				goto return_rc;
+		} else if (!left_section_len && right_section_len) {
+			/* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, 0,
+						  right_atom, right_section_len))
+				goto return_rc;
+		}
+		/* else: left_section_len == 0 and right_section_len == 0, i.e. nothing before the mid-snake. */
+
+		/* the mid-snake, identical data on both sides: */
+		debug("the mid-snake\n");
+		if (!diff_state_add_chunk(state, true,
+					  &left->atoms.head[mid_snake.left_start],
+					  mid_snake.left_end - mid_snake.left_start,
+					  &right->atoms.head[mid_snake.right_start],
+					  mid_snake.right_end - mid_snake.right_start))
+			goto return_rc;
+
+		/* Section after the mid-snake. */
+		debug("Section after the mid-snake\n");
+		debug("  left_end %u  right_end %u\n", mid_snake.left_end, mid_snake.right_end);
+		debug("  left_count %u  right_count %u\n", left->atoms.len, right->atoms.len);
+		left_atom = &left->atoms.head[mid_snake.left_end];
+		left_section_len = left->atoms.len - mid_snake.left_end;
+		right_atom = &right->atoms.head[mid_snake.right_end];
+		right_section_len = right->atoms.len - mid_snake.right_end;
+
+		if (left_section_len && right_section_len) {
+			/* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
+			if (!diff_state_add_chunk(state, false,
+						  left_atom, left_section_len,
+						  right_atom, right_section_len))
+				goto return_rc;
+		} else if (left_section_len && !right_section_len) {
+			/* Only left atoms and none on the right, they form a "minus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom, 0))
+				goto return_rc;
+		} else if (!left_section_len && right_section_len) {
+			/* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, 0,
+						  right_atom, right_section_len))
+				goto return_rc;
+		}
+		/* else: left_section_len == 0 and right_section_len == 0, i.e. nothing after the mid-snake. */
+	}
+
+	rc = DIFF_RC_OK;
+
+return_rc:
+	free(kd_buf);
+	debug("** END %s\n", __func__);
+	return rc;
+}
+
+/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
+ * fail if the required space surpasses algo_config->permitted_state_size. */
+enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+	/* do a diff_divide_myers_forward() without a _backward(), so that it walks forward across the entire
+	 * files to reach the end. Keep each run's state, and do a final backtrace. */
+	enum diff_rc rc = DIFF_RC_ENOMEM;
+	struct diff_data *left = &state->left;
+	struct diff_data *right = &state->right;
+
+	debug("\n** %s\n", __func__);
+	debug("left:\n");
+	debug_dump(left);
+	debug("right:\n");
+	debug_dump(right);
+	debug_dump_myers_graph(left, right, NULL);
+
+	/* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
+	unsigned int max = left->atoms.len + right->atoms.len;
+	size_t kd_len = max + 1 + max;
+	size_t kd_buf_size = kd_len * kd_len;
+	debug("state size: %zu\n", kd_buf_size);
+	if (kd_buf_size < kd_len /* overflow? */
+	    || kd_buf_size * sizeof(int) > algo_config->permitted_state_size) {
+		debug("state size %zu > permitted_state_size %zu, use fallback_algo\n",
+		      kd_buf_size, algo_config->permitted_state_size);
+		return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+	}
+
+	int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
+	if (!kd_buf)
+		return DIFF_RC_ENOMEM;
+	int i;
+	for (i = 0; i < kd_buf_size; i++)
+		kd_buf[i] = -1;
+
+	/* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
+	 * It is then possible to index from -max .. max. */
+	int *kd_origin = kd_buf + max;
+	int *kd_column = kd_origin;
+
+	int d;
+	int backtrack_d = -1;
+	int backtrack_k = 0;
+	int k;
+	int x, y;
+	for (d = 0; d <= max; d++, kd_column += kd_len) {
+		debug("-- d=%d\n", d);
+
+		debug("-- %s d=%d\n", __func__, d);
+
+		for (k = d; k >= -d; k -= 2) {
+			if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
+				/* This diagonal is completely outside of the Myers graph, don't calculate it. */
+				if (k < -(int)right->atoms.len)
+					debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
+				else
+					debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
+				if (k < 0) {
+					/* We are traversing negatively, and already below the entire graph, nothing will come
+					 * of this. */
+					debug(" break");
+					break;
+				}
+				debug(" continue");
+				continue;
+			}
+
+			debug("- k = %d\n", k);
+			if (d == 0) {
+				/* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
+				 * the Myers graph. */
+				x = 0;
+			} else {
+				int *kd_prev_column = kd_column - kd_len;
+
+				/* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
+				 * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
+				 *
+				 *      | d=   0   1   2
+				 *  ----+----------------
+				 *  k=  |
+				 *   2  |             2,0 <-- from prev_k = 2 - 1 = 1
+				 *      |            /
+				 *   1  |         1,0
+				 *      |        /
+				 *   0  |  -->0,0     3,3
+				 *      |       \\   /
+				 *  -1  |         0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
+				 *      |           \\
+				 *  -2  |             0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
+				 *
+				 * Except when a k + 1 from a previous run already means a further advancement in the graph.
+				 * If k == d, there is no k + 1 and k - 1 is the only option.
+				 * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
+				 */
+				if (k > -d && (k == d
+					       || (k - 1 >= -(int)right->atoms.len
+						   && kd_prev_column[k - 1] >= kd_prev_column[k + 1]))) {
+					/* Advance from k - 1.
+					 * From position prev_k, step to the right in the Myers graph: x += 1.
+					 */
+					int prev_k = k - 1;
+					int prev_x = kd_prev_column[prev_k];
+					x = prev_x + 1;
+				} else {
+					/* The bottom most one.
+					 * From position prev_k, step to the bottom in the Myers graph: y += 1.
+					 * Incrementing y is achieved by decrementing k while keeping the same x.
+					 * (since we're deriving y from y = x - k).
+					 */
+					int prev_k = k + 1;
+					int prev_x = kd_prev_column[prev_k];
+					x = prev_x;
+				}
+			}
+
+			/* Slide down any snake that we might find here. */
+			while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
+			       && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
+			       x++;
+			kd_column[k] = x;
+
+			if (DEBUG) {
+				int fi;
+				for (fi = d; fi >= k; fi-=2) {
+					debug("kd_column[%d] = (%d, %d)\n", fi, kd_column[fi], kd_column[fi] - fi);
+#if 0
+					if (kd_column[fi] >= 0 && kd_column[fi] < left->atoms.len)
+						debug_dump_atom(left, right, &left->atoms.head[kd_column[fi]]);
+					else
+						debug("\n");
+					if (kd_column[fi]-fi >= 0 && kd_column[fi]-fi < right->atoms.len)
+						debug_dump_atom(right, left, &right->atoms.head[kd_column[fi]-fi]);
+					else
+						debug("\n");
+#endif
+				}
+			}
+
+			if (x == left->atoms.len && xk_to_y(x, k) == right->atoms.len) {
+				/* Found a path */
+				backtrack_d = d;
+				backtrack_k = k;
+				debug("Reached the end at d = %d, k = %d\n",
+				      backtrack_d, backtrack_k);
+				break;
+			}
+		}
+
+		if (backtrack_d >= 0)
+			break;
+	}
+
+	debug_dump_myers_graph(left, right, kd_origin);
+
+	/* backtrack. A matrix spanning from start to end of the file is ready:
+	 *
+	 *      | d=   0   1   2   3   4
+	 *  ----+---------------------------------
+	 *  k=  |
+	 *   3  |
+	 *      |
+	 *   2  |             2,0
+	 *      |            /
+	 *   1  |         1,0     4,3
+	 *      |        /       /   \
+	 *   0  |  -->0,0     3,3     4,4 --> backtrack_d = 4, backtrack_k = 0
+	 *      |        \   /   \
+	 *  -1  |         0,1     3,4
+	 *      |            \
+	 *  -2  |             0,2
+	 *      |
+	 *
+	 * From (4,4) backwards, find the previous position that is the largest, and remember it.
+	 *
+	 */
+	for (d = backtrack_d, k = backtrack_k; d >= 0; d--) {
+		x = kd_column[k];
+		y = xk_to_y(x, k);
+
+		/* When the best position is identified, remember it for that kd_column.
+		 * That kd_column is no longer needed otherwise, so just re-purpose kd_column[0] = x and kd_column[1] = y,
+		 * so that there is no need to allocate more memory.
+		 */
+		kd_column[0] = x;
+		kd_column[1] = y;
+		debug("Backtrack d=%d: xy=(%d, %d)\n",
+		      d, kd_column[0], kd_column[1]);
+
+		/* Don't access memory before kd_buf */
+		if (d == 0)
+			break;
+		int *kd_prev_column = kd_column - kd_len;
+
+		/* When y == 0, backtracking downwards (k-1) is the only way.
+		 * When x == 0, backtracking upwards (k+1) is the only way.
+		 *
+		 *      | d=   0   1   2   3   4
+		 *  ----+---------------------------------
+		 *  k=  |
+		 *   3  |
+		 *      |                ..y == 0
+		 *   2  |             2,0
+		 *      |            /
+		 *   1  |         1,0     4,3
+		 *      |        /       /   \
+		 *   0  |  -->0,0     3,3     4,4 --> backtrack_d = 4, backtrack_k = 0
+		 *      |        \   /   \
+		 *  -1  |         0,1     3,4
+		 *      |            \
+		 *  -2  |             0,2__
+		 *      |                  x == 0
+		 */
+		debug("prev[k-1] = %d,%d  prev[k+1] = %d,%d\n",
+		      kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1),
+		      kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1));
+		if (y == 0
+		    || (x > 0 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) {
+			k = k - 1;
+			debug("prev k=k-1=%d x=%d y=%d\n",
+			      k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
+		} else {
+			k = k + 1;
+			debug("prev k=k+1=%d x=%d y=%d\n",
+			      k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
+		}
+		kd_column = kd_prev_column;
+	}
+
+	/* Forwards again, this time recording the diff chunks.
+	 * Definitely start from 0,0. kd_column[0] may actually point to the bottom of a snake starting at 0,0 */
+	x = 0;
+	y = 0;
+
+	kd_column = kd_origin;
+	for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) {
+		int next_x = kd_column[0];
+		int next_y = kd_column[1];
+		debug("Forward track from xy(%d,%d) to xy(%d,%d)\n",
+		      x, y, next_x, next_y);
+
+		struct diff_atom *left_atom = &left->atoms.head[x];
+		int left_section_len = next_x - x;
+		struct diff_atom *right_atom = &right->atoms.head[y];
+		int right_section_len = next_y - y;
+
+		rc = DIFF_RC_ENOMEM;
+		if (left_section_len && right_section_len) {
+			/* This must be a snake slide.
+			 * Snake slides have a straight line leading into them (except when starting at (0,0)). Find
+			 * out whether the lead-in is horizontal or vertical:
+			 *
+			 *     left
+			 *  ---------->
+			 *  |
+			 * r|   o-o        o
+			 * i|      \       |
+			 * g|       o      o
+			 * h|        \      \
+			 * t|         o      o
+			 *  v
+			 *
+			 * If left_section_len > right_section_len, the lead-in is horizontal, meaning first
+			 * remove one atom from the left before sliding down the snake.
+			 * If right_section_len > left_section_len, the lead-in is vetical, so add one atom from
+			 * the right before sliding down the snake. */
+			if (left_section_len == right_section_len + 1) {
+				if (!diff_state_add_chunk(state, true,
+							  left_atom, 1,
+							  right_atom, 0))
+					goto return_rc;
+				left_atom++;
+				left_section_len--;
+			} else if (right_section_len == left_section_len + 1) {
+				if (!diff_state_add_chunk(state, true,
+							  left_atom, 0,
+							  right_atom, 1))
+					goto return_rc;
+				right_atom++;
+				right_section_len--;
+			} else if (left_section_len != right_section_len) {
+				/* The numbers are making no sense. Should never happen. */
+				rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+				goto return_rc;
+			}
+
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom, right_section_len))
+				goto return_rc;
+		} else if (left_section_len && !right_section_len) {
+			/* Only left atoms and none on the right, they form a "minus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom, 0))
+				goto return_rc;
+		} else if (!left_section_len && right_section_len) {
+			/* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, 0,
+						  right_atom, right_section_len))
+				goto return_rc;
+		}
+
+		x = next_x;
+		y = next_y;
+	}
+
+	rc = DIFF_RC_OK;
+
+return_rc:
+	free(kd_buf);
+	debug("** END %s rc=%d\n", __func__, rc);
+	return rc;
+}
blob - /dev/null
blob + 8c9e498eb29496228f6ff282f223b721ba1a3d03 (mode 644)
--- /dev/null
+++ diff_output.c
@@ -0,0 +1,52 @@
+/* Common parts for printing diff output */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_output.h"
+
+void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count)
+{
+	struct diff_atom *atom;
+	foreach_diff_atom(atom, start_atom, count) {
+		fprintf(dest, "%s", prefix);
+		int i;
+		unsigned int len = atom->len;
+		if (len && atom->at[len - 1] == '\n') {
+			len--;
+			if (len && atom->at[len - 1] == '\r')
+				len--;
+		}
+
+		for (i = 0; i < len; i++) {
+			char c = atom->at[i];
+			if ((c < 0x20 || c >= 0x7f) && c != '\t')
+				fprintf(dest, "\\x%02x", (unsigned char)c);
+			else
+				fprintf(dest, "%c", c);
+		}
+		fprintf(dest, "\n");
+	}
+}
+
+enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info)
+{
+	if (info->arbitrary_info && *info->arbitrary_info)
+		fprintf(dest, "%s", info->arbitrary_info);
+	fprintf(dest, "--- %s\n+++ %s\n",
+		info->left_path ? : "a",
+		info->right_path ? : "b");
+        return DIFF_RC_OK;
+}
blob - /dev/null
blob + cb6854b078da31b5426b493fb1128b98361d923e (mode 644)
--- /dev/null
+++ diff_output.h
@@ -0,0 +1,43 @@
+/* Diff output generators and invocation shims. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "diff_main.h"
+
+struct diff_input_info {
+	const char *arbitrary_info;
+	const char *left_path;
+	const char *right_path;
+};
+
+enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
+			       const struct diff_result *result);
+enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
+			const struct diff_input_info *info,
+			const char *left, int left_len, const char *right, int right_len);
+
+enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
+				 const struct diff_result *result, unsigned int context_lines);
+enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
+			  const struct diff_input_info *info,
+			  const char *left, int left_len, const char *right, int right_len,
+			  unsigned int context_lines);
+
+enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info);
+void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count);
blob - /dev/null
blob + fa532a4c439692797a14d9c7e9e0c5c55c161029 (mode 644)
--- /dev/null
+++ diff_output_plain.c
@@ -0,0 +1,54 @@
+/* Output all lines of a diff_result. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_output.h"
+
+enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
+			       const struct diff_result *result)
+{
+	if (!result)
+		return DIFF_RC_EINVAL;
+	if (result->rc != DIFF_RC_OK)
+		return result->rc;
+
+	diff_output_info(dest, info);
+
+	int i;
+	for (i = 0; i < result->chunks.len; i++) {
+		struct diff_chunk *c = &result->chunks.head[i];
+		if (c->left_count && c->right_count)
+			diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
+		else if (c->left_count && !c->right_count)
+			diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
+		else if (c->right_count && !c->left_count)
+			diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
+	}
+	return DIFF_RC_OK;
+}
+
+enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
+			const struct diff_input_info *info,
+			const char *left, int left_len, const char *right, int right_len)
+{
+	enum diff_rc rc;
+	left_len = left_len < 0 ? strlen(left) : left_len;
+	right_len = right_len < 0 ? strlen(right) : right_len;
+	struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
+	rc = diff_output_plain(dest, info, result);
+	diff_result_free(result);
+	return rc;
+}
blob - /dev/null
blob + 4c95e6050fb2e06f84ee46543b0ebe1f83119115 (mode 644)
--- /dev/null
+++ diff_output_unidiff.c
@@ -0,0 +1,206 @@
+/* Produce a unidiff output from a diff_result. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "diff_output.h"
+#include "debug.h"
+
+enum chunk_type {
+	CHUNK_EMPTY,
+	CHUNK_PLUS,
+	CHUNK_MINUS,
+	CHUNK_SAME,
+	CHUNK_WEIRD,
+};
+
+static inline enum chunk_type chunk_type(const struct diff_chunk *chunk)
+{
+	if (!chunk->left_count && !chunk->right_count)
+		return CHUNK_EMPTY;
+	if (!chunk->solved)
+		return CHUNK_WEIRD;
+	if (!chunk->right_count)
+		return CHUNK_MINUS;
+	if (!chunk->left_count)
+		return CHUNK_PLUS;
+	if (chunk->left_count != chunk->right_count)
+		return CHUNK_WEIRD;
+	return CHUNK_SAME;
+}
+
+struct chunk_context {
+	struct range chunk;
+	struct range left, right;
+};
+
+static bool chunk_context_empty(const struct chunk_context *cc)
+{
+	return range_empty(&cc->chunk);
+}
+
+static void chunk_context_get(struct chunk_context *cc, const struct diff_result *r, int chunk_idx,
+		       int context_lines)
+{
+	const struct diff_chunk *c = &r->chunks.head[chunk_idx];
+	int left_start = diff_atom_root_idx(&r->left, c->left_start);
+	int right_start = diff_atom_root_idx(&r->right, c->right_start);
+
+	*cc = (struct chunk_context){
+		.chunk = {
+			.start = chunk_idx,
+			.end = chunk_idx + 1,
+		},
+		.left = {
+			.start = MAX(0, left_start - context_lines),
+			.end = MIN(r->left.atoms.len, left_start + c->left_count + context_lines),
+		},
+		.right = {
+			.start = MAX(0, right_start - context_lines),
+			.end = MIN(r->right.atoms.len, right_start + c->right_count + context_lines),
+		},
+	};
+}
+
+static bool chunk_contexts_touch(const struct chunk_context *cc, const struct chunk_context *other)
+{
+	return ranges_touch(&cc->chunk, &other->chunk)
+		|| ranges_touch(&cc->left, &other->left)
+		|| ranges_touch(&cc->right, &other->right);
+}
+
+static void chunk_contexts_merge(struct chunk_context *cc, const struct chunk_context *other)
+{
+	ranges_merge(&cc->chunk, &other->chunk);
+	ranges_merge(&cc->left, &other->left);
+	ranges_merge(&cc->right, &other->right);
+}
+
+static void diff_output_unidiff_chunk(FILE *dest, bool *info_printed, const struct diff_input_info *info,
+				      const struct diff_result *result, const struct chunk_context *cc)
+{
+	if (range_empty(&cc->left) && range_empty(&cc->right))
+		return;
+
+	if (!(*info_printed)) {
+		diff_output_info(dest, info);
+		*info_printed = true;
+	}
+
+	fprintf(dest, "@@ -%d,%d +%d,%d @@\n",
+		cc->left.start + 1, cc->left.end - cc->left.start,
+		cc->right.start + 1, cc->right.end - cc->right.start);
+
+	/* Got the absolute line numbers where to start printing, and the index of the interesting (non-context) chunk.
+	 * To print context lines above the interesting chunk, nipping on the previous chunk index may be necessary.
+	 * It is guaranteed to be only context lines where left == right, so it suffices to look on the left. */
+	const struct diff_chunk *first_chunk = &result->chunks.head[cc->chunk.start];
+	int chunk_start_line = diff_atom_root_idx(&result->left, first_chunk->left_start);
+	if (cc->left.start < chunk_start_line)
+		diff_output_lines(dest, " ", &result->left.atoms.head[cc->left.start],
+				  chunk_start_line - cc->left.start);
+
+	/* Now write out all the joined chunks and contexts between them */
+	int c_idx;
+	for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) {
+		const struct diff_chunk *c = &result->chunks.head[c_idx];
+
+		if (c->left_count && c->right_count)
+			diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
+		else if (c->left_count && !c->right_count)
+			diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
+		else if (c->right_count && !c->left_count)
+			diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
+	}
+
+	/* Trailing context? */
+	const struct diff_chunk *last_chunk = &result->chunks.head[cc->chunk.end - 1];
+	int chunk_end_line = diff_atom_root_idx(&result->left, last_chunk->left_start + last_chunk->left_count);
+	if (cc->left.end > chunk_end_line)
+		diff_output_lines(dest, " ", &result->left.atoms.head[chunk_end_line],
+				  cc->left.end - chunk_end_line);
+}
+
+enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
+				 const struct diff_result *result, unsigned int context_lines)
+{
+	if (!result)
+		return DIFF_RC_EINVAL;
+	if (result->rc != DIFF_RC_OK)
+		return result->rc;
+
+	struct chunk_context cc = {};
+	bool info_printed = false;
+
+	int i;
+	for (i = 0; i < result->chunks.len; i++) {
+		struct diff_chunk *c = &result->chunks.head[i];
+		enum chunk_type t = chunk_type(c);
+
+		if (t == CHUNK_MINUS || t == CHUNK_PLUS) {
+			if (chunk_context_empty(&cc)) {
+				/* These are the first lines being printed.
+				 * Note down the start point, any number of subsequent chunks may be joined up to this
+				 * unidiff chunk by context lines or by being directly adjacent. */
+				chunk_context_get(&cc, result, i, context_lines);
+				debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
+				      cc.chunk.start, cc.chunk.end,
+				      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+			} else {
+				/* There already is a previous chunk noted down for being printed.
+				 * Does it join up with this one? */
+				struct chunk_context next;
+				chunk_context_get(&next, result, i, context_lines);
+				debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
+				      next.chunk.start, next.chunk.end,
+				      next.left.start, next.left.end, next.right.start, next.right.end);
+				if (chunk_contexts_touch(&cc, &next)) {
+					/* This next context touches or overlaps the previous one, join. */
+					chunk_contexts_merge(&cc, &next);
+					debug("new chunk to be printed touches previous chunk, now: left %d-%d right %d-%d\n",
+					      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+				} else {
+					/* No touching, so the previous context is complete with a gap between it and
+					 * this next one. Print the previous one and start fresh here. */
+					debug("new chunk to be printed does not touch previous chunk; print left %d-%d right %d-%d\n",
+					      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+					diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
+					cc = next;
+					debug("new unprinted chunk is left %d-%d right %d-%d\n",
+					      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+				}
+			}
+		}
+
+	}
+
+	if (!chunk_context_empty(&cc))
+		diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
+	return DIFF_RC_OK;
+}
+
+enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
+			  const struct diff_input_info *info,
+			  const char *left, int left_len, const char *right, int right_len,
+			  unsigned int context_lines)
+{
+	enum diff_rc rc;
+	left_len = left_len < 0 ? strlen(left) : left_len;
+	right_len = right_len < 0 ? strlen(right) : right_len;
+	struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
+	rc = diff_output_unidiff(dest, info, result, context_lines);
+	diff_result_free(result);
+	return rc;
+}
blob - /dev/null
blob + 9d24bfe161fabd6c99cfd6b78be90df096a9d5ae (mode 644)
--- /dev/null
+++ diff_patience.c
@@ -0,0 +1,414 @@
+/* Implementation of the Patience Diff algorithm invented by Bram Cohen:
+ * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <assert.h>
+
+#include "diff_main.h"
+#include "debug.h"
+
+/* Set unique_here = true for all atoms that exist exactly once in this list. */
+static void diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count)
+{
+	struct diff_atom *i;
+	unsigned int count = 0;
+	diff_data_foreach_atom(i, d) {
+		i->patience.unique_here = true;
+		i->patience.unique_in_both = true;
+		count++;
+	}
+	diff_data_foreach_atom(i, d) {
+		struct diff_atom *j;
+
+		if (!i->patience.unique_here)
+			continue;
+
+		diff_data_foreach_atom_from(i + 1, j, d) {
+			if (diff_atom_same(i, j)) {
+				if (i->patience.unique_here) {
+					i->patience.unique_here = false;
+					i->patience.unique_in_both = false;
+					count--;
+				}
+				j->patience.unique_here = false;
+				j->patience.unique_in_both = false;
+				count--;
+			}
+		}
+	}
+	if (unique_count)
+		*unique_count = count;
+}
+
+/* Mark those lines as atom->patience.unique_in_both = true that appear exactly once in each side. */
+static void diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
+					   unsigned int *unique_in_both_count)
+{
+	/* Derive the final unique_in_both count without needing an explicit iteration. So this is just some
+	 * optimiziation to save one iteration in the end. */
+	unsigned int unique_in_both;
+
+	diff_atoms_mark_unique(left, &unique_in_both);
+	diff_atoms_mark_unique(right, NULL);
+
+	debug("unique_in_both %u\n", unique_in_both);
+
+	struct diff_atom *i;
+	diff_data_foreach_atom(i, left) {
+		if (!i->patience.unique_here)
+			continue;
+		struct diff_atom *j;
+		int found_in_b = 0;
+		diff_data_foreach_atom(j, right) {
+			if (!diff_atom_same(i, j))
+				continue;
+			if (!j->patience.unique_here) {
+				found_in_b = 2; /* or more */
+				break;
+			} else {
+				found_in_b = 1;
+				j->patience.pos_in_other = i;
+				i->patience.pos_in_other = j;
+			}
+		}
+
+		if (found_in_b == 0 || found_in_b > 1) {
+			i->patience.unique_in_both = false;
+			unique_in_both--;
+			debug("unique_in_both %u  (%d) ", unique_in_both, found_in_b);
+			debug_dump_atom(left, NULL, i);
+		}
+	}
+
+	/* Still need to unmark right[*]->patience.unique_in_both for atoms that don't exist in left */
+	diff_data_foreach_atom(i, right) {
+		if (!i->patience.unique_here
+		    || !i->patience.unique_in_both)
+			continue;
+		struct diff_atom *j;
+		bool found_in_a = false;
+		diff_data_foreach_atom(j, left) {
+			if (!j->patience.unique_in_both)
+				continue;
+			if (!diff_atom_same(i, j))
+				continue;
+			found_in_a = true;
+			break;
+		}
+
+		if (!found_in_a)
+			i->patience.unique_in_both = false;
+	}
+
+	if (unique_in_both_count)
+		*unique_in_both_count = unique_in_both;
+}
+
+static void diff_atoms_swallow_identical_neighbors(struct diff_data *left, struct diff_data *right,
+						   unsigned int *unique_in_both_count)
+{
+	debug("trivially combine identical lines arount unique_in_both lines\n");
+
+	unsigned int l_idx;
+	unsigned int next_l_idx;
+	unsigned int l_min = 0;
+	unsigned int r_min = 0;
+	for (l_idx = 0; l_idx < left->atoms.len; l_idx = next_l_idx) {
+		next_l_idx = l_idx + 1;
+		struct diff_atom *l = &left->atoms.head[l_idx];
+
+		if (!l->patience.unique_in_both)
+			continue;
+
+		debug("check identical lines around ");
+		debug_dump_atom(left, right, l);
+
+		unsigned int r_idx = diff_atom_idx(right, l->patience.pos_in_other);
+
+		struct range identical_l;
+		struct range identical_r;
+
+		/* Swallow upwards.
+		 * Each common-unique line swallows identical lines upwards and downwards.
+		 * All common-unique lines that were part of the identical lines following below were already swallowed
+		 * in the previous iteration, so we will never hit another common-unique line above. */
+		for (identical_l.start = l_idx, identical_r.start = r_idx;
+		     identical_l.start > l_min
+		     && identical_r.start > r_min
+		     && diff_atom_same(&left->atoms.head[identical_l.start - 1],
+				       &right->atoms.head[identical_r.start - 1]);
+		     identical_l.start--, identical_r.start--);
+
+		/* Swallow downwards */
+		for (identical_l.end = l_idx + 1, identical_r.end = r_idx + 1;
+		     identical_l.end < left->atoms.len
+		     && identical_r.end < right->atoms.len
+		     && diff_atom_same(&left->atoms.head[identical_l.end],
+				       &right->atoms.head[identical_r.end]);
+		     identical_l.end++, identical_r.end++,
+		     next_l_idx++) {
+			if (left->atoms.head[identical_l.end].patience.unique_in_both) {
+				/* Part of a chunk of identical lines, remove from listing of unique_in_both lines */
+				left->atoms.head[identical_l.end].patience.unique_in_both = false;
+				right->atoms.head[identical_r.end].patience.unique_in_both = false;
+				(*unique_in_both_count)--;
+			}
+		}
+
+		l->patience.identical_lines = identical_l;
+		l->patience.pos_in_other->patience.identical_lines = identical_r;
+
+		l_min = identical_l.end;
+		r_min = identical_r.end;
+
+		if (!range_empty(&l->patience.identical_lines)) {
+			debug("common-unique line at l=%u r=%u swallowed identical lines l=%u-%u r=%u-%u\n",
+			      l_idx, r_idx,
+			      identical_l.start, identical_l.end,
+			      identical_r.start, identical_r.end);
+		}
+		debug("next_l_idx = %u\n", next_l_idx);
+	}
+}
+
+/* Among the lines that appear exactly once in each side, find the longest streak that appear in both files in the same
+ * order (with other stuff allowed to interleave). Use patience sort for that, as in the Patience Diff algorithm.
+ * See https://bramcohen.livejournal.com/73318.html and, for a much more detailed explanation,
+ * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */
+enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+	enum diff_rc rc = DIFF_RC_ENOMEM;
+
+	struct diff_data *left = &state->left;
+	struct diff_data *right = &state->right;
+
+	unsigned int unique_in_both_count;
+
+	debug("\n** %s\n", __func__);
+
+	/* Find those lines that appear exactly once in 'left' and exactly once in 'right'. */
+	diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count);
+
+	debug("unique_in_both_count %u\n", unique_in_both_count);
+	debug("left:\n");
+	debug_dump(left);
+	debug("right:\n");
+	debug_dump(right);
+
+	if (!unique_in_both_count) {
+		/* Cannot apply Patience, tell the caller to use fallback_algo instead. */
+		return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+	}
+
+	diff_atoms_swallow_identical_neighbors(left, right, &unique_in_both_count);
+	debug("After swallowing identical neighbors: unique_in_both = %u\n",
+	      unique_in_both_count);
+
+	/* An array of Longest Common Sequence is the result of the below subscope: */
+	unsigned int lcs_count = 0;
+	struct diff_atom **lcs = NULL;
+	struct diff_atom *lcs_tail = NULL;
+
+	{
+		/* This subscope marks the lifetime of the atom_pointers allocation */
+
+		/* One chunk of storage for atom pointers */
+		struct diff_atom **atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2, sizeof(struct diff_atom*));
+
+		/* Half for the list of atoms that still need to be put on stacks */
+		struct diff_atom **uniques = atom_pointers;
+
+		/* Half for the patience sort state's "card stacks" -- we remember only each stack's topmost "card" */
+		struct diff_atom **patience_stacks = atom_pointers + unique_in_both_count;
+		unsigned int patience_stacks_count = 0;
+
+		/* Take all common, unique items from 'left' ... */
+
+		struct diff_atom *atom;
+		struct diff_atom **uniques_end = uniques;
+		diff_data_foreach_atom(atom, left) {
+			if (!atom->patience.unique_in_both)
+				continue;
+			*uniques_end = atom;
+			uniques_end++;
+		}
+
+		/* ...and sort them to the order found in 'right'.
+		 * The idea is to find the leftmost stack that has a higher line number and add it to the stack's top.
+		 * If there is no such stack, open a new one on the right. The line number is derived from the atom*,
+		 * which are array items and hence reflect the relative position in the source file. So we got the
+		 * common-uniques from 'left' and sort them according to atom->patience.pos_in_other. */
+		unsigned int i;
+		for (i = 0; i < unique_in_both_count; i++) {
+			atom = uniques[i];
+			unsigned int target_stack;
+
+			if (!patience_stacks_count)
+				target_stack = 0;
+			else {
+				/* binary search to find the stack to put this atom "card" on. */
+				unsigned int lo = 0;
+				unsigned int hi = patience_stacks_count;
+				while (lo < hi) {
+					unsigned int mid = (lo + hi) >> 1;
+					if (patience_stacks[mid]->patience.pos_in_other < atom->patience.pos_in_other)
+						lo = mid + 1;
+					else
+						hi = mid;
+				}
+
+				target_stack = lo;
+			}
+
+			assert(target_stack <= patience_stacks_count);
+			patience_stacks[target_stack] = atom;
+			if (target_stack == patience_stacks_count)
+				patience_stacks_count++;
+
+			/* Record a back reference to the next stack on the left, which will form the final longest sequence
+			 * later. */
+			atom->patience.prev_stack = target_stack ? patience_stacks[target_stack - 1] : NULL;
+
+		}
+
+		/* backtrace through prev_stack references to form the final longest common sequence */
+		lcs_tail = patience_stacks[patience_stacks_count - 1];
+		lcs_count = patience_stacks_count;
+
+		/* uniques and patience_stacks are no longer needed. Backpointers are in atom->patience.prev_stack */
+		free(atom_pointers);
+	}
+
+	lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*));
+	struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1];
+	struct diff_atom *atom;
+	for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) {
+		assert(lcs_backtrace_pos >= lcs);
+		*lcs_backtrace_pos = atom;
+	}
+
+	unsigned int i;
+	if (DEBUG) {
+		debug("\npatience LCS:\n");
+		for (i = 0; i < lcs_count; i++) {
+			debug_dump_atom(left, right, lcs[i]);
+		}
+	}
+
+
+	/* TODO: For each common-unique line found (now listed in lcs), swallow lines upwards and downwards that are
+	 * identical on each side. Requires a way to represent atoms being glued to adjacent atoms. */
+
+	debug("\ntraverse LCS, possibly recursing:\n");
+
+	/* Now we have pinned positions in both files at which it makes sense to divide the diff problem into smaller
+	 * chunks. Go into the next round: look at each section in turn, trying to again find common-unique lines in
+	 * those smaller sections. As soon as no more are found, the remaining smaller sections are solved by Myers. */
+	unsigned int left_pos = 0;
+	unsigned int right_pos = 0;
+	for (i = 0; i <= lcs_count; i++) {
+		struct diff_atom *atom;
+		struct diff_atom *atom_r;
+		unsigned int left_idx;
+		unsigned int right_idx;
+
+		if (i < lcs_count) {
+			atom = lcs[i];
+			atom_r = atom->patience.pos_in_other;
+			debug("lcs[%u] = left[%u] = right[%u]\n", i,
+			      diff_atom_idx(left, atom), diff_atom_idx(right, atom_r));
+			left_idx = atom->patience.identical_lines.start;
+			right_idx = atom_r->patience.identical_lines.start;
+			debug(" identical lines l %u-%u  r %u-%u\n",
+			      atom->patience.identical_lines.start, atom->patience.identical_lines.end,
+			      atom_r->patience.identical_lines.start, atom_r->patience.identical_lines.end);
+		} else {
+			atom = NULL;
+			atom_r = NULL;
+			left_idx = left->atoms.len;
+			right_idx = right->atoms.len;
+		}
+
+		/* 'atom' now marks an atom that matches on both sides according to patience-diff
+		 * (a common-unique identical atom in both files).
+		 * Handle the section before and the atom itself; the section after will be handled by the next loop
+		 * iteration -- note that i loops to last element + 1 ("i <= lcs_count"), so that there will be another
+		 * final iteration to pick up the last remaining items after the last LCS atom.
+		 * The sections before might also be empty on left and/or right.
+		 * left_pos and right_pos mark the indexes of the first atoms that have not yet been handled in the
+		 * previous loop iteration.
+		 * left_idx and right_idx mark the indexes of the matching atom on left and right, respectively. */
+
+		debug("iteration %u  left_pos %u  left_idx %u  right_pos %u  right_idx %u\n",
+		      i, left_pos, left_idx, right_pos, right_idx);
+
+		struct diff_chunk *chunk;
+
+		/* Section before the matching atom */
+		struct diff_atom *left_atom = &left->atoms.head[left_pos];
+		unsigned int left_section_len = left_idx - left_pos;
+
+		struct diff_atom *right_atom = &(right->atoms.head[right_pos]);
+		unsigned int right_section_len = right_idx - right_pos;
+
+		if (left_section_len && right_section_len) {
+			/* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
+			if (!diff_state_add_chunk(state, false,
+						  left_atom, left_section_len,
+						  right_atom, right_section_len))
+				goto return_rc;
+		} else if (left_section_len && !right_section_len) {
+			/* Only left atoms and none on the right, they form a "minus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom, 0))
+				goto return_rc;
+		} else if (!left_section_len && right_section_len) {
+			/* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, 0,
+						  right_atom, right_section_len))
+				goto return_rc;
+		}
+		/* else: left_section_len == 0 and right_section_len == 0, i.e. nothing here. */
+
+		/* The atom found to match on both sides forms a chunk of equals on each side. In the very last
+		 * iteration of this loop, there is no matching atom, we were just cleaning out the remaining lines. */
+		if (atom) {
+			if (!diff_state_add_chunk(state, true,
+						  left->atoms.head + atom->patience.identical_lines.start,
+						  range_len(&atom->patience.identical_lines),
+						  right->atoms.head + atom_r->patience.identical_lines.start,
+						  range_len(&atom_r->patience.identical_lines)))
+				goto return_rc;
+			left_pos = atom->patience.identical_lines.end;
+			right_pos = atom_r->patience.identical_lines.end;
+		} else {
+			left_pos = left_idx + 1;
+			right_pos = right_idx + 1;
+		}
+		debug("end of iteration %u  left_pos %u  left_idx %u  right_pos %u  right_idx %u\n",
+		      i, left_pos, left_idx, right_pos, right_idx);
+	}
+	debug("** END %s\n", __func__);
+
+	rc = DIFF_RC_OK;
+
+return_rc:
+	free(lcs);
+	return rc;
+}