Commit Diff


commit - d362ea2e854f8c315f7f54d125439540e0649383
commit + c6eecea3241f36efced72dc3b7c5b023c89e1c4a
blob - b93701332fb1013c6f09d620e138075dc268925c
blob + 22f3e3fe2e77d9fac70919780ff35f258870d0dc
--- diff/Makefile
+++ diff/Makefile
@@ -16,6 +16,7 @@ SRCS= \
 MAN =	${PROG}.1
 
 CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
+#CPPFLAGS += -DDIFF_NO_MMAP
 
 .if defined(PROFILE)
 LDADD = -lutil_p -lz_p -lc_p
blob - 8105d6365f0c62b2042433e8267d0359a019823c
blob + 023a35d7013c43a90d6d1d7f6cc55c2b5834c492
--- diff/diff.c
+++ diff/diff.c
@@ -41,7 +41,7 @@ static const char *getprogname()
 
 __dead void	 usage(void);
 int		 diffreg(char *, char *, int);
-char		*mmapfile(const char *, struct stat *);
+int		 openfile(const char *, char **, struct stat *);
 
 __dead void
 usage(void)
@@ -127,6 +127,7 @@ int
 diffreg(char *file1, char *file2, int flags)
 {
 	char *str1, *str2;
+	int fd1, fd2;
 	struct stat st1, st2;
 	struct diff_input_info info = {
 		.left_path = file1,
@@ -138,10 +139,10 @@ diffreg(char *file1, char *file2, int flags)
 	
 	cfg = do_patience ? &diff_config_patience : &diff_config;
 
-	str1 = mmapfile(file1, &st1);
-	str2 = mmapfile(file2, &st2);
+	fd1 = openfile(file1, &str1, &st1);
+	fd2 = openfile(file2, &str2, &st2);
 
-	result = diff_main(cfg, str1, st1.st_size, str2, st2.st_size);
+	result = diff_main(cfg, fd1, str1, st1.st_size, fd2, str2, st2.st_size);
 #if 0
 	rc = diff_output_plain(stdout, &info, result);
 #else
@@ -149,17 +150,20 @@ diffreg(char *file1, char *file2, int flags)
 #endif
 	diff_result_free(result);
 
-	munmap(str1, st1.st_size);
-	munmap(str2, st2.st_size);
+	if (str1)
+		munmap(str1, st1.st_size);
+	if (str2)
+		munmap(str2, st2.st_size);
+	close(fd1);
+	close(fd2);
 
 	return rc;
 }
 
-char *
-mmapfile(const char *path, struct stat *st)
+int
+openfile(const char *path, char **p, struct stat *st)
 {
 	int			 fd;
-	char			*p;
 
 	fd = open(path, O_RDONLY);
 	if (fd == -1)
@@ -168,14 +172,11 @@ mmapfile(const char *path, struct stat *st)
 	if (fstat(fd, st) == -1)
 		err(2, "%s", path);
 
-	if ((uintmax_t)st->st_size > SIZE_MAX)
-		errx(2, "%s: file too big to fit memory", path);
+#ifndef DIFF_NO_MMAP
+	*p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (*p == MAP_FAILED)
+#endif
+		*p = NULL; /* fall back on file I/O */
 
-	p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
-	if (p == MAP_FAILED)
-		err(2, "mmap");
-
-	close(fd);
-
-	return p;
+	return fd;
 }
blob - 51927b8761a00249104a282f106a3653fb5b046b
blob + ae582411bdb59d5b3c89163d9a81c5b48fab369d
--- include/diff/diff_main.h
+++ include/diff/diff_main.h
@@ -75,10 +75,15 @@ enum diff_rc {
 	DIFF_RC_EINVAL = EINVAL,
 };
 
+struct diff_data;
+
 struct diff_atom {
-	const uint8_t *at;
-	size_t len;
+	struct diff_data *d; /* back pointer to associated diff data */
 
+	off_t pos;		/* if not memory-mapped */
+	const uint8_t *at;	/* if memory-mapped */
+	off_t len;
+
 	/* This hash is just a very cheap speed up for finding *mismatching*
 	 * atoms. When hashes match, we still need to compare entire atoms to
 	 * find out whether they are indeed identical or not. */
@@ -94,14 +99,6 @@ struct diff_atom {
 		struct diff_range identical_lines;
 	} patience;
 };
-
-static inline bool
-diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
-{
-	return left->hash == right->hash
-		&& left->len == right->len
-		&& memcmp(left->at, right->at, left->len) == 0;
-}
 
 /* For each file, there is a "root" struct diff_data referencing the entire
  * file, which the atoms are parsed from. In recursion of diff algorithm, there
@@ -111,14 +108,21 @@ diff_atom_same(const struct diff_atom *left, const str
  * "child" structs, atoms_allocated == 0, to indicate that the struct is
  * referencing a subset of atoms. */
 struct diff_data {
-	const uint8_t *data;
-	size_t len;
+	int fd;			/* if root diff_data and not memory-mapped */
+	off_t pos;		/* if not memory-mapped */
+	const uint8_t *data;	/* if memory-mapped */
+	off_t len;
+
 	ARRAYLIST(struct diff_atom) atoms;
 	struct diff_data *root;
 };
 
 void diff_data_free(struct diff_data *diff_data);
 
+/* Indicate whether two given diff atoms match. */
+bool diff_atom_same(const struct diff_atom *left,
+		    const struct diff_atom *right);
+
 /* The atom's index in the entire file. For atoms divided by lines of text, this
  * yields the line number (starting with 0). Also works for diff_data that
  * reference only a subsection of a file, always reflecting the global position
@@ -332,6 +336,8 @@ struct diff_config {
 };
 
 struct diff_result *diff_main(const struct diff_config *config,
-			      const uint8_t *left_data, size_t left_len,
-			      const uint8_t *right_data, size_t right_len);
+			      int left_fd, const uint8_t *left_data,
+			      off_t left_len,
+			      int right_fd, const uint8_t *right_data,
+			      off_t right_len);
 void diff_result_free(struct diff_result *result);
blob - 6bdd58965fe55ca3ffcf79a6ba3e2ca4568ecfed
blob + 3af44d4f53fe470fb58794596298080da09ca302
--- lib/debug.h
+++ lib/debug.h
@@ -20,6 +20,7 @@
 
 #if DEBUG
 #include <stdio.h>
+#include <unistd.h>
 #define print(args...) fprintf(stderr, ##args)
 #define debug print
 #define debug_dump dump
@@ -27,6 +28,18 @@
 #define debug_dump_atoms dump_atoms
 
 static inline void
+print_atom_byte(unsigned char c) {
+	if (c == '\r')
+		print("\\r");
+	else if (c == '\n')
+		print("\\n");
+	else if ((c < 32 || c >= 127) && (c != '\t'))
+		print("\\x%02x", c);
+	else
+		print("%c", c);
+}
+
+static inline void
 dump_atom(const struct diff_data *left, const struct diff_data *right,
 	  const struct diff_atom *atom)
 {
@@ -43,16 +56,29 @@ dump_atom(const struct diff_data *left, const struct d
 	print(" %s%s '",
 	      atom->patience.unique_here ? "u" : " ",
 	      atom->patience.unique_in_both ? "c" : " ");
-	const char *s;
-	for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) {
-		if (*s == '\r')
-			print("\\r");
-		else if (*s == '\n')
-			print("\\n");
-		else if ((*s < 32 || *s >= 127) && (*s != '\t'))
-			print("\\x%02x", *s);
-		else
-			print("%c", *s);
+	if (atom->at == NULL) {
+		unsigned long long total = 0;
+		off_t remain = atom->len;
+		if (lseek(atom->d->root->fd, atom->pos, SEEK_SET) == -1)
+			abort(); /* cannot return error */
+		while (remain > 0) {
+			char buf[16];
+			ssize_t r;
+			int i;
+			r = read(atom->d->root->fd, buf,
+			    MIN(remain, sizeof(buf)));
+			if (r == -1)
+				abort(); /* cannot return error */
+			if (r == 0)
+				break;
+			remain -= r;
+			for (i = 0; i < r; i++)
+				print_atom_byte(buf[i]);
+		}
+	} else {
+		const char *s;
+		for (s = atom->at; s < (const char*)(atom->at + atom->len); s++)
+			print_atom_byte(*s);
 	}
 	print("'\n");
 }
blob - f6b9cbeca28fdc250d60823064f09fad0aba2fe4
blob + 47d5d29a32cf5deb5d6838c66924649c60606e24
--- lib/diff_atomize_text.c
+++ lib/diff_atomize_text.c
@@ -15,11 +15,88 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include <unistd.h>
+#include <stdio.h>
+
 #include <diff/diff_main.h>
+#include "debug.h"
 
 static int
-diff_data_atomize_text_lines(struct diff_data *d)
+diff_data_atomize_text_lines_fd(struct diff_data *d)
 {
+	off_t pos = lseek(d->root->fd, 0, SEEK_SET);
+	const off_t end = pos + d->len;
+
+	unsigned int array_size_estimate = d->len / 50;
+	unsigned int pow2 = 1;
+	while (array_size_estimate >>= 1)
+		pow2++;
+
+	ARRAYLIST_INIT(d->atoms, 1 << pow2);
+
+	while (pos < end) {
+		off_t line_end = pos;
+		unsigned int hash = 0;
+		unsigned char buf[512];
+		ssize_t r, i;
+		struct diff_atom *atom;
+		int eol = 0;
+
+		while (eol == 0 && line_end < end) {
+			r = read(d->root->fd, buf, sizeof(buf));
+			if (r == -1)
+				return errno;
+			i = 0;
+			while (eol == 0 && i < r) {
+				if (buf[i] != '\r' && buf[i] != '\n') {
+					hash = hash * 23 + buf[i];
+					line_end++;
+				} else
+					eol = buf[i];
+				i++;
+			}
+		}
+
+		/* When not at the end of data, the line ending char ('\r' or
+		 * '\n') must follow */
+		if (line_end < end)
+			line_end++;
+		/* If that was an '\r', also pull in any following '\n' */
+		if (line_end < end && eol == '\r') {
+			if (lseek(d->root->fd, line_end, SEEK_SET) == -1)
+				return errno;
+			r = read(d->root->fd, buf, 1);
+			if (r == -1)
+				return errno;
+			if (r == 1 && buf[0] == '\n' )
+				line_end++;
+		}
+
+		/* Record the found line as diff atom */
+		ARRAYLIST_ADD(atom, d->atoms);
+		if (!atom)
+			return DIFF_RC_ENOMEM;
+
+		*atom = (struct diff_atom){
+			.d = d,
+			.pos = pos,
+			.at = NULL,	/* atom data is not memory-mapped */
+			.len = line_end - pos,
+			.hash = hash,
+		};
+
+		/* Starting point for next line: */
+		pos = line_end;
+		if (lseek(d->root->fd, pos, SEEK_SET) == -1)
+			abort();
+	}
+
+	return DIFF_RC_OK;
+}
+
+static int
+diff_data_atomize_text_lines_mmap(struct diff_data *d)
+{
 	const uint8_t *pos = d->data;
 	const uint8_t *end = pos + d->len;
 
@@ -55,6 +132,8 @@ diff_data_atomize_text_lines(struct diff_data *d)
 			return DIFF_RC_ENOMEM;
 
 		*atom = (struct diff_atom){
+			.d = d,
+			.pos = (off_t)(pos - d->data),
 			.at = pos,
 			.len = line_end - pos,
 			.hash = hash,
@@ -67,6 +146,15 @@ diff_data_atomize_text_lines(struct diff_data *d)
 	return DIFF_RC_OK;
 }
 
+static int
+diff_data_atomize_text_lines(struct diff_data *d)
+{
+	if (d->data == NULL)
+		return diff_data_atomize_text_lines_fd(d);
+	else
+		return diff_data_atomize_text_lines_mmap(d);
+}
+
 enum diff_rc
 diff_atomize_text_by_line(void *func_data, struct diff_data *left,
 			  struct diff_data *right)
blob - 541789d3a2848a9f50d3a6ffba47c91a61124717
blob + 3e7d4492a6ef544e6df8c1c3ec43b42fa2f0d3e8
--- lib/diff_main.c
+++ lib/diff_main.c
@@ -23,13 +23,94 @@
 #include <stdio.h>
 #include <string.h>
 #include <limits.h>
+#include <unistd.h>
 
 #include <assert.h>
 
 #include <diff/diff_main.h>
 
 #include "debug.h"
+
+bool
+diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
+{
+	off_t remain_left, remain_right;
+	bool same = true;
+
+	if (left->hash != right->hash || left->len != right->len)
+		return false;
+
+	if (left->at != NULL && right->at != NULL)
+		return (memcmp(left->at, right->at, left->len) == 0);
+
+	remain_left = left->len;
+	remain_right = right->len;
+	while (remain_left > 0 || remain_right > 0) {
+		const size_t chunksz = 8192;
+		unsigned char buf_left[chunksz], buf_right[chunksz];
+		const uint8_t *p_left, *p_right;
+		off_t n_left, n_right;
+		ssize_t r;
+
+		n_left = MIN(chunksz, remain_left);
+		n_right = MIN(chunksz, remain_right);
+
+		if (left->at == NULL) {
+			if (lseek(left->d->root->fd,
+			    left->pos + (left->len - remain_left),
+			    SEEK_SET) == -1)
+				abort(); /* XXX cannot return error */
+			r = read(left->d->root->fd, buf_left, n_left);
+			if (r == -1)
+				abort(); /* XXX cannot return error */
+			if (r != n_left)
+				abort(); /* XXX cannot return error */
+			p_left = buf_left;
+		} else {
+			p_left = left->at + (left->len - remain_left);
+		}
+
+		if (right->at == NULL) {
+			if (lseek(right->d->root->fd,
+			    right->pos + (right->len - remain_right),
+			    SEEK_SET) == -1)
+				abort(); /* XXX cannot return error */
+			r = read(right->d->root->fd, buf_right, n_right);
+			if (r == -1)
+				abort(); /* XXX cannot return error */
+			if (r != n_right)
+				abort(); /* XXX cannot return error */
+			p_right = buf_right;
+		} else {
+			p_right = right->at + (right->len - remain_right);
+			n_right = MIN(chunksz, remain_right);
+		}
+	
+		if (n_left == 0) {
+			if (n_right != 0)
+				same = false;
+			break;
+		} else if (n_right == 0) {
+			if (n_left != 0)
+				same = false;
+			break;
+		} else if (n_left == n_right) {
+			if (memcmp(p_left, p_right, n_left) != 0) {
+				same = false;
+				break;
+			}
+		} else {
+			same = false;
+			break;
+		}
+
+		remain_left -= n_left;
+		remain_right -= n_right;
+	}
 
+	return same;
+}
+
 /* Even if a left or right side is empty, diff output may need to know the
  * position in that file.
  * So left_start or right_start must never be NULL -- pass left_count or
@@ -68,9 +149,12 @@ diff_state_add_chunk(struct diff_state *state, bool so
 }
 
 void
-diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len)
+diff_data_init_root(struct diff_data *d, int fd, const uint8_t *data,
+    unsigned long long len)
 {
 	*d = (struct diff_data){
+		.fd = fd,
+		.pos = 0,
 		.data = data,
 		.len = len,
 		.root = d,
@@ -83,8 +167,10 @@ diff_data_init_subsection(struct diff_data *d, struct 
 {
 	struct diff_atom *last_atom = from_atom + atoms_count - 1;
 	*d = (struct diff_data){
+		.fd = -1,
+		.pos = from_atom->pos,
 		.data = from_atom->at,
-		.len = (last_atom->at + last_atom->len) - from_atom->at,
+		.len = (last_atom->pos + last_atom->len) - from_atom->pos,
 		.root = parent->root,
 		.atoms.head = from_atom,
 		.atoms.len = atoms_count,
@@ -224,16 +310,16 @@ return_rc:
 
 struct diff_result *
 diff_main(const struct diff_config *config,
-	  const uint8_t *left_data, size_t left_len,
-	  const uint8_t *right_data, size_t right_len)
+	  int left_fd, const uint8_t *left_data, off_t left_len,
+	  int right_fd, const uint8_t *right_data, off_t right_len)
 {
 	struct diff_result *result = malloc(sizeof(struct diff_result));
 	if (!result)
 		return NULL;
 
 	*result = (struct diff_result){};
-	diff_data_init_root(&result->left, left_data, left_len);
-	diff_data_init_root(&result->right, right_data, right_len);
+	diff_data_init_root(&result->left, left_fd, left_data, left_len);
+	diff_data_init_root(&result->right, right_fd, right_data, right_len);
 
 	if (!config->atomize_func) {
 		result->rc = DIFF_RC_EINVAL;
blob - 8d2e931317d282485fa79c4c3fabaf475ea31f6e
blob + 07799698e67e76ec7421983f7bee626ae2aeb3c3
--- lib/diff_output.c
+++ lib/diff_output.c
@@ -15,8 +15,33 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include <unistd.h>
+
 #include <diff/diff_output.h>
 
+static char
+get_atom_byte(struct diff_atom *atom, off_t off)
+{
+	char ch;
+	off_t cur;
+
+	if (atom->at != NULL)
+		return atom->at[off];
+
+	cur = lseek(atom->d->root->fd, 0, SEEK_CUR);
+	if (cur == -1)
+		abort(); /* XXX cannot return error */
+
+	if (cur != atom->pos + off &&
+	    lseek(atom->d->root->fd, atom->pos + off, SEEK_SET) == -1)
+		abort(); /* XXX cannot return error */
+
+	if (read(atom->d->root->fd, &ch, sizeof(ch)) == -1)
+		abort(); /* XXX cannot return error */
+
+	return ch;
+}
+
 void
 diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom,
 		  unsigned int count)
@@ -26,14 +51,20 @@ diff_output_lines(FILE *dest, const char *prefix, stru
 		fprintf(dest, "%s", prefix);
 		int i;
 		unsigned int len = atom->len;
-		if (len && atom->at[len - 1] == '\n') {
-			len--;
-			if (len && atom->at[len - 1] == '\r')
+		if (len) {
+			char ch;
+			ch = get_atom_byte(atom, len - 1);
+			if (ch == '\n')
 				len--;
+			if (len) {
+				ch = get_atom_byte(atom, len - 1);
+				if (ch == '\r')
+					len--;
+			}
 		}
 
 		for (i = 0; i < len; i++) {
-			char c = atom->at[i];
+			char c = get_atom_byte(atom, i);
 			if ((c < 0x20 || c >= 0x7f) && c != '\t')
 				fprintf(dest, "\\x%02x", (unsigned char)c);
 			else