commit - d362ea2e854f8c315f7f54d125439540e0649383
commit + c6eecea3241f36efced72dc3b7c5b023c89e1c4a
blob - b93701332fb1013c6f09d620e138075dc268925c
blob + 22f3e3fe2e77d9fac70919780ff35f258870d0dc
--- diff/Makefile
+++ diff/Makefile
MAN = ${PROG}.1
CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
+#CPPFLAGS += -DDIFF_NO_MMAP
.if defined(PROFILE)
LDADD = -lutil_p -lz_p -lc_p
blob - 8105d6365f0c62b2042433e8267d0359a019823c
blob + 023a35d7013c43a90d6d1d7f6cc55c2b5834c492
--- diff/diff.c
+++ diff/diff.c
__dead void usage(void);
int diffreg(char *, char *, int);
-char *mmapfile(const char *, struct stat *);
+int openfile(const char *, char **, struct stat *);
__dead void
usage(void)
diffreg(char *file1, char *file2, int flags)
{
char *str1, *str2;
+ int fd1, fd2;
struct stat st1, st2;
struct diff_input_info info = {
.left_path = file1,
cfg = do_patience ? &diff_config_patience : &diff_config;
- str1 = mmapfile(file1, &st1);
- str2 = mmapfile(file2, &st2);
+ fd1 = openfile(file1, &str1, &st1);
+ fd2 = openfile(file2, &str2, &st2);
- result = diff_main(cfg, str1, st1.st_size, str2, st2.st_size);
+ result = diff_main(cfg, fd1, str1, st1.st_size, fd2, str2, st2.st_size);
#if 0
rc = diff_output_plain(stdout, &info, result);
#else
#endif
diff_result_free(result);
- munmap(str1, st1.st_size);
- munmap(str2, st2.st_size);
+ if (str1)
+ munmap(str1, st1.st_size);
+ if (str2)
+ munmap(str2, st2.st_size);
+ close(fd1);
+ close(fd2);
return rc;
}
-char *
-mmapfile(const char *path, struct stat *st)
+int
+openfile(const char *path, char **p, struct stat *st)
{
int fd;
- char *p;
fd = open(path, O_RDONLY);
if (fd == -1)
if (fstat(fd, st) == -1)
err(2, "%s", path);
- if ((uintmax_t)st->st_size > SIZE_MAX)
- errx(2, "%s: file too big to fit memory", path);
+#ifndef DIFF_NO_MMAP
+ *p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (*p == MAP_FAILED)
+#endif
+ *p = NULL; /* fall back on file I/O */
- p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
- if (p == MAP_FAILED)
- err(2, "mmap");
-
- close(fd);
-
- return p;
+ return fd;
}
blob - 51927b8761a00249104a282f106a3653fb5b046b
blob + ae582411bdb59d5b3c89163d9a81c5b48fab369d
--- include/diff/diff_main.h
+++ include/diff/diff_main.h
DIFF_RC_EINVAL = EINVAL,
};
+struct diff_data;
+
struct diff_atom {
- const uint8_t *at;
- size_t len;
+ struct diff_data *d; /* back pointer to associated diff data */
+ off_t pos; /* if not memory-mapped */
+ const uint8_t *at; /* if memory-mapped */
+ off_t len;
+
/* This hash is just a very cheap speed up for finding *mismatching*
* atoms. When hashes match, we still need to compare entire atoms to
* find out whether they are indeed identical or not. */
struct diff_range identical_lines;
} patience;
};
-
-static inline bool
-diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
-{
- return left->hash == right->hash
- && left->len == right->len
- && memcmp(left->at, right->at, left->len) == 0;
-}
/* For each file, there is a "root" struct diff_data referencing the entire
* file, which the atoms are parsed from. In recursion of diff algorithm, there
* "child" structs, atoms_allocated == 0, to indicate that the struct is
* referencing a subset of atoms. */
struct diff_data {
- const uint8_t *data;
- size_t len;
+ int fd; /* if root diff_data and not memory-mapped */
+ off_t pos; /* if not memory-mapped */
+ const uint8_t *data; /* if memory-mapped */
+ off_t len;
+
ARRAYLIST(struct diff_atom) atoms;
struct diff_data *root;
};
void diff_data_free(struct diff_data *diff_data);
+/* Indicate whether two given diff atoms match. */
+bool diff_atom_same(const struct diff_atom *left,
+ const struct diff_atom *right);
+
/* The atom's index in the entire file. For atoms divided by lines of text, this
* yields the line number (starting with 0). Also works for diff_data that
* reference only a subsection of a file, always reflecting the global position
};
struct diff_result *diff_main(const struct diff_config *config,
- const uint8_t *left_data, size_t left_len,
- const uint8_t *right_data, size_t right_len);
+ int left_fd, const uint8_t *left_data,
+ off_t left_len,
+ int right_fd, const uint8_t *right_data,
+ off_t right_len);
void diff_result_free(struct diff_result *result);
blob - 6bdd58965fe55ca3ffcf79a6ba3e2ca4568ecfed
blob + 3af44d4f53fe470fb58794596298080da09ca302
--- lib/debug.h
+++ lib/debug.h
#if DEBUG
#include <stdio.h>
+#include <unistd.h>
#define print(args...) fprintf(stderr, ##args)
#define debug print
#define debug_dump dump
#define debug_dump_atoms dump_atoms
static inline void
+print_atom_byte(unsigned char c) {
+ if (c == '\r')
+ print("\\r");
+ else if (c == '\n')
+ print("\\n");
+ else if ((c < 32 || c >= 127) && (c != '\t'))
+ print("\\x%02x", c);
+ else
+ print("%c", c);
+}
+
+static inline void
dump_atom(const struct diff_data *left, const struct diff_data *right,
const struct diff_atom *atom)
{
print(" %s%s '",
atom->patience.unique_here ? "u" : " ",
atom->patience.unique_in_both ? "c" : " ");
- const char *s;
- for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) {
- if (*s == '\r')
- print("\\r");
- else if (*s == '\n')
- print("\\n");
- else if ((*s < 32 || *s >= 127) && (*s != '\t'))
- print("\\x%02x", *s);
- else
- print("%c", *s);
+ if (atom->at == NULL) {
+ unsigned long long total = 0;
+ off_t remain = atom->len;
+ if (lseek(atom->d->root->fd, atom->pos, SEEK_SET) == -1)
+ abort(); /* cannot return error */
+ while (remain > 0) {
+ char buf[16];
+ ssize_t r;
+ int i;
+ r = read(atom->d->root->fd, buf,
+ MIN(remain, sizeof(buf)));
+ if (r == -1)
+ abort(); /* cannot return error */
+ if (r == 0)
+ break;
+ remain -= r;
+ for (i = 0; i < r; i++)
+ print_atom_byte(buf[i]);
+ }
+ } else {
+ const char *s;
+ for (s = atom->at; s < (const char*)(atom->at + atom->len); s++)
+ print_atom_byte(*s);
}
print("'\n");
}
blob - f6b9cbeca28fdc250d60823064f09fad0aba2fe4
blob + 47d5d29a32cf5deb5d6838c66924649c60606e24
--- lib/diff_atomize_text.c
+++ lib/diff_atomize_text.c
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
+#include <unistd.h>
+#include <stdio.h>
+
#include <diff/diff_main.h>
+#include "debug.h"
static int
-diff_data_atomize_text_lines(struct diff_data *d)
+diff_data_atomize_text_lines_fd(struct diff_data *d)
{
+ off_t pos = lseek(d->root->fd, 0, SEEK_SET);
+ const off_t end = pos + d->len;
+
+ unsigned int array_size_estimate = d->len / 50;
+ unsigned int pow2 = 1;
+ while (array_size_estimate >>= 1)
+ pow2++;
+
+ ARRAYLIST_INIT(d->atoms, 1 << pow2);
+
+ while (pos < end) {
+ off_t line_end = pos;
+ unsigned int hash = 0;
+ unsigned char buf[512];
+ ssize_t r, i;
+ struct diff_atom *atom;
+ int eol = 0;
+
+ while (eol == 0 && line_end < end) {
+ r = read(d->root->fd, buf, sizeof(buf));
+ if (r == -1)
+ return errno;
+ i = 0;
+ while (eol == 0 && i < r) {
+ if (buf[i] != '\r' && buf[i] != '\n') {
+ hash = hash * 23 + buf[i];
+ line_end++;
+ } else
+ eol = buf[i];
+ i++;
+ }
+ }
+
+ /* When not at the end of data, the line ending char ('\r' or
+ * '\n') must follow */
+ if (line_end < end)
+ line_end++;
+ /* If that was an '\r', also pull in any following '\n' */
+ if (line_end < end && eol == '\r') {
+ if (lseek(d->root->fd, line_end, SEEK_SET) == -1)
+ return errno;
+ r = read(d->root->fd, buf, 1);
+ if (r == -1)
+ return errno;
+ if (r == 1 && buf[0] == '\n' )
+ line_end++;
+ }
+
+ /* Record the found line as diff atom */
+ ARRAYLIST_ADD(atom, d->atoms);
+ if (!atom)
+ return DIFF_RC_ENOMEM;
+
+ *atom = (struct diff_atom){
+ .d = d,
+ .pos = pos,
+ .at = NULL, /* atom data is not memory-mapped */
+ .len = line_end - pos,
+ .hash = hash,
+ };
+
+ /* Starting point for next line: */
+ pos = line_end;
+ if (lseek(d->root->fd, pos, SEEK_SET) == -1)
+ abort();
+ }
+
+ return DIFF_RC_OK;
+}
+
+static int
+diff_data_atomize_text_lines_mmap(struct diff_data *d)
+{
const uint8_t *pos = d->data;
const uint8_t *end = pos + d->len;
return DIFF_RC_ENOMEM;
*atom = (struct diff_atom){
+ .d = d,
+ .pos = (off_t)(pos - d->data),
.at = pos,
.len = line_end - pos,
.hash = hash,
return DIFF_RC_OK;
}
+static int
+diff_data_atomize_text_lines(struct diff_data *d)
+{
+ if (d->data == NULL)
+ return diff_data_atomize_text_lines_fd(d);
+ else
+ return diff_data_atomize_text_lines_mmap(d);
+}
+
enum diff_rc
diff_atomize_text_by_line(void *func_data, struct diff_data *left,
struct diff_data *right)
blob - 541789d3a2848a9f50d3a6ffba47c91a61124717
blob + 3e7d4492a6ef544e6df8c1c3ec43b42fa2f0d3e8
--- lib/diff_main.c
+++ lib/diff_main.c
#include <stdio.h>
#include <string.h>
#include <limits.h>
+#include <unistd.h>
#include <assert.h>
#include <diff/diff_main.h>
#include "debug.h"
+
+bool
+diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
+{
+ off_t remain_left, remain_right;
+ bool same = true;
+
+ if (left->hash != right->hash || left->len != right->len)
+ return false;
+
+ if (left->at != NULL && right->at != NULL)
+ return (memcmp(left->at, right->at, left->len) == 0);
+
+ remain_left = left->len;
+ remain_right = right->len;
+ while (remain_left > 0 || remain_right > 0) {
+ const size_t chunksz = 8192;
+ unsigned char buf_left[chunksz], buf_right[chunksz];
+ const uint8_t *p_left, *p_right;
+ off_t n_left, n_right;
+ ssize_t r;
+
+ n_left = MIN(chunksz, remain_left);
+ n_right = MIN(chunksz, remain_right);
+
+ if (left->at == NULL) {
+ if (lseek(left->d->root->fd,
+ left->pos + (left->len - remain_left),
+ SEEK_SET) == -1)
+ abort(); /* XXX cannot return error */
+ r = read(left->d->root->fd, buf_left, n_left);
+ if (r == -1)
+ abort(); /* XXX cannot return error */
+ if (r != n_left)
+ abort(); /* XXX cannot return error */
+ p_left = buf_left;
+ } else {
+ p_left = left->at + (left->len - remain_left);
+ }
+
+ if (right->at == NULL) {
+ if (lseek(right->d->root->fd,
+ right->pos + (right->len - remain_right),
+ SEEK_SET) == -1)
+ abort(); /* XXX cannot return error */
+ r = read(right->d->root->fd, buf_right, n_right);
+ if (r == -1)
+ abort(); /* XXX cannot return error */
+ if (r != n_right)
+ abort(); /* XXX cannot return error */
+ p_right = buf_right;
+ } else {
+ p_right = right->at + (right->len - remain_right);
+ n_right = MIN(chunksz, remain_right);
+ }
+
+ if (n_left == 0) {
+ if (n_right != 0)
+ same = false;
+ break;
+ } else if (n_right == 0) {
+ if (n_left != 0)
+ same = false;
+ break;
+ } else if (n_left == n_right) {
+ if (memcmp(p_left, p_right, n_left) != 0) {
+ same = false;
+ break;
+ }
+ } else {
+ same = false;
+ break;
+ }
+
+ remain_left -= n_left;
+ remain_right -= n_right;
+ }
+ return same;
+}
+
/* Even if a left or right side is empty, diff output may need to know the
* position in that file.
* So left_start or right_start must never be NULL -- pass left_count or
}
void
-diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len)
+diff_data_init_root(struct diff_data *d, int fd, const uint8_t *data,
+ unsigned long long len)
{
*d = (struct diff_data){
+ .fd = fd,
+ .pos = 0,
.data = data,
.len = len,
.root = d,
{
struct diff_atom *last_atom = from_atom + atoms_count - 1;
*d = (struct diff_data){
+ .fd = -1,
+ .pos = from_atom->pos,
.data = from_atom->at,
- .len = (last_atom->at + last_atom->len) - from_atom->at,
+ .len = (last_atom->pos + last_atom->len) - from_atom->pos,
.root = parent->root,
.atoms.head = from_atom,
.atoms.len = atoms_count,
struct diff_result *
diff_main(const struct diff_config *config,
- const uint8_t *left_data, size_t left_len,
- const uint8_t *right_data, size_t right_len)
+ int left_fd, const uint8_t *left_data, off_t left_len,
+ int right_fd, const uint8_t *right_data, off_t right_len)
{
struct diff_result *result = malloc(sizeof(struct diff_result));
if (!result)
return NULL;
*result = (struct diff_result){};
- diff_data_init_root(&result->left, left_data, left_len);
- diff_data_init_root(&result->right, right_data, right_len);
+ diff_data_init_root(&result->left, left_fd, left_data, left_len);
+ diff_data_init_root(&result->right, right_fd, right_data, right_len);
if (!config->atomize_func) {
result->rc = DIFF_RC_EINVAL;
blob - 8d2e931317d282485fa79c4c3fabaf475ea31f6e
blob + 07799698e67e76ec7421983f7bee626ae2aeb3c3
--- lib/diff_output.c
+++ lib/diff_output.c
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
+#include <unistd.h>
+
#include <diff/diff_output.h>
+static char
+get_atom_byte(struct diff_atom *atom, off_t off)
+{
+ char ch;
+ off_t cur;
+
+ if (atom->at != NULL)
+ return atom->at[off];
+
+ cur = lseek(atom->d->root->fd, 0, SEEK_CUR);
+ if (cur == -1)
+ abort(); /* XXX cannot return error */
+
+ if (cur != atom->pos + off &&
+ lseek(atom->d->root->fd, atom->pos + off, SEEK_SET) == -1)
+ abort(); /* XXX cannot return error */
+
+ if (read(atom->d->root->fd, &ch, sizeof(ch)) == -1)
+ abort(); /* XXX cannot return error */
+
+ return ch;
+}
+
void
diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom,
unsigned int count)
fprintf(dest, "%s", prefix);
int i;
unsigned int len = atom->len;
- if (len && atom->at[len - 1] == '\n') {
- len--;
- if (len && atom->at[len - 1] == '\r')
+ if (len) {
+ char ch;
+ ch = get_atom_byte(atom, len - 1);
+ if (ch == '\n')
len--;
+ if (len) {
+ ch = get_atom_byte(atom, len - 1);
+ if (ch == '\r')
+ len--;
+ }
}
for (i = 0; i < len; i++) {
- char c = atom->at[i];
+ char c = get_atom_byte(atom, i);
if ((c < 0x20 || c >= 0x7f) && c != '\t')
fprintf(dest, "\\x%02x", (unsigned char)c);
else