commit c6eecea3241f36efced72dc3b7c5b023c89e1c4a from: Stefan Sperling date: Sun Jul 26 19:20:53 2020 UTC fall back on file i/o in case an input file cannot be memory-mapped commit - d362ea2e854f8c315f7f54d125439540e0649383 commit + c6eecea3241f36efced72dc3b7c5b023c89e1c4a blob - b93701332fb1013c6f09d620e138075dc268925c blob + 22f3e3fe2e77d9fac70919780ff35f258870d0dc --- diff/Makefile +++ diff/Makefile @@ -16,6 +16,7 @@ SRCS= \ MAN = ${PROG}.1 CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib +#CPPFLAGS += -DDIFF_NO_MMAP .if defined(PROFILE) LDADD = -lutil_p -lz_p -lc_p blob - 8105d6365f0c62b2042433e8267d0359a019823c blob + 023a35d7013c43a90d6d1d7f6cc55c2b5834c492 --- diff/diff.c +++ diff/diff.c @@ -41,7 +41,7 @@ static const char *getprogname() __dead void usage(void); int diffreg(char *, char *, int); -char *mmapfile(const char *, struct stat *); +int openfile(const char *, char **, struct stat *); __dead void usage(void) @@ -127,6 +127,7 @@ int diffreg(char *file1, char *file2, int flags) { char *str1, *str2; + int fd1, fd2; struct stat st1, st2; struct diff_input_info info = { .left_path = file1, @@ -138,10 +139,10 @@ diffreg(char *file1, char *file2, int flags) cfg = do_patience ? &diff_config_patience : &diff_config; - str1 = mmapfile(file1, &st1); - str2 = mmapfile(file2, &st2); + fd1 = openfile(file1, &str1, &st1); + fd2 = openfile(file2, &str2, &st2); - result = diff_main(cfg, str1, st1.st_size, str2, st2.st_size); + result = diff_main(cfg, fd1, str1, st1.st_size, fd2, str2, st2.st_size); #if 0 rc = diff_output_plain(stdout, &info, result); #else @@ -149,17 +150,20 @@ diffreg(char *file1, char *file2, int flags) #endif diff_result_free(result); - munmap(str1, st1.st_size); - munmap(str2, st2.st_size); + if (str1) + munmap(str1, st1.st_size); + if (str2) + munmap(str2, st2.st_size); + close(fd1); + close(fd2); return rc; } -char * -mmapfile(const char *path, struct stat *st) +int +openfile(const char *path, char **p, struct stat *st) { int fd; - char *p; fd = open(path, O_RDONLY); if (fd == -1) @@ -168,14 +172,11 @@ mmapfile(const char *path, struct stat *st) if (fstat(fd, st) == -1) err(2, "%s", path); - if ((uintmax_t)st->st_size > SIZE_MAX) - errx(2, "%s: file too big to fit memory", path); +#ifndef DIFF_NO_MMAP + *p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (*p == MAP_FAILED) +#endif + *p = NULL; /* fall back on file I/O */ - p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0); - if (p == MAP_FAILED) - err(2, "mmap"); - - close(fd); - - return p; + return fd; } blob - 51927b8761a00249104a282f106a3653fb5b046b blob + ae582411bdb59d5b3c89163d9a81c5b48fab369d --- include/diff/diff_main.h +++ include/diff/diff_main.h @@ -75,10 +75,15 @@ enum diff_rc { DIFF_RC_EINVAL = EINVAL, }; +struct diff_data; + struct diff_atom { - const uint8_t *at; - size_t len; + struct diff_data *d; /* back pointer to associated diff data */ + off_t pos; /* if not memory-mapped */ + const uint8_t *at; /* if memory-mapped */ + off_t len; + /* This hash is just a very cheap speed up for finding *mismatching* * atoms. When hashes match, we still need to compare entire atoms to * find out whether they are indeed identical or not. */ @@ -94,14 +99,6 @@ struct diff_atom { struct diff_range identical_lines; } patience; }; - -static inline bool -diff_atom_same(const struct diff_atom *left, const struct diff_atom *right) -{ - return left->hash == right->hash - && left->len == right->len - && memcmp(left->at, right->at, left->len) == 0; -} /* For each file, there is a "root" struct diff_data referencing the entire * file, which the atoms are parsed from. In recursion of diff algorithm, there @@ -111,14 +108,21 @@ diff_atom_same(const struct diff_atom *left, const str * "child" structs, atoms_allocated == 0, to indicate that the struct is * referencing a subset of atoms. */ struct diff_data { - const uint8_t *data; - size_t len; + int fd; /* if root diff_data and not memory-mapped */ + off_t pos; /* if not memory-mapped */ + const uint8_t *data; /* if memory-mapped */ + off_t len; + ARRAYLIST(struct diff_atom) atoms; struct diff_data *root; }; void diff_data_free(struct diff_data *diff_data); +/* Indicate whether two given diff atoms match. */ +bool diff_atom_same(const struct diff_atom *left, + const struct diff_atom *right); + /* The atom's index in the entire file. For atoms divided by lines of text, this * yields the line number (starting with 0). Also works for diff_data that * reference only a subsection of a file, always reflecting the global position @@ -332,6 +336,8 @@ struct diff_config { }; struct diff_result *diff_main(const struct diff_config *config, - const uint8_t *left_data, size_t left_len, - const uint8_t *right_data, size_t right_len); + int left_fd, const uint8_t *left_data, + off_t left_len, + int right_fd, const uint8_t *right_data, + off_t right_len); void diff_result_free(struct diff_result *result); blob - 6bdd58965fe55ca3ffcf79a6ba3e2ca4568ecfed blob + 3af44d4f53fe470fb58794596298080da09ca302 --- lib/debug.h +++ lib/debug.h @@ -20,6 +20,7 @@ #if DEBUG #include +#include #define print(args...) fprintf(stderr, ##args) #define debug print #define debug_dump dump @@ -27,6 +28,18 @@ #define debug_dump_atoms dump_atoms static inline void +print_atom_byte(unsigned char c) { + if (c == '\r') + print("\\r"); + else if (c == '\n') + print("\\n"); + else if ((c < 32 || c >= 127) && (c != '\t')) + print("\\x%02x", c); + else + print("%c", c); +} + +static inline void dump_atom(const struct diff_data *left, const struct diff_data *right, const struct diff_atom *atom) { @@ -43,16 +56,29 @@ dump_atom(const struct diff_data *left, const struct d print(" %s%s '", atom->patience.unique_here ? "u" : " ", atom->patience.unique_in_both ? "c" : " "); - const char *s; - for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) { - if (*s == '\r') - print("\\r"); - else if (*s == '\n') - print("\\n"); - else if ((*s < 32 || *s >= 127) && (*s != '\t')) - print("\\x%02x", *s); - else - print("%c", *s); + if (atom->at == NULL) { + unsigned long long total = 0; + off_t remain = atom->len; + if (lseek(atom->d->root->fd, atom->pos, SEEK_SET) == -1) + abort(); /* cannot return error */ + while (remain > 0) { + char buf[16]; + ssize_t r; + int i; + r = read(atom->d->root->fd, buf, + MIN(remain, sizeof(buf))); + if (r == -1) + abort(); /* cannot return error */ + if (r == 0) + break; + remain -= r; + for (i = 0; i < r; i++) + print_atom_byte(buf[i]); + } + } else { + const char *s; + for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) + print_atom_byte(*s); } print("'\n"); } blob - f6b9cbeca28fdc250d60823064f09fad0aba2fe4 blob + 47d5d29a32cf5deb5d6838c66924649c60606e24 --- lib/diff_atomize_text.c +++ lib/diff_atomize_text.c @@ -15,11 +15,88 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include +#include + #include +#include "debug.h" static int -diff_data_atomize_text_lines(struct diff_data *d) +diff_data_atomize_text_lines_fd(struct diff_data *d) { + off_t pos = lseek(d->root->fd, 0, SEEK_SET); + const off_t end = pos + d->len; + + unsigned int array_size_estimate = d->len / 50; + unsigned int pow2 = 1; + while (array_size_estimate >>= 1) + pow2++; + + ARRAYLIST_INIT(d->atoms, 1 << pow2); + + while (pos < end) { + off_t line_end = pos; + unsigned int hash = 0; + unsigned char buf[512]; + ssize_t r, i; + struct diff_atom *atom; + int eol = 0; + + while (eol == 0 && line_end < end) { + r = read(d->root->fd, buf, sizeof(buf)); + if (r == -1) + return errno; + i = 0; + while (eol == 0 && i < r) { + if (buf[i] != '\r' && buf[i] != '\n') { + hash = hash * 23 + buf[i]; + line_end++; + } else + eol = buf[i]; + i++; + } + } + + /* When not at the end of data, the line ending char ('\r' or + * '\n') must follow */ + if (line_end < end) + line_end++; + /* If that was an '\r', also pull in any following '\n' */ + if (line_end < end && eol == '\r') { + if (lseek(d->root->fd, line_end, SEEK_SET) == -1) + return errno; + r = read(d->root->fd, buf, 1); + if (r == -1) + return errno; + if (r == 1 && buf[0] == '\n' ) + line_end++; + } + + /* Record the found line as diff atom */ + ARRAYLIST_ADD(atom, d->atoms); + if (!atom) + return DIFF_RC_ENOMEM; + + *atom = (struct diff_atom){ + .d = d, + .pos = pos, + .at = NULL, /* atom data is not memory-mapped */ + .len = line_end - pos, + .hash = hash, + }; + + /* Starting point for next line: */ + pos = line_end; + if (lseek(d->root->fd, pos, SEEK_SET) == -1) + abort(); + } + + return DIFF_RC_OK; +} + +static int +diff_data_atomize_text_lines_mmap(struct diff_data *d) +{ const uint8_t *pos = d->data; const uint8_t *end = pos + d->len; @@ -55,6 +132,8 @@ diff_data_atomize_text_lines(struct diff_data *d) return DIFF_RC_ENOMEM; *atom = (struct diff_atom){ + .d = d, + .pos = (off_t)(pos - d->data), .at = pos, .len = line_end - pos, .hash = hash, @@ -67,6 +146,15 @@ diff_data_atomize_text_lines(struct diff_data *d) return DIFF_RC_OK; } +static int +diff_data_atomize_text_lines(struct diff_data *d) +{ + if (d->data == NULL) + return diff_data_atomize_text_lines_fd(d); + else + return diff_data_atomize_text_lines_mmap(d); +} + enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right) blob - 541789d3a2848a9f50d3a6ffba47c91a61124717 blob + 3e7d4492a6ef544e6df8c1c3ec43b42fa2f0d3e8 --- lib/diff_main.c +++ lib/diff_main.c @@ -23,13 +23,94 @@ #include #include #include +#include #include #include #include "debug.h" + +bool +diff_atom_same(const struct diff_atom *left, const struct diff_atom *right) +{ + off_t remain_left, remain_right; + bool same = true; + + if (left->hash != right->hash || left->len != right->len) + return false; + + if (left->at != NULL && right->at != NULL) + return (memcmp(left->at, right->at, left->len) == 0); + + remain_left = left->len; + remain_right = right->len; + while (remain_left > 0 || remain_right > 0) { + const size_t chunksz = 8192; + unsigned char buf_left[chunksz], buf_right[chunksz]; + const uint8_t *p_left, *p_right; + off_t n_left, n_right; + ssize_t r; + + n_left = MIN(chunksz, remain_left); + n_right = MIN(chunksz, remain_right); + + if (left->at == NULL) { + if (lseek(left->d->root->fd, + left->pos + (left->len - remain_left), + SEEK_SET) == -1) + abort(); /* XXX cannot return error */ + r = read(left->d->root->fd, buf_left, n_left); + if (r == -1) + abort(); /* XXX cannot return error */ + if (r != n_left) + abort(); /* XXX cannot return error */ + p_left = buf_left; + } else { + p_left = left->at + (left->len - remain_left); + } + + if (right->at == NULL) { + if (lseek(right->d->root->fd, + right->pos + (right->len - remain_right), + SEEK_SET) == -1) + abort(); /* XXX cannot return error */ + r = read(right->d->root->fd, buf_right, n_right); + if (r == -1) + abort(); /* XXX cannot return error */ + if (r != n_right) + abort(); /* XXX cannot return error */ + p_right = buf_right; + } else { + p_right = right->at + (right->len - remain_right); + n_right = MIN(chunksz, remain_right); + } + + if (n_left == 0) { + if (n_right != 0) + same = false; + break; + } else if (n_right == 0) { + if (n_left != 0) + same = false; + break; + } else if (n_left == n_right) { + if (memcmp(p_left, p_right, n_left) != 0) { + same = false; + break; + } + } else { + same = false; + break; + } + + remain_left -= n_left; + remain_right -= n_right; + } + return same; +} + /* Even if a left or right side is empty, diff output may need to know the * position in that file. * So left_start or right_start must never be NULL -- pass left_count or @@ -68,9 +149,12 @@ diff_state_add_chunk(struct diff_state *state, bool so } void -diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len) +diff_data_init_root(struct diff_data *d, int fd, const uint8_t *data, + unsigned long long len) { *d = (struct diff_data){ + .fd = fd, + .pos = 0, .data = data, .len = len, .root = d, @@ -83,8 +167,10 @@ diff_data_init_subsection(struct diff_data *d, struct { struct diff_atom *last_atom = from_atom + atoms_count - 1; *d = (struct diff_data){ + .fd = -1, + .pos = from_atom->pos, .data = from_atom->at, - .len = (last_atom->at + last_atom->len) - from_atom->at, + .len = (last_atom->pos + last_atom->len) - from_atom->pos, .root = parent->root, .atoms.head = from_atom, .atoms.len = atoms_count, @@ -224,16 +310,16 @@ return_rc: struct diff_result * diff_main(const struct diff_config *config, - const uint8_t *left_data, size_t left_len, - const uint8_t *right_data, size_t right_len) + int left_fd, const uint8_t *left_data, off_t left_len, + int right_fd, const uint8_t *right_data, off_t right_len) { struct diff_result *result = malloc(sizeof(struct diff_result)); if (!result) return NULL; *result = (struct diff_result){}; - diff_data_init_root(&result->left, left_data, left_len); - diff_data_init_root(&result->right, right_data, right_len); + diff_data_init_root(&result->left, left_fd, left_data, left_len); + diff_data_init_root(&result->right, right_fd, right_data, right_len); if (!config->atomize_func) { result->rc = DIFF_RC_EINVAL; blob - 8d2e931317d282485fa79c4c3fabaf475ea31f6e blob + 07799698e67e76ec7421983f7bee626ae2aeb3c3 --- lib/diff_output.c +++ lib/diff_output.c @@ -15,8 +15,33 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include + #include +static char +get_atom_byte(struct diff_atom *atom, off_t off) +{ + char ch; + off_t cur; + + if (atom->at != NULL) + return atom->at[off]; + + cur = lseek(atom->d->root->fd, 0, SEEK_CUR); + if (cur == -1) + abort(); /* XXX cannot return error */ + + if (cur != atom->pos + off && + lseek(atom->d->root->fd, atom->pos + off, SEEK_SET) == -1) + abort(); /* XXX cannot return error */ + + if (read(atom->d->root->fd, &ch, sizeof(ch)) == -1) + abort(); /* XXX cannot return error */ + + return ch; +} + void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count) @@ -26,14 +51,20 @@ diff_output_lines(FILE *dest, const char *prefix, stru fprintf(dest, "%s", prefix); int i; unsigned int len = atom->len; - if (len && atom->at[len - 1] == '\n') { - len--; - if (len && atom->at[len - 1] == '\r') + if (len) { + char ch; + ch = get_atom_byte(atom, len - 1); + if (ch == '\n') len--; + if (len) { + ch = get_atom_byte(atom, len - 1); + if (ch == '\r') + len--; + } } for (i = 0; i < len; i++) { - char c = atom->at[i]; + char c = get_atom_byte(atom, i); if ((c < 0x20 || c >= 0x7f) && c != '\t') fprintf(dest, "\\x%02x", (unsigned char)c); else