Blob


1 /* Split source by line breaks, and calculate a simplistic checksum. */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
18 #include <errno.h>
19 #include <stdbool.h>
20 #include <stdint.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <ctype.h>
26 #include <arraylist.h>
27 #include <diff_main.h>
29 #include "diff_internal.h"
30 #include "diff_debug.h"
32 /*
33 * Mix another atom_byte into the provided hash value and return the result.
34 * The hash value passed in for the first byte of the atom must be zero.
35 */
36 static unsigned int
37 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
38 {
39 return hash * 23 + atom_byte;
40 }
42 static int
43 diff_data_atomize_text_lines_fd(struct diff_data *d)
44 {
45 off_t pos = 0;
46 const off_t end = pos + d->len;
47 unsigned int array_size_estimate = d->len / 50;
48 unsigned int pow2 = 1;
49 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
50 bool embedded_nul = false;
52 while (array_size_estimate >>= 1)
53 pow2++;
55 ARRAYLIST_INIT(d->atoms, 1 << pow2);
57 if (fseek(d->root->f, 0L, SEEK_SET) == -1)
58 return errno;
60 while (pos < end) {
61 off_t line_end = pos;
62 unsigned int hash = 0;
63 unsigned char buf[512];
64 size_t r, i;
65 struct diff_atom *atom;
66 int eol = 0;
68 while (eol == 0 && line_end < end) {
69 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
70 if (r == 0 && ferror(d->root->f))
71 return errno;
72 i = 0;
73 while (eol == 0 && i < r) {
74 if (buf[i] != '\r' && buf[i] != '\n') {
75 if (!ignore_whitespace
76 || !isspace(buf[i]))
77 hash = diff_atom_hash_update(
78 hash, buf[i]);
79 if (buf[i] == '\0')
80 embedded_nul = true;
81 line_end++;
82 } else
83 eol = buf[i];
84 i++;
85 }
86 }
88 /* When not at the end of data, the line ending char ('\r' or
89 * '\n') must follow */
90 if (line_end < end)
91 line_end++;
92 /* If that was an '\r', also pull in any following '\n' */
93 if (line_end < end && eol == '\r') {
94 if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
95 return errno;
96 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
97 if (r == 0 && ferror(d->root->f))
98 return errno;
99 if (r > 0 && buf[0] == '\n')
100 line_end++;
103 /* Record the found line as diff atom */
104 ARRAYLIST_ADD(atom, d->atoms);
105 if (!atom)
106 return ENOMEM;
108 *atom = (struct diff_atom){
109 .root = d,
110 .pos = pos,
111 .at = NULL, /* atom data is not memory-mapped */
112 .len = line_end - pos,
113 .hash = hash,
114 };
116 /* Starting point for next line: */
117 pos = line_end;
118 if (fseeko(d->root->f, pos, SEEK_SET) == -1)
119 return errno;
122 /* File are considered binary if they contain embedded '\0' bytes. */
123 if (embedded_nul)
124 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
126 return DIFF_RC_OK;
129 static int
130 diff_data_atomize_text_lines_mmap(struct diff_data *d)
132 const uint8_t *pos = d->data;
133 const uint8_t *end = pos + d->len;
134 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
135 bool embedded_nul = false;
136 unsigned int array_size_estimate = d->len / 50;
137 unsigned int pow2 = 1;
138 while (array_size_estimate >>= 1)
139 pow2++;
141 ARRAYLIST_INIT(d->atoms, 1 << pow2);
143 while (pos < end) {
144 const uint8_t *line_end = pos;
145 unsigned int hash = 0;
147 while (line_end < end && *line_end != '\r' && *line_end != '\n') {
148 if (!ignore_whitespace
149 || !isspace(*line_end))
150 hash = diff_atom_hash_update(hash, *line_end);
151 if (*line_end == '\0')
152 embedded_nul = true;
153 line_end++;
156 /* When not at the end of data, the line ending char ('\r' or
157 * '\n') must follow */
158 if (line_end < end && *line_end == '\r')
159 line_end++;
160 if (line_end < end && *line_end == '\n')
161 line_end++;
163 /* Record the found line as diff atom */
164 struct diff_atom *atom;
165 ARRAYLIST_ADD(atom, d->atoms);
166 if (!atom)
167 return ENOMEM;
169 *atom = (struct diff_atom){
170 .root = d,
171 .pos = (off_t)(pos - d->data),
172 .at = pos,
173 .len = line_end - pos,
174 .hash = hash,
175 };
177 /* Starting point for next line: */
178 pos = line_end;
181 /* File are considered binary if they contain embedded '\0' bytes. */
182 if (embedded_nul)
183 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
185 return DIFF_RC_OK;
188 static int
189 diff_data_atomize_text_lines(struct diff_data *d)
191 if (d->data == NULL)
192 return diff_data_atomize_text_lines_fd(d);
193 else
194 return diff_data_atomize_text_lines_mmap(d);
197 int
198 diff_atomize_text_by_line(void *func_data, struct diff_data *d)
200 return diff_data_atomize_text_lines(d);