Blame


1 fe621944 2020-11-10 stsp /* Split source by line breaks, and calculate a simplistic checksum. */
2 fe621944 2020-11-10 stsp /*
3 fe621944 2020-11-10 stsp * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 fe621944 2020-11-10 stsp *
5 fe621944 2020-11-10 stsp * Permission to use, copy, modify, and distribute this software for any
6 fe621944 2020-11-10 stsp * purpose with or without fee is hereby granted, provided that the above
7 fe621944 2020-11-10 stsp * copyright notice and this permission notice appear in all copies.
8 fe621944 2020-11-10 stsp *
9 fe621944 2020-11-10 stsp * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 fe621944 2020-11-10 stsp * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 fe621944 2020-11-10 stsp * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 fe621944 2020-11-10 stsp * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 fe621944 2020-11-10 stsp * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 fe621944 2020-11-10 stsp * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 fe621944 2020-11-10 stsp * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 fe621944 2020-11-10 stsp */
17 fe621944 2020-11-10 stsp
18 fe621944 2020-11-10 stsp #include <errno.h>
19 fe621944 2020-11-10 stsp #include <stdbool.h>
20 f3c44083 2020-11-14 naddy #include <stdint.h>
21 fe621944 2020-11-10 stsp #include <stdio.h>
22 fe621944 2020-11-10 stsp #include <stdlib.h>
23 fe621944 2020-11-10 stsp #include <unistd.h>
24 fe621944 2020-11-10 stsp #include <ctype.h>
25 fe621944 2020-11-10 stsp
26 fe621944 2020-11-10 stsp #include <arraylist.h>
27 fe621944 2020-11-10 stsp #include <diff_main.h>
28 fe621944 2020-11-10 stsp
29 fe621944 2020-11-10 stsp #include "diff_internal.h"
30 fe621944 2020-11-10 stsp #include "diff_debug.h"
31 fe621944 2020-11-10 stsp
32 e233ad80 2022-08-03 op /*
33 e233ad80 2022-08-03 op * Mix another atom_byte into the provided hash value and return the result.
34 e233ad80 2022-08-03 op * The hash value passed in for the first byte of the atom must be zero.
35 e233ad80 2022-08-03 op */
36 e233ad80 2022-08-03 op static unsigned int
37 dea26038 2020-11-18 stsp diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
38 dea26038 2020-11-18 stsp {
39 dea26038 2020-11-18 stsp return hash * 23 + atom_byte;
40 dea26038 2020-11-18 stsp }
41 dea26038 2020-11-18 stsp
42 fe621944 2020-11-10 stsp static int
43 fe621944 2020-11-10 stsp diff_data_atomize_text_lines_fd(struct diff_data *d)
44 fe621944 2020-11-10 stsp {
45 fe621944 2020-11-10 stsp off_t pos = 0;
46 fe621944 2020-11-10 stsp const off_t end = pos + d->len;
47 fe621944 2020-11-10 stsp unsigned int array_size_estimate = d->len / 50;
48 fe621944 2020-11-10 stsp unsigned int pow2 = 1;
49 fe621944 2020-11-10 stsp bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
50 b67f3bcb 2020-11-21 stsp bool embedded_nul = false;
51 fe621944 2020-11-10 stsp
52 fe621944 2020-11-10 stsp while (array_size_estimate >>= 1)
53 fe621944 2020-11-10 stsp pow2++;
54 fe621944 2020-11-10 stsp
55 fe621944 2020-11-10 stsp ARRAYLIST_INIT(d->atoms, 1 << pow2);
56 fe621944 2020-11-10 stsp
57 fe621944 2020-11-10 stsp if (fseek(d->root->f, 0L, SEEK_SET) == -1)
58 fe621944 2020-11-10 stsp return errno;
59 fe621944 2020-11-10 stsp
60 fe621944 2020-11-10 stsp while (pos < end) {
61 fe621944 2020-11-10 stsp off_t line_end = pos;
62 fe621944 2020-11-10 stsp unsigned int hash = 0;
63 fe621944 2020-11-10 stsp unsigned char buf[512];
64 fe621944 2020-11-10 stsp size_t r, i;
65 fe621944 2020-11-10 stsp struct diff_atom *atom;
66 fe621944 2020-11-10 stsp int eol = 0;
67 fe621944 2020-11-10 stsp
68 fe621944 2020-11-10 stsp while (eol == 0 && line_end < end) {
69 fe621944 2020-11-10 stsp r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
70 fe621944 2020-11-10 stsp if (r == 0 && ferror(d->root->f))
71 fe621944 2020-11-10 stsp return errno;
72 fe621944 2020-11-10 stsp i = 0;
73 fe621944 2020-11-10 stsp while (eol == 0 && i < r) {
74 fe621944 2020-11-10 stsp if (buf[i] != '\r' && buf[i] != '\n') {
75 fe621944 2020-11-10 stsp if (!ignore_whitespace
76 fe621944 2020-11-10 stsp || !isspace(buf[i]))
77 dea26038 2020-11-18 stsp hash = diff_atom_hash_update(
78 dea26038 2020-11-18 stsp hash, buf[i]);
79 b67f3bcb 2020-11-21 stsp if (buf[i] == '\0')
80 b67f3bcb 2020-11-21 stsp embedded_nul = true;
81 fe621944 2020-11-10 stsp line_end++;
82 fe621944 2020-11-10 stsp } else
83 fe621944 2020-11-10 stsp eol = buf[i];
84 fe621944 2020-11-10 stsp i++;
85 fe621944 2020-11-10 stsp }
86 fe621944 2020-11-10 stsp }
87 fe621944 2020-11-10 stsp
88 fe621944 2020-11-10 stsp /* When not at the end of data, the line ending char ('\r' or
89 fe621944 2020-11-10 stsp * '\n') must follow */
90 fe621944 2020-11-10 stsp if (line_end < end)
91 fe621944 2020-11-10 stsp line_end++;
92 fe621944 2020-11-10 stsp /* If that was an '\r', also pull in any following '\n' */
93 fe621944 2020-11-10 stsp if (line_end < end && eol == '\r') {
94 fe621944 2020-11-10 stsp if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
95 fe621944 2020-11-10 stsp return errno;
96 fe621944 2020-11-10 stsp r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
97 fe621944 2020-11-10 stsp if (r == 0 && ferror(d->root->f))
98 fe621944 2020-11-10 stsp return errno;
99 8d504b53 2022-07-26 op if (r > 0 && buf[0] == '\n')
100 fe621944 2020-11-10 stsp line_end++;
101 fe621944 2020-11-10 stsp }
102 fe621944 2020-11-10 stsp
103 fe621944 2020-11-10 stsp /* Record the found line as diff atom */
104 fe621944 2020-11-10 stsp ARRAYLIST_ADD(atom, d->atoms);
105 fe621944 2020-11-10 stsp if (!atom)
106 fe621944 2020-11-10 stsp return ENOMEM;
107 fe621944 2020-11-10 stsp
108 fe621944 2020-11-10 stsp *atom = (struct diff_atom){
109 fe621944 2020-11-10 stsp .root = d,
110 fe621944 2020-11-10 stsp .pos = pos,
111 fe621944 2020-11-10 stsp .at = NULL, /* atom data is not memory-mapped */
112 fe621944 2020-11-10 stsp .len = line_end - pos,
113 fe621944 2020-11-10 stsp .hash = hash,
114 fe621944 2020-11-10 stsp };
115 fe621944 2020-11-10 stsp
116 fe621944 2020-11-10 stsp /* Starting point for next line: */
117 fe621944 2020-11-10 stsp pos = line_end;
118 fe621944 2020-11-10 stsp if (fseeko(d->root->f, pos, SEEK_SET) == -1)
119 fe621944 2020-11-10 stsp return errno;
120 fe621944 2020-11-10 stsp }
121 fe621944 2020-11-10 stsp
122 b67f3bcb 2020-11-21 stsp /* File are considered binary if they contain embedded '\0' bytes. */
123 b67f3bcb 2020-11-21 stsp if (embedded_nul)
124 b67f3bcb 2020-11-21 stsp d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
125 b67f3bcb 2020-11-21 stsp
126 fe621944 2020-11-10 stsp return DIFF_RC_OK;
127 fe621944 2020-11-10 stsp }
128 fe621944 2020-11-10 stsp
129 fe621944 2020-11-10 stsp static int
130 fe621944 2020-11-10 stsp diff_data_atomize_text_lines_mmap(struct diff_data *d)
131 fe621944 2020-11-10 stsp {
132 fe621944 2020-11-10 stsp const uint8_t *pos = d->data;
133 fe621944 2020-11-10 stsp const uint8_t *end = pos + d->len;
134 fe621944 2020-11-10 stsp bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
135 b67f3bcb 2020-11-21 stsp bool embedded_nul = false;
136 fe621944 2020-11-10 stsp unsigned int array_size_estimate = d->len / 50;
137 fe621944 2020-11-10 stsp unsigned int pow2 = 1;
138 fe621944 2020-11-10 stsp while (array_size_estimate >>= 1)
139 fe621944 2020-11-10 stsp pow2++;
140 fe621944 2020-11-10 stsp
141 fe621944 2020-11-10 stsp ARRAYLIST_INIT(d->atoms, 1 << pow2);
142 fe621944 2020-11-10 stsp
143 fe621944 2020-11-10 stsp while (pos < end) {
144 fe621944 2020-11-10 stsp const uint8_t *line_end = pos;
145 fe621944 2020-11-10 stsp unsigned int hash = 0;
146 fe621944 2020-11-10 stsp
147 fe621944 2020-11-10 stsp while (line_end < end && *line_end != '\r' && *line_end != '\n') {
148 fe621944 2020-11-10 stsp if (!ignore_whitespace
149 fe621944 2020-11-10 stsp || !isspace(*line_end))
150 1c5fbba3 2022-08-03 op hash = diff_atom_hash_update(hash, *line_end);
151 b67f3bcb 2020-11-21 stsp if (*line_end == '\0')
152 b67f3bcb 2020-11-21 stsp embedded_nul = true;
153 fe621944 2020-11-10 stsp line_end++;
154 fe621944 2020-11-10 stsp }
155 fe621944 2020-11-10 stsp
156 fe621944 2020-11-10 stsp /* When not at the end of data, the line ending char ('\r' or
157 fe621944 2020-11-10 stsp * '\n') must follow */
158 8d504b53 2022-07-26 op if (line_end < end && *line_end == '\r')
159 fe621944 2020-11-10 stsp line_end++;
160 8d504b53 2022-07-26 op if (line_end < end && *line_end == '\n')
161 fe621944 2020-11-10 stsp line_end++;
162 fe621944 2020-11-10 stsp
163 fe621944 2020-11-10 stsp /* Record the found line as diff atom */
164 fe621944 2020-11-10 stsp struct diff_atom *atom;
165 fe621944 2020-11-10 stsp ARRAYLIST_ADD(atom, d->atoms);
166 fe621944 2020-11-10 stsp if (!atom)
167 fe621944 2020-11-10 stsp return ENOMEM;
168 fe621944 2020-11-10 stsp
169 fe621944 2020-11-10 stsp *atom = (struct diff_atom){
170 fe621944 2020-11-10 stsp .root = d,
171 fe621944 2020-11-10 stsp .pos = (off_t)(pos - d->data),
172 fe621944 2020-11-10 stsp .at = pos,
173 fe621944 2020-11-10 stsp .len = line_end - pos,
174 fe621944 2020-11-10 stsp .hash = hash,
175 fe621944 2020-11-10 stsp };
176 fe621944 2020-11-10 stsp
177 fe621944 2020-11-10 stsp /* Starting point for next line: */
178 fe621944 2020-11-10 stsp pos = line_end;
179 fe621944 2020-11-10 stsp }
180 fe621944 2020-11-10 stsp
181 b67f3bcb 2020-11-21 stsp /* File are considered binary if they contain embedded '\0' bytes. */
182 b67f3bcb 2020-11-21 stsp if (embedded_nul)
183 b67f3bcb 2020-11-21 stsp d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
184 b67f3bcb 2020-11-21 stsp
185 fe621944 2020-11-10 stsp return DIFF_RC_OK;
186 fe621944 2020-11-10 stsp }
187 fe621944 2020-11-10 stsp
188 fe621944 2020-11-10 stsp static int
189 fe621944 2020-11-10 stsp diff_data_atomize_text_lines(struct diff_data *d)
190 fe621944 2020-11-10 stsp {
191 fe621944 2020-11-10 stsp if (d->data == NULL)
192 fe621944 2020-11-10 stsp return diff_data_atomize_text_lines_fd(d);
193 fe621944 2020-11-10 stsp else
194 fe621944 2020-11-10 stsp return diff_data_atomize_text_lines_mmap(d);
195 fe621944 2020-11-10 stsp }
196 fe621944 2020-11-10 stsp
197 fe621944 2020-11-10 stsp int
198 fe621944 2020-11-10 stsp diff_atomize_text_by_line(void *func_data, struct diff_data *d)
199 fe621944 2020-11-10 stsp {
200 fe621944 2020-11-10 stsp return diff_data_atomize_text_lines(d);
201 fe621944 2020-11-10 stsp }