Blob


1 /* Generic infrastructure to implement various diff algorithms. */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
18 #ifndef MAX
19 #define MAX(A,B) ((A)>(B)?(A):(B))
20 #endif
21 #ifndef MIN
22 #define MIN(A,B) ((A)<(B)?(A):(B))
23 #endif
25 struct diff_range {
26 int start;
27 int end;
28 };
30 static inline bool
31 diff_range_empty(const struct diff_range *r)
32 {
33 return r->start == r->end;
34 }
36 static inline bool
37 diff_ranges_touch(const struct diff_range *a, const struct diff_range *b)
38 {
39 return (a->end >= b->start) && (a->start <= b->end);
40 }
42 static inline void
43 diff_ranges_merge(struct diff_range *a, const struct diff_range *b)
44 {
45 *a = (struct diff_range){
46 .start = MIN(a->start, b->start),
47 .end = MAX(a->end, b->end),
48 };
49 }
51 static inline int
52 diff_range_len(const struct diff_range *r)
53 {
54 if (!r)
55 return 0;
56 return r->end - r->start;
57 }
59 /* List of all possible return codes of a diff invocation. */
60 #define DIFF_RC_USE_DIFF_ALGO_FALLBACK -1
61 #define DIFF_RC_OK 0
62 /* Any positive return values are errno values from sys/errno.h */
64 struct diff_data;
66 struct diff_atom {
67 struct diff_data *d; /* back pointer to associated diff data */
69 off_t pos; /* if not memory-mapped */
70 const uint8_t *at; /* if memory-mapped */
71 off_t len;
73 /* This hash is just a very cheap speed up for finding *mismatching*
74 * atoms. When hashes match, we still need to compare entire atoms to
75 * find out whether they are indeed identical or not. */
76 unsigned int hash;
78 /* State for the Patience Diff algorithm */
79 /* TODO: keep a separate array for the patience state */
80 struct {
81 bool unique_here;
82 bool unique_in_both;
83 struct diff_atom *pos_in_other;
84 struct diff_atom *prev_stack;
85 struct diff_range identical_lines;
86 } patience;
87 };
89 /* For each file, there is a "root" struct diff_data referencing the entire
90 * file, which the atoms are parsed from. In recursion of diff algorithm, there
91 * may be "child" struct diff_data only referencing a subsection of the file,
92 * re-using the atoms parsing. For "root" structs, atoms_allocated will be
93 * nonzero, indicating that the array of atoms is owned by that struct. For
94 * "child" structs, atoms_allocated == 0, to indicate that the struct is
95 * referencing a subset of atoms. */
96 struct diff_data {
97 int fd; /* if root diff_data and not memory-mapped */
98 off_t pos; /* if not memory-mapped */
99 const uint8_t *data; /* if memory-mapped */
100 off_t len;
102 ARRAYLIST(struct diff_atom) atoms;
103 struct diff_data *root;
105 bool ignore_whitespace;
106 };
108 void diff_data_free(struct diff_data *diff_data);
110 int
111 diff_atom_cmp(int *cmp,
112 const struct diff_atom *left,
113 const struct diff_atom *right);
115 /* Indicate whether two given diff atoms match. */
116 int
117 diff_atom_same(bool *same,
118 const struct diff_atom *left,
119 const struct diff_atom *right);
121 /* The atom's index in the entire file. For atoms divided by lines of text, this
122 * yields the line number (starting with 0). Also works for diff_data that
123 * reference only a subsection of a file, always reflecting the global position
124 * in the file (and not the relative position within the subsection). */
125 #define diff_atom_root_idx(DIFF_DATA, ATOM) \
126 ((ATOM) && ((ATOM) >= (DIFF_DATA)->root->atoms.head) \
127 ? (unsigned int)((ATOM) - ((DIFF_DATA)->root->atoms.head)) \
128 : (DIFF_DATA)->root->atoms.len)
130 /* The atom's index within DIFF_DATA. For atoms divided by lines of text, this
131 * yields the line number (starting with 0). */
132 #define diff_atom_idx(DIFF_DATA, ATOM) \
133 ((ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) \
134 ? (unsigned int)((ATOM) - ((DIFF_DATA)->atoms.head)) \
135 : (DIFF_DATA)->atoms.len)
137 #define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
138 for ((ATOM) = (FIRST_ATOM); \
139 (ATOM) \
140 && ((ATOM) >= (FIRST_ATOM)) \
141 && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
142 (ATOM)++)
144 #define diff_data_foreach_atom(ATOM, DIFF_DATA) \
145 foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
147 #define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
148 for ((ATOM) = (FROM); \
149 (ATOM) \
150 && ((ATOM) >= (DIFF_DATA)->atoms.head) \
151 && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
152 (ATOM)++)
154 /* A diff chunk represents a set of atoms on the left and/or a set of atoms on
155 * the right.
157 * If solved == false:
158 * The diff algorithm has divided the source file, and this is a chunk that the
159 * inner_algo should run on next.
160 * The lines on the left should be diffed against the lines on the right.
161 * (If there are no left lines or no right lines, it implies solved == true,
162 * because there is nothing to diff.)
164 * If solved == true:
165 * If there are only left atoms, it is a chunk removing atoms from the left ("a
166 * minus chunk").
167 * If there are only right atoms, it is a chunk adding atoms from the right ("a
168 * plus chunk").
169 * If there are both left and right lines, it is a chunk of equal content on
170 * both sides, and left_count == right_count:
172 * - foo }
173 * - bar }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
174 * - baz } right_start = NULL, right_count = 0 }
175 * moo }
176 * goo }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
177 * zoo } right_start = &right.atoms.head[0], right_count = 3 }
178 * +loo }
179 * +roo }-- diff_chunk{ left_start = NULL, left_count = 0,
180 * +too } right_start = &right.atoms.head[3], right_count = 3 }
182 */
183 struct diff_chunk {
184 bool solved;
185 struct diff_atom *left_start;
186 unsigned int left_count;
187 struct diff_atom *right_start;
188 unsigned int right_count;
189 };
191 typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
192 #define DIFF_RESULT_ALLOC_BLOCKSIZE 128
194 enum diff_chunk_type {
195 CHUNK_EMPTY,
196 CHUNK_PLUS,
197 CHUNK_MINUS,
198 CHUNK_SAME,
199 CHUNK_ERROR,
200 };
202 static inline enum diff_chunk_type
203 diff_chunk_type(const struct diff_chunk *chunk)
205 if (!chunk->left_count && !chunk->right_count)
206 return CHUNK_EMPTY;
207 if (!chunk->solved)
208 return CHUNK_ERROR;
209 if (!chunk->right_count)
210 return CHUNK_MINUS;
211 if (!chunk->left_count)
212 return CHUNK_PLUS;
213 if (chunk->left_count != chunk->right_count)
214 return CHUNK_ERROR;
215 return CHUNK_SAME;
218 struct diff_result {
219 int rc;
220 struct diff_data left;
221 struct diff_data right;
222 diff_chunk_arraylist_t chunks;
223 };
225 struct diff_state {
226 /* The final result passed to the original diff caller. */
227 struct diff_result *result;
229 /* The root diff_data is in result->left,right, these are (possibly)
230 * subsections of the root data. */
231 struct diff_data left;
232 struct diff_data right;
234 unsigned int recursion_depth_left;
236 /* Remaining chunks from one diff algorithm pass, if any solved == false
237 * chunks came up. */
238 diff_chunk_arraylist_t temp_result;
239 };
241 struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
242 struct diff_atom *left_start,
243 unsigned int left_count,
244 struct diff_atom *right_start,
245 unsigned int right_count);
247 /* Signature of a utility function to divide both source files into diff atoms.
248 * It is possible that a (future) algorithm requires both source files to decide
249 * on atom split points, hence this gets both left and right to atomize at the
250 * same time.
251 * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
253 * func_data: context pointer (free to be used by implementation).
254 * left: struct diff_data with left->data and left->len already set up, and
255 * left->atoms to be created.
256 * right: struct diff_data with right->data and right->len already set up, and
257 * right->atoms to be created.
258 */
259 typedef int (*diff_atomize_func_t)(void *func_data,
260 struct diff_data *left,
261 struct diff_data *right);
263 extern int diff_atomize_text_by_line(void *func_data,
264 struct diff_data *left,
265 struct diff_data *right);
267 struct diff_algo_config;
268 typedef int (*diff_algo_impl_t)(
269 const struct diff_algo_config *algo_config, struct diff_state *state);
271 /* Form a result with all left-side removed and all right-side added, i.e. no
272 * actual diff algorithm involved. */
273 int diff_algo_none(const struct diff_algo_config *algo_config,
274 struct diff_state *state);
276 /* Myers Diff tracing from the start all the way through to the end, requiring
277 * quadratic amounts of memory. This can fail if the required space surpasses
278 * algo_config->permitted_state_size. */
279 extern int diff_algo_myers(const struct diff_algo_config *algo_config,
280 struct diff_state *state);
282 /* Myers "Divide et Impera": tracing forwards from the start and backwards from
283 * the end to find a midpoint that divides the problem into smaller chunks.
284 * Requires only linear amounts of memory. */
285 extern int diff_algo_myers_divide(
286 const struct diff_algo_config *algo_config, struct diff_state *state);
288 /* Patience Diff algorithm, which divides a larger diff into smaller chunks. For
289 * very specific scenarios, it may lead to a complete diff result by itself, but
290 * needs a fallback algo to solve chunks that don't have common-unique atoms. */
291 extern int diff_algo_patience(
292 const struct diff_algo_config *algo_config, struct diff_state *state);
294 /* Diff algorithms to use, possibly nested. For example:
296 * struct diff_algo_config myers, patience, myers_divide;
298 * myers = (struct diff_algo_config){
299 * .impl = diff_algo_myers,
300 * .permitted_state_size = 32 * 1024 * 1024,
301 * // When too large, do diff_algo_patience:
302 * .fallback_algo = &patience,
303 * };
305 * const struct diff_algo_config patience = (struct diff_algo_config){
306 * .impl = diff_algo_patience,
307 * // After subdivision, do Patience again:
308 * .inner_algo = &patience,
309 * // If subdivision failed, do Myers Divide et Impera:
310 * .fallback_algo = &myers_then_myers_divide,
311 * };
313 * const struct diff_algo_config myers_divide = (struct diff_algo_config){
314 * .impl = diff_algo_myers_divide,
315 * // When division succeeded, start from the top:
316 * .inner_algo = &myers_then_myers_divide,
317 * // (fallback_algo = NULL implies diff_algo_none).
318 * };
319 * struct diff_config config = {
320 * .algo = &myers,
321 * ...
322 * };
323 * diff_main(&config, ...);
324 */
325 struct diff_algo_config {
326 diff_algo_impl_t impl;
328 /* Fail this algo if it would use more than this amount of memory, and
329 * instead use fallback_algo (diff_algo_myers). permitted_state_size ==
330 * 0 means no limitation. */
331 size_t permitted_state_size;
333 /* For algorithms that divide into smaller chunks, use this algorithm to
334 * solve the divided chunks. */
335 const struct diff_algo_config *inner_algo;
337 /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large
338 * state, or diff_algo_patience can't find any common-unique atoms),
339 * then use this algorithm instead. */
340 const struct diff_algo_config *fallback_algo;
341 };
343 struct diff_config {
344 diff_atomize_func_t atomize_func;
345 void *atomize_func_data;
347 const struct diff_algo_config *algo;
349 /* How deep to step into subdivisions of a source file, a paranoia /
350 * safety measure to guard against infinite loops through diff
351 * algorithms. When the maximum recursion is reached, employ
352 * diff_algo_none (i.e. remove all left atoms and add all right atoms).
353 */
354 unsigned int max_recursion_depth;
355 };
357 struct diff_result *diff_main(const struct diff_config *config,
358 int left_fd, const uint8_t *left_data,
359 off_t left_len,
360 int right_fd, const uint8_t *right_data,
361 off_t right_len, bool ignore_whitespace);
362 void diff_result_free(struct diff_result *result);