1 3b0f3d61 2020-01-22 neels /* Generic infrastructure to implement various diff algorithms. */
3 3b0f3d61 2020-01-22 neels * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
5 3b0f3d61 2020-01-22 neels * Permission to use, copy, modify, and distribute this software for any
6 3b0f3d61 2020-01-22 neels * purpose with or without fee is hereby granted, provided that the above
7 3b0f3d61 2020-01-22 neels * copyright notice and this permission notice appear in all copies.
9 3b0f3d61 2020-01-22 neels * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 3b0f3d61 2020-01-22 neels * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 3b0f3d61 2020-01-22 neels * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 3b0f3d61 2020-01-22 neels * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 3b0f3d61 2020-01-22 neels * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 3b0f3d61 2020-01-22 neels * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 3b0f3d61 2020-01-22 neels * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 54fa8228 2020-01-27 neels #ifndef MAX
19 54fa8228 2020-01-27 neels #define MAX(A,B) ((A)>(B)?(A):(B))
21 54fa8228 2020-01-27 neels #ifndef MIN
22 54fa8228 2020-01-27 neels #define MIN(A,B) ((A)<(B)?(A):(B))
25 d362ea2e 2020-07-25 stsp struct diff_range {
30 61a7b578 2020-05-06 neels static inline bool
31 d362ea2e 2020-07-25 stsp diff_range_empty(const struct diff_range *r)
33 54fa8228 2020-01-27 neels return r->start == r->end;
36 61a7b578 2020-05-06 neels static inline bool
37 d362ea2e 2020-07-25 stsp diff_ranges_touch(const struct diff_range *a, const struct diff_range *b)
39 54fa8228 2020-01-27 neels return (a->end >= b->start) && (a->start <= b->end);
42 61a7b578 2020-05-06 neels static inline void
43 d362ea2e 2020-07-25 stsp diff_ranges_merge(struct diff_range *a, const struct diff_range *b)
45 d362ea2e 2020-07-25 stsp *a = (struct diff_range){
46 54fa8228 2020-01-27 neels .start = MIN(a->start, b->start),
47 54fa8228 2020-01-27 neels .end = MAX(a->end, b->end),
51 61a7b578 2020-05-06 neels static inline int
52 d362ea2e 2020-07-25 stsp diff_range_len(const struct diff_range *r)
56 54fa8228 2020-01-27 neels return r->end - r->start;
59 3b0f3d61 2020-01-22 neels /* List of all possible return codes of a diff invocation. */
60 3e6cba3a 2020-08-13 stsp #define DIFF_RC_USE_DIFF_ALGO_FALLBACK -1
61 3e6cba3a 2020-08-13 stsp #define DIFF_RC_OK 0
62 3e6cba3a 2020-08-13 stsp /* Any positive return values are errno values from sys/errno.h */
64 c6eecea3 2020-07-26 stsp struct diff_data;
66 3b0f3d61 2020-01-22 neels struct diff_atom {
67 c6eecea3 2020-07-26 stsp struct diff_data *d; /* back pointer to associated diff data */
69 c6eecea3 2020-07-26 stsp off_t pos; /* if not memory-mapped */
70 c6eecea3 2020-07-26 stsp const uint8_t *at; /* if memory-mapped */
73 0d27172a 2020-05-06 neels /* This hash is just a very cheap speed up for finding *mismatching*
74 0d27172a 2020-05-06 neels * atoms. When hashes match, we still need to compare entire atoms to
75 0d27172a 2020-05-06 neels * find out whether they are indeed identical or not. */
76 3b0f3d61 2020-01-22 neels unsigned int hash;
78 3b0f3d61 2020-01-22 neels /* State for the Patience Diff algorithm */
79 3b0f3d61 2020-01-22 neels /* TODO: keep a separate array for the patience state */
81 3b0f3d61 2020-01-22 neels bool unique_here;
82 3b0f3d61 2020-01-22 neels bool unique_in_both;
83 3b0f3d61 2020-01-22 neels struct diff_atom *pos_in_other;
84 3b0f3d61 2020-01-22 neels struct diff_atom *prev_stack;
85 d362ea2e 2020-07-25 stsp struct diff_range identical_lines;
86 3b0f3d61 2020-01-22 neels } patience;
89 0d27172a 2020-05-06 neels /* For each file, there is a "root" struct diff_data referencing the entire
90 0d27172a 2020-05-06 neels * file, which the atoms are parsed from. In recursion of diff algorithm, there
91 0d27172a 2020-05-06 neels * may be "child" struct diff_data only referencing a subsection of the file,
92 0d27172a 2020-05-06 neels * re-using the atoms parsing. For "root" structs, atoms_allocated will be
93 0d27172a 2020-05-06 neels * nonzero, indicating that the array of atoms is owned by that struct. For
94 0d27172a 2020-05-06 neels * "child" structs, atoms_allocated == 0, to indicate that the struct is
95 0d27172a 2020-05-06 neels * referencing a subset of atoms. */
96 3b0f3d61 2020-01-22 neels struct diff_data {
97 c6eecea3 2020-07-26 stsp int fd; /* if root diff_data and not memory-mapped */
98 c6eecea3 2020-07-26 stsp off_t pos; /* if not memory-mapped */
99 c6eecea3 2020-07-26 stsp const uint8_t *data; /* if memory-mapped */
102 3b0f3d61 2020-01-22 neels ARRAYLIST(struct diff_atom) atoms;
103 3b0f3d61 2020-01-22 neels struct diff_data *root;
105 732e8ee0 2020-09-20 stsp bool ignore_whitespace;
108 3b0f3d61 2020-01-22 neels void diff_data_free(struct diff_data *diff_data);
111 b3fb4686 2020-09-20 neels diff_atom_cmp(int *cmp,
112 b3fb4686 2020-09-20 neels const struct diff_atom *left,
113 b3fb4686 2020-09-20 neels const struct diff_atom *right);
115 c6eecea3 2020-07-26 stsp /* Indicate whether two given diff atoms match. */
117 b3fb4686 2020-09-20 neels diff_atom_same(bool *same,
118 b3fb4686 2020-09-20 neels const struct diff_atom *left,
119 b3fb4686 2020-09-20 neels const struct diff_atom *right);
121 0d27172a 2020-05-06 neels /* The atom's index in the entire file. For atoms divided by lines of text, this
122 0d27172a 2020-05-06 neels * yields the line number (starting with 0). Also works for diff_data that
123 0d27172a 2020-05-06 neels * reference only a subsection of a file, always reflecting the global position
124 0d27172a 2020-05-06 neels * in the file (and not the relative position within the subsection). */
125 3b0f3d61 2020-01-22 neels #define diff_atom_root_idx(DIFF_DATA, ATOM) \
126 15ac50e2 2020-05-05 neels ((ATOM) && ((ATOM) >= (DIFF_DATA)->root->atoms.head) \
127 15ac50e2 2020-05-05 neels ? (unsigned int)((ATOM) - ((DIFF_DATA)->root->atoms.head)) \
128 15ac50e2 2020-05-05 neels : (DIFF_DATA)->root->atoms.len)
130 0d27172a 2020-05-06 neels /* The atom's index within DIFF_DATA. For atoms divided by lines of text, this
131 0d27172a 2020-05-06 neels * yields the line number (starting with 0). */
132 3b0f3d61 2020-01-22 neels #define diff_atom_idx(DIFF_DATA, ATOM) \
133 15ac50e2 2020-05-05 neels ((ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) \
134 15ac50e2 2020-05-05 neels ? (unsigned int)((ATOM) - ((DIFF_DATA)->atoms.head)) \
135 15ac50e2 2020-05-05 neels : (DIFF_DATA)->atoms.len)
137 3b0f3d61 2020-01-22 neels #define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
138 3b0f3d61 2020-01-22 neels for ((ATOM) = (FIRST_ATOM); \
140 0d27172a 2020-05-06 neels && ((ATOM) >= (FIRST_ATOM)) \
141 0d27172a 2020-05-06 neels && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
144 3b0f3d61 2020-01-22 neels #define diff_data_foreach_atom(ATOM, DIFF_DATA) \
145 3b0f3d61 2020-01-22 neels foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
147 3b0f3d61 2020-01-22 neels #define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
148 3b0f3d61 2020-01-22 neels for ((ATOM) = (FROM); \
150 0d27172a 2020-05-06 neels && ((ATOM) >= (DIFF_DATA)->atoms.head) \
151 0d27172a 2020-05-06 neels && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
154 0d27172a 2020-05-06 neels /* A diff chunk represents a set of atoms on the left and/or a set of atoms on
155 0d27172a 2020-05-06 neels * the right.
157 3b0f3d61 2020-01-22 neels * If solved == false:
158 0d27172a 2020-05-06 neels * The diff algorithm has divided the source file, and this is a chunk that the
159 0d27172a 2020-05-06 neels * inner_algo should run on next.
160 3b0f3d61 2020-01-22 neels * The lines on the left should be diffed against the lines on the right.
161 0d27172a 2020-05-06 neels * (If there are no left lines or no right lines, it implies solved == true,
162 0d27172a 2020-05-06 neels * because there is nothing to diff.)
164 3b0f3d61 2020-01-22 neels * If solved == true:
165 0d27172a 2020-05-06 neels * If there are only left atoms, it is a chunk removing atoms from the left ("a
166 0d27172a 2020-05-06 neels * minus chunk").
167 0d27172a 2020-05-06 neels * If there are only right atoms, it is a chunk adding atoms from the right ("a
168 0d27172a 2020-05-06 neels * plus chunk").
169 0d27172a 2020-05-06 neels * If there are both left and right lines, it is a chunk of equal content on
170 0d27172a 2020-05-06 neels * both sides, and left_count == right_count:
173 3b0f3d61 2020-01-22 neels * - bar }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
174 3b0f3d61 2020-01-22 neels * - baz } right_start = NULL, right_count = 0 }
176 3b0f3d61 2020-01-22 neels * goo }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
177 0d27172a 2020-05-06 neels * zoo } right_start = &right.atoms.head[0], right_count = 3 }
179 3b0f3d61 2020-01-22 neels * +roo }-- diff_chunk{ left_start = NULL, left_count = 0,
180 0d27172a 2020-05-06 neels * +too } right_start = &right.atoms.head[3], right_count = 3 }
183 3b0f3d61 2020-01-22 neels struct diff_chunk {
184 3b0f3d61 2020-01-22 neels bool solved;
185 3b0f3d61 2020-01-22 neels struct diff_atom *left_start;
186 3b0f3d61 2020-01-22 neels unsigned int left_count;
187 3b0f3d61 2020-01-22 neels struct diff_atom *right_start;
188 3b0f3d61 2020-01-22 neels unsigned int right_count;
191 3b0f3d61 2020-01-22 neels typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
192 3b0f3d61 2020-01-22 neels #define DIFF_RESULT_ALLOC_BLOCKSIZE 128
194 8546b045 2020-09-20 neels enum diff_chunk_type {
195 8546b045 2020-09-20 neels CHUNK_EMPTY,
196 8546b045 2020-09-20 neels CHUNK_PLUS,
197 8546b045 2020-09-20 neels CHUNK_MINUS,
198 8546b045 2020-09-20 neels CHUNK_SAME,
199 8546b045 2020-09-20 neels CHUNK_ERROR,
202 8546b045 2020-09-20 neels static inline enum diff_chunk_type
203 8546b045 2020-09-20 neels diff_chunk_type(const struct diff_chunk *chunk)
205 8546b045 2020-09-20 neels if (!chunk->left_count && !chunk->right_count)
206 8546b045 2020-09-20 neels return CHUNK_EMPTY;
207 8546b045 2020-09-20 neels if (!chunk->solved)
208 8546b045 2020-09-20 neels return CHUNK_ERROR;
209 8546b045 2020-09-20 neels if (!chunk->right_count)
210 8546b045 2020-09-20 neels return CHUNK_MINUS;
211 8546b045 2020-09-20 neels if (!chunk->left_count)
212 8546b045 2020-09-20 neels return CHUNK_PLUS;
213 8546b045 2020-09-20 neels if (chunk->left_count != chunk->right_count)
214 8546b045 2020-09-20 neels return CHUNK_ERROR;
215 8546b045 2020-09-20 neels return CHUNK_SAME;
218 3b0f3d61 2020-01-22 neels struct diff_result {
220 3b0f3d61 2020-01-22 neels struct diff_data left;
221 3b0f3d61 2020-01-22 neels struct diff_data right;
222 3b0f3d61 2020-01-22 neels diff_chunk_arraylist_t chunks;
225 3b0f3d61 2020-01-22 neels struct diff_state {
226 3b0f3d61 2020-01-22 neels /* The final result passed to the original diff caller. */
227 3b0f3d61 2020-01-22 neels struct diff_result *result;
229 0d27172a 2020-05-06 neels /* The root diff_data is in result->left,right, these are (possibly)
230 0d27172a 2020-05-06 neels * subsections of the root data. */
231 3b0f3d61 2020-01-22 neels struct diff_data left;
232 3b0f3d61 2020-01-22 neels struct diff_data right;
234 3b0f3d61 2020-01-22 neels unsigned int recursion_depth_left;
236 0d27172a 2020-05-06 neels /* Remaining chunks from one diff algorithm pass, if any solved == false
237 0d27172a 2020-05-06 neels * chunks came up. */
238 3b0f3d61 2020-01-22 neels diff_chunk_arraylist_t temp_result;
241 3b0f3d61 2020-01-22 neels struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
242 0d27172a 2020-05-06 neels struct diff_atom *left_start,
243 0d27172a 2020-05-06 neels unsigned int left_count,
244 0d27172a 2020-05-06 neels struct diff_atom *right_start,
245 0d27172a 2020-05-06 neels unsigned int right_count);
247 3b0f3d61 2020-01-22 neels /* Signature of a utility function to divide both source files into diff atoms.
248 0d27172a 2020-05-06 neels * It is possible that a (future) algorithm requires both source files to decide
249 0d27172a 2020-05-06 neels * on atom split points, hence this gets both left and right to atomize at the
250 0d27172a 2020-05-06 neels * same time.
251 3b0f3d61 2020-01-22 neels * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
253 3b0f3d61 2020-01-22 neels * func_data: context pointer (free to be used by implementation).
254 0d27172a 2020-05-06 neels * left: struct diff_data with left->data and left->len already set up, and
255 0d27172a 2020-05-06 neels * left->atoms to be created.
256 0d27172a 2020-05-06 neels * right: struct diff_data with right->data and right->len already set up, and
257 0d27172a 2020-05-06 neels * right->atoms to be created.
259 3e6cba3a 2020-08-13 stsp typedef int (*diff_atomize_func_t)(void *func_data,
260 0d27172a 2020-05-06 neels struct diff_data *left,
261 0d27172a 2020-05-06 neels struct diff_data *right);
263 3e6cba3a 2020-08-13 stsp extern int diff_atomize_text_by_line(void *func_data,
264 0d27172a 2020-05-06 neels struct diff_data *left,
265 0d27172a 2020-05-06 neels struct diff_data *right);
267 3b0f3d61 2020-01-22 neels struct diff_algo_config;
268 3e6cba3a 2020-08-13 stsp typedef int (*diff_algo_impl_t)(
269 0d27172a 2020-05-06 neels const struct diff_algo_config *algo_config, struct diff_state *state);
271 0d27172a 2020-05-06 neels /* Form a result with all left-side removed and all right-side added, i.e. no
272 0d27172a 2020-05-06 neels * actual diff algorithm involved. */
273 3e6cba3a 2020-08-13 stsp int diff_algo_none(const struct diff_algo_config *algo_config,
274 0d27172a 2020-05-06 neels struct diff_state *state);
276 0d27172a 2020-05-06 neels /* Myers Diff tracing from the start all the way through to the end, requiring
277 0d27172a 2020-05-06 neels * quadratic amounts of memory. This can fail if the required space surpasses
278 0d27172a 2020-05-06 neels * algo_config->permitted_state_size. */
279 3e6cba3a 2020-08-13 stsp extern int diff_algo_myers(const struct diff_algo_config *algo_config,
280 0d27172a 2020-05-06 neels struct diff_state *state);
282 0d27172a 2020-05-06 neels /* Myers "Divide et Impera": tracing forwards from the start and backwards from
283 0d27172a 2020-05-06 neels * the end to find a midpoint that divides the problem into smaller chunks.
284 0d27172a 2020-05-06 neels * Requires only linear amounts of memory. */
285 3e6cba3a 2020-08-13 stsp extern int diff_algo_myers_divide(
286 0d27172a 2020-05-06 neels const struct diff_algo_config *algo_config, struct diff_state *state);
288 0d27172a 2020-05-06 neels /* Patience Diff algorithm, which divides a larger diff into smaller chunks. For
289 0d27172a 2020-05-06 neels * very specific scenarios, it may lead to a complete diff result by itself, but
290 0d27172a 2020-05-06 neels * needs a fallback algo to solve chunks that don't have common-unique atoms. */
291 3e6cba3a 2020-08-13 stsp extern int diff_algo_patience(
292 0d27172a 2020-05-06 neels const struct diff_algo_config *algo_config, struct diff_state *state);
294 3b0f3d61 2020-01-22 neels /* Diff algorithms to use, possibly nested. For example:
296 3b0f3d61 2020-01-22 neels * struct diff_algo_config myers, patience, myers_divide;
298 3b0f3d61 2020-01-22 neels * myers = (struct diff_algo_config){
299 3b0f3d61 2020-01-22 neels * .impl = diff_algo_myers,
300 3b0f3d61 2020-01-22 neels * .permitted_state_size = 32 * 1024 * 1024,
301 0d27172a 2020-05-06 neels * // When too large, do diff_algo_patience:
302 0d27172a 2020-05-06 neels * .fallback_algo = &patience,
305 0d27172a 2020-05-06 neels * const struct diff_algo_config patience = (struct diff_algo_config){
306 0d27172a 2020-05-06 neels * .impl = diff_algo_patience,
307 0d27172a 2020-05-06 neels * // After subdivision, do Patience again:
308 0d27172a 2020-05-06 neels * .inner_algo = &patience,
309 0d27172a 2020-05-06 neels * // If subdivision failed, do Myers Divide et Impera:
310 0d27172a 2020-05-06 neels * .fallback_algo = &myers_then_myers_divide,
313 0d27172a 2020-05-06 neels * const struct diff_algo_config myers_divide = (struct diff_algo_config){
314 0d27172a 2020-05-06 neels * .impl = diff_algo_myers_divide,
315 0d27172a 2020-05-06 neels * // When division succeeded, start from the top:
316 0d27172a 2020-05-06 neels * .inner_algo = &myers_then_myers_divide,
317 0d27172a 2020-05-06 neels * // (fallback_algo = NULL implies diff_algo_none).
319 3b0f3d61 2020-01-22 neels * struct diff_config config = {
320 3b0f3d61 2020-01-22 neels * .algo = &myers,
323 3b0f3d61 2020-01-22 neels * diff_main(&config, ...);
325 3b0f3d61 2020-01-22 neels struct diff_algo_config {
326 3b0f3d61 2020-01-22 neels diff_algo_impl_t impl;
328 0d27172a 2020-05-06 neels /* Fail this algo if it would use more than this amount of memory, and
329 0d27172a 2020-05-06 neels * instead use fallback_algo (diff_algo_myers). permitted_state_size ==
330 0d27172a 2020-05-06 neels * 0 means no limitation. */
331 3b0f3d61 2020-01-22 neels size_t permitted_state_size;
333 0d27172a 2020-05-06 neels /* For algorithms that divide into smaller chunks, use this algorithm to
334 0d27172a 2020-05-06 neels * solve the divided chunks. */
335 3b0f3d61 2020-01-22 neels const struct diff_algo_config *inner_algo;
337 0d27172a 2020-05-06 neels /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large
338 0d27172a 2020-05-06 neels * state, or diff_algo_patience can't find any common-unique atoms),
339 0d27172a 2020-05-06 neels * then use this algorithm instead. */
340 3b0f3d61 2020-01-22 neels const struct diff_algo_config *fallback_algo;
343 3b0f3d61 2020-01-22 neels struct diff_config {
344 3b0f3d61 2020-01-22 neels diff_atomize_func_t atomize_func;
345 3b0f3d61 2020-01-22 neels void *atomize_func_data;
347 3b0f3d61 2020-01-22 neels const struct diff_algo_config *algo;
349 0d27172a 2020-05-06 neels /* How deep to step into subdivisions of a source file, a paranoia /
350 0d27172a 2020-05-06 neels * safety measure to guard against infinite loops through diff
351 0d27172a 2020-05-06 neels * algorithms. When the maximum recursion is reached, employ
352 0d27172a 2020-05-06 neels * diff_algo_none (i.e. remove all left atoms and add all right atoms).
354 3b0f3d61 2020-01-22 neels unsigned int max_recursion_depth;
357 3b0f3d61 2020-01-22 neels struct diff_result *diff_main(const struct diff_config *config,
358 c6eecea3 2020-07-26 stsp int left_fd, const uint8_t *left_data,
359 c6eecea3 2020-07-26 stsp off_t left_len,
360 c6eecea3 2020-07-26 stsp int right_fd, const uint8_t *right_data,
361 732e8ee0 2020-09-20 stsp off_t right_len, bool ignore_whitespace);
362 3b0f3d61 2020-01-22 neels void diff_result_free(struct diff_result *result);