Blame


1 3b0f3d61 2020-01-22 neels /* Generic infrastructure to implement various diff algorithms. */
2 3b0f3d61 2020-01-22 neels /*
3 3b0f3d61 2020-01-22 neels * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 3b0f3d61 2020-01-22 neels *
5 3b0f3d61 2020-01-22 neels * Permission to use, copy, modify, and distribute this software for any
6 3b0f3d61 2020-01-22 neels * purpose with or without fee is hereby granted, provided that the above
7 3b0f3d61 2020-01-22 neels * copyright notice and this permission notice appear in all copies.
8 3b0f3d61 2020-01-22 neels *
9 3b0f3d61 2020-01-22 neels * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 3b0f3d61 2020-01-22 neels * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 3b0f3d61 2020-01-22 neels * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 3b0f3d61 2020-01-22 neels * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 3b0f3d61 2020-01-22 neels * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 3b0f3d61 2020-01-22 neels * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 3b0f3d61 2020-01-22 neels * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 3b0f3d61 2020-01-22 neels */
17 3b0f3d61 2020-01-22 neels
18 3b0f3d61 2020-01-22 neels #pragma once
19 3b0f3d61 2020-01-22 neels
20 3b0f3d61 2020-01-22 neels #include <stdint.h>
21 3b0f3d61 2020-01-22 neels #include <stdlib.h>
22 3b0f3d61 2020-01-22 neels #include <stdbool.h>
23 3b0f3d61 2020-01-22 neels #include <string.h>
24 3b0f3d61 2020-01-22 neels #include <errno.h>
25 3b0f3d61 2020-01-22 neels
26 3b0f3d61 2020-01-22 neels #include <diff/arraylist.h>
27 3b0f3d61 2020-01-22 neels
28 3b0f3d61 2020-01-22 neels /* List of all possible return codes of a diff invocation. */
29 3b0f3d61 2020-01-22 neels enum diff_rc {
30 3b0f3d61 2020-01-22 neels DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1,
31 3b0f3d61 2020-01-22 neels DIFF_RC_OK = 0,
32 3b0f3d61 2020-01-22 neels DIFF_RC_ENOTSUP = ENOTSUP,
33 3b0f3d61 2020-01-22 neels DIFF_RC_ENOMEM = ENOMEM,
34 3b0f3d61 2020-01-22 neels DIFF_RC_EINVAL = EINVAL,
35 3b0f3d61 2020-01-22 neels };
36 3b0f3d61 2020-01-22 neels
37 3b0f3d61 2020-01-22 neels struct diff_atom {
38 3b0f3d61 2020-01-22 neels const uint8_t *at;
39 3b0f3d61 2020-01-22 neels size_t len;
40 3b0f3d61 2020-01-22 neels
41 3b0f3d61 2020-01-22 neels /* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to
42 3b0f3d61 2020-01-22 neels * compare entire atoms to find out whether they are indeed identical or not. */
43 3b0f3d61 2020-01-22 neels unsigned int hash;
44 3b0f3d61 2020-01-22 neels
45 3b0f3d61 2020-01-22 neels /* State for the Patience Diff algorithm */
46 3b0f3d61 2020-01-22 neels /* TODO: keep a separate array for the patience state */
47 3b0f3d61 2020-01-22 neels struct {
48 3b0f3d61 2020-01-22 neels bool unique_here;
49 3b0f3d61 2020-01-22 neels bool unique_in_both;
50 3b0f3d61 2020-01-22 neels struct diff_atom *pos_in_other;
51 3b0f3d61 2020-01-22 neels struct diff_atom *prev_stack;
52 3b0f3d61 2020-01-22 neels } patience;
53 3b0f3d61 2020-01-22 neels };
54 3b0f3d61 2020-01-22 neels
55 3b0f3d61 2020-01-22 neels static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
56 3b0f3d61 2020-01-22 neels {
57 3b0f3d61 2020-01-22 neels return left->hash == right->hash
58 3b0f3d61 2020-01-22 neels && left->len == right->len
59 3b0f3d61 2020-01-22 neels && memcmp(left->at, right->at, left->len) == 0;
60 3b0f3d61 2020-01-22 neels }
61 3b0f3d61 2020-01-22 neels
62 3b0f3d61 2020-01-22 neels /* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In
63 3b0f3d61 2020-01-22 neels * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file,
64 3b0f3d61 2020-01-22 neels * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms
65 3b0f3d61 2020-01-22 neels * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a
66 3b0f3d61 2020-01-22 neels * subset of atoms. */
67 3b0f3d61 2020-01-22 neels struct diff_data {
68 3b0f3d61 2020-01-22 neels const uint8_t *data;
69 3b0f3d61 2020-01-22 neels size_t len;
70 3b0f3d61 2020-01-22 neels ARRAYLIST(struct diff_atom) atoms;
71 3b0f3d61 2020-01-22 neels struct diff_data *root;
72 3b0f3d61 2020-01-22 neels };
73 3b0f3d61 2020-01-22 neels
74 3b0f3d61 2020-01-22 neels void diff_data_free(struct diff_data *diff_data);
75 3b0f3d61 2020-01-22 neels
76 3b0f3d61 2020-01-22 neels /* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with
77 3b0f3d61 2020-01-22 neels * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in
78 3b0f3d61 2020-01-22 neels * the file (and not the relative position within the subsection). */
79 3b0f3d61 2020-01-22 neels #define diff_atom_root_idx(DIFF_DATA, ATOM) \
80 3b0f3d61 2020-01-22 neels ((ATOM) ? (ATOM) - ((DIFF_DATA)->root->atoms.head) : (DIFF_DATA)->root->atoms.len)
81 3b0f3d61 2020-01-22 neels
82 3b0f3d61 2020-01-22 neels /* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with
83 3b0f3d61 2020-01-22 neels * 0). */
84 3b0f3d61 2020-01-22 neels #define diff_atom_idx(DIFF_DATA, ATOM) \
85 3b0f3d61 2020-01-22 neels ((ATOM) ? (ATOM) - ((DIFF_DATA)->atoms.head) : (DIFF_DATA)->atoms.len)
86 3b0f3d61 2020-01-22 neels
87 3b0f3d61 2020-01-22 neels #define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
88 3b0f3d61 2020-01-22 neels for ((ATOM) = (FIRST_ATOM); \
89 3b0f3d61 2020-01-22 neels (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
90 3b0f3d61 2020-01-22 neels (ATOM)++)
91 3b0f3d61 2020-01-22 neels
92 3b0f3d61 2020-01-22 neels #define diff_data_foreach_atom(ATOM, DIFF_DATA) \
93 3b0f3d61 2020-01-22 neels foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
94 3b0f3d61 2020-01-22 neels
95 3b0f3d61 2020-01-22 neels #define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
96 3b0f3d61 2020-01-22 neels for ((ATOM) = (FROM); \
97 3b0f3d61 2020-01-22 neels (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
98 3b0f3d61 2020-01-22 neels (ATOM)++)
99 3b0f3d61 2020-01-22 neels
100 3b0f3d61 2020-01-22 neels /* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right.
101 3b0f3d61 2020-01-22 neels *
102 3b0f3d61 2020-01-22 neels * If solved == false:
103 3b0f3d61 2020-01-22 neels * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next.
104 3b0f3d61 2020-01-22 neels * The lines on the left should be diffed against the lines on the right.
105 3b0f3d61 2020-01-22 neels * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.)
106 3b0f3d61 2020-01-22 neels *
107 3b0f3d61 2020-01-22 neels * If solved == true:
108 3b0f3d61 2020-01-22 neels * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk").
109 3b0f3d61 2020-01-22 neels * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk").
110 3b0f3d61 2020-01-22 neels * If there are both left and right lines, it is a chunk of equal content on both sides,
111 3b0f3d61 2020-01-22 neels * and left_count == right_count:
112 3b0f3d61 2020-01-22 neels *
113 3b0f3d61 2020-01-22 neels * - foo }
114 3b0f3d61 2020-01-22 neels * - bar }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
115 3b0f3d61 2020-01-22 neels * - baz } right_start = NULL, right_count = 0 }
116 3b0f3d61 2020-01-22 neels * moo }
117 3b0f3d61 2020-01-22 neels * goo }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
118 3b0f3d61 2020-01-22 neels * zoo } right_start = &right.atoms.head[0], right_count = 3 }
119 3b0f3d61 2020-01-22 neels * +loo }
120 3b0f3d61 2020-01-22 neels * +roo }-- diff_chunk{ left_start = NULL, left_count = 0,
121 3b0f3d61 2020-01-22 neels * +too } right_start = &right.atoms.head[3], right_count = 3 }
122 3b0f3d61 2020-01-22 neels *
123 3b0f3d61 2020-01-22 neels */
124 3b0f3d61 2020-01-22 neels struct diff_chunk {
125 3b0f3d61 2020-01-22 neels bool solved;
126 3b0f3d61 2020-01-22 neels struct diff_atom *left_start;
127 3b0f3d61 2020-01-22 neels unsigned int left_count;
128 3b0f3d61 2020-01-22 neels struct diff_atom *right_start;
129 3b0f3d61 2020-01-22 neels unsigned int right_count;
130 3b0f3d61 2020-01-22 neels };
131 3b0f3d61 2020-01-22 neels
132 3b0f3d61 2020-01-22 neels typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
133 3b0f3d61 2020-01-22 neels #define DIFF_RESULT_ALLOC_BLOCKSIZE 128
134 3b0f3d61 2020-01-22 neels
135 3b0f3d61 2020-01-22 neels struct diff_result {
136 3b0f3d61 2020-01-22 neels enum diff_rc rc;
137 3b0f3d61 2020-01-22 neels struct diff_data left;
138 3b0f3d61 2020-01-22 neels struct diff_data right;
139 3b0f3d61 2020-01-22 neels diff_chunk_arraylist_t chunks;
140 3b0f3d61 2020-01-22 neels };
141 3b0f3d61 2020-01-22 neels
142 3b0f3d61 2020-01-22 neels struct diff_state {
143 3b0f3d61 2020-01-22 neels /* The final result passed to the original diff caller. */
144 3b0f3d61 2020-01-22 neels struct diff_result *result;
145 3b0f3d61 2020-01-22 neels
146 3b0f3d61 2020-01-22 neels /* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */
147 3b0f3d61 2020-01-22 neels struct diff_data left;
148 3b0f3d61 2020-01-22 neels struct diff_data right;
149 3b0f3d61 2020-01-22 neels
150 3b0f3d61 2020-01-22 neels unsigned int recursion_depth_left;
151 3b0f3d61 2020-01-22 neels
152 3b0f3d61 2020-01-22 neels /* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */
153 3b0f3d61 2020-01-22 neels diff_chunk_arraylist_t temp_result;
154 3b0f3d61 2020-01-22 neels };
155 3b0f3d61 2020-01-22 neels
156 3b0f3d61 2020-01-22 neels struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
157 3b0f3d61 2020-01-22 neels struct diff_atom *left_start, unsigned int left_count,
158 3b0f3d61 2020-01-22 neels struct diff_atom *right_start, unsigned int right_count);
159 3b0f3d61 2020-01-22 neels
160 3b0f3d61 2020-01-22 neels /* Signature of a utility function to divide both source files into diff atoms.
161 3b0f3d61 2020-01-22 neels * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets
162 3b0f3d61 2020-01-22 neels * both left and right to atomize at the same time.
163 3b0f3d61 2020-01-22 neels * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
164 3b0f3d61 2020-01-22 neels *
165 3b0f3d61 2020-01-22 neels * func_data: context pointer (free to be used by implementation).
166 3b0f3d61 2020-01-22 neels * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created.
167 3b0f3d61 2020-01-22 neels * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created.
168 3b0f3d61 2020-01-22 neels */
169 3b0f3d61 2020-01-22 neels typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right);
170 3b0f3d61 2020-01-22 neels
171 3b0f3d61 2020-01-22 neels extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right);
172 3b0f3d61 2020-01-22 neels
173 3b0f3d61 2020-01-22 neels struct diff_algo_config;
174 3b0f3d61 2020-01-22 neels typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state);
175 3b0f3d61 2020-01-22 neels
176 3b0f3d61 2020-01-22 neels /* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */
177 3b0f3d61 2020-01-22 neels enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state);
178 3b0f3d61 2020-01-22 neels
179 3b0f3d61 2020-01-22 neels /* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
180 3b0f3d61 2020-01-22 neels * fail if the required space surpasses algo_config->permitted_state_size. */
181 3b0f3d61 2020-01-22 neels extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state);
182 3b0f3d61 2020-01-22 neels
183 3b0f3d61 2020-01-22 neels /* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
184 3b0f3d61 2020-01-22 neels * the problem into smaller chunks. Requires only linear amounts of memory. */
185 3b0f3d61 2020-01-22 neels extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state);
186 3b0f3d61 2020-01-22 neels
187 3b0f3d61 2020-01-22 neels /* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to
188 3b0f3d61 2020-01-22 neels * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */
189 3b0f3d61 2020-01-22 neels extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state);
190 3b0f3d61 2020-01-22 neels
191 3b0f3d61 2020-01-22 neels /* Diff algorithms to use, possibly nested. For example:
192 3b0f3d61 2020-01-22 neels *
193 3b0f3d61 2020-01-22 neels * struct diff_algo_config myers, patience, myers_divide;
194 3b0f3d61 2020-01-22 neels *
195 3b0f3d61 2020-01-22 neels * myers = (struct diff_algo_config){
196 3b0f3d61 2020-01-22 neels * .impl = diff_algo_myers,
197 3b0f3d61 2020-01-22 neels * .permitted_state_size = 32 * 1024 * 1024,
198 3b0f3d61 2020-01-22 neels * .fallback_algo = &patience, // when too large, do diff_algo_patience
199 3b0f3d61 2020-01-22 neels * };
200 3b0f3d61 2020-01-22 neels *
201 3b0f3d61 2020-01-22 neels * patience = (struct diff_algo_config){
202 3b0f3d61 2020-01-22 neels * .impl = diff_algo_patience,
203 3b0f3d61 2020-01-22 neels * .inner_algo = &patience, // After subdivision, do Patience again.
204 3b0f3d61 2020-01-22 neels * .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
205 3b0f3d61 2020-01-22 neels * };
206 3b0f3d61 2020-01-22 neels *
207 3b0f3d61 2020-01-22 neels * myers_divide = (struct diff_algo_config){
208 3b0f3d61 2020-01-22 neels * .impl = diff_algo_myers_divide,
209 3b0f3d61 2020-01-22 neels * .inner_algo = &myers, // When division succeeded, start from the top.
210 3b0f3d61 2020-01-22 neels * // (fallback_algo = NULL implies diff_algo_none).
211 3b0f3d61 2020-01-22 neels * };
212 3b0f3d61 2020-01-22 neels *
213 3b0f3d61 2020-01-22 neels * struct diff_config config = {
214 3b0f3d61 2020-01-22 neels * .algo = &myers,
215 3b0f3d61 2020-01-22 neels * ...
216 3b0f3d61 2020-01-22 neels * };
217 3b0f3d61 2020-01-22 neels * diff_main(&config, ...);
218 3b0f3d61 2020-01-22 neels */
219 3b0f3d61 2020-01-22 neels struct diff_algo_config {
220 3b0f3d61 2020-01-22 neels diff_algo_impl_t impl;
221 3b0f3d61 2020-01-22 neels
222 3b0f3d61 2020-01-22 neels /* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo
223 3b0f3d61 2020-01-22 neels * (diff_algo_myers). permitted_state_size == 0 means no limitation. */
224 3b0f3d61 2020-01-22 neels size_t permitted_state_size;
225 3b0f3d61 2020-01-22 neels
226 3b0f3d61 2020-01-22 neels /* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */
227 3b0f3d61 2020-01-22 neels const struct diff_algo_config *inner_algo;
228 3b0f3d61 2020-01-22 neels
229 3b0f3d61 2020-01-22 neels /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find
230 3b0f3d61 2020-01-22 neels * any common-unique atoms), then use this algorithm instead. */
231 3b0f3d61 2020-01-22 neels const struct diff_algo_config *fallback_algo;
232 3b0f3d61 2020-01-22 neels };
233 3b0f3d61 2020-01-22 neels
234 3b0f3d61 2020-01-22 neels struct diff_config {
235 3b0f3d61 2020-01-22 neels diff_atomize_func_t atomize_func;
236 3b0f3d61 2020-01-22 neels void *atomize_func_data;
237 3b0f3d61 2020-01-22 neels
238 3b0f3d61 2020-01-22 neels const struct diff_algo_config *algo;
239 3b0f3d61 2020-01-22 neels
240 3b0f3d61 2020-01-22 neels /* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite
241 3b0f3d61 2020-01-22 neels * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all
242 3b0f3d61 2020-01-22 neels * left atoms and add all right atoms). */
243 3b0f3d61 2020-01-22 neels unsigned int max_recursion_depth;
244 3b0f3d61 2020-01-22 neels };
245 3b0f3d61 2020-01-22 neels
246 3b0f3d61 2020-01-22 neels struct diff_result *diff_main(const struct diff_config *config,
247 3b0f3d61 2020-01-22 neels const uint8_t *left_data, size_t left_len,
248 3b0f3d61 2020-01-22 neels const uint8_t *right_data, size_t right_len);
249 3b0f3d61 2020-01-22 neels void diff_result_free(struct diff_result *result);