Blob


1 /* Split source by line breaks, and calculate a simplistic checksum. */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
18 #include <errno.h>
19 #include <stdbool.h>
20 #include <stdint.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <ctype.h>
26 #include <arraylist.h>
27 #include <diff_main.h>
29 #include "diff_internal.h"
30 #include "diff_debug.h"
32 unsigned int
33 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
34 {
35 return hash * 23 + atom_byte;
36 }
38 static int
39 diff_data_atomize_text_lines_fd(struct diff_data *d)
40 {
41 off_t pos = 0;
42 const off_t end = pos + d->len;
43 unsigned int array_size_estimate = d->len / 50;
44 unsigned int pow2 = 1;
45 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
47 while (array_size_estimate >>= 1)
48 pow2++;
50 ARRAYLIST_INIT(d->atoms, 1 << pow2);
52 if (fseek(d->root->f, 0L, SEEK_SET) == -1)
53 return errno;
55 while (pos < end) {
56 off_t line_end = pos;
57 unsigned int hash = 0;
58 unsigned char buf[512];
59 size_t r, i;
60 struct diff_atom *atom;
61 int eol = 0;
63 while (eol == 0 && line_end < end) {
64 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
65 if (r == 0 && ferror(d->root->f))
66 return errno;
67 i = 0;
68 while (eol == 0 && i < r) {
69 if (buf[i] != '\r' && buf[i] != '\n') {
70 if (!ignore_whitespace
71 || !isspace(buf[i]))
72 hash = diff_atom_hash_update(
73 hash, buf[i]);
74 line_end++;
75 } else
76 eol = buf[i];
77 i++;
78 }
79 }
81 /* When not at the end of data, the line ending char ('\r' or
82 * '\n') must follow */
83 if (line_end < end)
84 line_end++;
85 /* If that was an '\r', also pull in any following '\n' */
86 if (line_end < end && eol == '\r') {
87 if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
88 return errno;
89 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
90 if (r == 0 && ferror(d->root->f))
91 return errno;
92 if (r == 1 && buf[0] == '\n' )
93 line_end++;
94 }
96 /* Record the found line as diff atom */
97 ARRAYLIST_ADD(atom, d->atoms);
98 if (!atom)
99 return ENOMEM;
101 *atom = (struct diff_atom){
102 .root = d,
103 .pos = pos,
104 .at = NULL, /* atom data is not memory-mapped */
105 .len = line_end - pos,
106 .hash = hash,
107 };
109 /* Starting point for next line: */
110 pos = line_end;
111 if (fseeko(d->root->f, pos, SEEK_SET) == -1)
112 return errno;
115 return DIFF_RC_OK;
118 static int
119 diff_data_atomize_text_lines_mmap(struct diff_data *d)
121 const uint8_t *pos = d->data;
122 const uint8_t *end = pos + d->len;
123 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
125 unsigned int array_size_estimate = d->len / 50;
126 unsigned int pow2 = 1;
127 while (array_size_estimate >>= 1)
128 pow2++;
130 ARRAYLIST_INIT(d->atoms, 1 << pow2);
132 while (pos < end) {
133 const uint8_t *line_end = pos;
134 unsigned int hash = 0;
136 while (line_end < end && *line_end != '\r' && *line_end != '\n') {
137 if (!ignore_whitespace
138 || !isspace(*line_end))
139 hash = hash * 23 + *line_end;
140 line_end++;
143 /* When not at the end of data, the line ending char ('\r' or
144 * '\n') must follow */
145 if (line_end < end)
146 line_end++;
147 /* If that was an '\r', also pull in any following '\n' */
148 if (line_end < end - 1 && line_end[0] == '\r' &&
149 line_end[1] == '\n')
150 line_end++;
152 /* Record the found line as diff atom */
153 struct diff_atom *atom;
154 ARRAYLIST_ADD(atom, d->atoms);
155 if (!atom)
156 return ENOMEM;
158 *atom = (struct diff_atom){
159 .root = d,
160 .pos = (off_t)(pos - d->data),
161 .at = pos,
162 .len = line_end - pos,
163 .hash = hash,
164 };
166 /* Starting point for next line: */
167 pos = line_end;
170 return DIFF_RC_OK;
173 static int
174 diff_data_atomize_text_lines(struct diff_data *d)
176 if (d->data == NULL)
177 return diff_data_atomize_text_lines_fd(d);
178 else
179 return diff_data_atomize_text_lines_mmap(d);
182 int
183 diff_atomize_text_by_line(void *func_data, struct diff_data *d)
185 return diff_data_atomize_text_lines(d);