Blob


1 /* Split source by line breaks, and calculate a simplistic checksum. */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
18 #include <errno.h>
19 #include <inttypes.h>
20 #include <stdbool.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
25 #include <diff/arraylist.h>
26 #include <diff/diff_main.h>
27 #include "diff_debug.h"
29 static int
30 diff_data_atomize_text_lines_fd(struct diff_data *d)
31 {
32 off_t pos = lseek(d->root->fd, 0, SEEK_SET);
33 const off_t end = pos + d->len;
35 unsigned int array_size_estimate = d->len / 50;
36 unsigned int pow2 = 1;
37 while (array_size_estimate >>= 1)
38 pow2++;
40 ARRAYLIST_INIT(d->atoms, 1 << pow2);
42 while (pos < end) {
43 off_t line_end = pos;
44 unsigned int hash = 0;
45 unsigned char buf[512];
46 ssize_t r, i;
47 struct diff_atom *atom;
48 int eol = 0;
50 while (eol == 0 && line_end < end) {
51 r = read(d->root->fd, buf, sizeof(buf));
52 if (r == -1)
53 return errno;
54 i = 0;
55 while (eol == 0 && i < r) {
56 if (buf[i] != '\r' && buf[i] != '\n') {
57 hash = hash * 23 + buf[i];
58 line_end++;
59 } else
60 eol = buf[i];
61 i++;
62 }
63 }
65 /* When not at the end of data, the line ending char ('\r' or
66 * '\n') must follow */
67 if (line_end < end)
68 line_end++;
69 /* If that was an '\r', also pull in any following '\n' */
70 if (line_end < end && eol == '\r') {
71 if (lseek(d->root->fd, line_end, SEEK_SET) == -1)
72 return errno;
73 r = read(d->root->fd, buf, 1);
74 if (r == -1)
75 return errno;
76 if (r == 1 && buf[0] == '\n' )
77 line_end++;
78 }
80 /* Record the found line as diff atom */
81 ARRAYLIST_ADD(atom, d->atoms);
82 if (!atom)
83 return ENOMEM;
85 *atom = (struct diff_atom){
86 .d = d,
87 .pos = pos,
88 .at = NULL, /* atom data is not memory-mapped */
89 .len = line_end - pos,
90 .hash = hash,
91 };
93 /* Starting point for next line: */
94 pos = line_end;
95 if (lseek(d->root->fd, pos, SEEK_SET) == -1)
96 abort();
97 }
99 return DIFF_RC_OK;
102 static int
103 diff_data_atomize_text_lines_mmap(struct diff_data *d)
105 const uint8_t *pos = d->data;
106 const uint8_t *end = pos + d->len;
108 unsigned int array_size_estimate = d->len / 50;
109 unsigned int pow2 = 1;
110 while (array_size_estimate >>= 1)
111 pow2++;
113 ARRAYLIST_INIT(d->atoms, 1 << pow2);
115 while (pos < end) {
116 const uint8_t *line_end = pos;
117 unsigned int hash = 0;
119 while (line_end < end && *line_end != '\r' && *line_end != '\n') {
120 hash = hash * 23 + *line_end;
121 line_end++;
124 /* When not at the end of data, the line ending char ('\r' or
125 * '\n') must follow */
126 if (line_end < end)
127 line_end++;
128 /* If that was an '\r', also pull in any following '\n' */
129 if (line_end[0] == '\r'
130 && line_end < end && line_end[1] == '\n')
131 line_end++;
133 /* Record the found line as diff atom */
134 struct diff_atom *atom;
135 ARRAYLIST_ADD(atom, d->atoms);
136 if (!atom)
137 return ENOMEM;
139 *atom = (struct diff_atom){
140 .d = d,
141 .pos = (off_t)(pos - d->data),
142 .at = pos,
143 .len = line_end - pos,
144 .hash = hash,
145 };
147 /* Starting point for next line: */
148 pos = line_end;
151 return DIFF_RC_OK;
154 static int
155 diff_data_atomize_text_lines(struct diff_data *d)
157 if (d->data == NULL)
158 return diff_data_atomize_text_lines_fd(d);
159 else
160 return diff_data_atomize_text_lines_mmap(d);
163 int
164 diff_atomize_text_by_line(void *func_data, struct diff_data *left,
165 struct diff_data *right)
167 int rc;
168 rc = diff_data_atomize_text_lines(left);
169 if (rc != DIFF_RC_OK)
170 return rc;
171 return diff_data_atomize_text_lines(right);