commit c1b420179b3673008ca7ac2af3156849b9d2059f from: Omar Polo date: Thu Jul 31 17:09:03 2025 UTC sync files from diff.git 543aed44ef221a67c386d967ec45907fb129194b improve binary files detection instead of 'just' looking for NUL, consider a file binary if it has any control charcters (except our beloved tabs characters). joint work with jtt@, ok stsp@ commit - e5cbb211a342397e8ac59234046e2bbbe017c7c1 commit + c1b420179b3673008ca7ac2af3156849b9d2059f blob - 32023105af9438217a65a1bf6b821e4d225516d1 blob + c75f8a9c0c4af1024f35db50aa0d6b33ebe78829 --- lib/diff_atomize_text.c +++ lib/diff_atomize_text.c @@ -43,7 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) unsigned int array_size_estimate = d->len / 50; unsigned int pow2 = 1; bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); - bool embedded_nul = false; + bool isbinary = false; while (array_size_estimate >>= 1) pow2++; @@ -72,8 +72,9 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) || !isspace((unsigned char)buf[i])) hash = diff_atom_hash_update( hash, buf[i]); - if (buf[i] == '\0') - embedded_nul = true; + if (iscntrl((unsigned char)buf[i]) && + !isspace((unsigned char)buf[i])) + isbinary = true; line_end++; } else eol = buf[i]; @@ -115,8 +116,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) return errno; } - /* File are considered binary if they contain embedded '\0' bytes. */ - if (embedded_nul) + /* File are considered binary if they contain control bytes. */ + if (isbinary) d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; return DIFF_RC_OK; @@ -128,7 +129,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) const uint8_t *pos = d->data; const uint8_t *end = pos + d->len; bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); - bool embedded_nul = false; + bool isbinary = false; unsigned int array_size_estimate = d->len / 50; unsigned int pow2 = 1; while (array_size_estimate >>= 1) @@ -144,8 +145,9 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) if (!ignore_whitespace || !isspace((unsigned char)*line_end)) hash = diff_atom_hash_update(hash, *line_end); - if (*line_end == '\0') - embedded_nul = true; + if (iscntrl((unsigned char)*line_end) && + !isspace((unsigned char)*line_end)) + isbinary = true; line_end++; } @@ -174,8 +176,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) pos = line_end; } - /* File are considered binary if they contain embedded '\0' bytes. */ - if (embedded_nul) + /* File are considered binary if they contain embedded control bytes. */ + if (isbinary) d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; return DIFF_RC_OK;