commit ea5e974da9b1047689411a00ecc0a9c1fb101d73 from: Omar Polo date: Thu Mar 28 09:41:18 2024 UTC got-notify-http: fix unicode handling JSON strings are made of UNICODE codepoints, of which only \, " and control characters have to be escaped, and the whole document MUST be encoded in UTF-8. The current code generates invalid strings for non-ASCII characters, so it has to be made UTF-8 aware. tedu' isu8cont() can't be used since it allows surrogate pairs and overlong sequences which will cause decoding errors on the receiving side. Similarly, mbtowc() depends on the current locale and could cause issues in -portable. Instead, bundle Björn Höhrmann's "Flexible and Economical UTF-8 Decoder" and use it to parse the text. Decoding errors results in the replacement character U+FFFD being emitted and the bytes considered so far to be discarded; the decoder is then restarted with the next byte. Git commit messages don't carry the notion of the encoding, but it's reasonable to expect UTF-8 (which is a superset of ASCII). For other more esotic encodings, the commit id can be used to manually extract the data. ok stsp@ commit - 87890bc26c1c6958bd64bb9d46fbc29ba6a92d95 commit + ea5e974da9b1047689411a00ecc0a9c1fb101d73 blob - 80cfc6f803bbf046beb7285ddb64fd7ef3169391 blob + e1dd226bd532d7adaf1e4712e8c13bf3cc29ffba --- gotd/libexec/got-notify-http/got-notify-http.c +++ gotd/libexec/got-notify-http/got-notify-http.c @@ -33,6 +33,7 @@ #include "got_version.h" #include "bufio.h" +#include "utf8d.h" #define USERAGENT "got-notify-http/" GOT_VERSION_STR @@ -91,39 +92,50 @@ dial(const char *host, const char *port) static void escape(FILE *fp, const uint8_t *s) { - for (; *s; ++s) { - /* - * XXX: this is broken for UNICODE: we should leave - * the multibyte characters as-is. - */ + uint32_t codepoint, state; + const uint8_t *start = s; - if (*s >= ' ' && *s <= '~') { - fputc(*s, fp); - continue; - } - - switch (*s) { - case '"': - case '\\': - fprintf(fp, "\\%c", *s); - break; - case '\b': - fprintf(fp, "\\b"); - break; - case '\f': - fprintf(fp, "\\f"); - break; - case '\n': - fprintf(fp, "\\n"); - break; - case '\r': - fprintf(fp, "\\r"); - break; - case '\t': - fprintf(fp, "\\t"); + state = 0; + for (; *s; ++s) { + switch (decode(&state, &codepoint, *s)) { + case UTF8_ACCEPT: + switch (codepoint) { + case '"': + case '\\': + fprintf(fp, "\\%c", *s); + break; + case '\b': + fprintf(fp, "\\b"); + break; + case '\f': + fprintf(fp, "\\f"); + break; + case '\n': + fprintf(fp, "\\n"); + break; + case '\r': + fprintf(fp, "\\r"); + break; + case '\t': + fprintf(fp, "\\t"); + break; + default: + /* other control characters */ + if (codepoint < ' ' || codepoint == 0x7F) { + fprintf(fp, "\\u%04x", codepoint); + break; + } + fwrite(start, 1, s - start + 1, fp); + break; + } + start = s + 1; break; - default: - fprintf(fp, "\\u%04X", *s); + + case UTF8_REJECT: + /* bad UTF-8 sequence; try to recover */ + fputs("\\uFFFD", fp); + state = UTF8_ACCEPT; + start = s + 1; break; } } blob - /dev/null blob + 480e9c55ac4a6aa58414ec65e0053dd620b5b01d (mode 644) --- /dev/null +++ gotd/libexec/got-notify-http/utf8d.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2008-2009 Bjoern Hoehrmann + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +static uint32_t inline +decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} blob - 6892df7536c1a19134db4827c42f870d903c65a7 blob + 3255fa55f2841d50b5170e35303b4e7622843e04 --- regress/gotd/http_notification.sh +++ regress/gotd/http_notification.sh @@ -65,6 +65,75 @@ test_file_changed() { "author":"$GOT_AUTHOR", "date":"$d", "message":"make changes\n", + "diffstat":{}, + "changes":{} + }]} + . + ,j + w + EOF + + cmp -s $testroot/stdout.expected $testroot/stdout + ret=$? + if [ $ret -ne 0 ]; then + diff -u $testroot/stdout.expected $testroot/stdout + test_done "$testroot" "$ret" + return 1 + fi + + test_done "$testroot" "$ret" +} + +test_bad_utf8() { + local testroot=`test_init bad_utf8 1` + + got clone -a -q ${GOTD_TEST_REPO_URL} $testroot/repo-clone + ret=$? + if [ $ret -ne 0 ]; then + echo "got clone failed unexpectedly" >&2 + test_done "$testroot" 1 + return 1 + fi + + got checkout -q $testroot/repo-clone $testroot/wt >/dev/null + ret=$? + if [ $ret -ne 0 ]; then + echo "got checkout failed unexpectedly" >&2 + test_done "$testroot" 1 + fi + + # invalid utf8 sequenc + commit_msg="make$(printf '\xED\xA0\x80')changes" + + echo "changed" > $testroot/wt/alpha + (cd $testroot/wt && got commit -m "$commit_msg" > /dev/null) + local commit_id=`git_show_head $testroot/repo-clone` + local author_time=`git_show_author_time $testroot/repo-clone` + + timeout 5 ./http-server -p $GOTD_TEST_HTTP_PORT \ + > $testroot/stdout & + + got send -b main -q -r $testroot/repo-clone + ret=$? + if [ $ret -ne 0 ]; then + echo "got send failed unexpectedly" >&2 + test_done "$testroot" "1" + return 1 + fi + + wait %1 # wait for the http "server" + + d=`date -u -r $author_time +"%a %b %e %X %Y UTC"` + + touch "$testroot/stdout.expected" + ed -s "$testroot/stdout.expected" <<-EOF + a + {"notifications":[{ + "short":false, + "id":"$commit_id", + "author":"$GOT_AUTHOR", + "date":"$d", + "message":"make\uFFFD\uFFFDchanges\n", "diffstat":{}, "changes":{} }]} @@ -245,5 +314,6 @@ test_many_commits_summarized() { test_parseargs "$@" run_test test_file_changed +run_test test_bad_utf8 run_test test_many_commits_not_summarized run_test test_many_commits_summarized