summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix von Leitner <felix-libowfat@fefe.de>2018-02-02 18:16:09 +0000
committerFelix von Leitner <felix-libowfat@fefe.de>2018-02-02 18:16:09 +0000
commit39a431f4d87c564cd6c01ab493971d14f5ebb3fc (patch)
treec1f1a396e6ba344f8f759791d39e89dfacfb47e9
parent707f0cca3b0906240758c42ba3f49bb1836d4989 (diff)
downloadlibowfat-39a431f4d87c564cd6c01ab493971d14f5ebb3fc.tar.gz
libowfat-39a431f4d87c564cd6c01ab493971d14f5ebb3fc.zip
don't escape poop emoji (emit as utf8 instead)
if input is not valid utf8, encode each invalid byte as utf8 in output
-rw-r--r--textcode.h2
-rw-r--r--textcode/fmt_jsonescape.c18
2 files changed, 15 insertions, 5 deletions
diff --git a/textcode.h b/textcode.h
index 71829fe..b10cf3a 100644
--- a/textcode.h
+++ b/textcode.h
@@ -49,7 +49,7 @@ size_t fmt_foldwhitespace(char* dest,const char* src,size_t len);
size_t fmt_ldapescape(char* dest,const char* src,size_t len);
size_t fmt_ldapescape2(char* dest,const char* src,size_t len,const char* escapeme);
/* Encode JSON string from UTF-8; will backslash-escape the bare minimum.
- * Will not verify that the input is valid UTF-8!
+ * Invalid UTF-8 in input will output as valid UTF-8 for each byte
* Worst case: len*6 */
size_t fmt_jsonescape(char* dest,const char* src,size_t len);
diff --git a/textcode/fmt_jsonescape.c b/textcode/fmt_jsonescape.c
index eb97b10..b86a471 100644
--- a/textcode/fmt_jsonescape.c
+++ b/textcode/fmt_jsonescape.c
@@ -42,8 +42,13 @@ escape:
/* UTF-8! Convert to surrogate pair if needed. */
uint32_t u;
size_t j=scan_utf8_sem((const char*)s+i,len-i,&u);
- if (j==0) /* Invalid UTF-8! Abort! */
- return written;
+ if (j==0) { /* Invalid UTF-8! Try to limp on! */
+ written+=fmt_utf8(dest?dest+written:0,s[i]);
+ break;
+ }
+ /* It turns out we are not required to escape these.
+ * So we won't. */
+#if 0
if (u>0xffff) {
if (dest) {
dest[written ]='\\';
@@ -54,7 +59,9 @@ escape:
fmt_xlong(dest+written+8,0xdc00 + (u & 0x3ff));
}
written+=12;
- } else {
+ } else
+#endif
+ {
if (dest) memcpy(dest+written,s+i,j);
written+=j;
}
@@ -83,6 +90,9 @@ int main() {
/* test escaping of unprintable characters */
assert(fmt_jsonescape(buf,"\001x",2)==7 && !memcmp(buf,"\\u0001x",7));
/* test conversion of large UTF-8 chars to UTF-16 surrogate pairs (poop emoji) */
- assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud83d\\udca9x",13));
+ /* EDIT: this escaping is not actually needed, so we aren't doing it
+ * anymore. This test will fail now:
+ assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud83d\\udca9x",13)); */
+ assert(fmt_jsonescape(buf,"a\x81x",3)==4 && !memcmp(buf,"a\xc2\x81x",4));
}
#endif