Browse Source

fmt_jsonescape: convert utf-8 >ffff to \u escaped surrogate pair

master
Felix von Leitner 4 years ago
parent
commit
bb2f5ef433
  1. 37
      textcode/fmt_jsonescape.c
  2. 35
      textcode/scan_html.c
  3. 1
      textcode/scan_jsonescape.c

37
textcode/fmt_jsonescape.c

@ -1,6 +1,7 @@
#include "fmt.h"
#include "textcode.h"
#include "str.h"
#include "scan.h"
#include "haveinline.h"
/* src is UTF-8 encoded */
@ -36,6 +37,28 @@ escape:
dest[written+5]=fmt_tohex(s[i]&0xf);
}
written+=6;
} else if (s[i]>0x7f) {
/* UTF-8! Convert to surrogate pair if needed. */
uint32_t u;
size_t j=scan_utf8_sem((const char*)s+i,len-i,&u);
if (j==0) /* Invalid UTF-8! Abort! */
return written;
if (u>0xffff) {
if (dest) {
dest[written ]='\\';
dest[written+1]='u';
fmt_xlong(dest+written+2,0xd800 + ((u>>10) & 0x3ff));
dest[written+6]='\\';
dest[written+7]='u';
fmt_xlong(dest+written+8,0xdc00 + (u & 0x3ff));
}
written+=12;
} else {
if (dest) memcpy(dest+written,s+i,j);
written+=j;
}
i+=j-1; /* -1 because the for loop will also add 1 */
break;
} else {
if (dest) dest[written]=s[i];
++written;
@ -48,3 +71,17 @@ escape:
return written;
}
#ifdef UNITTEST
#include <assert.h>
#include <string.h>
int main() {
char buf[100];
/* test utf-8 pass-through and correct encoding of \t */
assert(fmt_jsonescape(buf,"\tfnörd",7)==8 && !memcmp(buf,"\\tfnörd",8));
/* test escaping of unprintable characters */
assert(fmt_jsonescape(buf,"\001x",2)==7 && !memcmp(buf,"\\u0001x",7));
/* test conversion of large UTF-8 chars to UTF-16 surrogate pairs (poop emoji) */
assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud87d\\udca9x",13));
}
#endif

35
textcode/scan_html.c

@ -25,9 +25,15 @@ static const char* lookup(size_t ofs,const char* t) {
return NULL;
}
static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int flag) {
enum htmlmode { /* <a href="http://example.com/&quot;foo">libowfat&lt;home</a> */
OUTSIDE, /* ^^^^^^^^^^^^^^^^ -> `libowfat<home` */
TAGARG, /* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> `http://example.com/"foo´ */
};
static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum htmlmode mode) {
register const unsigned char* s=(const unsigned char*) src;
size_t written=0,i;
int dq=0;
for (i=0; s[i]; ++i) {
if (s[i]=='&') {
const char* utf8;
@ -58,7 +64,8 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
continue;
} else
dest[written]='&';
} else if (flag && s[i]=='<') {
} else if (s[i]=='<') {
if (mode == OUTSIDE) break;
if (case_starts((const char*)s+i+1,"br>")) {
dest[written]='\n';
i+=3;
@ -68,7 +75,12 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
i+=3;
} else
dest[written]=s[i];
} else
} else if (s[i]=='"' && mode==TAGARG) {
if (i==0) { dq=1; continue; }
break;
} else if (mode==TAGARG && !dq && (s[i]==' ' || s[i]=='\t' || s[i]=='\n'))
break;
else
dest[written]=s[i];
++written;
}
@ -77,9 +89,22 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
}
size_t scan_html_tagarg(const char *src,char *dest,size_t *destlen) {
return scan_html_inner(src,dest,destlen,1);
return scan_html_inner(src,dest,destlen,TAGARG);
}
size_t scan_html(const char *src,char *dest,size_t *destlen) {
return scan_html_inner(src,dest,destlen,0);
return scan_html_inner(src,dest,destlen,OUTSIDE);
}
#ifdef UNITTEST
#include <assert.h>
int main() {
char* html="<a href=\"http://example.com/&quot;foo\">libowfat&lt;home</a>";
char buf[100];
size_t destlen;
assert(scan_html(html,buf,&destlen)==0 && destlen==0);
assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home",13));
assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo",23));
}
#endif

1
textcode/scan_jsonescape.c

@ -99,6 +99,7 @@ int main() {
assert(scan_jsonescape("a\\udafd0",buf,&l)==1);
/* correct surrogate pair */
assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6));
assert(scan_jsonescape("\\ud87d\\udca9x",buf,&l)==13 && l==5 && !memcmp(buf,"\xf0\x9f\x92\xa9x",5));
/* how about some incorrect UTF-8? */
assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));
return 0;

Loading…
Cancel
Save