summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix von Leitner <felix-libowfat@fefe.de>2017-07-30 12:31:12 +0000
committerFelix von Leitner <felix-libowfat@fefe.de>2017-07-30 12:31:12 +0000
commit4ba71a754011dad0990cec867f6eac6dac2f8a81 (patch)
treec30b475c553bcb3771e58969777d222a603f8ea4
parent13896db585740487aa26fa0c9d062fba4d0fb430 (diff)
downloadlibowfat-4ba71a754011dad0990cec867f6eac6dac2f8a81.tar.gz
libowfat-4ba71a754011dad0990cec867f6eac6dac2f8a81.zip
bring scan_cescape up to speed
-rw-r--r--textcode/scan_cescape.335
-rw-r--r--textcode/scan_cescape.c73
-rw-r--r--textcode/scan_jsonescape.331
-rw-r--r--textcode/scan_jsonescape.c6
4 files changed, 138 insertions, 7 deletions
diff --git a/textcode/scan_cescape.3 b/textcode/scan_cescape.3
new file mode 100644
index 0000000..69dd293
--- /dev/null
+++ b/textcode/scan_cescape.3
@@ -0,0 +1,35 @@
+.TH scan_cescape 3
+.SH NAME
+scan_cescape \- parse C escaped string
+.SH SYNTAX
+.B #include <libowfat/textcode.h>
+
+size_t \fBscan_cescape\fP(const char *\fIsrc\fR,char *\fIdest\fR,size_t* \fIdestlen\fR);
+
+.SH DESCRIPTION
+scan_cescape parses C escaped text from src into dest.
+Parsing stops at the 0 terminator, invalid input characters or a double
+quote that was not escaped.
+
+C escape sequences like \\n and \\x0a and \\012 are translated into
+their binary counterparts. The C99 escape sequences \\u and \\U are
+supported and lead to UTF-8 sequences being output.
+
+scan_cescape will then write the number of bytes in dest into *destlen,
+and return the number of bytes decoded from src.
+
+dest can be NULL. destlen can be NULL.
+
+To make sure dest is large enough, either allocate strlen(src)+1 bytes
+or call scan_cescape twice, the first time with dest == NULL (*destlen
+will still be written).
+
+.SH "RETURN VALUE"
+scan_cescape returns the number of bytes successfully parsed
+from src.
+
+.SH EXAMPLES
+scan_cescape("test\\n\");",buf,&i) -> return 7, i=5, buf="test\n".
+
+.SH "SEE ALSO"
+fmt_jsonescape(3), fmt_cescape(3), scan_ldapescape(3)
diff --git a/textcode/scan_cescape.c b/textcode/scan_cescape.c
index 314ba3a..00afb90 100644
--- a/textcode/scan_cescape.c
+++ b/textcode/scan_cescape.c
@@ -18,25 +18,84 @@ size_t scan_cescape(const char *src,char *dest,size_t *destlen) {
case 't': c='\t'; break;
case 'v': c='\v';
case '\\': break;
- case 'x':
+ case 'x': // hex escape; \x0a -> 10
{
unsigned char a,b;
a=scan_fromhex(s[i+2]);
b=scan_fromhex(s[i+3]);
- if (a<16 && b<16) {
- c=(a<<4)+b;
- i+=2;
+ if (a<16) {
+ if (b<16) {
+ c=(a<<4)+b;
+ i+=2;
+ } else {
+ c=a;
+ i+=1;
+ }
}
}
break;
+ case 'u': // C99 unicode escape: \u000a -> 10
+ case 'U': // C99 unicode escape: \U0000000a -> 10
+ {
+ unsigned int j,k=0,l=(s[i+1]=='U'?10:6);
+ for (j=2; j<l; ++j) {
+ unsigned char c=scan_fromhex(s[i+j]);
+ if (c>=16) // error
+ goto error; // don't allow short sequences
+ k=k*16+c;
+ }
+ written+=fmt_utf8(dest?dest+written:0,k);
+ i+=j-1;
+ continue;
+ }
default:
+ if (s[i+1]>='0' && s[i+1]<='7') { // octal escape; \012 -> 10
+ unsigned int j,k;
+ for (k=0,j=1; j<4; ++j) {
+ unsigned int l=s[i+j]-'0';
+ if (l<8)
+ k=k*8+l;
+ else
+ break;
+ }
+ if (dest) dest[written++]=k;
+ i+=j-1;
+ continue;
+ }
--i;
}
++i;
- }
- dest[written]=c;
+ } else if (c=='"')
+ break;
+ if (dest) dest[written]=c;
++written;
}
- *destlen=written;
+error:
+ if (destlen) *destlen=written;
return i;
}
+
+#ifdef UNITTEST
+#include <assert.h>
+#undef UNITTEST
+#include <scan/scan_fromhex.c>
+#include <fmt/fmt_utf8.c>
+
+int main() {
+ size_t dl;
+ char buf[100];
+ assert(scan_cescape("test\\n\");",buf,&dl)==6 && dl==5 && !memcmp(buf,"test\n",5));
+ /* check hex and octal escaping */
+ assert(scan_cescape("test\\x0a\");",buf,&dl)==8 && dl==5 && !memcmp(buf,"test\n",5));
+ assert(scan_cescape("test\\012\");",buf,&dl)==8 && dl==5 && !memcmp(buf,"test\n",5));
+ /* check short escape sequences */
+ assert(scan_cescape("test\\xa\");",buf,&dl)==7 && dl==5 && !memcmp(buf,"test\n",5));
+ assert(scan_cescape("test\\12\");",buf,&dl)==7 && dl==5 && !memcmp(buf,"test\n",5));
+ assert(scan_cescape("test\\1\");",buf,&dl)==6 && dl==5 && !memcmp(buf,"test\1",5));
+ /* check unicode */
+ assert(scan_cescape("test\\u000a\");",buf,&dl)==10 && dl==5 && !memcmp(buf,"test\n",5));
+ assert(scan_cescape("test\\U0000000a\");",buf,&dl)==14 && dl==5 && !memcmp(buf,"test\n",5));
+ /* check that short sequences are rejected */
+ assert(scan_cescape("test\\Ua\");",buf,&dl)==4 && dl==4 && !memcmp(buf,"test",4));
+}
+#endif
diff --git a/textcode/scan_jsonescape.3 b/textcode/scan_jsonescape.3
new file mode 100644
index 0000000..d7716b4
--- /dev/null
+++ b/textcode/scan_jsonescape.3
@@ -0,0 +1,31 @@
+.TH scan_jsonescape 3
+.SH NAME
+scan_jsonescape \- parse JSON escaped string
+.SH SYNTAX
+.B #include <libowfat/textcode.h>
+
+size_t \fBscan_jsonescape\fP(const char *\fIsrc\fR,char *\fIdest\fR,size_t* \fIdestlen\fR);
+
+.SH DESCRIPTION
+scan_jsonescape parses JSON escaped text from src into dest, leaving a
+UTF-8 string in dest. Parsing stops at the 0 terminator, invalid input
+characters or a double quote that was not escaped.
+
+It will then write the number of bytes in dest into *destlen,
+and return the number of bytes decoded from src.
+
+dest can be NULL. destlen can be NULL.
+
+To make sure dest is large enough, either allocate strlen(src)+1 bytes
+or call scan_jsonescape twice, the first time with dest == NULL (*destlen
+will still be written).
+
+.SH "RETURN VALUE"
+scan_jsonescape returns the number of bytes successfully parsed
+from src.
+
+.SH EXAMPLES
+scan_jsonescape("test\n\");",buf,&i) -> return 6, i=5, buf="test\n".
+
+.SH "SEE ALSO"
+fmt_jsonescape(3), scan_cescape(3), scan_ldapescape(3)
diff --git a/textcode/scan_jsonescape.c b/textcode/scan_jsonescape.c
index 30ac6fb..585e87d 100644
--- a/textcode/scan_jsonescape.c
+++ b/textcode/scan_jsonescape.c
@@ -73,6 +73,10 @@ abort:
#ifdef UNITTEST
#include <assert.h>
#include <string.h>
+#undef UNITTEST
+#include <scan/scan_fromhex.c>
+#include <scan/scan_utf8.c>
+#include <fmt/fmt_utf8.c>
int main() {
char buf[100];
@@ -105,3 +109,5 @@ int main() {
return 0;
}
#endif
+
+