summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix von Leitner <felix-libowfat@fefe.de>2017-07-30 13:41:43 +0000
committerFelix von Leitner <felix-libowfat@fefe.de>2017-07-30 13:41:43 +0000
commitefafd510b85cff9ad2db303ec4baf1241825f941 (patch)
tree64ae0ca35f5fbed970c52bc8ce088d84f2682b73
parent4310a56aeb492d748da93bca17171fdba9c3887a (diff)
downloadlibowfat-efafd510b85cff9ad2db303ec4baf1241825f941.tar.gz
libowfat-efafd510b85cff9ad2db303ec4baf1241825f941.zip
add man page and unit tests for scan_base64url
-rw-r--r--textcode/scan_base64.38
-rw-r--r--textcode/scan_base64.c4
-rw-r--r--textcode/scan_base64url.332
-rw-r--r--textcode/scan_base64url.c31
-rw-r--r--textcode/scan_html.c63
5 files changed, 103 insertions, 35 deletions
diff --git a/textcode/scan_base64.3 b/textcode/scan_base64.3
index b7deb43..64e0abc 100644
--- a/textcode/scan_base64.3
+++ b/textcode/scan_base64.3
@@ -16,10 +16,6 @@ Note that real world base64 encoded data is sometimes permitted to
contain whitespace characters or new lines. This function will not allow
those and return the decoded data until then.
-base64 works by taking 3 bytes of binary input and converting them into
-4 bytes of printable ASCII. If the input ends in the middle of a base64
-4-byte-tuple, scan_base64 will disregard the whole tuple.
-
Many base64 variants demand padding in the last block. Some don't. This
implementation will consume padding if it is there, but will not
complain if it is not.
@@ -30,7 +26,7 @@ dest can be NULL. destlen can be NULL.
scan_base64 returns the number of bytes successfully scanned and
processed from src.
.SH EXAMPLES
-scan_base64("%9FYO<F0`",buf,&i) -> return 8, i=5, buf="fnord"
+scan_base64("Zm5vcmQ=",buf,&i) -> return 8, i=5, buf="fnord"
.SH "SEE ALSO"
-scan_xlong(3), scan_8long(3), fmt_ulong(3)
+scan_base64url(3), scan_xlong(3), scan_8long(3), fmt_ulong(3)
diff --git a/textcode/scan_base64.c b/textcode/scan_base64.c
index 6fdc5c7..41aa03e 100644
--- a/textcode/scan_base64.c
+++ b/textcode/scan_base64.c
@@ -48,8 +48,10 @@ int main() {
char buf[100];
size_t i,l;
memset(buf,0,10); assert(scan_base64("Zm5vcmQ=",buf,&l)==8 && l==5 && !memcmp(buf,"fnord",6));
+ /* check that we don't insist on the padding */
memset(buf,0,10); assert(scan_base64("Zm5vcmQ",buf,&l)==7 && l==5 && !memcmp(buf,"fnord",6));
- memset(buf,0,10); assert(scan_base64("//8=",buf,&l)==4 && l==2 && !memcmp(buf,"\xff\xff",3));
+ /* check the special non-isalnum chars :) */
+ memset(buf,0,10); assert(scan_base64("/+8=",buf,&l)==4 && l==2 && !memcmp(buf,"\xff\xef",3));
return 0;
}
#endif
diff --git a/textcode/scan_base64url.3 b/textcode/scan_base64url.3
new file mode 100644
index 0000000..c0ccd0a
--- /dev/null
+++ b/textcode/scan_base64url.3
@@ -0,0 +1,32 @@
+.TH scan_base64url 3
+.SH NAME
+scan_base64url \- decode base64url encoded data
+.SH SYNTAX
+.B #include <libowfat/textcode.h>
+
+size_t \fBscan_base64url\fP(const char *\fIsrc\fR,char *\fIdest\fR,size_t* \fIdestlen\fR);
+
+.SH DESCRIPTION
+base64url is a variant of base64 for use in URLs (standard base64 uses /
+and +, which can cause problems in URLs, so base64url uses - and _
+instead; also base64url does not use = padding at the end).
+
+scan_base64url decodes base64url encoded data from src into dest.
+It will stop when it encountes any non-valid input characters.
+It will then write the number of decoded bytes in dest into *destlen,
+and return the number of bytes decoded from src.
+
+Many base64 variants demand padding in the last block. Some don't. This
+implementation will consume padding if it is there, but will not
+complain if it is not.
+
+dest can be NULL. destlen can be NULL.
+
+.SH "RETURN VALUE"
+scan_base64url returns the number of bytes successfully scanned and
+processed from src.
+.SH EXAMPLES
+scan_base64url("Zm5vcmQ",buf,&i) -> return 7, i=5, buf="fnord"
+
+.SH "SEE ALSO"
+scan_base64(3), scan_xlong(3), scan_8long(3), fmt_ulong(3)
diff --git a/textcode/scan_base64url.c b/textcode/scan_base64url.c
index f1c1c5f..93a1ea7 100644
--- a/textcode/scan_base64url.c
+++ b/textcode/scan_base64url.c
@@ -15,17 +15,36 @@ static inline int dec(unsigned char x) {
size_t scan_base64url(const char *src,char *dest,size_t *destlen) {
unsigned short tmp=0,bits=0;
register const unsigned char* s=(const unsigned char*) src;
- const char* orig=dest;
- for (;;) {
+ size_t i,j=0;
+ for (i=0;;) {
int a=dec(*s);
- if (a<0) break;
+ if (a<0) break; /* base64url does not have padding */
tmp=(tmp<<6)|a; bits+=6;
++s;
if (bits>=8) {
- *dest=(tmp>>(bits-=8));
- ++dest;
+ bits-=8;
+ if (dest) dest[i]=(tmp>>bits);
+ ++i;
}
}
- *destlen=dest-orig;
+ if (destlen) *destlen=i;
return (const char*)s-src;
}
+
+#ifdef UNITTEST
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+
+int main() {
+ char buf[100];
+ size_t i,l;
+ /* check that we don't consume padding */
+ memset(buf,0,10); assert(scan_base64url("Zm5vcmQ=",buf,&l)==7 && l==5 && !memcmp(buf,"fnord",6));
+ /* check that we don't insist on the padding */
+ memset(buf,0,10); assert(scan_base64url("Zm5vcmQ",buf,&l)==7 && l==5 && !memcmp(buf,"fnord",6));
+ /* check the special non-isalnum chars :) */
+ memset(buf,0,10); assert(scan_base64url("_-8=",buf,&l)==3 && l==2 && !memcmp(buf,"\xff\xef",3));
+ return 0;
+}
+#endif
diff --git a/textcode/scan_html.c b/textcode/scan_html.c
index 5ddb50a..f3fdad0 100644
--- a/textcode/scan_html.c
+++ b/textcode/scan_html.c
@@ -26,8 +26,8 @@ static const char* lookup(size_t ofs,const char* t) {
}
enum htmlmode { /* <a href="http://example.com/&quot;foo">libowfat&lt;home</a> */
- OUTSIDE, /* ^^^^^^^^^^^^^^^^ -> `libowfat<home` */
- TAGARG, /* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> `http://example.com/"foo┬┤ */
+ OUTSIDE, /* ^^^^^^^^^^^^^^^^ -> libowfat<home */
+ TAGARG, /* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> http://example.com/"foo */
};
static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum htmlmode mode) {
@@ -42,49 +42,41 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum ht
size_t j;
if ((s[i+2]&~32)=='X') {
j=scan_xlong(src+i+3,&l);
- if (!j) j+=3;
+ if (j) j+=3;
} else {
j=scan_ulong(src+i+2,&l);
- if (!j) j+=3;
+ if (j) j+=2;
}
if (s[i+j]==';') {
i+=j;
- written+=fmt_utf8(dest+written,l);
+ written+=fmt_utf8(dest?dest+written:0,l);
} else {
- dest[written++]='&';
+ if (dest) dest[written]='&';
+ ++written;
}
continue;
}
utf8=lookup(1,src+i+1);
if (utf8) {
size_t l=strlen(utf8);
- memcpy(dest+written,utf8,l);
+ if (dest) memcpy(dest+written,utf8,l);
written+=l;
i+=2+str_chr(src+i+2,';');
continue;
} else
- dest[written]='&';
+ if (dest) dest[written]='&';
} else if (s[i]=='<') {
- if (mode == OUTSIDE) break;
- if (case_starts((const char*)s+i+1,"br>")) {
- dest[written]='\n';
- i+=3;
- } else if (case_starts((const char*)s+i+1,"p>")) {
- dest[written]='\n'; ++written;
- dest[written]='\n';
- i+=3;
- } else
- dest[written]=s[i];
+ break;
} else if (s[i]=='"' && mode==TAGARG) {
if (i==0) { dq=1; continue; }
break;
} else if (mode==TAGARG && !dq && (s[i]==' ' || s[i]=='\t' || s[i]=='\n'))
break;
else
- dest[written]=s[i];
+ if (dest) dest[written]=s[i];
++written;
}
- *destlen=written;
+ if (destlen) *destlen=written;
return i;
}
@@ -98,13 +90,40 @@ size_t scan_html(const char *src,char *dest,size_t *destlen) {
#ifdef UNITTEST
#include <assert.h>
+#undef UNITTEST
+#include <scan/scan_fromhex.c>
+#include <scan/scan_xlongn.c>
+#include <scan/scan_xlong.c>
+#include <scan/scan_ulongn.c>
+#include <scan/scan_ulong.c>
+#include <str/str_chr.c>
+#include <fmt/fmt_utf8.c>
+#include <stdio.h>
int main() {
char* html="<a href=\"http://example.com/&quot;foo\">libowfat&lt;home</a>";
char buf[100];
size_t destlen;
+ /* check that we stop at < */
assert(scan_html(html,buf,&destlen)==0 && destlen==0);
- assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home",13));
- assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo",23));
+ /* check that we properly decode &lt; */
+ memset(buf,'?',sizeof(buf));
+ assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home?",14));
+ /* check that we stop at " and properly decode &quot; */
+ memset(buf,'?',sizeof(buf));
+ assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo?",24));
+ /* check that we pass through invalid escapes */
+ memset(buf,'?',sizeof(buf));
+ assert(scan_html("&fnord;",buf,&destlen)==7 && destlen==7 && !memcmp(buf,"&fnord;?",8));
+ memset(buf,'?',sizeof(buf));
+ assert(scan_html("&#x;",buf,&destlen)==4 && destlen==4 && !memcmp(buf,"&#x;?",5));
+ memset(buf,'?',sizeof(buf));
+ assert(scan_html("&#;",buf,&destlen)==3 && destlen==3 && !memcmp(buf,"&#;?",4));
+ /* check that &#x[hex]; is decoded properly */
+ memset(buf,'?',sizeof(buf));
+ assert(scan_html("&#x1;",buf,&destlen)==5 && destlen==1 && buf[0]==1 && buf[1]=='?');
+ /* check that &#[decimal]; is decoded properly */
+ memset(buf,'?',sizeof(buf));
+ assert(scan_html("&#1;",buf,&destlen)==4 && destlen==1 && buf[0]==1 && buf[1]=='?');
}
#endif