Mirror of :pserver:cvs@cvs.fefe.de:/cvs libowfat https://www.fefe.de/libowfat/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

124 lines
5.4 KiB

  1. #include "fmt.h"
  2. size_t scan_utf8(const char* in,size_t len,uint32_t* num) {
  3. uint32_t i,k,m;
  4. const char* orig=in;
  5. if (len==0) return 0;
  6. i=(*(unsigned char*)in++); /* grab first byte */
  7. if (i>=0xfe || /* 0xfe and 0xff are invalid encodings in utf-8 for the first byte */
  8. (i&0xc0)==0x80) return 0; /* first bits being 10 marks continuation chars, invalid sequence for first byte */
  9. for (k=0; i&0x80; i<<=1, ++k); /* count leading 1 bits */
  10. if (!k) {
  11. if (num) *num=i;
  12. return 1;
  13. }
  14. if (k>len) return 0;
  15. i=(i&0xff)>>k; /* mask the leading 1 bits */
  16. /* The next part is a little tricky.
  17. * UTF-8 says that the encoder has to choose the most efficient
  18. * encoding, and the decoder has to reject other encodings. The
  19. * background is that attackers encoded '/' not as 0x2f but as 0xc0
  20. * 0xaf, and that evaded bad security checks just scan for the '/'
  21. * byte in pathnames.
  22. * At this point k contains the number of bytes, so k-1 is the number
  23. * of continuation bytes. For each additional continuation byte, we
  24. * gain 6 bits of storage space, but we lose one in the signalling in
  25. * the initial byte. So we have 6 + (k-1) * 5 bits total storage
  26. * space for this encoding. The minimum value for k bytes is the
  27. * maximum number for k-1 bytes plus 1. If the previous encoding has
  28. * 11 bits, its maximum value is 11 1-bits or 0x7ff, and the minimum
  29. * value we are looking for is 0x800 or 1<<11. For 2 bytes, UTF-8 can
  30. * encode 11 bits, after that each additional byte gains 5 more bits.
  31. * So for k>2, we want
  32. * 1 << (11+(k-3)*5)
  33. * or optimized to get rid of the -3
  34. * 1 << (k*5-4)
  35. * but for k==2 the delta is 4 bits (not 5), so we want
  36. * 1 << 7
  37. * abusing the fact that a boolean expression evaluates to 0 or 1, the
  38. * expression can be written as
  39. * 1 << (k*5-4+(k==2))
  40. */
  41. m=((uint32_t)1<<(k*5-4+(k==2)));
  42. while (k>1) {
  43. if ((*in&0xc0)!=0x80) return 0;
  44. i=(i<<6) | ((*in++)&0x3f);
  45. --k;
  46. }
  47. if (i<m) return 0; /* if the encoded value was less than m, reject */
  48. if (num) *num=i;
  49. return (size_t)(in-orig);
  50. }
  51. #ifdef UNITTEST
  52. #include <assert.h>
  53. #include "fmt/fmt_utf8.c"
  54. int main() {
  55. char buf[100];
  56. uint32_t l;
  57. unsigned int i;
  58. /* first positive testing for the various lengths */
  59. l=fmt_utf8(buf,0); assert(l == 1 && scan_utf8(buf,l+1,&l)==1 && l==0);
  60. l=fmt_utf8(buf,0x80); assert(l == 2 && scan_utf8(buf,l+1,&l)==2 && l==0x80);
  61. l=fmt_utf8(buf,0x800); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0x800);
  62. l=fmt_utf8(buf,0x10000); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x10000);
  63. l=fmt_utf8(buf,0x200000); assert(l == 5 && scan_utf8(buf,l+1,&l)==5 && l==0x200000);
  64. l=fmt_utf8(buf,0x4000000); assert(l == 6 && scan_utf8(buf,l+1,&l)==6 && l==0x4000000);
  65. /* corner cases */
  66. l=fmt_utf8(buf,0x7f); assert(l == 1 && scan_utf8(buf,l+1,&l)==1 && l==0x7f);
  67. l=fmt_utf8(buf,0x7ff); assert(l == 2 && scan_utf8(buf,l+1,&l)==2 && l==0x7ff);
  68. l=fmt_utf8(buf,0xffff); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xffff);
  69. l=fmt_utf8(buf,0x1fffff); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x1fffff);
  70. l=fmt_utf8(buf,0x3ffffff); assert(l == 5 && scan_utf8(buf,l+1,&l)==5 && l==0x3ffffff);
  71. l=fmt_utf8(buf,0x7fffffff); assert(l == 6 && scan_utf8(buf,l+1,&l)==6 && l==0x7fffffff);
  72. /* more corner cases */
  73. l=fmt_utf8(buf,0xd7ff); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xd7ff);
  74. l=fmt_utf8(buf,0xe000); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xe000);
  75. l=fmt_utf8(buf,0xfffd); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xfffd);
  76. l=fmt_utf8(buf,0x10ffff); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x10ffff);
  77. l=fmt_utf8(buf,0x110000); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x110000);
  78. /* now negative testing */
  79. /* start off with some continuation bytes outside a sequence */
  80. for (i=0x80; i<=0xbf; ++i) {
  81. buf[0]=i;
  82. assert(scan_utf8(buf,2,&l)==0);
  83. }
  84. /* now check lonely sequence start characters */
  85. buf[1]=' ';
  86. for (i=0xc0; i<=0xfd; ++i) {
  87. buf[0]=i;
  88. assert(scan_utf8(buf,2,&l)==0);
  89. }
  90. /* FE and FF are reserved for UTF-16 endianness detection*/
  91. assert(scan_utf8("\xfe\xff",3,&l)==0);
  92. assert(scan_utf8("\xff\xfe",3,&l)==0);
  93. /* now check some truncated sequences */
  94. l=fmt_utf8(buf,0); assert(l == 1 && scan_utf8(buf,l-1,&l)==0);
  95. l=fmt_utf8(buf,0x80); assert(l == 2 && scan_utf8(buf,l-1,&l)==0);
  96. l=fmt_utf8(buf,0x800); assert(l == 3 && scan_utf8(buf,l-1,&l)==0);
  97. l=fmt_utf8(buf,0x10000); assert(l == 4 && scan_utf8(buf,l-1,&l)==0);
  98. l=fmt_utf8(buf,0x200000); assert(l == 5 && scan_utf8(buf,l-1,&l)==0);
  99. l=fmt_utf8(buf,0x4000000); assert(l == 6 && scan_utf8(buf,l-1,&l)==0);
  100. /* now truncate in another way */
  101. l=fmt_utf8(buf,0x80); buf[l-1]=' '; assert(l == 2 && scan_utf8(buf,l+1,&l)==0);
  102. l=fmt_utf8(buf,0x800); buf[l-1]=' '; assert(l == 3 && scan_utf8(buf,l+1,&l)==0);
  103. l=fmt_utf8(buf,0x10000); buf[l-1]=' '; assert(l == 4 && scan_utf8(buf,l+1,&l)==0);
  104. l=fmt_utf8(buf,0x200000); buf[l-1]=' '; assert(l == 5 && scan_utf8(buf,l+1,&l)==0);
  105. l=fmt_utf8(buf,0x4000000); buf[l-1]=' '; assert(l == 6 && scan_utf8(buf,l+1,&l)==0);
  106. /* now some not minimally encoded utf-8 sequences */
  107. assert(scan_utf8("\xc0\x80",3,&l)==0);
  108. assert(scan_utf8("\xe0\x80\x80",4,&l)==0);
  109. assert(scan_utf8("\xf0\x80\x80\x80",5,&l)==0);
  110. assert(scan_utf8("\xf8\x80\x80\x80\x80",6,&l)==0);
  111. assert(scan_utf8("\xfc\x80\x80\x80\x80\x80",7,&l)==0);
  112. return 0;
  113. }
  114. #endif