Mirror of :pserver:cvs@cvs.fefe.de:/cvs libowfat https://www.fefe.de/libowfat/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

230 lines
5.1 KiB

  1. #include <stdio.h>
  2. #include <ctype.h>
  3. #include <string.h>
  4. #include <stdlib.h>
  5. #include "scan.h"
  6. #include <assert.h>
  7. #include "scan/scan_ulong.c"
  8. #include "scan/scan_ulongn.c"
  9. #include "fmt/fmt_utf8.c"
  10. #include "fmt/fmt_tohex.c"
  11. #include "fmt/fmt_escapecharc.c"
  12. char tmp[20];
  13. char tmp2[20];
  14. size_t n,m;
  15. unsigned long l;
  16. struct entity {
  17. const char* entity;
  18. char utf8[10];
  19. struct entity* next;
  20. }* root,** cur=&root;
  21. struct letter {
  22. char c;
  23. struct letters* weiter;
  24. uint32_t marshaled; // lower 8 bits: char. rest: ofs from start of marshaled blob
  25. };
  26. struct letters {
  27. size_t n;
  28. struct letter liste[256];
  29. };
  30. struct letters* d;
  31. size_t nodes,datasize;
  32. void addword(struct letters** s,const char* t, void* pointer) {
  33. size_t i;
  34. if (!*s) {
  35. *s=malloc(sizeof(**s));
  36. memset(*s,0,sizeof(**s));
  37. (*s)->liste[0].c='?';
  38. }
  39. i=(unsigned char)*t;
  40. if ((*s)->liste[i].c==*t) {
  41. if (!*t) {
  42. datasize+=strlen((char*)pointer)+1;
  43. (*s)->liste[i].weiter=pointer;
  44. } else
  45. addword(&(*s)->liste[i].weiter,t+1,pointer);
  46. return;
  47. }
  48. ++nodes;
  49. (*s)->n++;
  50. (*s)->liste[i].c=*t;
  51. if (!*t) {
  52. datasize+=strlen((char*)pointer)+1;
  53. (*s)->liste[i].weiter=pointer;
  54. } else {
  55. (*s)->liste[i].weiter=0;
  56. addword(&(*s)->liste[i].weiter,t+1,pointer);
  57. }
  58. }
  59. void dump(struct letters* s,size_t depth) {
  60. size_t i,j;
  61. if (!s) return;
  62. for (i=0; i<256; ++i) {
  63. if (s->liste[i].c!=i) continue;
  64. for (j=0; j<depth; ++j) printf(" ");
  65. printf("'%c' -> {\n",s->liste[i].c);
  66. if (s->liste[i].c)
  67. dump(s->liste[i].weiter,depth+1);
  68. for (j=0; j<depth; ++j) printf(" ");
  69. printf("}\n");
  70. }
  71. }
  72. size_t used;
  73. size_t useddata;
  74. char* heap;
  75. uint32_t* marshaled;
  76. char* data;
  77. void marshalhelper(struct letters* s) {
  78. size_t i;
  79. uint32_t myindex=used;
  80. if (!s) return;
  81. used+=s->n;
  82. assert(used<nodes+2);
  83. for (i=1; i!=0; ++i) { // start at 1, go to 256, then access modulo 256; effect: sort but put 0 last
  84. uint32_t x;
  85. i&=0xff;
  86. // printf("%c ",i);
  87. if (s->liste[i].c!=i) {
  88. if (i==0) return;
  89. continue;
  90. }
  91. // printf("marshalhelper: %c\n",i);
  92. x=(unsigned char)s->liste[i].c;
  93. if (!x) {
  94. size_t l=strlen((char*)s->liste[i].weiter)+1;
  95. // puts((char*)s->liste[i].weiter);
  96. x|=useddata<<8;
  97. assert(useddata+l<=datasize);
  98. memcpy(data+useddata,s->liste[i].weiter,l);
  99. useddata+=l;
  100. marshaled[++myindex]=x;
  101. return;
  102. } else {
  103. x|=(used+1)<<8;
  104. marshalhelper(s->liste[i].weiter);
  105. }
  106. marshaled[++myindex]=x;
  107. }
  108. // printf("return\n");
  109. }
  110. void marshal(struct letters* s) {
  111. fprintf(stderr,"nodes=%lu, datasize=%lu\n",nodes,datasize);
  112. heap=malloc((nodes+1)*sizeof(uint32_t)+datasize);
  113. if (!heap) return;
  114. marshaled=(uint32_t*)heap;
  115. marshaled[0]=nodes+1;
  116. data=heap+(nodes+1)*sizeof(uint32_t);
  117. marshalhelper(s);
  118. fprintf(stderr,"actually used: %lu nodes, %lu bytes data\n",used,useddata);
  119. }
  120. char* lookup(char* ds,size_t ofs,const char* t) {
  121. uint32_t* tab=(uint32_t*)ds;
  122. if (ofs>tab[0]) return 0;
  123. while (ofs<tab[0]) {
  124. unsigned char ch=tab[ofs]&0xff;
  125. if (ch==(unsigned char)*t) {
  126. if (!ch)
  127. return ds+tab[0]*sizeof(uint32_t)+(tab[ofs]>>8);
  128. else
  129. return lookup(ds,tab[ofs]>>8,t+1);
  130. } else
  131. ++ofs;
  132. if (!ch) break;
  133. }
  134. return NULL;
  135. }
  136. int main() {
  137. FILE* f=fopen("entities.json","r");
  138. char buf[256];
  139. if (!f) return 1;
  140. #if 0
  141. puts("struct { const char* entity; const char* utf8; } codepoints[] = {");
  142. #endif
  143. while (fgets(buf,sizeof(buf),f)) {
  144. char* s,* entity;
  145. size_t ul;
  146. if (!isspace(buf[0])) continue;
  147. for (s=buf; *s && *s!='"'; ++s) ; // skip whitespace
  148. if (!(*s=='"')) continue;
  149. ++s;
  150. entity=s;
  151. if (*entity!='&') continue; ++entity; ++s;
  152. for (; *s && *s!='"'; ++s) ; // skip to end of entity
  153. if (!(*s=='"')) continue;
  154. if (s[-1]!=';') continue;
  155. s[-1]=0; ++s;
  156. s=strchr(s,'[');
  157. if (!s) continue;
  158. n=0;
  159. #if 0
  160. printf(" { \"%s\", \"",entity);
  161. #endif
  162. ++s;
  163. *cur=malloc(sizeof(**cur));
  164. (*cur)->next=0;
  165. if (!((*cur)->entity=strdup(entity))) return 1;
  166. ul=0;
  167. do {
  168. while (isspace(*s)) ++s;
  169. m=scan_ulong(s,&l);
  170. if (!m) return 2;
  171. s+=n;
  172. n=fmt_utf8(tmp,l);
  173. if (ul+n>sizeof((*cur)->utf8)) return 3;
  174. memcpy((*cur)->utf8+ul,tmp,n);
  175. ul+=n;
  176. #if 0
  177. {
  178. size_t i;
  179. for (i=0; i<n; ++i) {
  180. fwrite(tmp2,fmt_escapecharc(tmp2,(unsigned char)tmp[i]),1,stdout);
  181. }
  182. }
  183. #endif
  184. if (*s==']') break;
  185. } while (*s==',');
  186. #if 0
  187. puts("\" },");
  188. #endif
  189. addword(&d,(*cur)->entity,(*cur)->utf8);
  190. }
  191. fclose(f);
  192. // dump(d,0);
  193. marshal(d);
  194. {
  195. FILE* f=fopen("entities.h","w");
  196. size_t i;
  197. fprintf(f,"struct {\n uint32_t tab[%u];\n char data[%lu];\n} entities = {\n {",marshaled[0],datasize);
  198. for (i=0; i<marshaled[0]; ++i) {
  199. if (i%8 == 0) fprintf(f,"\n ");
  200. fprintf(f,"0x%x,",marshaled[i]);
  201. }
  202. fprintf(f,"\n } , {");
  203. for (i=0; i<datasize; ++i) {
  204. if (i%16 == 0) fprintf(f,"\n ");
  205. fprintf(f,"0x%x,",data[i]&0xff);
  206. }
  207. fprintf(f,"\n }\n};");
  208. fclose(f);
  209. }
  210. // puts(lookup(heap,1,"zwnj"));
  211. #if 0
  212. puts("};");
  213. #endif
  214. return 0;
  215. }