Mirror of :pserver:cvs@cvs.fefe.de:/cvs libowfat https://www.fefe.de/libowfat/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

244 lines
5.3 KiB

  1. #include <stdio.h>
  2. #include <ctype.h>
  3. #include <string.h>
  4. #include <stdlib.h>
  5. #include "scan.h"
  6. #include <assert.h>
  7. #define INTERNAL
  8. #include "scan/scan_ulong.c"
  9. #include "scan/scan_ulongn.c"
  10. #include "fmt/fmt_utf8.c"
  11. #include "fmt/fmt_tohex.c"
  12. #include "fmt/fmt_escapecharc.c"
  13. char tmp[20];
  14. char tmp2[20];
  15. size_t n,m;
  16. unsigned long l;
  17. struct entity {
  18. const char* entity;
  19. char utf8[10];
  20. struct entity* next;
  21. }* root,** cur=&root;
  22. struct letter {
  23. unsigned char c;
  24. struct letters* weiter;
  25. uint32_t marshaled; // lower 8 bits: char. rest: ofs from start of marshaled blob
  26. };
  27. struct letters {
  28. size_t n;
  29. struct letter liste[256];
  30. };
  31. struct letters* d;
  32. size_t nodes,datasize;
  33. void nomem() {
  34. fprintf(stderr, "memory allocation failure!\n");
  35. exit(1);
  36. }
  37. void addword(struct letters** s,const char* t, void* pointer) {
  38. size_t i;
  39. if (!*s) {
  40. *s=malloc(sizeof(**s));
  41. if (!*s) nomem();
  42. memset(*s,0,sizeof(**s));
  43. (*s)->liste[0].c='?';
  44. }
  45. i=(unsigned char)*t;
  46. if ((*s)->liste[i].c==*t) {
  47. if (!*t) {
  48. datasize+=strlen((char*)pointer)+1;
  49. (*s)->liste[i].weiter=pointer;
  50. } else
  51. addword(&(*s)->liste[i].weiter,t+1,pointer);
  52. return;
  53. }
  54. ++nodes;
  55. (*s)->n++;
  56. (*s)->liste[i].c=*t;
  57. if (!*t) {
  58. datasize+=strlen((char*)pointer)+1;
  59. (*s)->liste[i].weiter=pointer;
  60. } else {
  61. (*s)->liste[i].weiter=0;
  62. addword(&(*s)->liste[i].weiter,t+1,pointer);
  63. }
  64. }
  65. void dump(struct letters* s,size_t depth) {
  66. size_t i,j;
  67. if (!s) return;
  68. for (i=0; i<256; ++i) {
  69. if (s->liste[i].c!=i) continue;
  70. for (j=0; j<depth; ++j) printf(" ");
  71. printf("'%c' -> {\n",s->liste[i].c);
  72. if (s->liste[i].c)
  73. dump(s->liste[i].weiter,depth+1);
  74. for (j=0; j<depth; ++j) printf(" ");
  75. printf("}\n");
  76. }
  77. }
  78. size_t used;
  79. size_t useddata;
  80. char* heap;
  81. uint32_t* marshaled;
  82. char* data;
  83. void marshalhelper(struct letters* s) {
  84. size_t i;
  85. uint32_t myindex=used;
  86. if (!s) return;
  87. used+=s->n;
  88. assert(used<nodes+2);
  89. for (i=1; i!=0; ++i) { // start at 1, go to 256, then access modulo 256; effect: sort but put 0 last
  90. uint32_t x;
  91. i&=0xff;
  92. // printf("%c ",i);
  93. if (s->liste[i].c!=i) {
  94. if (i==0) return;
  95. continue;
  96. }
  97. // printf("marshalhelper: %c\n",i);
  98. x=(unsigned char)s->liste[i].c;
  99. if (!x) {
  100. size_t l=strlen((char*)s->liste[i].weiter)+1;
  101. // puts((char*)s->liste[i].weiter);
  102. x|=useddata<<8;
  103. assert(useddata+l<=datasize);
  104. memcpy(data+useddata,s->liste[i].weiter,l);
  105. useddata+=l;
  106. marshaled[++myindex]=x;
  107. return;
  108. } else {
  109. x|=(used+1)<<8;
  110. marshalhelper(s->liste[i].weiter);
  111. }
  112. marshaled[++myindex]=x;
  113. }
  114. // printf("return\n");
  115. }
  116. void marshal(struct letters* s) {
  117. fprintf(stderr,"nodes=%zu, datasize=%zu\n",nodes,datasize);
  118. {
  119. size_t l;
  120. heap=malloc(l=(nodes+1)*sizeof(uint32_t)+datasize);
  121. if (!heap) nomem();
  122. memset(heap,0,l);
  123. }
  124. marshaled=(uint32_t*)heap;
  125. marshaled[0]=nodes+1;
  126. data=heap+(nodes+1)*sizeof(uint32_t);
  127. marshalhelper(s);
  128. fprintf(stderr,"actually used: %zu nodes, %zu bytes data\n",used,useddata);
  129. }
  130. char* lookup(char* ds,size_t ofs,const char* t) {
  131. uint32_t* tab=(uint32_t*)ds;
  132. if (ofs>tab[0]) return 0;
  133. while (ofs<tab[0]) {
  134. unsigned char ch=tab[ofs]&0xff;
  135. if (ch==(unsigned char)*t) {
  136. if (!ch)
  137. return ds+tab[0]*sizeof(uint32_t)+(tab[ofs]>>8);
  138. else
  139. return lookup(ds,tab[ofs]>>8,t+1);
  140. } else
  141. ++ofs;
  142. if (!ch) break;
  143. }
  144. return NULL;
  145. }
  146. int main() {
  147. FILE* f=fopen("entities.json","r");
  148. char buf[256];
  149. if (!f) return 1;
  150. #if 0
  151. puts("struct { const char* entity; const char* utf8; } codepoints[] = {");
  152. #endif
  153. while (fgets(buf,sizeof(buf),f)) {
  154. char* s,* entity;
  155. size_t ul;
  156. if (!isspace(buf[0])) continue;
  157. for (s=buf; *s && *s!='"'; ++s) ; // skip whitespace
  158. if (!(*s=='"')) continue;
  159. ++s;
  160. entity=s;
  161. if (*entity!='&') continue;
  162. ++entity; ++s;
  163. for (; *s && *s!='"'; ++s) ; // skip to end of entity
  164. if (!(*s=='"')) continue;
  165. if (s[-1]!=';') continue;
  166. s[-1]=0; ++s;
  167. s=strchr(s,'[');
  168. if (!s) continue;
  169. n=0;
  170. #if 0
  171. printf(" { \"%s\", \"",entity);
  172. #endif
  173. ++s;
  174. *cur=malloc(sizeof(**cur));
  175. if (!*cur) nomem();
  176. (*cur)->next=0;
  177. if (!((*cur)->entity=strdup(entity))) return 1;
  178. ul=0;
  179. do {
  180. while (isspace(*s)) ++s;
  181. m=scan_ulong(s,&l);
  182. if (!m) return 2;
  183. s+=n;
  184. n=fmt_utf8(tmp,l);
  185. if (ul+n>sizeof((*cur)->utf8)) return 3;
  186. memcpy((*cur)->utf8+ul,tmp,n);
  187. ul+=n;
  188. #if 0
  189. {
  190. size_t i;
  191. for (i=0; i<n; ++i) {
  192. fwrite(tmp2,fmt_escapecharc(tmp2,(unsigned char)tmp[i]),1,stdout);
  193. }
  194. }
  195. #endif
  196. if (*s==']') break;
  197. } while (*s==',');
  198. (*cur)->utf8[ul]=0;
  199. #if 0
  200. puts("\" },");
  201. #endif
  202. addword(&d,(*cur)->entity,(*cur)->utf8);
  203. }
  204. fclose(f);
  205. // dump(d,0);
  206. marshal(d);
  207. {
  208. FILE* f=fopen("entities.h","w");
  209. size_t i;
  210. fprintf(f,"struct {\n uint32_t tab[%u];\n char data[%zu];\n} entities = {\n {",marshaled[0],datasize);
  211. for (i=0; i<marshaled[0]; ++i) {
  212. if (i%8 == 0) fprintf(f,"\n ");
  213. fprintf(f,"0x%x,",marshaled[i]);
  214. }
  215. fprintf(f,"\n } , {");
  216. for (i=0; i<datasize; ++i) {
  217. if (i%16 == 0) fprintf(f,"\n ");
  218. fprintf(f,"0x%x,",data[i]&0xff);
  219. }
  220. fprintf(f,"\n }\n};");
  221. fclose(f);
  222. }
  223. // puts(lookup(heap,1,"zwnj"));
  224. #if 0
  225. puts("};");
  226. #endif
  227. return 0;
  228. }