1 | #!/usr/bin/env python3
|
---|
2 | #
|
---|
3 | # Original script modified in November 2003 to take advantage of
|
---|
4 | # the character-validation range routines, and updated to the
|
---|
5 | # current Unicode information (Version 4.0.1)
|
---|
6 | #
|
---|
7 | # NOTE: there is an 'alias' facility for blocks which are not present in
|
---|
8 | # the current release, but are needed for ABI compatibility. This
|
---|
9 | # must be accomplished MANUALLY! Please see the comments below under
|
---|
10 | # 'blockAliases'
|
---|
11 | #
|
---|
12 | import sys
|
---|
13 | import string
|
---|
14 | import time
|
---|
15 |
|
---|
16 | webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
|
---|
17 | sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
|
---|
18 |
|
---|
19 | #
|
---|
20 | # blockAliases is a small hack - it is used for mapping block names which
|
---|
21 | # were were used in the 3.1 release, but are missing or changed in the current
|
---|
22 | # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
|
---|
23 | blockAliases = []
|
---|
24 | blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
|
---|
25 | blockAliases.append("Greek:GreekandCoptic")
|
---|
26 | blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
|
---|
27 | "SupplementaryPrivateUseArea-B")
|
---|
28 |
|
---|
29 | # minTableSize gives the minimum number of ranges which must be present
|
---|
30 | # before a range table is produced. If there are less than this
|
---|
31 | # number, inline comparisons are generated
|
---|
32 | minTableSize = 8
|
---|
33 |
|
---|
34 | (blockfile, catfile) = sources.split()
|
---|
35 |
|
---|
36 |
|
---|
37 | #
|
---|
38 | # Now process the "blocks" file, reducing it to a dictionary
|
---|
39 | # indexed by blockname, containing a tuple with the applicable
|
---|
40 | # block range
|
---|
41 | #
|
---|
42 | BlockNames = {}
|
---|
43 | try:
|
---|
44 | blocks = open(blockfile, "r")
|
---|
45 | except:
|
---|
46 | print("Missing %s, aborting ..." % blockfile)
|
---|
47 | sys.exit(1)
|
---|
48 |
|
---|
49 | for line in blocks.readlines():
|
---|
50 | if line[0] == '#':
|
---|
51 | continue
|
---|
52 | line = line.strip()
|
---|
53 | if line == '':
|
---|
54 | continue
|
---|
55 | try:
|
---|
56 | fields = line.split(';')
|
---|
57 | range = fields[0].strip()
|
---|
58 | (start, end) = range.split("..")
|
---|
59 | name = fields[1].strip()
|
---|
60 | name = name.replace(' ', '')
|
---|
61 | except:
|
---|
62 | print("Failed to process line: %s" % (line))
|
---|
63 | continue
|
---|
64 | start = "0x" + start
|
---|
65 | end = "0x" + end
|
---|
66 | try:
|
---|
67 | BlockNames[name].append((start, end))
|
---|
68 | except:
|
---|
69 | BlockNames[name] = [(start, end)]
|
---|
70 | blocks.close()
|
---|
71 | print("Parsed %d blocks descriptions" % (len(BlockNames.keys())))
|
---|
72 |
|
---|
73 | for block in blockAliases:
|
---|
74 | alias = block.split(':')
|
---|
75 | alist = alias[1].split(',')
|
---|
76 | for comp in alist:
|
---|
77 | if comp in BlockNames:
|
---|
78 | if alias[0] not in BlockNames:
|
---|
79 | BlockNames[alias[0]] = []
|
---|
80 | for r in BlockNames[comp]:
|
---|
81 | BlockNames[alias[0]].append(r)
|
---|
82 | else:
|
---|
83 | print("Alias %s: %s not in Blocks" % (alias[0], comp))
|
---|
84 | continue
|
---|
85 |
|
---|
86 | #
|
---|
87 | # Next process the Categories file. This is more complex, since
|
---|
88 | # the file is in code sequence, and we need to invert it. We use
|
---|
89 | # a dictionary with index category-name, with each entry containing
|
---|
90 | # all the ranges (codepoints) of that category. Note that category
|
---|
91 | # names comprise two parts - the general category, and the "subclass"
|
---|
92 | # within that category. Therefore, both "general category" (which is
|
---|
93 | # the first character of the 2-character category-name) and the full
|
---|
94 | # (2-character) name are entered into this dictionary.
|
---|
95 | #
|
---|
96 | try:
|
---|
97 | data = open(catfile, "r")
|
---|
98 | except:
|
---|
99 | print("Missing %s, aborting ..." % catfile)
|
---|
100 | sys.exit(1)
|
---|
101 |
|
---|
102 | nbchar = 0;
|
---|
103 | Categories = {}
|
---|
104 | for line in data.readlines():
|
---|
105 | if line[0] == '#':
|
---|
106 | continue
|
---|
107 | line = line.strip()
|
---|
108 | if line == '':
|
---|
109 | continue
|
---|
110 | try:
|
---|
111 | fields = line.split(';')
|
---|
112 | point = fields[0].strip()
|
---|
113 | value = 0
|
---|
114 | while point != '':
|
---|
115 | value = value * 16
|
---|
116 | if point[0] >= '0' and point[0] <= '9':
|
---|
117 | value = value + ord(point[0]) - ord('0')
|
---|
118 | elif point[0] >= 'A' and point[0] <= 'F':
|
---|
119 | value = value + 10 + ord(point[0]) - ord('A')
|
---|
120 | elif point[0] >= 'a' and point[0] <= 'f':
|
---|
121 | value = value + 10 + ord(point[0]) - ord('a')
|
---|
122 | point = point[1:]
|
---|
123 | name = fields[2]
|
---|
124 | except:
|
---|
125 | print("Failed to process line: %s" % (line))
|
---|
126 | continue
|
---|
127 |
|
---|
128 | nbchar = nbchar + 1
|
---|
129 | # update entry for "full name"
|
---|
130 | try:
|
---|
131 | Categories[name].append(value)
|
---|
132 | except:
|
---|
133 | try:
|
---|
134 | Categories[name] = [value]
|
---|
135 | except:
|
---|
136 | print("Failed to process line: %s" % (line))
|
---|
137 | # update "general category" name
|
---|
138 | try:
|
---|
139 | Categories[name[0]].append(value)
|
---|
140 | except:
|
---|
141 | try:
|
---|
142 | Categories[name[0]] = [value]
|
---|
143 | except:
|
---|
144 | print("Failed to process line: %s" % (line))
|
---|
145 |
|
---|
146 | blocks.close()
|
---|
147 | print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))
|
---|
148 |
|
---|
149 | #
|
---|
150 | # The data is now all read. Time to process it into a more useful form.
|
---|
151 | #
|
---|
152 | # reduce the number list into ranges
|
---|
153 | for cat in Categories.keys():
|
---|
154 | list = Categories[cat]
|
---|
155 | start = -1
|
---|
156 | prev = -1
|
---|
157 | end = -1
|
---|
158 | ranges = []
|
---|
159 | for val in list:
|
---|
160 | if start == -1:
|
---|
161 | start = val
|
---|
162 | prev = val
|
---|
163 | continue
|
---|
164 | elif val == prev + 1:
|
---|
165 | prev = val
|
---|
166 | continue
|
---|
167 | elif prev == start:
|
---|
168 | ranges.append((prev, prev))
|
---|
169 | start = val
|
---|
170 | prev = val
|
---|
171 | continue
|
---|
172 | else:
|
---|
173 | ranges.append((start, prev))
|
---|
174 | start = val
|
---|
175 | prev = val
|
---|
176 | continue
|
---|
177 | if prev == start:
|
---|
178 | ranges.append((prev, prev))
|
---|
179 | else:
|
---|
180 | ranges.append((start, prev))
|
---|
181 | Categories[cat] = ranges
|
---|
182 |
|
---|
183 | #
|
---|
184 | # Assure all data is in alphabetic order, since we will be doing binary
|
---|
185 | # searches on the tables.
|
---|
186 | #
|
---|
187 | bkeys = sorted(BlockNames.keys())
|
---|
188 |
|
---|
189 | ckeys = sorted(Categories.keys())
|
---|
190 |
|
---|
191 | #
|
---|
192 | # Generate the resulting files
|
---|
193 | #
|
---|
194 | try:
|
---|
195 | header = open("include/libxml/xmlunicode.h", "w")
|
---|
196 | except:
|
---|
197 | print("Failed to open include/libxml/xmlunicode.h")
|
---|
198 | sys.exit(1)
|
---|
199 |
|
---|
200 | try:
|
---|
201 | output = open("xmlunicode.c", "w")
|
---|
202 | except:
|
---|
203 | print("Failed to open xmlunicode.c")
|
---|
204 | sys.exit(1)
|
---|
205 |
|
---|
206 | date = time.asctime(time.localtime(time.time()))
|
---|
207 |
|
---|
208 | header.write(
|
---|
209 | """/*
|
---|
210 | * Summary: Unicode character APIs
|
---|
211 | * Description: API for the Unicode character APIs
|
---|
212 | *
|
---|
213 | * This file is automatically generated from the
|
---|
214 | * UCS description files of the Unicode Character Database
|
---|
215 | * %s
|
---|
216 | * using the genUnicode.py Python script.
|
---|
217 | *
|
---|
218 | * Generation date: %s
|
---|
219 | * Sources: %s
|
---|
220 | * Author: Daniel Veillard
|
---|
221 | */
|
---|
222 |
|
---|
223 | #ifndef __XML_UNICODE_H__
|
---|
224 | #define __XML_UNICODE_H__
|
---|
225 |
|
---|
226 | #include <libxml/xmlversion.h>
|
---|
227 |
|
---|
228 | #ifdef LIBXML_UNICODE_ENABLED
|
---|
229 |
|
---|
230 | #ifdef __cplusplus
|
---|
231 | extern "C" {
|
---|
232 | #endif
|
---|
233 |
|
---|
234 | """ % (webpage, date, sources));
|
---|
235 |
|
---|
236 | output.write(
|
---|
237 | """/*
|
---|
238 | * xmlunicode.c: this module implements the Unicode character APIs
|
---|
239 | *
|
---|
240 | * This file is automatically generated from the
|
---|
241 | * UCS description files of the Unicode Character Database
|
---|
242 | * %s
|
---|
243 | * using the genUnicode.py Python script.
|
---|
244 | *
|
---|
245 | * Generation date: %s
|
---|
246 | * Sources: %s
|
---|
247 | * Daniel Veillard <[email protected]>
|
---|
248 | */
|
---|
249 |
|
---|
250 | #define IN_LIBXML
|
---|
251 | #include "libxml.h"
|
---|
252 |
|
---|
253 | #ifdef LIBXML_UNICODE_ENABLED
|
---|
254 |
|
---|
255 | #include <string.h>
|
---|
256 | #include <libxml/xmlversion.h>
|
---|
257 | #include <libxml/xmlunicode.h>
|
---|
258 | #include <libxml/chvalid.h>
|
---|
259 |
|
---|
260 | typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
|
---|
261 |
|
---|
262 | typedef struct {
|
---|
263 | const char *rangename;
|
---|
264 | xmlIntFunc *func;
|
---|
265 | } xmlUnicodeRange;
|
---|
266 |
|
---|
267 | typedef struct {
|
---|
268 | const xmlUnicodeRange *table;
|
---|
269 | int numentries;
|
---|
270 | } xmlUnicodeNameTable;
|
---|
271 |
|
---|
272 |
|
---|
273 | static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname);
|
---|
274 |
|
---|
275 | static const xmlUnicodeRange xmlUnicodeBlocks[] = {
|
---|
276 | """ % (webpage, date, sources));
|
---|
277 |
|
---|
278 | flag = 0
|
---|
279 | for block in bkeys:
|
---|
280 | name = block.replace('-', '')
|
---|
281 | if flag:
|
---|
282 | output.write(',\n')
|
---|
283 | else:
|
---|
284 | flag = 1
|
---|
285 | output.write(' {"%s", xmlUCSIs%s}' % (block, name))
|
---|
286 | output.write('};\n\n')
|
---|
287 |
|
---|
288 | output.write('static const xmlUnicodeRange xmlUnicodeCats[] = {\n')
|
---|
289 | flag = 0;
|
---|
290 | for name in ckeys:
|
---|
291 | if flag:
|
---|
292 | output.write(',\n')
|
---|
293 | else:
|
---|
294 | flag = 1
|
---|
295 | output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
|
---|
296 | output.write('};\n\n')
|
---|
297 |
|
---|
298 | #
|
---|
299 | # For any categories with more than minTableSize ranges we generate
|
---|
300 | # a range table suitable for xmlCharInRange
|
---|
301 | #
|
---|
302 | for name in ckeys:
|
---|
303 | if len(Categories[name]) > minTableSize:
|
---|
304 | numshort = 0
|
---|
305 | numlong = 0
|
---|
306 | ranges = Categories[name]
|
---|
307 | sptr = "NULL"
|
---|
308 | lptr = "NULL"
|
---|
309 | for range in ranges:
|
---|
310 | (low, high) = range
|
---|
311 | if high < 0x10000:
|
---|
312 | if numshort == 0:
|
---|
313 | pline = "static const xmlChSRange xml%sS[] = {" % name
|
---|
314 | sptr = "xml%sS" % name
|
---|
315 | else:
|
---|
316 | pline += ","
|
---|
317 | numshort += 1
|
---|
318 | else:
|
---|
319 | if numlong == 0:
|
---|
320 | if numshort > 0:
|
---|
321 | output.write(pline + " };\n")
|
---|
322 | pline = "static const xmlChLRange xml%sL[] = {" % name
|
---|
323 | lptr = "xml%sL" % name
|
---|
324 | else:
|
---|
325 | pline += ","
|
---|
326 | numlong += 1
|
---|
327 | if len(pline) > 60:
|
---|
328 | output.write(pline + "\n")
|
---|
329 | pline = " "
|
---|
330 | elif pline[-1:] == ",":
|
---|
331 | pline += " "
|
---|
332 | pline += "{%s, %s}" % (hex(low), hex(high))
|
---|
333 | output.write(pline + " };\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
|
---|
334 | % (name, numshort, numlong, sptr, lptr))
|
---|
335 |
|
---|
336 |
|
---|
337 | output.write(
|
---|
338 | """static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
|
---|
339 | static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
|
---|
340 |
|
---|
341 | /**
|
---|
342 | * xmlUnicodeLookup:
|
---|
343 | * @tptr: pointer to the name table
|
---|
344 | * @name: name to be found
|
---|
345 | *
|
---|
346 | * binary table lookup for user-supplied name
|
---|
347 | *
|
---|
348 | * Returns pointer to range function if found, otherwise NULL
|
---|
349 | */
|
---|
350 | static xmlIntFunc
|
---|
351 | *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) {
|
---|
352 | int low, high, mid, cmp;
|
---|
353 | const xmlUnicodeRange *sptr;
|
---|
354 |
|
---|
355 | if ((tptr == NULL) || (tname == NULL)) return(NULL);
|
---|
356 |
|
---|
357 | low = 0;
|
---|
358 | high = tptr->numentries - 1;
|
---|
359 | sptr = tptr->table;
|
---|
360 | while (low <= high) {
|
---|
361 | mid = (low + high) / 2;
|
---|
362 | if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
|
---|
363 | return (sptr[mid].func);
|
---|
364 | if (cmp < 0)
|
---|
365 | high = mid - 1;
|
---|
366 | else
|
---|
367 | low = mid + 1;
|
---|
368 | }
|
---|
369 | return (NULL);
|
---|
370 | }
|
---|
371 |
|
---|
372 | """ % (len(BlockNames), len(Categories)) )
|
---|
373 |
|
---|
374 | for block in bkeys:
|
---|
375 | name = block.replace('-', '')
|
---|
376 | header.write("XMLPUBFUN int xmlUCSIs%s\t(int code);\n" % name)
|
---|
377 | output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
|
---|
378 | output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
|
---|
379 | (block))
|
---|
380 | output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
|
---|
381 | output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
|
---|
382 | flag = 0
|
---|
383 | for (start, end) in BlockNames[block]:
|
---|
384 | if flag:
|
---|
385 | output.write(" ||\n ")
|
---|
386 | else:
|
---|
387 | flag = 1
|
---|
388 | output.write("((code >= %s) && (code <= %s))" % (start, end))
|
---|
389 | output.write(");\n}\n\n")
|
---|
390 |
|
---|
391 | header.write("\nXMLPUBFUN int xmlUCSIsBlock\t(int code, const char *block);\n\n")
|
---|
392 | output.write(
|
---|
393 | """/**
|
---|
394 | * xmlUCSIsBlock:
|
---|
395 | * @code: UCS code point
|
---|
396 | * @block: UCS block name
|
---|
397 | *
|
---|
398 | * Check whether the character is part of the UCS Block
|
---|
399 | *
|
---|
400 | * Returns 1 if true, 0 if false and -1 on unknown block
|
---|
401 | */
|
---|
402 | int
|
---|
403 | xmlUCSIsBlock(int code, const char *block) {
|
---|
404 | xmlIntFunc *func;
|
---|
405 |
|
---|
406 | func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
|
---|
407 | if (func == NULL)
|
---|
408 | return (-1);
|
---|
409 | return (func(code));
|
---|
410 | }
|
---|
411 |
|
---|
412 | """)
|
---|
413 |
|
---|
414 | for name in ckeys:
|
---|
415 | ranges = Categories[name]
|
---|
416 | header.write("XMLPUBFUN int xmlUCSIsCat%s\t(int code);\n" % name)
|
---|
417 | output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
|
---|
418 | output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
|
---|
419 | (name))
|
---|
420 | output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
|
---|
421 | output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
|
---|
422 | if len(Categories[name]) > minTableSize:
|
---|
423 | output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
|
---|
424 | % name)
|
---|
425 | else:
|
---|
426 | start = 1
|
---|
427 | for range in ranges:
|
---|
428 | (begin, end) = range;
|
---|
429 | if start:
|
---|
430 | output.write(" return(");
|
---|
431 | start = 0
|
---|
432 | else:
|
---|
433 | output.write(" ||\n ");
|
---|
434 | if (begin == end):
|
---|
435 | output.write("(code == %s)" % (hex(begin)))
|
---|
436 | else:
|
---|
437 | output.write("((code >= %s) && (code <= %s))" % (
|
---|
438 | hex(begin), hex(end)))
|
---|
439 | output.write(");\n}\n\n")
|
---|
440 |
|
---|
441 | header.write("\nXMLPUBFUN int xmlUCSIsCat\t(int code, const char *cat);\n")
|
---|
442 | output.write(
|
---|
443 | """/**
|
---|
444 | * xmlUCSIsCat:
|
---|
445 | * @code: UCS code point
|
---|
446 | * @cat: UCS Category name
|
---|
447 | *
|
---|
448 | * Check whether the character is part of the UCS Category
|
---|
449 | *
|
---|
450 | * Returns 1 if true, 0 if false and -1 on unknown category
|
---|
451 | */
|
---|
452 | int
|
---|
453 | xmlUCSIsCat(int code, const char *cat) {
|
---|
454 | xmlIntFunc *func;
|
---|
455 |
|
---|
456 | func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
|
---|
457 | if (func == NULL)
|
---|
458 | return (-1);
|
---|
459 | return (func(code));
|
---|
460 | }
|
---|
461 |
|
---|
462 | #endif /* LIBXML_UNICODE_ENABLED */
|
---|
463 | """)
|
---|
464 |
|
---|
465 | header.write("""
|
---|
466 | #ifdef __cplusplus
|
---|
467 | }
|
---|
468 | #endif
|
---|
469 |
|
---|
470 | #endif /* LIBXML_UNICODE_ENABLED */
|
---|
471 |
|
---|
472 | #endif /* __XML_UNICODE_H__ */
|
---|
473 | """);
|
---|
474 |
|
---|
475 | header.close()
|
---|
476 | output.close()
|
---|