479 lines
13 KiB
Python
Executable File
479 lines
13 KiB
Python
Executable File
#!/usr/bin/python -u
|
|
#
|
|
# Original script modified in November 2003 to take advantage of
|
|
# the character-validation range routines, and updated to the
|
|
# current Unicode information (Version 4.0.1)
|
|
#
|
|
# NOTE: there is an 'alias' facility for blocks which are not present in
|
|
# the current release, but are needed for ABI compatibility. This
|
|
# must be accomplished MANUALLY! Please see the comments below under
|
|
# 'blockAliases'
|
|
#
|
|
import sys
|
|
import string
|
|
import time
|
|
|
|
webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
|
|
sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
|
|
|
|
#
|
|
# blockAliases is a small hack - it is used for mapping block names which
|
|
# were were used in the 3.1 release, but are missing or changed in the current
|
|
# release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
|
|
blockAliases = []
|
|
blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
|
|
blockAliases.append("Greek:GreekandCoptic")
|
|
blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
|
|
"SupplementaryPrivateUseArea-B")
|
|
|
|
# minTableSize gives the minimum number of ranges which must be present
|
|
# before a range table is produced. If there are less than this
|
|
# number, inline comparisons are generated
|
|
minTableSize = 8
|
|
|
|
(blockfile, catfile) = string.split(sources)
|
|
|
|
|
|
#
|
|
# Now process the "blocks" file, reducing it to a dictionary
|
|
# indexed by blockname, containing a tuple with the applicable
|
|
# block range
|
|
#
|
|
BlockNames = {}
|
|
try:
|
|
blocks = open(blockfile, "r")
|
|
except:
|
|
print "Missing %s, aborting ..." % blockfile
|
|
sys.exit(1)
|
|
|
|
for line in blocks.readlines():
|
|
if line[0] == '#':
|
|
continue
|
|
line = string.strip(line)
|
|
if line == '':
|
|
continue
|
|
try:
|
|
fields = string.split(line, ';')
|
|
range = string.strip(fields[0])
|
|
(start, end) = string.split(range, "..")
|
|
name = string.strip(fields[1])
|
|
name = string.replace(name, ' ', '')
|
|
except:
|
|
print "Failed to process line: %s" % (line)
|
|
continue
|
|
start = "0x" + start
|
|
end = "0x" + end
|
|
try:
|
|
BlockNames[name].append((start, end))
|
|
except:
|
|
BlockNames[name] = [(start, end)]
|
|
blocks.close()
|
|
print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
|
|
|
|
for block in blockAliases:
|
|
alias = string.split(block,':')
|
|
alist = string.split(alias[1],',')
|
|
for comp in alist:
|
|
if BlockNames.has_key(comp):
|
|
if alias[0] not in BlockNames:
|
|
BlockNames[alias[0]] = []
|
|
for r in BlockNames[comp]:
|
|
BlockNames[alias[0]].append(r)
|
|
else:
|
|
print "Alias %s: %s not in Blocks" % (alias[0], comp)
|
|
continue
|
|
|
|
#
|
|
# Next process the Categories file. This is more complex, since
|
|
# the file is in code sequence, and we need to invert it. We use
|
|
# a dictionary with index category-name, with each entry containing
|
|
# all the ranges (codepoints) of that category. Note that category
|
|
# names comprise two parts - the general category, and the "subclass"
|
|
# within that category. Therefore, both "general category" (which is
|
|
# the first character of the 2-character category-name) and the full
|
|
# (2-character) name are entered into this dictionary.
|
|
#
|
|
try:
|
|
data = open(catfile, "r")
|
|
except:
|
|
print "Missing %s, aborting ..." % catfile
|
|
sys.exit(1)
|
|
|
|
nbchar = 0;
|
|
Categories = {}
|
|
for line in data.readlines():
|
|
if line[0] == '#':
|
|
continue
|
|
line = string.strip(line)
|
|
if line == '':
|
|
continue
|
|
try:
|
|
fields = string.split(line, ';')
|
|
point = string.strip(fields[0])
|
|
value = 0
|
|
while point != '':
|
|
value = value * 16
|
|
if point[0] >= '0' and point[0] <= '9':
|
|
value = value + ord(point[0]) - ord('0')
|
|
elif point[0] >= 'A' and point[0] <= 'F':
|
|
value = value + 10 + ord(point[0]) - ord('A')
|
|
elif point[0] >= 'a' and point[0] <= 'f':
|
|
value = value + 10 + ord(point[0]) - ord('a')
|
|
point = point[1:]
|
|
name = fields[2]
|
|
except:
|
|
print "Failed to process line: %s" % (line)
|
|
continue
|
|
|
|
nbchar = nbchar + 1
|
|
# update entry for "full name"
|
|
try:
|
|
Categories[name].append(value)
|
|
except:
|
|
try:
|
|
Categories[name] = [value]
|
|
except:
|
|
print "Failed to process line: %s" % (line)
|
|
# update "general category" name
|
|
try:
|
|
Categories[name[0]].append(value)
|
|
except:
|
|
try:
|
|
Categories[name[0]] = [value]
|
|
except:
|
|
print "Failed to process line: %s" % (line)
|
|
|
|
blocks.close()
|
|
print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
|
|
|
|
#
|
|
# The data is now all read. Time to process it into a more useful form.
|
|
#
|
|
# reduce the number list into ranges
|
|
for cat in Categories.keys():
|
|
list = Categories[cat]
|
|
start = -1
|
|
prev = -1
|
|
end = -1
|
|
ranges = []
|
|
for val in list:
|
|
if start == -1:
|
|
start = val
|
|
prev = val
|
|
continue
|
|
elif val == prev + 1:
|
|
prev = val
|
|
continue
|
|
elif prev == start:
|
|
ranges.append((prev, prev))
|
|
start = val
|
|
prev = val
|
|
continue
|
|
else:
|
|
ranges.append((start, prev))
|
|
start = val
|
|
prev = val
|
|
continue
|
|
if prev == start:
|
|
ranges.append((prev, prev))
|
|
else:
|
|
ranges.append((start, prev))
|
|
Categories[cat] = ranges
|
|
|
|
#
|
|
# Assure all data is in alphabetic order, since we will be doing binary
|
|
# searches on the tables.
|
|
#
|
|
bkeys = BlockNames.keys()
|
|
bkeys.sort()
|
|
|
|
ckeys = Categories.keys()
|
|
ckeys.sort()
|
|
|
|
#
|
|
# Generate the resulting files
|
|
#
|
|
try:
|
|
header = open("include/libxml/xmlunicode.h", "w")
|
|
except:
|
|
print "Failed to open include/libxml/xmlunicode.h"
|
|
sys.exit(1)
|
|
|
|
try:
|
|
output = open("xmlunicode.c", "w")
|
|
except:
|
|
print "Failed to open xmlunicode.c"
|
|
sys.exit(1)
|
|
|
|
date = time.asctime(time.localtime(time.time()))
|
|
|
|
header.write(
|
|
"""/*
|
|
* Summary: Unicode character APIs
|
|
* Description: API for the Unicode character APIs
|
|
*
|
|
* This file is automatically generated from the
|
|
* UCS description files of the Unicode Character Database
|
|
* %s
|
|
* using the genUnicode.py Python script.
|
|
*
|
|
* Generation date: %s
|
|
* Sources: %s
|
|
* Author: Daniel Veillard
|
|
*/
|
|
|
|
#ifndef __XML_UNICODE_H__
|
|
#define __XML_UNICODE_H__
|
|
|
|
#include <libxml/xmlversion.h>
|
|
|
|
#ifdef LIBXML_UNICODE_ENABLED
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
""" % (webpage, date, sources));
|
|
|
|
output.write(
|
|
"""/*
|
|
* xmlunicode.c: this module implements the Unicode character APIs
|
|
*
|
|
* This file is automatically generated from the
|
|
* UCS description files of the Unicode Character Database
|
|
* %s
|
|
* using the genUnicode.py Python script.
|
|
*
|
|
* Generation date: %s
|
|
* Sources: %s
|
|
* Daniel Veillard <veillard@redhat.com>
|
|
*/
|
|
|
|
#define IN_LIBXML
|
|
#include "libxml.h"
|
|
|
|
#ifdef LIBXML_UNICODE_ENABLED
|
|
|
|
#include <string.h>
|
|
#include <libxml/xmlversion.h>
|
|
#include <libxml/xmlunicode.h>
|
|
#include <libxml/chvalid.h>
|
|
|
|
typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
|
|
|
|
typedef struct {
|
|
const char *rangename;
|
|
xmlIntFunc *func;
|
|
} xmlUnicodeRange;
|
|
|
|
typedef struct {
|
|
xmlUnicodeRange *table;
|
|
int numentries;
|
|
} xmlUnicodeNameTable;
|
|
|
|
|
|
static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
|
|
|
|
static xmlUnicodeRange xmlUnicodeBlocks[] = {
|
|
""" % (webpage, date, sources));
|
|
|
|
flag = 0
|
|
for block in bkeys:
|
|
name = string.replace(block, '-', '')
|
|
if flag:
|
|
output.write(',\n')
|
|
else:
|
|
flag = 1
|
|
output.write(' {"%s", xmlUCSIs%s}' % (block, name))
|
|
output.write('};\n\n')
|
|
|
|
output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
|
|
flag = 0;
|
|
for name in ckeys:
|
|
if flag:
|
|
output.write(',\n')
|
|
else:
|
|
flag = 1
|
|
output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
|
|
output.write('};\n\n')
|
|
|
|
#
|
|
# For any categories with more than minTableSize ranges we generate
|
|
# a range table suitable for xmlCharInRange
|
|
#
|
|
for name in ckeys:
|
|
if len(Categories[name]) > minTableSize:
|
|
numshort = 0
|
|
numlong = 0
|
|
ranges = Categories[name]
|
|
sptr = "NULL"
|
|
lptr = "NULL"
|
|
for range in ranges:
|
|
(low, high) = range
|
|
if high < 0x10000:
|
|
if numshort == 0:
|
|
pline = "static const xmlChSRange xml%sS[] = {" % name
|
|
sptr = "xml%sS" % name
|
|
else:
|
|
pline += ", "
|
|
numshort += 1
|
|
else:
|
|
if numlong == 0:
|
|
if numshort > 0:
|
|
output.write(pline + " };\n")
|
|
pline = "static const xmlChLRange xml%sL[] = {" % name
|
|
lptr = "xml%sL" % name
|
|
else:
|
|
pline += ", "
|
|
numlong += 1
|
|
if len(pline) > 60:
|
|
output.write(pline + "\n")
|
|
pline = " "
|
|
pline += "{%s, %s}" % (hex(low), hex(high))
|
|
output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
|
|
% (name, numshort, numlong, sptr, lptr))
|
|
|
|
|
|
output.write(
|
|
"""static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
|
|
static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
|
|
|
|
/**
|
|
* xmlUnicodeLookup:
|
|
* @tptr: pointer to the name table
|
|
* @name: name to be found
|
|
*
|
|
* binary table lookup for user-supplied name
|
|
*
|
|
* Returns pointer to range function if found, otherwise NULL
|
|
*/
|
|
static xmlIntFunc
|
|
*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
|
|
int low, high, mid, cmp;
|
|
xmlUnicodeRange *sptr;
|
|
|
|
if ((tptr == NULL) || (tname == NULL)) return(NULL);
|
|
|
|
low = 0;
|
|
high = tptr->numentries - 1;
|
|
sptr = tptr->table;
|
|
while (low <= high) {
|
|
mid = (low + high) / 2;
|
|
if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
|
|
return (sptr[mid].func);
|
|
if (cmp < 0)
|
|
high = mid - 1;
|
|
else
|
|
low = mid + 1;
|
|
}
|
|
return (NULL);
|
|
}
|
|
|
|
""" % (len(BlockNames), len(Categories)) )
|
|
|
|
for block in bkeys:
|
|
name = string.replace(block, '-', '')
|
|
header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
|
|
output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
|
|
output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
|
|
(block))
|
|
output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
|
|
output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
|
|
flag = 0
|
|
for (start, end) in BlockNames[block]:
|
|
if flag:
|
|
output.write(" ||\n ")
|
|
else:
|
|
flag = 1
|
|
output.write("((code >= %s) && (code <= %s))" % (start, end))
|
|
output.write(");\n}\n\n")
|
|
|
|
header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
|
|
output.write(
|
|
"""/**
|
|
* xmlUCSIsBlock:
|
|
* @code: UCS code point
|
|
* @block: UCS block name
|
|
*
|
|
* Check whether the character is part of the UCS Block
|
|
*
|
|
* Returns 1 if true, 0 if false and -1 on unknown block
|
|
*/
|
|
int
|
|
xmlUCSIsBlock(int code, const char *block) {
|
|
xmlIntFunc *func;
|
|
|
|
func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
|
|
if (func == NULL)
|
|
return (-1);
|
|
return (func(code));
|
|
}
|
|
|
|
""")
|
|
|
|
for name in ckeys:
|
|
ranges = Categories[name]
|
|
header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
|
|
output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
|
|
output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
|
|
(name))
|
|
output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
|
|
output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
|
|
if len(Categories[name]) > minTableSize:
|
|
output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
|
|
% name)
|
|
else:
|
|
start = 1
|
|
for range in ranges:
|
|
(begin, end) = range;
|
|
if start:
|
|
output.write(" return(");
|
|
start = 0
|
|
else:
|
|
output.write(" ||\n ");
|
|
if (begin == end):
|
|
output.write("(code == %s)" % (hex(begin)))
|
|
else:
|
|
output.write("((code >= %s) && (code <= %s))" % (
|
|
hex(begin), hex(end)))
|
|
output.write(");\n}\n\n")
|
|
|
|
header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
|
|
output.write(
|
|
"""/**
|
|
* xmlUCSIsCat:
|
|
* @code: UCS code point
|
|
* @cat: UCS Category name
|
|
*
|
|
* Check whether the character is part of the UCS Category
|
|
*
|
|
* Returns 1 if true, 0 if false and -1 on unknown category
|
|
*/
|
|
int
|
|
xmlUCSIsCat(int code, const char *cat) {
|
|
xmlIntFunc *func;
|
|
|
|
func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
|
|
if (func == NULL)
|
|
return (-1);
|
|
return (func(code));
|
|
}
|
|
|
|
#define bottom_xmlunicode
|
|
#include "elfgcchack.h"
|
|
#endif /* LIBXML_UNICODE_ENABLED */
|
|
""")
|
|
|
|
header.write("""
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* LIBXML_UNICODE_ENABLED */
|
|
|
|
#endif /* __XML_UNICODE_H__ */
|
|
""");
|
|
|
|
header.close()
|
|
output.close()
|