Skip to content
Snippets Groups Projects
genUnicode.py 12.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • #!/usr/bin/env python
    
    #
    # Original script modified in November 2003 to take advantage of
    # the character-validation range routines, and updated to the
    # current Unicode information (Version 4.0.1)
    #
    # NOTE: there is an 'alias' facility for blocks which are not present in
    #	the current release, but are needed for ABI compatibility.  This
    
    #	must be accomplished MANUALLY!  Please see the comments below under
    #     'blockAliases'
    
    webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
    sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
    
    #
    # blockAliases is a small hack - it is used for mapping block names which
    # were were used in the 3.1 release, but are missing or changed in the current
    # release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
    blockAliases = []
    blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
    blockAliases.append("Greek:GreekandCoptic")
    blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 
    	"SupplementaryPrivateUseArea-B")
    
    
    # minTableSize gives the minimum number of ranges which must be present
    # before a range table is produced.  If there are less than this
    # number, inline comparisons are generated
    minTableSize = 8
    
    
    (blockfile, catfile) = sources.split()
    
    
    #
    # Now process the "blocks" file, reducing it to a dictionary
    # indexed by blockname, containing a tuple with the applicable
    # block range
    #
    
        print("Missing %s, aborting ..." % blockfile)
    
        sys.exit(1)
    
    for line in blocks.readlines():
        if line[0] == '#':
            continue
    
        line = line.strip()
    
            fields = line.split(';')
            range = fields[0].strip()
            (start, end) = range.split("..")
            name = fields[1].strip()
            name = name.replace(' ', '')
    
            print("Failed to process line: %s" % (line))
    
        start = "0x" + start
        end = "0x" + end
        try:
            BlockNames[name].append((start, end))
        except:
            BlockNames[name] = [(start, end)]
    
    print("Parsed %d blocks descriptions" % (len(BlockNames.keys())))
    
        alias = block.split(':')
        alist = alias[1].split(',')
    
            if comp in BlockNames:
    
                if alias[0] not in BlockNames:
                    BlockNames[alias[0]] = []
                for r in BlockNames[comp]:
                    BlockNames[alias[0]].append(r)
            else:
    
                print("Alias %s: %s not in Blocks" % (alias[0], comp))
    
    #
    # Next process the Categories file. This is more complex, since
    # the file is in code sequence, and we need to invert it.  We use
    # a dictionary with index category-name, with each entry containing
    # all the ranges (codepoints) of that category.  Note that category
    # names comprise two parts - the general category, and the "subclass"
    # within that category.  Therefore, both "general category" (which is
    # the first character of the 2-character category-name) and the full
    # (2-character) name are entered into this dictionary.
    #
    
        print("Missing %s, aborting ..." % catfile)
    
        sys.exit(1)
    
    nbchar = 0;
    Categories = {}
    for line in data.readlines():
        if line[0] == '#':
            continue
    
        line = line.strip()
    
            fields = line.split(';')
            point = fields[0].strip()
    
            value = 0
            while point != '':
                value = value * 16
                if point[0] >= '0' and point[0] <= '9':
                    value = value + ord(point[0]) - ord('0')
                elif point[0] >= 'A' and point[0] <= 'F':
                    value = value + 10 + ord(point[0]) - ord('A')
                elif point[0] >= 'a' and point[0] <= 'f':
                    value = value + 10 + ord(point[0]) - ord('a')
                point = point[1:]
            name = fields[2]
    
            print("Failed to process line: %s" % (line))
    
                print("Failed to process line: %s" % (line))
    
                print("Failed to process line: %s" % (line))
    
    print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))
    
    
    #
    # The data is now all read.  Time to process it into a more useful form.
    #
    # reduce the number list into ranges
    
    for cat in Categories.keys():
        list = Categories[cat]
        start = -1
        prev = -1
        end = -1
        ranges = []
        for val in list:
            if start == -1:
    
                start = val
                prev = val
                continue
            elif val == prev + 1:
                prev = val
                continue
            elif prev == start:
                ranges.append((prev, prev))
                start = val
                prev = val
                continue
            else:
                ranges.append((start, prev))
                start = val
                prev = val
                continue
    
        if prev == start:
            ranges.append((prev, prev))
        else:
            ranges.append((start, prev))
        Categories[cat] = ranges
    
    
    #
    # Assure all data is in alphabetic order, since we will be doing binary
    # searches on the tables.
    #
    
    bkeys = sorted(BlockNames.keys())
    
    ckeys = sorted(Categories.keys())
    
        header = open("include/libxml/xmlunicode.h", "w")
    
        print("Failed to open include/libxml/xmlunicode.h")
    
        sys.exit(1)
    
    try:
        output = open("xmlunicode.c", "w")
    except:
    
        print("Failed to open xmlunicode.c")
    
        sys.exit(1)
    
    date = time.asctime(time.localtime(time.time()))
    
    header.write(
    """/*
    
     * Summary: Unicode character APIs
     * Description: API for the Unicode character APIs
    
     *
     * This file is automatically generated from the
     * UCS description files of the Unicode Character Database
    
     * using the genUnicode.py Python script.
     *
     * Generation date: %s
     * Sources: %s
    
     */
    
    #ifndef __XML_UNICODE_H__
    #define __XML_UNICODE_H__
    
    
    #include <libxml/xmlversion.h>
    
    
    output.write(
    """/*
     * xmlunicode.c: this module implements the Unicode character APIs
     *
     * This file is automatically generated from the
     * UCS description files of the Unicode Character Database
    
     * using the genUnicode.py Python script.
     *
     * Generation date: %s
     * Sources: %s
     * Daniel Veillard <veillard@redhat.com>
     */
    
    #define IN_LIBXML
    #include "libxml.h"
    
    #ifdef LIBXML_UNICODE_ENABLED
    
    #include <string.h>
    #include <libxml/xmlversion.h>
    #include <libxml/xmlunicode.h>
    
    typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */
    
    typedef struct {
        const char *rangename;
        xmlIntFunc *func;
    } xmlUnicodeRange;
    
    typedef struct {
    
        const xmlUnicodeRange *table;
    
    static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname);
    
    static const xmlUnicodeRange xmlUnicodeBlocks[] = {
    
    """ % (webpage, date, sources));
    
    flag = 0
    for block in bkeys:
    
        name = block.replace('-', '')
    
        output.write('  {"%s", xmlUCSIs%s}' % (block, name))
    
    output.write('static const xmlUnicodeRange xmlUnicodeCats[] = {\n')
    
    flag = 0;
    for name in ckeys:
        if flag:
            output.write(',\n')
        else:
            flag = 1
        output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
    output.write('};\n\n')
    
    #
    # For any categories with more than minTableSize ranges we generate
    # a range table suitable for xmlCharInRange
    #
    for name in ckeys:
      if len(Categories[name]) > minTableSize:
        numshort = 0
        numlong = 0
        ranges = Categories[name]
        sptr = "NULL"
        lptr = "NULL"
        for range in ranges:
          (low, high) = range
          if high < 0x10000:
            if numshort == 0:
    
              pline = "static const xmlChSRange xml%sS[] = {" % name
    
              pline += ","
    
            numshort += 1
          else:
            if numlong == 0:
              if numshort > 0:
                output.write(pline + " };\n")
    
              pline = "static const xmlChLRange xml%sL[] = {" % name
    
              pline += ","
    
            numlong += 1
          if len(pline) > 60:
            output.write(pline + "\n")
            pline = "    "
    
          elif pline[-1:] == ",":
            pline += " "
    
          pline += "{%s, %s}" % (hex(low), hex(high))
    
        output.write(pline + " };\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
    
             % (name, numshort, numlong, sptr, lptr))
    
    
    output.write(
    
    """static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
    static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
    
    
    /**
     * xmlUnicodeLookup:
     * @tptr: pointer to the name table
     * @name: name to be found
     *
     * binary table lookup for user-supplied name
     *
     * Returns pointer to range function if found, otherwise NULL
     */
    static xmlIntFunc
    
    *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) {
    
        const xmlUnicodeRange *sptr;
    
        if ((tptr == NULL) || (tname == NULL)) return(NULL);
    
    
        low = 0;
        high = tptr->numentries - 1;
        sptr = tptr->table;
        while (low <= high) {
    	mid = (low + high) / 2;
    	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
    	    return (sptr[mid].func);
    	if (cmp < 0)
    	    high = mid - 1;
    	else
    	    low = mid + 1;
        }
    
        return (NULL);
    
    }
    
    """ % (len(BlockNames), len(Categories)) )
    
    for block in bkeys:
    
        name = block.replace('-', '')
    
        header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
    
        output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
        output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
                     (block))
        output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
    
        output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
        flag = 0
        for (start, end) in BlockNames[block]:
            if flag:
                output.write(" ||\n           ")
            else:
                flag = 1
            output.write("((code >= %s) && (code <= %s))" % (start, end))
        output.write(");\n}\n\n")
    
    header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
    output.write(
    """/**
     * xmlUCSIsBlock:
     * @code: UCS code point
     * @block: UCS block name
     *
     * Check whether the character is part of the UCS Block
     *
     * Returns 1 if true, 0 if false and -1 on unknown block
     */
    int
    xmlUCSIsBlock(int code, const char *block) {
        xmlIntFunc *func;
    
        func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
        if (func == NULL)
    	return (-1);
        return (func(code));
    }
    
        header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
    
        output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
        output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
                     (name))
        output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
        output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
    
        if len(Categories[name]) > minTableSize:
            output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
                % name)
        else:
            start = 1
            for range in ranges:
                (begin, end) = range;
                if start:
                    output.write("    return(");
                    start = 0
                else:
                    output.write(" ||\n           ");
                if (begin == end):
                    output.write("(code == %s)" % (hex(begin)))
                else:
                    output.write("((code >= %s) && (code <= %s))" % (
                             hex(begin), hex(end)))
    
    header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
    output.write(
    """/**
     * xmlUCSIsCat:
     * @code: UCS code point
     * @cat: UCS Category name
     *
     * Check whether the character is part of the UCS Category
     *
     * Returns 1 if true, 0 if false and -1 on unknown category
     */
    int
    xmlUCSIsCat(int code, const char *cat) {
        xmlIntFunc *func;
    
        func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
        if (func == NULL)
    	return (-1);
        return (func(code));
    }
    
    #endif /* LIBXML_UNICODE_ENABLED */
    """)
    
    #endif /* __XML_UNICODE_H__ */
    """);