Skip to content
Snippets Groups Projects
genChRanges.py 17.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • #!/usr/bin/env python
    
    #
    # Portions of this script have been (shamelessly) stolen from the
    # prior work of Daniel Veillard (genUnicode.py)
    #
    # I, however, take full credit for any bugs, errors or difficulties :-)
    #
    # William Brack
    # October 2003
    #
    
    # 18 October 2003
    # Modified to maintain binary compatibility with previous library versions
    # by adding a suffix 'Q' ('quick') to the macro generated for the original,
    # function, and adding generation of a function (with the original name) which
    # instantiates the macro.
    #
    
    
    import sys
    import time
    
    #
    # A routine to take a list of yes/no (1, 0) values and turn it
    # into a list of ranges.  This will later be used to determine whether
    # to generate single-byte lookup tables, or inline comparisons
    #
    def makeRange(lst):
        ret = []
        pos = 0
        while pos < len(lst):
    
            try:            # index generates exception if not present
                s = lst[pos:].index(1)      # look for start of next range
            except:
                break                       # if no more, finished
            pos += s                        # pointer to start of possible range
            try:
                e = lst[pos:].index(0)      # look for end of range
                e += pos
            except:                         # if no end, set to end of list
                e = len(lst)
            ret.append((pos, e-1))          # append range tuple to list
            pos = e + 1                     # ready to check for next range
    
    sources = "chvalid.def"                 # input filename
    
    
    # minTableSize gives the minimum number of ranges which must be present
    # before a 256-byte lookup table is produced.  If there are less than this
    # number, a macro with inline comparisons is generated
    minTableSize = 6
    
    # dictionary of functions, key=name, element contains char-map and range-list
    Functs = {}
    
    state = 0
    
    try:
        defines = open("chvalid.def", "r")
    except:
    
        print("Missing chvalid.def, aborting ...")
    
        sys.exit(1)
    
    #
    # The lines in the .def file have three types:-
    #   name:   Defines a new function block
    
    #   ur:     Defines individual or ranges of unicode values
    
    #   end:    Indicates the end of the function block
    #
    # These lines are processed below.
    #
    for line in defines.readlines():
        # ignore blank lines, or lines beginning with '#'
        if line[0] == '#':
            continue
    
        line = line.strip()
    
        if line == '':
            continue
        # split line into space-separated fields, then split on type
        try:
    
            fields = line.split(' ')
            #
            # name line:
            #   validate any previous function block already ended
            #   validate this function not already defined
            #   initialize an entry in the function dicitonary
            #       including a mask table with no values yet defined
            #
            if fields[0] == 'name':
                name = fields[1]
                if state != 0:
                    print("'name' %s found before previous name" \
                          "completed" % (fields[1]))
                    continue
                state = 1
                if name in Functs:
                    print("name '%s' already present - may give" \
                          " wrong results" % (name))
                else:
                    # dict entry with two list elements (chdata, rangedata)
                    Functs[name] = [ [], [] ]
                    for v in range(256):
                        Functs[name][0].append(0)
            #
            # end line:
            #   validate there was a preceding function name line
            #   set state to show no current function active
            #
            elif fields[0] == 'end':
                if state == 0:
                    print("'end' found outside of function block")
                    continue
                state = 0
    
            #
            # ur line:
            #   validate function has been defined
            #   process remaining fields on the line, which may be either
            #       individual unicode values or ranges of values
            #
            elif fields[0] == 'ur':
                if state != 1:
                    raise Exception("'ur' found outside of 'name' block")
                for el in fields[1:]:
                    pos = el.find('..')
                    # pos <=0 means not a range, so must be individual value
                    if pos <= 0:
                        # cheap handling of hex or decimal values
                        if el[0:2] == '0x':
                            value = int(el[2:],16)
                        elif el[0] == "'":
                            value = ord(el[1])
                        else:
                            value = int(el)
                        if ((value < 0) | (value > 0x1fffff)):
                            raise Exception('Illegal value (%s) in ch for'\
                                    ' name %s' % (el,name))
                        # for ur we have only ranges (makes things simpler),
                        # so convert val to range
                        currange = (value, value)
                    # pos > 0 means this is a range, so isolate/validate
                    # the interval
                    else:
                        # split the range into it's first-val, last-val
                        (first, last) = el.split("..")
                        # convert values from text into binary
                        if first[0:2] == '0x':
                            start = int(first[2:],16)
                        elif first[0] == "'":
                            start = ord(first[1])
                        else:
                            start = int(first)
                        if last[0:2] == '0x':
                            end = int(last[2:],16)
                        elif last[0] == "'":
                            end = ord(last[1])
                        else:
                            end = int(last)
                        if (start < 0) | (end > 0x1fffff) | (start > end):
                            raise Exception("Invalid range '%s'" % el)
                        currange = (start, end)
                    # common path - 'currange' has the range, now take care of it
                    # We split on single-byte values vs. multibyte
                    if currange[1] < 0x100: # single-byte
                        for ch in range(currange[0],currange[1]+1):
                            # validate that value not previously defined
                            if Functs[name][0][ch]:
                                msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
                                raise Exception(msg)
                            Functs[name][0][ch] = 1
                    else:                   # multi-byte
                        if currange in Functs[name][1]:
                            raise Exception("range already defined in" \
                                    " function")
                        else:
                            Functs[name][1].append(currange)
    
            print("Failed to process line: %s" % (line))
            raise
    
    #
    # At this point, the entire definition file has been processed.  Now we
    # enter the output phase, where we generate the two files chvalid.c and'
    # chvalid.h
    #
    # To do this, we first output the 'static' data (heading, fixed
    # definitions, etc.), then output the 'dynamic' data (the results
    # of the above processing), and finally output closing 'static' data
    # (e.g. the subroutine to process the ranges)
    #
    
    #
    # Generate the headings:
    #
    try:
    
        header = open("include/libxml/chvalid.h", "w")
    
        print("Failed to open include/libxml/chvalid.h")
    
        sys.exit(1)
    
    try:
        output = open("chvalid.c", "w")
    except:
    
        print("Failed to open chvalid.c")
    
        sys.exit(1)
    
    date = time.asctime(time.localtime(time.time()))
    
    header.write(
    """/*
    
     * Summary: Unicode character range checking
     * Description: this module exports interfaces for the character
     *               range validation APIs
    
     *
     * This file is automatically generated from the cvs source
     * definition files using the genChRanges.py Python script
     *
     * Generation date: %s
     * Sources: %s
    
     * Author: William Brack <wbrack@mmm.com.hk>
    
     */
    
    #ifndef __XML_CHVALID_H__
    #define __XML_CHVALID_H__
    
    
    #ifdef __cplusplus
    extern "C" {
    #endif
    
    /*
     * Define our typedefs and structures
     *
     */
    typedef struct _xmlChSRange xmlChSRange;
    typedef xmlChSRange *xmlChSRangePtr;
    struct _xmlChSRange {
    
        unsigned short\tlow;
        unsigned short\thigh;
    
    };
    
    typedef struct _xmlChLRange xmlChLRange;
    typedef xmlChLRange *xmlChLRangePtr;
    struct _xmlChLRange {
    
        unsigned int\tlow;
        unsigned int\thigh;
    
    };
    
    typedef struct _xmlChRangeGroup xmlChRangeGroup;
    typedef xmlChRangeGroup *xmlChRangeGroupPtr;
    struct _xmlChRangeGroup {
    
        int\t\t\tnbShortRange;
        int\t\t\tnbLongRange;
        const xmlChSRange\t*shortRange;\t/* points to an array of ranges */
        const xmlChLRange\t*longRange;
    
    \t\txmlCharInRange(unsigned int val, const xmlChRangeGroup *group);
    
     * chvalid.c:\tthis module implements the character range
     *\t\tvalidation APIs
    
     *
     * This file is automatically generated from the cvs source
     * definition files using the genChRanges.py Python script
     *
     * Generation date: %s
     * Sources: %s
     * William Brack <wbrack@mmm.com.hk>
     */
    
    
    #define IN_LIBXML
    #include "libxml.h"
    
    #include <libxml/chvalid.h>
    
    
    /*
     * The initial tables ({func_name}_tab) are used to validate whether a
     * single-byte character is within the specified group.  Each table
     * contains 256 bytes, with each byte representing one of the 256
     * possible characters.  If the table byte is set, the character is
     * allowed.
     *
     */
    """ % (date, sources));
    
    #
    # Now output the generated data.
    # We try to produce the best execution times.  Tests have shown that validation
    # with direct table lookup is, when there are a "small" number of valid items,
    # still not as fast as a sequence of inline compares.  So, if the single-byte
    # portion of a range has a "small" number of ranges, we output a macro for inline
    # compares, otherwise we output a 256-byte table and a macro to use it.
    #
    
    
    fkeys = sorted(Functs.keys())
    
    
    for f in fkeys:
    
    # First we convert the specified single-byte values into a group of ranges.
    # If the total number of such ranges is less than minTableSize, we generate
    # an inline macro for direct comparisons; if greater, we generate a lookup
    # table.
    
        if max(Functs[f][0]) > 0:   # only check if at least one entry
    
            rangeTable = makeRange(Functs[f][0])
    
            numRanges = len(rangeTable)
            if numRanges >= minTableSize:   # table is worthwhile
                header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
                header.write("""
    
    /**
     * %s_ch:
     * @c: char to validate
     *
     * Automatically generated by genChRanges.py
     */
    """ % f)
    
                header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))
    
                # write the constant data to the code file
                output.write("const unsigned char %s_tab[256] = {\n" % f)
                pline = "   "
                for n in range(255):
                    pline += " 0x%02x," % Functs[f][0][n]
                    if len(pline) > 72:
                        output.write(pline + "\n")
                        pline = "   "
                output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])
    
            else:           # inline check is used
                # first another little optimisation - if space is present,
                # put it at the front of the list so it is checked first
                try:
                    ix = rangeTable.remove((0x20, 0x20))
                    rangeTable.insert(0, (0x20, 0x20))
                except:
                    pass
                firstFlag = 1
    
                header.write("""
    
    /**
     * %s_ch:
     * @c: char to validate
     *
     * Automatically generated by genChRanges.py
     */
    """ % f)
    
                # okay, I'm tired of the messy lineup - let's automate it!
                pline = "#define %s_ch(c)" % f
                # 'ntab' is number of tabs needed to position to col. 33 from name end
                ntab = 4 - (len(pline)) // 8
                if ntab < 0:
                    ntab = 0
                just = ""
                for i in range(ntab):
                    just += "\t"
                pline = pline + just + "("
                for rg in rangeTable:
                    if not firstFlag:
                        pline += " || \\\n\t\t\t\t "
                    else:
                        firstFlag = 0
                    if rg[0] == rg[1]:              # single value - check equal
                        pline += "((c) == 0x%x)" % rg[0]
                    else:                           # value range
                    # since we are doing char, also change range ending in 0xff
                        if rg[1] != 0xff:
                            pline += "((0x%x <= (c)) &&" % rg[0]
                            pline += " ((c) <= 0x%x))" % rg[1]
                        else:
                            pline += " (0x%x <= (c))" % rg[0]
                pline += ")\n"
                header.write(pline)
    
        header.write("""
    /**
     * %sQ:
     * @c: char to validate
     *
     * Automatically generated by genChRanges.py
     */
    """ % f)
    
        ntab = 4 - (len(pline)) // 8
    
            just += "\t"
    
        header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ")
    
            header.write("%s_ch((c)) :" % f)
    
            header.write("0 :")
    
    
        # if no ranges defined, value invalid if >= 0x100
    
        numRanges = len(Functs[f][1])
        if numRanges == 0:
    
            header.write(" 0)\n\n")
    
            if numRanges >= minTableSize:
                header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n"  % f)
            else:           # if < minTableSize, generate inline code
                firstFlag = 1
                for rg in Functs[f][1]:
                    if not firstFlag:
                        pline += " || \\\n\t\t\t\t "
                    else:
                        firstFlag = 0
                        pline = "\\\n\t\t\t\t("
                    if rg[0] == rg[1]:              # single value - check equal
                        pline += "((c) == 0x%x)" % rg[0]
                    else:                           # value range
                        pline += "((0x%x <= (c)) &&" % rg[0]
                        pline += " ((c) <= 0x%x))" % rg[1]
                pline += "))\n\n"
                header.write(pline)
    
            header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
    
        if len(Functs[f][1]) > 0:   # only generate if unicode ranges present
            rangeTable = Functs[f][1]
            rangeTable.sort()       # ascending tuple sequence
            numShort = 0
            numLong  = 0
            for rg in rangeTable:
                if rg[1] < 0x10000: # if short value
                    if numShort == 0:       # first occurrence
                        pline = "static const xmlChSRange %s_srng[] = {" % f
                    else:
                        pline += ","
                    numShort += 1
                    if len(pline) > 60:
                        output.write(pline + "\n")
                        pline = "    "
    
    Nick Wellnhofer's avatar
    Nick Wellnhofer committed
                    else:
                        pline += " "
    
                    pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
                else:               # if long value
                    if numLong == 0:        # first occurrence
                        if numShort > 0:    # if there were shorts, finish them off
                            output.write(pline + "};\n")
                        pline = "static const xmlChLRange %s_lrng[] = { " % f
                    else:
                        pline += ", "
                    numLong += 1
                    if len(pline) > 60:
                        output.write(pline + "\n")
                        pline = "    "
                    pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
            output.write(pline + "};\n")    # finish off last group
    
            pline = "const xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong)
            if numShort > 0:
                pline += "%s_srng" % f
            else:
                pline += "(xmlChSRangePtr)0"
            if numLong > 0:
                pline += ", %s_lrng" % f
            else:
                pline += ", (xmlChLRangePtr)0"
    
            output.write(pline + "};\n\n")
    
    /**
     * xmlCharInRange:
     * @val: character to be validated
     * @rptr: pointer to range to be used to validate
     *
     * Does a binary search of the range table to determine if char
     * is valid
     *
     * Returns: true if character valid, false otherwise
     */
    
    xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
    
        const xmlChSRange *sptr;
        const xmlChLRange *lptr;
    
        if (val < 0x10000) {\t/* is val in 'short' or 'long'  array? */
    \tif (rptr->nbShortRange == 0)
    \t    return 0;
    \tlow = 0;
    \thigh = rptr->nbShortRange - 1;
    \tsptr = rptr->shortRange;
    \twhile (low <= high) {
    \t    mid = (low + high) / 2;
    \t    if ((unsigned short) val < sptr[mid].low) {
    \t\thigh = mid - 1;
    \t    } else {
    \t\tif ((unsigned short) val > sptr[mid].high) {
    \t\t    low = mid + 1;
    \t\t} else {
    \t\t    return 1;
    \t\t}
    \t    }
    \t}
    
    \tif (rptr->nbLongRange == 0) {
    \t    return 0;
    \t}
    \tlow = 0;
    \thigh = rptr->nbLongRange - 1;
    \tlptr = rptr->longRange;
    \twhile (low <= high) {
    \t    mid = (low + high) / 2;
    \t    if (val < lptr[mid].low) {
    \t\thigh = mid - 1;
    \t    } else {
    \t\tif (val > lptr[mid].high) {
    \t\t    low = mid + 1;
    \t\t} else {
    \t\t    return 1;
    \t\t}
    \t    }
    \t}
    
    #
    # finally, generate the ABI compatibility functions
    #
    for f in fkeys:
    
        output.write("""
    /**
     * %s:
     * @ch:  character to validate
     *
    
     * This function is DEPRECATED.
    """ % f);
        if max(Functs[f][0]) > 0:
            output.write(" * Use %s_ch or %sQ instead" % (f, f))
        else:
            output.write(" * Use %sQ instead" % f)
        output.write("""
    
     *
     * Returns true if argument valid, false otherwise
     */
    
        output.write("int\n%s(unsigned int ch) {\n    return(%sQ(ch));\n}\n\n" % (f,f))
        header.write("XMLPUBFUN int XMLCALL\n\t\t%s(unsigned int ch);\n" % f);
    #
    # Run complete - write trailers and close the output files
    #
    
    header.write("""
    #ifdef __cplusplus
    }
    #endif
    #endif /* __XML_CHVALID_H__ */