Newer
Older
/*
* HTMLparser.c : an HTML 4.0 non-verifying parser
*
* See Copyright for the status of this software.
*
#define IN_LIBXML
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <libxml/xmlmemory.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
#include <libxml/xmlerror.h>
#include <libxml/HTMLparser.h>
#include <libxml/entities.h>
#include <libxml/encoding.h>
#include <libxml/valid.h>
#include <libxml/xmlIO.h>
#include <libxml/globals.h>
#include "buf.h"
#include "enc.h"
#define HTML_MAX_NAMELEN 1000
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
#define HTML_PARSER_BUFFER_SIZE 100
/* #define DEBUG */
/* #define DEBUG_PUSH */
static int htmlOmittedDefaultValue = 1;
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
xmlChar end, xmlChar end2, xmlChar end3);
static void htmlParseComment(htmlParserCtxtPtr ctxt);
/************************************************************************
* *
* Some factorized error routines *
* *
************************************************************************/
/**
William M. Brack
committed
* htmlErrMemory:
* @ctxt: an HTML parser context
*
* Handle a redefinition of attribute error
*/
static void
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
{
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
if (ctxt != NULL) {
ctxt->errNo = XML_ERR_NO_MEMORY;
ctxt->instate = XML_PARSER_EOF;
ctxt->disableSAX = 1;
}
if (extra)
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
NULL, NULL, 0, 0,
"Memory allocation failed : %s\n", extra);
else
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
NULL, NULL, 0, 0, "Memory allocation failed\n");
}
/**
* htmlParseErr:
* @ctxt: an HTML parser context
* @error: the error number
* @msg: the error message
* @str1: string infor
* @str2: string infor
*
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
*/
static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, const xmlChar *str1, const xmlChar *str2)
{
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
if (ctxt != NULL)
ctxt->errNo = error;
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
XML_ERR_ERROR, NULL, 0,
(const char *) str1, (const char *) str2,
NULL, 0, 0,
msg, str1, str2);
if (ctxt != NULL)
ctxt->wellFormed = 0;
}
/**
* htmlParseErrInt:
* @ctxt: an HTML parser context
* @error: the error number
* @msg: the error message
* @val: integer info
*
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
*/
static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, int val)
{
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
if (ctxt != NULL)
ctxt->errNo = error;
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
XML_ERR_ERROR, NULL, 0, NULL, NULL,
NULL, val, 0, msg, val);
if (ctxt != NULL)
ctxt->wellFormed = 0;
}
/************************************************************************
* *
* Parser stacks related functions and macros *
* *
************************************************************************/
/**
* htmlnamePush:
* @ctxt: an HTML parser context
* @value: the element name
*
* Pushes a new element name on top of the name stack
*
* Returns 0 in case of error, the index in the stack otherwise
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
ctxt->html = 3;
if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
ctxt->html = 10;
if (ctxt->nameNr >= ctxt->nameMax) {
ctxt->nameMax *= 2;
ctxt->nameTab = (const xmlChar * *)
xmlRealloc((xmlChar * *)ctxt->nameTab,
ctxt->nameMax *
sizeof(ctxt->nameTab[0]));
if (ctxt->nameTab == NULL) {
htmlErrMemory(ctxt, NULL);
return (0);
}
}
ctxt->nameTab[ctxt->nameNr] = value;
ctxt->name = value;
return (ctxt->nameNr++);
}
/**
* htmlnamePop:
* @ctxt: an HTML parser context
*
* Pops the top element name from the name stack
*
* Returns the name just removed
*/
static const xmlChar *
const xmlChar *ret;
return (NULL);
return (NULL);
if (ctxt->nameNr > 0)
ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
else
ctxt->name = NULL;
ret = ctxt->nameTab[ctxt->nameNr];
ctxt->nameTab[ctxt->nameNr] = NULL;
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/**
* htmlNodeInfoPush:
* @ctxt: an HTML parser context
* @value: the node info
*
* Pushes a new element name on top of the node info stack
*
* Returns 0 in case of error, the index in the stack otherwise
*/
static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
{
if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
if (ctxt->nodeInfoMax == 0)
ctxt->nodeInfoMax = 5;
ctxt->nodeInfoMax *= 2;
ctxt->nodeInfoTab = (htmlParserNodeInfo *)
xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
ctxt->nodeInfoMax *
sizeof(ctxt->nodeInfoTab[0]));
if (ctxt->nodeInfoTab == NULL) {
htmlErrMemory(ctxt, NULL);
return (0);
}
}
ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
return (ctxt->nodeInfoNr++);
}
/**
* htmlNodeInfoPop:
* @ctxt: an HTML parser context
*
* Pops the top element name from the node info stack
*
* Returns 0 in case of error, the pointer to NodeInfo otherwise
*/
static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
{
if (ctxt->nodeInfoNr <= 0)
return (NULL);
ctxt->nodeInfoNr--;
if (ctxt->nodeInfoNr < 0)
return (NULL);
if (ctxt->nodeInfoNr > 0)
ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
else
ctxt->nodeInfo = NULL;
return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
}
/*
* Macros for accessing the content. Those should be used only by the parser,
* and not exported.
*
* Dirty macros, i.e. one need to make assumption on the context to use them
*
* CUR_PTR return the current pointer to the xmlChar to be parsed.
* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
* in UNICODE mode. This should be used internally by the parser
* only to compare to ASCII values otherwise it would break when
* running with UTF-8 encoding.
* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
* to compare on ASCII based substring.
* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
* it should be used only to compare on ASCII based substring.
* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
* strings without newlines within the parser.
*
* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
*
* CURRENT Returns the current char value, with the full decoding of
* UTF-8 if we are using this mode. It returns an int.
* NEXT Skip to the next character, this does the proper decoding
* in UTF-8 mode. It also pop-up unfinished entities on the fly.
* NEXTL(l) Skip the current unicode character of l xmlChars long.
* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
*/
#define UPPER (toupper(*ctxt->input->cur))
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
#define NXT(val) ctxt->input->cur[(val)]
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
#define CUR_PTR ctxt->input->cur
#define BASE_PTR ctxt->input->base
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
xmlParserInputShrink(ctxt->input)
#define GROW if ((ctxt->progressive == 0) && \
(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
#define CURRENT ((int) (*ctxt->input->cur))
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
#define CUR ((int) (*ctxt->input->cur))
#define NEXT xmlNextChar(ctxt)
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
#define NEXTL(l) do { \
if (*(ctxt->input->cur) == '\n') { \
ctxt->input->line++; ctxt->input->col = 1; \
} else ctxt->input->col++; \
ctxt->token = 0; ctxt->input->cur += l; \
/************
\
if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
************/
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
#define COPY_BUF(l,b,i,v) \
if (l == 1) b[i++] = (xmlChar) v; \
else i += xmlCopyChar(l,&b[i],v)
/**
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
* htmlFindEncoding:
* @the HTML parser context
*
* Ty to find and encoding in the current data available in the input
* buffer this is needed to try to switch to the proper encoding when
* one face a character error.
* That's an heuristic, since it's operating outside of parsing it could
* try to use a meta which had been commented out, that's the reason it
* should only be used in case of error, not as a default.
*
* Returns an encoding string or NULL if not found, the string need to
* be freed
*/
static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
const xmlChar *start, *cur, *end;
if ((ctxt == NULL) || (ctxt->input == NULL) ||
(ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
(ctxt->input->buf->encoder != NULL))
return(NULL);
if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
return(NULL);
start = ctxt->input->cur;
end = ctxt->input->end;
/* we also expect the input buffer to be zero terminated */
if (*end != 0)
return(NULL);
cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
if (cur == NULL)
return(NULL);
cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
if (cur == NULL)
return(NULL);
cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
if (cur == NULL)
return(NULL);
cur += 8;
start = cur;
while (((*cur >= 'A') && (*cur <= 'Z')) ||
((*cur >= 'a') && (*cur <= 'z')) ||
((*cur >= '0') && (*cur <= '9')) ||
(*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
cur++;
if (cur == start)
return(NULL);
return(xmlStrndup(start, cur - start));
}
/**
* htmlCurrentChar:
* @ctxt: the HTML parser context
* @len: pointer to the length of the char read
*
* The current char value, if using UTF-8 this may actually span multiple
* bytes in the input buffer. Implement the end of line normalization:
* 2.11 End-of-Line Handling
* If the encoding is unspecified, in the case we find an ISO-Latin-1
* char, then the encoding converter is plugged in automatically.
*
* Returns the current char value and its length
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
const unsigned char *cur;
unsigned char c;
unsigned int val;
if (ctxt->instate == XML_PARSER_EOF)
return(0);
if (ctxt->token != 0) {
*len = 0;
return(ctxt->token);
if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
xmlChar * guess;
xmlCharEncodingHandlerPtr handler;
/*
* Assume it's a fixed length encoding (1) with
* a compatible encoding for the ASCII set, since
* HTML constructs only use < 128 chars
*/
if ((int) *ctxt->input->cur < 0x80) {
*len = 1;
if ((*ctxt->input->cur == 0) &&
(ctxt->input->cur < ctxt->input->end)) {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Char 0x%X out of allowed range\n", 0);
return(' ');
}
return((int) *ctxt->input->cur);
}
/*
* Humm this is bad, do an automatic flow conversion
*/
guess = htmlFindEncoding(ctxt);
if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
} else {
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = guess;
handler = xmlFindCharEncodingHandler((const char *) guess);
if (handler != NULL) {
/*
* Don't use UTF-8 encoder which isn't required and
* can produce invalid UTF-8.
*/
if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
xmlSwitchToEncoding(ctxt, handler);
} else {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"Unsupported encoding %s", guess, NULL);
}
}
ctxt->charset = XML_CHAR_ENCODING_UTF8;
}
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
/*
* We are supposed to handle UTF8, check it's valid
* From rfc2044: encoding of the Unicode values on UTF-8:
*
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
*
* Check for the 0x110000 limit too
*/
cur = ctxt->input->cur;
c = *cur;
if (c & 0x80) {
if ((c & 0x40) == 0)
goto encoding_error;
if (cur[1] == 0) {
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
cur = ctxt->input->cur;
}
if ((cur[1] & 0xc0) != 0x80)
goto encoding_error;
if ((c & 0xe0) == 0xe0) {
if (cur[2] == 0) {
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
cur = ctxt->input->cur;
}
if ((cur[2] & 0xc0) != 0x80)
goto encoding_error;
if ((c & 0xf0) == 0xf0) {
if (cur[3] == 0) {
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
cur = ctxt->input->cur;
}
if (((c & 0xf8) != 0xf0) ||
((cur[3] & 0xc0) != 0x80))
goto encoding_error;
/* 4-byte code */
*len = 4;
val = (cur[0] & 0x7) << 18;
val |= (cur[1] & 0x3f) << 12;
val |= (cur[2] & 0x3f) << 6;
val |= cur[3] & 0x3f;
if (val < 0x10000)
goto encoding_error;
} else {
/* 3-byte code */
*len = 3;
val = (cur[0] & 0xf) << 12;
val |= (cur[1] & 0x3f) << 6;
val |= cur[2] & 0x3f;
if (val < 0x800)
goto encoding_error;
}
} else {
/* 2-byte code */
*len = 2;
val = (cur[0] & 0x1f) << 6;
val |= cur[1] & 0x3f;
if (val < 0x80)
goto encoding_error;
}
if (!IS_CHAR(val)) {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Char 0x%X out of allowed range\n", val);
}
return(val);
} else {
if ((*ctxt->input->cur == 0) &&
(ctxt->input->cur < ctxt->input->end)) {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Char 0x%X out of allowed range\n", 0);
*len = 1;
return(' ');
}
/* 1-byte code */
*len = 1;
return((int) *ctxt->input->cur);
}
encoding_error:
/*
* If we detect an UTF8 error that probably mean that the
* input encoding didn't get properly advertised in the
* declaration header. Report the error and switch the encoding
* to ISO-Latin-1 (if you don't like this policy, just declare the
* encoding !)
*/
{
char buffer[150];
Daniel Veillard
committed
if (ctxt->input->end - ctxt->input->cur >= 4) {
snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
ctxt->input->cur[0], ctxt->input->cur[1],
ctxt->input->cur[2], ctxt->input->cur[3]);
} else {
snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
}
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"Input is not proper UTF-8, indicate encoding !\n",
BAD_CAST buffer, NULL);
/*
* Don't switch encodings twice. Note that if there's an encoder, we
* shouldn't receive invalid UTF-8 anyway.
*
* Note that if ctxt->input->buf == NULL, switching encodings is
* impossible, see Gitlab issue #34.
*/
if ((ctxt->input->buf != NULL) &&
(ctxt->input->buf->encoder == NULL))
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
*len = 1;
return((int) *ctxt->input->cur);
}
/**
* htmlSkipBlankChars:
* @ctxt: the HTML parser context
*
* skip all blanks character found at that point in the input streams.
*
* Returns the number of space chars skipped
*/
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
int res = 0;
while (IS_BLANK_CH(*(ctxt->input->cur))) {
if ((*ctxt->input->cur == 0) &&
(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
xmlPopInput(ctxt);
} else {
if (*(ctxt->input->cur) == '\n') {
ctxt->input->line++; ctxt->input->col = 1;
} else ctxt->input->col++;
ctxt->input->cur++;
if (*ctxt->input->cur == 0)
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
}
if (res < INT_MAX)
res++;
}
return(res);
}
/************************************************************************
* *
* The list of HTML elements and their properties *
* *
************************************************************************/
/*
* Start Tag: 1 means the start tag can be omitted
* End Tag: 1 means the end tag can be omitted
* 2 means it's forbidden (empty elements)
* 3 means the tag is stylistic and should be closed easily
* Depr: this element is deprecated
* DTD: 1 means that this element is valid only in the Loose DTD
* 2 means that this element is valid only in the Frameset DTD
*
* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
, subElements , impliedsubelt , Attributes, userdata
/* Definitions and a couple of vars for HTML Elements */
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard
committed
#define NB_FONTSTYLE 8
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard
committed
#define NB_PHRASE 10
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
#define NB_SPECIAL 16
#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard
committed
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard
committed
#define NB_BLOCK NB_HEADING + NB_LIST + 14
#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard
committed
#define NB_FORMCTRL 5
#define PCDATA
Daniel Veillard
committed
#define NB_PCDATA 0
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard
committed
#define NB_HEADING 6
#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard
committed
#define NB_LIST 4
#define MODIFIER
Daniel Veillard
committed
#define NB_MODIFIER 0
#define FLOW BLOCK,INLINE
Daniel Veillard
committed
#define NB_FLOW NB_BLOCK + NB_INLINE
#define EMPTY NULL
static const char* const html_flow[] = { FLOW, NULL } ;
static const char* const html_inline[] = { INLINE, NULL } ;
/* placeholders: elts with content but no subelements */
static const char* const html_pcdata[] = { NULL } ;
#define html_cdata html_pcdata
/* ... and for HTML Attributes */
#define COREATTRS "id", "class", "style", "title"
Daniel Veillard
committed
#define NB_COREATTRS 4
#define I18N "lang", "dir"
Daniel Veillard
committed
#define NB_I18N 2
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard
committed
#define NB_EVENTS 9
#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard
committed
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard
committed
#define NB_CELLHALIGN 3
#define CELLVALIGN "valign"
Daniel Veillard
committed
#define NB_CELLVALIGN 1
static const char* const html_attrs[] = { ATTRS, NULL } ;
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
static const char* const core_attrs[] = { COREATTRS, NULL } ;
static const char* const i18n_attrs[] = { I18N, NULL } ;
/* Other declarations that should go inline ... */
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
"tabindex", "onfocus", "onblur", NULL } ;
static const char* const target_attr[] = { "target", NULL } ;
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
static const char* const alt_attr[] = { "alt", NULL } ;
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
static const char* const href_attrs[] = { "href", NULL } ;
static const char* const clear_attrs[] = { "clear", NULL } ;
static const char* const inline_p[] = { INLINE, "p", NULL } ;
static const char* const flow_param[] = { FLOW, "param", NULL } ;
static const char* const applet_attrs[] = { COREATTRS , "codebase",
"archive", "alt", "name", "height", "width", "align",
"hspace", "vspace", NULL } ;
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
static const char* const basefont_attrs[] =
{ "id", "size", "color", "face", NULL } ;
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
static const char* const body_depr[] = { "background", "bgcolor", "text",
"link", "vlink", "alink", NULL } ;
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
static const char* const col_elt[] = { "col", NULL } ;
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
static const char* const compact_attr[] = { "compact", NULL } ;
static const char* const label_attr[] = { "label", NULL } ;
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
static const char* const version_attr[] = { "version", NULL } ;
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
static const char* const align_attr[] = { "align", NULL } ;
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
static const char* const name_attr[] = { "name", NULL } ;
static const char* const action_attr[] = { "action", NULL } ;
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
static const char* const content_attr[] = { "content", NULL } ;
static const char* const type_attr[] = { "type", NULL } ;
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
static const char* const object_contents[] = { FLOW, "param", NULL } ;
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
static const char* const option_elt[] = { "option", NULL } ;
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
static const char* const width_attr[] = { "width", NULL } ;
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
static const char* const language_attr[] = { "language", NULL } ;
static const char* const select_content[] = { "optgroup", "option", NULL } ;
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
static const char* const tr_elt[] = { "tr", NULL } ;
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
static const char* const tr_contents[] = { "th", "td", NULL } ;
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
static const char* const li_elt[] = { "li", NULL } ;
static const char* const ul_depr[] = { "type", "compact", NULL} ;
static const char* const dir_attr[] = { "dir", NULL} ;
#define DECL (const char**)
static const htmlElemDesc
html40ElementTable[] = {
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
},
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
},
{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
},
{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
DECL inline_p , NULL , DECL html_attrs, NULL, NULL
},
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
},
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
},
{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
},
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
},
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
EMPTY , NULL , NULL, DECL basefont_attrs, NULL
},
{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
},
{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
},
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
},
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
},
{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
},
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
},
{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
},
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
DECL html_flow , NULL , NULL, DECL html_attrs, NULL
},
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
},
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
},
{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
EMPTY , NULL , DECL col_attrs , NULL, NULL
},
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
DECL col_elt , "col" , DECL col_attrs , NULL, NULL
},
{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
DECL html_flow , NULL , DECL html_attrs, NULL, NULL
},
{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
},
{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
},
{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
},
{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
},
{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
},
{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
},
{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
},
{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
EMPTY, NULL, DECL embed_attrs, NULL, NULL
},
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
},
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
DECL html_inline, NULL, NULL, DECL font_attrs, NULL
},
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
},
{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
EMPTY, NULL, NULL, DECL frame_attrs, NULL
},
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
},
{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
},
{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
},
{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
},
{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
},
{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
},
{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
},
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
DECL head_contents, NULL, DECL head_attrs, NULL, NULL
},
{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
},
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
},
{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
},
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
},
{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
},
{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
},
{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
},
{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
EMPTY, NULL, NULL, DECL prompt_attrs, NULL
},
{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
},
{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
},
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
},
{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
DECL html_flow, NULL, DECL html_attrs, NULL, NULL
},
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
},
{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
},
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
},
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
},
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
},
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
DECL html_flow, "div", DECL html_attrs, NULL, NULL
},
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
},
{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
},
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
},
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
},
{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
},
{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
},
{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
},
{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
},
{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
DECL html_inline, NULL, NULL, DECL html_attrs, NULL
},
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
},
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
},
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
DECL select_content, NULL, DECL select_attrs, NULL, NULL
},
{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
},