HTMLparser.c@ 106165

最後變更在這個檔案從106165是 105420,由 vboxsync 提交於 4 月前
libxml2-2.12.6: Applied and adjusted our libxml2 changes to 2.12.6. bugref:10730
屬性 svn:eol-style 設為 `native`
檔案大小: 192.3 KB

行
1	/*
2	* HTMLparser.c : an HTML 4.0 non-verifying parser
3	*
4	* See Copyright for the status of this software.
5	*
6	* [email protected]
7	*/
8
9	#define IN_LIBXML
10	#include "libxml.h"
11	#ifdef LIBXML_HTML_ENABLED
12
13	#include <string.h>
14	#include <ctype.h>
15	#include <stdlib.h>
16
17	#include <libxml/HTMLparser.h>
18	#include <libxml/xmlmemory.h>
19	#include <libxml/tree.h>
20	#include <libxml/parser.h>
21	#include <libxml/parserInternals.h>
22	#include <libxml/xmlerror.h>
23	#include <libxml/HTMLtree.h>
24	#include <libxml/entities.h>
25	#include <libxml/encoding.h>
26	#include <libxml/xmlIO.h>
27	#include <libxml/uri.h>
28
29	#include "private/buf.h"
30	#include "private/enc.h"
31	#include "private/error.h"
32	#include "private/html.h"
33	#include "private/io.h"
34	#include "private/parser.h"
35	#include "private/tree.h"
36
37	#define HTML_MAX_NAMELEN 1000
38	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
39	#define HTML_PARSER_BUFFER_SIZE 100
40
41	static int htmlOmittedDefaultValue = 1;
42
43	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44	xmlChar end, xmlChar end2, xmlChar end3);
45	static void htmlParseComment(htmlParserCtxtPtr ctxt);
46
47	/************************************************************************
48	* *
49	* Some factorized error routines *
50	* *
51	************************************************************************/
52
53	/**
54	* htmlErrMemory:
55	* @ctxt: an HTML parser context
56	* @extra: extra information
57	*
58	* Handle a redefinition of attribute error
59	*/
60	static void
61	htmlErrMemory(xmlParserCtxtPtr ctxt)
62	{
63	xmlCtxtErrMemory(ctxt);
64	}
65
66	/**
67	* htmlParseErr:
68	* @ctxt: an HTML parser context
69	* @error: the error number
70	* @msg: the error message
71	* @str1: string infor
72	* @str2: string infor
73	*
74	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
75	*/
76	static void LIBXML_ATTR_FORMAT(3,0)
77	htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
78	const char msg, const xmlChar str1, const xmlChar *str2)
79	{
80	xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
81	str1, str2, NULL, 0, msg, str1, str2);
82	}
83
84	/**
85	* htmlParseErrInt:
86	* @ctxt: an HTML parser context
87	* @error: the error number
88	* @msg: the error message
89	* @val: integer info
90	*
91	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
92	*/
93	static void LIBXML_ATTR_FORMAT(3,0)
94	htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
95	const char *msg, int val)
96	{
97	xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
98	NULL, NULL, NULL, val, msg, val);
99	}
100
101	/************************************************************************
102	* *
103	* Parser stacks related functions and macros *
104	* *
105	************************************************************************/
106
107	/**
108	* htmlnamePush:
109	* @ctxt: an HTML parser context
110	* @value: the element name
111	*
112	* Pushes a new element name on top of the name stack
113	*
114	* Returns -1 in case of error, the index in the stack otherwise
115	*/
116	static int
117	htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
118	{
119	if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
120	ctxt->html = 3;
121	if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
122	ctxt->html = 10;
123	if (ctxt->nameNr >= ctxt->nameMax) {
124	size_t newSize = ctxt->nameMax * 2;
125	const xmlChar **tmp;
126
127	tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
128	newSize * sizeof(ctxt->nameTab[0]));
129	if (tmp == NULL) {
130	htmlErrMemory(ctxt);
131	return (-1);
132	}
133	ctxt->nameTab = tmp;
134	ctxt->nameMax = newSize;
135	}
136	ctxt->nameTab[ctxt->nameNr] = value;
137	ctxt->name = value;
138	return (ctxt->nameNr++);
139	}
140	/**
141	* htmlnamePop:
142	* @ctxt: an HTML parser context
143	*
144	* Pops the top element name from the name stack
145	*
146	* Returns the name just removed
147	*/
148	static const xmlChar *
149	htmlnamePop(htmlParserCtxtPtr ctxt)
150	{
151	const xmlChar *ret;
152
153	if (ctxt->nameNr <= 0)
154	return (NULL);
155	ctxt->nameNr--;
156	if (ctxt->nameNr < 0)
157	return (NULL);
158	if (ctxt->nameNr > 0)
159	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
160	else
161	ctxt->name = NULL;
162	ret = ctxt->nameTab[ctxt->nameNr];
163	ctxt->nameTab[ctxt->nameNr] = NULL;
164	return (ret);
165	}
166
167	/**
168	* htmlNodeInfoPush:
169	* @ctxt: an HTML parser context
170	* @value: the node info
171	*
172	* Pushes a new element name on top of the node info stack
173	*
174	* Returns 0 in case of error, the index in the stack otherwise
175	*/
176	static int
177	htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
178	{
179	if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
180	if (ctxt->nodeInfoMax == 0)
181	ctxt->nodeInfoMax = 5;
182	ctxt->nodeInfoMax *= 2;
183	ctxt->nodeInfoTab = (htmlParserNodeInfo *)
184	xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
185	ctxt->nodeInfoMax *
186	sizeof(ctxt->nodeInfoTab[0]));
187	if (ctxt->nodeInfoTab == NULL) {
188	htmlErrMemory(ctxt);
189	return (0);
190	}
191	}
192	ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
193	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
194	return (ctxt->nodeInfoNr++);
195	}
196
197	/**
198	* htmlNodeInfoPop:
199	* @ctxt: an HTML parser context
200	*
201	* Pops the top element name from the node info stack
202	*
203	* Returns 0 in case of error, the pointer to NodeInfo otherwise
204	*/
205	static htmlParserNodeInfo *
206	htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
207	{
208	if (ctxt->nodeInfoNr <= 0)
209	return (NULL);
210	ctxt->nodeInfoNr--;
211	if (ctxt->nodeInfoNr < 0)
212	return (NULL);
213	if (ctxt->nodeInfoNr > 0)
214	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
215	else
216	ctxt->nodeInfo = NULL;
217	return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
218	}
219
220	/*
221	* Macros for accessing the content. Those should be used only by the parser,
222	* and not exported.
223	*
224	* Dirty macros, i.e. one need to make assumption on the context to use them
225	*
226	* CUR_PTR return the current pointer to the xmlChar to be parsed.
227	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
228	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
229	* in UNICODE mode. This should be used internally by the parser
230	* only to compare to ASCII values otherwise it would break when
231	* running with UTF-8 encoding.
232	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
233	* to compare on ASCII based substring.
234	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
235	* it should be used only to compare on ASCII based substring.
236	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
237	* strings without newlines within the parser.
238	*
239	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
240	*
241	* NEXT Skip to the next character, this does the proper decoding
242	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
243	* NEXTL(l) Skip the current unicode character of l xmlChars long.
244	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
245	*/
246
247	#define UPPER (toupper(*ctxt->input->cur))
248
249	#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
250
251	#define NXT(val) ctxt->input->cur[(val)]
252
253	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
254
255	#define CUR_PTR ctxt->input->cur
256	#define BASE_PTR ctxt->input->base
257
258	#define SHRINK \
259	if ((!PARSER_PROGRESSIVE(ctxt)) && \
260	(ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
261	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
262	xmlParserShrink(ctxt);
263
264	#define GROW \
265	if ((!PARSER_PROGRESSIVE(ctxt)) && \
266	(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
267	xmlParserGrow(ctxt);
268
269	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
270
271	/* Imported from XML */
272
273	#define CUR (*ctxt->input->cur)
274	#define NEXT xmlNextChar(ctxt)
275
276	#define RAW (*ctxt->input->cur)
277
278
279	#define NEXTL(l) do { \
280	if (*(ctxt->input->cur) == '\n') { \
281	ctxt->input->line++; ctxt->input->col = 1; \
282	} else ctxt->input->col++; \
283	ctxt->input->cur += l; \
284	} while (0)
285
286	/************
287	\
288	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
289	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
290	************/
291
292	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
293
294	#define COPY_BUF(l,b,i,v) \
295	if (l == 1) b[i++] = v; \
296	else i += xmlCopyChar(l,&b[i],v)
297
298	/**
299	* htmlFindEncoding:
300	* @the HTML parser context
301	*
302	* Ty to find and encoding in the current data available in the input
303	* buffer this is needed to try to switch to the proper encoding when
304	* one face a character error.
305	* That's an heuristic, since it's operating outside of parsing it could
306	* try to use a meta which had been commented out, that's the reason it
307	* should only be used in case of error, not as a default.
308	*
309	* Returns an encoding string or NULL if not found, the string need to
310	* be freed
311	*/
312	static xmlChar *
313	htmlFindEncoding(xmlParserCtxtPtr ctxt) {
314	const xmlChar start, cur, *end;
315	xmlChar *ret;
316
317	if ((ctxt == NULL) \|\| (ctxt->input == NULL) \|\|
318	(ctxt->input->flags & XML_INPUT_HAS_ENCODING))
319	return(NULL);
320	if ((ctxt->input->cur == NULL) \|\| (ctxt->input->end == NULL))
321	return(NULL);
322
323	start = ctxt->input->cur;
324	end = ctxt->input->end;
325	/* we also expect the input buffer to be zero terminated */
326	if (*end != 0)
327	return(NULL);
328
329	cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
330	if (cur == NULL)
331	return(NULL);
332	cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
333	if (cur == NULL)
334	return(NULL);
335	cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
336	if (cur == NULL)
337	return(NULL);
338	cur += 8;
339	start = cur;
340	while (((cur >= 'A') && (cur <= 'Z')) \|\|
341	((cur >= 'a') && (cur <= 'z')) \|\|
342	((cur >= '0') && (cur <= '9')) \|\|
343	(cur == '-') \|\| (cur == '_') \|\| (cur == ':') \|\| (cur == '/'))
344	cur++;
345	if (cur == start)
346	return(NULL);
347	ret = xmlStrndup(start, cur - start);
348	if (ret == NULL)
349	htmlErrMemory(ctxt);
350	return(ret);
351	}
352
353	/**
354	* htmlCurrentChar:
355	* @ctxt: the HTML parser context
356	* @len: pointer to the length of the char read
357	*
358	* The current char value, if using UTF-8 this may actually span multiple
359	* bytes in the input buffer. Implement the end of line normalization:
360	* 2.11 End-of-Line Handling
361	* If the encoding is unspecified, in the case we find an ISO-Latin-1
362	* char, then the encoding converter is plugged in automatically.
363	*
364	* Returns the current char value and its length
365	*/
366
367	static int
368	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
369	const unsigned char *cur;
370	unsigned char c;
371	unsigned int val;
372
373	if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)
374	xmlParserGrow(ctxt);
375
376	if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
377	xmlChar * guess;
378
379	/*
380	* Assume it's a fixed length encoding (1) with
381	* a compatible encoding for the ASCII set, since
382	* HTML constructs only use < 128 chars
383	*/
384	if (*ctxt->input->cur < 0x80) {
385	if (*ctxt->input->cur == 0) {
386	if (ctxt->input->cur < ctxt->input->end) {
387	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
388	"Char 0x%X out of allowed range\n", 0);
389	*len = 1;
390	return(' ');
391	} else {
392	*len = 0;
393	return(0);
394	}
395	}
396	*len = 1;
397	return(*ctxt->input->cur);
398	}
399
400	/*
401	* Humm this is bad, do an automatic flow conversion
402	*/
403	guess = htmlFindEncoding(ctxt);
404	if (guess == NULL) {
405	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
406	} else {
407	xmlSwitchEncodingName(ctxt, (const char *) guess);
408	xmlFree(guess);
409	}
410	ctxt->input->flags \|= XML_INPUT_HAS_ENCODING;
411	}
412
413	/*
414	* We are supposed to handle UTF8, check it's valid
415	* From rfc2044: encoding of the Unicode values on UTF-8:
416	*
417	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
418	* 0000 0000-0000 007F 0xxxxxxx
419	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
420	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
421	*
422	* Check for the 0x110000 limit too
423	*/
424	cur = ctxt->input->cur;
425	c = *cur;
426	if (c & 0x80) {
427	size_t avail;
428
429	if ((c & 0x40) == 0)
430	goto encoding_error;
431
432	avail = ctxt->input->end - ctxt->input->cur;
433
434	if ((avail < 2) \|\| ((cur[1] & 0xc0) != 0x80))
435	goto encoding_error;
436	if ((c & 0xe0) == 0xe0) {
437	if ((avail < 3) \|\| ((cur[2] & 0xc0) != 0x80))
438	goto encoding_error;
439	if ((c & 0xf0) == 0xf0) {
440	if (((c & 0xf8) != 0xf0) \|\|
441	(avail < 4) \|\| ((cur[3] & 0xc0) != 0x80))
442	goto encoding_error;
443	/* 4-byte code */
444	*len = 4;
445	val = (cur[0] & 0x7) << 18;
446	val \|= (cur[1] & 0x3f) << 12;
447	val \|= (cur[2] & 0x3f) << 6;
448	val \|= cur[3] & 0x3f;
449	if (val < 0x10000)
450	goto encoding_error;
451	} else {
452	/* 3-byte code */
453	*len = 3;
454	val = (cur[0] & 0xf) << 12;
455	val \|= (cur[1] & 0x3f) << 6;
456	val \|= cur[2] & 0x3f;
457	if (val < 0x800)
458	goto encoding_error;
459	}
460	} else {
461	/* 2-byte code */
462	*len = 2;
463	val = (cur[0] & 0x1f) << 6;
464	val \|= cur[1] & 0x3f;
465	if (val < 0x80)
466	goto encoding_error;
467	}
468	if (!IS_CHAR(val)) {
469	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
470	"Char 0x%X out of allowed range\n", val);
471	}
472	return(val);
473	} else {
474	if (*ctxt->input->cur == 0) {
475	if (ctxt->input->cur < ctxt->input->end) {
476	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
477	"Char 0x%X out of allowed range\n", 0);
478	*len = 1;
479	return(' ');
480	} else {
481	*len = 0;
482	return(0);
483	}
484	}
485	/* 1-byte code */
486	*len = 1;
487	return(*ctxt->input->cur);
488	}
489
490	encoding_error:
491	xmlCtxtErrIO(ctxt, XML_ERR_INVALID_ENCODING, NULL);
492
493	if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
494	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
495	*len = 1;
496	return(*ctxt->input->cur);
497	}
498
499	/**
500	* htmlSkipBlankChars:
501	* @ctxt: the HTML parser context
502	*
503	* skip all blanks character found at that point in the input streams.
504	*
505	* Returns the number of space chars skipped
506	*/
507
508	static int
509	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
510	int res = 0;
511
512	while (IS_BLANK_CH(*(ctxt->input->cur))) {
513	if (*(ctxt->input->cur) == '\n') {
514	ctxt->input->line++; ctxt->input->col = 1;
515	} else ctxt->input->col++;
516	ctxt->input->cur++;
517	if (*ctxt->input->cur == 0)
518	xmlParserGrow(ctxt);
519	if (res < INT_MAX)
520	res++;
521	}
522	return(res);
523	}
524
525
526
527	/************************************************************************
528	* *
529	* The list of HTML elements and their properties *
530	* *
531	************************************************************************/
532
533	/*
534	* Start Tag: 1 means the start tag can be omitted
535	* End Tag: 1 means the end tag can be omitted
536	* 2 means it's forbidden (empty elements)
537	* 3 means the tag is stylistic and should be closed easily
538	* Depr: this element is deprecated
539	* DTD: 1 means that this element is valid only in the Loose DTD
540	* 2 means that this element is valid only in the Frameset DTD
541	*
542	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
543	, subElements , impliedsubelt , Attributes, userdata
544	*/
545
546	/* Definitions and a couple of vars for HTML Elements */
547
548	#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
549	#define NB_FONTSTYLE 8
550	#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
551	#define NB_PHRASE 10
552	#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
553	#define NB_SPECIAL 16
554	#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
555	#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
556	#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
557	#define NB_BLOCK NB_HEADING + NB_LIST + 14
558	#define FORMCTRL "input", "select", "textarea", "label", "button"
559	#define NB_FORMCTRL 5
560	#define PCDATA
561	#define NB_PCDATA 0
562	#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
563	#define NB_HEADING 6
564	#define LIST "ul", "ol", "dir", "menu"
565	#define NB_LIST 4
566	#define MODIFIER
567	#define NB_MODIFIER 0
568	#define FLOW BLOCK,INLINE
569	#define NB_FLOW NB_BLOCK + NB_INLINE
570	#define EMPTY NULL
571
572
573	static const char* const html_flow[] = { FLOW, NULL } ;
574	static const char* const html_inline[] = { INLINE, NULL } ;
575
576	/* placeholders: elts with content but no subelements */
577	static const char* const html_pcdata[] = { NULL } ;
578	#define html_cdata html_pcdata
579
580
581	/* ... and for HTML Attributes */
582
583	#define COREATTRS "id", "class", "style", "title"
584	#define NB_COREATTRS 4
585	#define I18N "lang", "dir"
586	#define NB_I18N 2
587	#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
588	#define NB_EVENTS 9
589	#define ATTRS COREATTRS,I18N,EVENTS
590	#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
591	#define CELLHALIGN "align", "char", "charoff"
592	#define NB_CELLHALIGN 3
593	#define CELLVALIGN "valign"
594	#define NB_CELLVALIGN 1
595
596	static const char* const html_attrs[] = { ATTRS, NULL } ;
597	static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
598	static const char* const core_attrs[] = { COREATTRS, NULL } ;
599	static const char* const i18n_attrs[] = { I18N, NULL } ;
600
601
602	/* Other declarations that should go inline ... */
603	static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
604	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
605	"tabindex", "onfocus", "onblur", NULL } ;
606	static const char* const target_attr[] = { "target", NULL } ;
607	static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
608	static const char* const alt_attr[] = { "alt", NULL } ;
609	static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
610	static const char* const href_attrs[] = { "href", NULL } ;
611	static const char* const clear_attrs[] = { "clear", NULL } ;
612	static const char* const inline_p[] = { INLINE, "p", NULL } ;
613
614	static const char* const flow_param[] = { FLOW, "param", NULL } ;
615	static const char* const applet_attrs[] = { COREATTRS , "codebase",
616	"archive", "alt", "name", "height", "width", "align",
617	"hspace", "vspace", NULL } ;
618	static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
619	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
620	static const char* const basefont_attrs[] =
621	{ "id", "size", "color", "face", NULL } ;
622	static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
623	static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
624	static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
625	static const char* const body_depr[] = { "background", "bgcolor", "text",
626	"link", "vlink", "alink", NULL } ;
627	static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
628	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
629
630
631	static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
632	static const char* const col_elt[] = { "col", NULL } ;
633	static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
634	static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
635	static const char* const dl_contents[] = { "dt", "dd", NULL } ;
636	static const char* const compact_attr[] = { "compact", NULL } ;
637	static const char* const label_attr[] = { "label", NULL } ;
638	static const char* const fieldset_contents[] = { FLOW, "legend" } ;
639	static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
640	static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
641	static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
642	static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
643	static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
644	static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
645	static const char* const head_attrs[] = { I18N, "profile", NULL } ;
646	static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
647	static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
648	static const char* const version_attr[] = { "version", NULL } ;
649	static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
650	static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
651	static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
652	static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
653	static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
654	static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
655	static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
656	static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
657	static const char* const align_attr[] = { "align", NULL } ;
658	static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
659	static const char* const map_contents[] = { BLOCK, "area", NULL } ;
660	static const char* const name_attr[] = { "name", NULL } ;
661	static const char* const action_attr[] = { "action", NULL } ;
662	static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
663	static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
664	static const char* const content_attr[] = { "content", NULL } ;
665	static const char* const type_attr[] = { "type", NULL } ;
666	static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
667	static const char* const object_contents[] = { FLOW, "param", NULL } ;
668	static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
669	static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
670	static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
671	static const char* const option_elt[] = { "option", NULL } ;
672	static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
673	static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
674	static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
675	static const char* const width_attr[] = { "width", NULL } ;
676	static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
677	static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
678	static const char* const language_attr[] = { "language", NULL } ;
679	static const char* const select_content[] = { "optgroup", "option", NULL } ;
680	static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
681	static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
682	static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
683	static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
684	static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
685	static const char* const tr_elt[] = { "tr", NULL } ;
686	static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
687	static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
688	static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
689	static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
690	static const char* const tr_contents[] = { "th", "td", NULL } ;
691	static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
692	static const char* const li_elt[] = { "li", NULL } ;
693	static const char* const ul_depr[] = { "type", "compact", NULL} ;
694	static const char* const dir_attr[] = { "dir", NULL} ;
695
696	#define DECL (const char**)
697
698	static const htmlElemDesc
699	html40ElementTable[] = {
700	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
701	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
702	},
703	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
704	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
705	},
706	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
707	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
708	},
709	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
710	DECL inline_p , NULL , DECL html_attrs, NULL, NULL
711	},
712	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
713	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
714	},
715	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
716	EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
717	},
718	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
719	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
720	},
721	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
722	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
723	},
724	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
725	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
726	},
727	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
728	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
729	},
730	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
731	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
732	},
733	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
734	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
735	},
736	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
737	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
738	},
739	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
740	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
741	},
742	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
743	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
744	},
745	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
746	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
747	},
748	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
749	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
750	},
751	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
752	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
753	},
754	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
755	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
756	},
757	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
758	EMPTY , NULL , DECL col_attrs , NULL, NULL
759	},
760	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
761	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
762	},
763	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
764	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
765	},
766	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
767	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
768	},
769	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
770	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
771	},
772	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
773	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
774	},
775	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
776	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
777	},
778	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
779	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
780	},
781	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
782	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
783	},
784	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
785	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
786	},
787	{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
788	EMPTY, NULL, DECL embed_attrs, NULL, NULL
789	},
790	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
791	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
792	},
793	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
794	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
795	},
796	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
797	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
798	},
799	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
800	EMPTY, NULL, NULL, DECL frame_attrs, NULL
801	},
802	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
803	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
804	},
805	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
806	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
807	},
808	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
809	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
810	},
811	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
812	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813	},
814	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
815	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
816	},
817	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
818	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
819	},
820	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
821	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
822	},
823	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
824	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
825	},
826	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
827	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
828	},
829	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
830	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
831	},
832	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
833	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
834	},
835	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
836	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
837	},
838	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
839	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
840	},
841	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
842	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
843	},
844	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
845	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
846	},
847	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
848	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
849	},
850	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
851	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852	},
853	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
854	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
855	},
856	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
857	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
858	},
859	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
860	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
861	},
862	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
863	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
864	},
865	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
866	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
867	},
868	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
869	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
870	},
871	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
872	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
873	},
874	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
875	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
876	},
877	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
878	DECL html_flow, "div", DECL html_attrs, NULL, NULL
879	},
880	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
881	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
882	},
883	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
884	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
885	},
886	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
887	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
888	},
889	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
890	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
891	},
892	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
893	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894	},
895	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
896	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
897	},
898	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
899	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
900	},
901	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
902	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
903	},
904	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
905	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
906	},
907	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
908	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
909	},
910	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
911	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
912	},
913	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
914	DECL select_content, NULL, DECL select_attrs, NULL, NULL
915	},
916	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
917	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
918	},
919	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
920	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
921	},
922	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
923	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
924	},
925	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
926	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
927	},
928	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
929	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
930	},
931	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
932	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
933	},
934	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
935	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
936	},
937	{ "table", 0, 0, 0, 0, 0, 0, 0, "",
938	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
939	},
940	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
941	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
942	},
943	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
944	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
945	},
946	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
947	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
948	},
949	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
950	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
951	},
952	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
953	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
954	},
955	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
956	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
957	},
958	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
959	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
960	},
961	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
962	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
963	},
964	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
965	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
966	},
967	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
968	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
969	},
970	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
971	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
972	},
973	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
974	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975	}
976	};
977
978	typedef struct {
979	const char *oldTag;
980	const char *newTag;
981	} htmlStartCloseEntry;
982
983	/*
984	* start tags that imply the end of current element
985	*/
986	static const htmlStartCloseEntry htmlStartClose[] = {
987	{ "a", "a" },
988	{ "a", "fieldset" },
989	{ "a", "table" },
990	{ "a", "td" },
991	{ "a", "th" },
992	{ "address", "dd" },
993	{ "address", "dl" },
994	{ "address", "dt" },
995	{ "address", "form" },
996	{ "address", "li" },
997	{ "address", "ul" },
998	{ "b", "center" },
999	{ "b", "p" },
1000	{ "b", "td" },
1001	{ "b", "th" },
1002	{ "big", "p" },
1003	{ "caption", "col" },
1004	{ "caption", "colgroup" },
1005	{ "caption", "tbody" },
1006	{ "caption", "tfoot" },
1007	{ "caption", "thead" },
1008	{ "caption", "tr" },
1009	{ "col", "col" },
1010	{ "col", "colgroup" },
1011	{ "col", "tbody" },
1012	{ "col", "tfoot" },
1013	{ "col", "thead" },
1014	{ "col", "tr" },
1015	{ "colgroup", "colgroup" },
1016	{ "colgroup", "tbody" },
1017	{ "colgroup", "tfoot" },
1018	{ "colgroup", "thead" },
1019	{ "colgroup", "tr" },
1020	{ "dd", "dt" },
1021	{ "dir", "dd" },
1022	{ "dir", "dl" },
1023	{ "dir", "dt" },
1024	{ "dir", "form" },
1025	{ "dir", "ul" },
1026	{ "dl", "form" },
1027	{ "dl", "li" },
1028	{ "dt", "dd" },
1029	{ "dt", "dl" },
1030	{ "font", "center" },
1031	{ "font", "td" },
1032	{ "font", "th" },
1033	{ "form", "form" },
1034	{ "h1", "fieldset" },
1035	{ "h1", "form" },
1036	{ "h1", "li" },
1037	{ "h1", "p" },
1038	{ "h1", "table" },
1039	{ "h2", "fieldset" },
1040	{ "h2", "form" },
1041	{ "h2", "li" },
1042	{ "h2", "p" },
1043	{ "h2", "table" },
1044	{ "h3", "fieldset" },
1045	{ "h3", "form" },
1046	{ "h3", "li" },
1047	{ "h3", "p" },
1048	{ "h3", "table" },
1049	{ "h4", "fieldset" },
1050	{ "h4", "form" },
1051	{ "h4", "li" },
1052	{ "h4", "p" },
1053	{ "h4", "table" },
1054	{ "h5", "fieldset" },
1055	{ "h5", "form" },
1056	{ "h5", "li" },
1057	{ "h5", "p" },
1058	{ "h5", "table" },
1059	{ "h6", "fieldset" },
1060	{ "h6", "form" },
1061	{ "h6", "li" },
1062	{ "h6", "p" },
1063	{ "h6", "table" },
1064	{ "head", "a" },
1065	{ "head", "abbr" },
1066	{ "head", "acronym" },
1067	{ "head", "address" },
1068	{ "head", "b" },
1069	{ "head", "bdo" },
1070	{ "head", "big" },
1071	{ "head", "blockquote" },
1072	{ "head", "body" },
1073	{ "head", "br" },
1074	{ "head", "center" },
1075	{ "head", "cite" },
1076	{ "head", "code" },
1077	{ "head", "dd" },
1078	{ "head", "dfn" },
1079	{ "head", "dir" },
1080	{ "head", "div" },
1081	{ "head", "dl" },
1082	{ "head", "dt" },
1083	{ "head", "em" },
1084	{ "head", "fieldset" },
1085	{ "head", "font" },
1086	{ "head", "form" },
1087	{ "head", "frameset" },
1088	{ "head", "h1" },
1089	{ "head", "h2" },
1090	{ "head", "h3" },
1091	{ "head", "h4" },
1092	{ "head", "h5" },
1093	{ "head", "h6" },
1094	{ "head", "hr" },
1095	{ "head", "i" },
1096	{ "head", "iframe" },
1097	{ "head", "img" },
1098	{ "head", "kbd" },
1099	{ "head", "li" },
1100	{ "head", "listing" },
1101	{ "head", "map" },
1102	{ "head", "menu" },
1103	{ "head", "ol" },
1104	{ "head", "p" },
1105	{ "head", "pre" },
1106	{ "head", "q" },
1107	{ "head", "s" },
1108	{ "head", "samp" },
1109	{ "head", "small" },
1110	{ "head", "span" },
1111	{ "head", "strike" },
1112	{ "head", "strong" },
1113	{ "head", "sub" },
1114	{ "head", "sup" },
1115	{ "head", "table" },
1116	{ "head", "tt" },
1117	{ "head", "u" },
1118	{ "head", "ul" },
1119	{ "head", "var" },
1120	{ "head", "xmp" },
1121	{ "hr", "form" },
1122	{ "i", "center" },
1123	{ "i", "p" },
1124	{ "i", "td" },
1125	{ "i", "th" },
1126	{ "legend", "fieldset" },
1127	{ "li", "li" },
1128	{ "link", "body" },
1129	{ "link", "frameset" },
1130	{ "listing", "dd" },
1131	{ "listing", "dl" },
1132	{ "listing", "dt" },
1133	{ "listing", "fieldset" },
1134	{ "listing", "form" },
1135	{ "listing", "li" },
1136	{ "listing", "table" },
1137	{ "listing", "ul" },
1138	{ "menu", "dd" },
1139	{ "menu", "dl" },
1140	{ "menu", "dt" },
1141	{ "menu", "form" },
1142	{ "menu", "ul" },
1143	{ "ol", "form" },
1144	{ "option", "optgroup" },
1145	{ "option", "option" },
1146	{ "p", "address" },
1147	{ "p", "blockquote" },
1148	{ "p", "body" },
1149	{ "p", "caption" },
1150	{ "p", "center" },
1151	{ "p", "col" },
1152	{ "p", "colgroup" },
1153	{ "p", "dd" },
1154	{ "p", "dir" },
1155	{ "p", "div" },
1156	{ "p", "dl" },
1157	{ "p", "dt" },
1158	{ "p", "fieldset" },
1159	{ "p", "form" },
1160	{ "p", "frameset" },
1161	{ "p", "h1" },
1162	{ "p", "h2" },
1163	{ "p", "h3" },
1164	{ "p", "h4" },
1165	{ "p", "h5" },
1166	{ "p", "h6" },
1167	{ "p", "head" },
1168	{ "p", "hr" },
1169	{ "p", "li" },
1170	{ "p", "listing" },
1171	{ "p", "menu" },
1172	{ "p", "ol" },
1173	{ "p", "p" },
1174	{ "p", "pre" },
1175	{ "p", "table" },
1176	{ "p", "tbody" },
1177	{ "p", "td" },
1178	{ "p", "tfoot" },
1179	{ "p", "th" },
1180	{ "p", "title" },
1181	{ "p", "tr" },
1182	{ "p", "ul" },
1183	{ "p", "xmp" },
1184	{ "pre", "dd" },
1185	{ "pre", "dl" },
1186	{ "pre", "dt" },
1187	{ "pre", "fieldset" },
1188	{ "pre", "form" },
1189	{ "pre", "li" },
1190	{ "pre", "table" },
1191	{ "pre", "ul" },
1192	{ "s", "p" },
1193	{ "script", "noscript" },
1194	{ "small", "p" },
1195	{ "span", "td" },
1196	{ "span", "th" },
1197	{ "strike", "p" },
1198	{ "style", "body" },
1199	{ "style", "frameset" },
1200	{ "tbody", "tbody" },
1201	{ "tbody", "tfoot" },
1202	{ "td", "tbody" },
1203	{ "td", "td" },
1204	{ "td", "tfoot" },
1205	{ "td", "th" },
1206	{ "td", "tr" },
1207	{ "tfoot", "tbody" },
1208	{ "th", "tbody" },
1209	{ "th", "td" },
1210	{ "th", "tfoot" },
1211	{ "th", "th" },
1212	{ "th", "tr" },
1213	{ "thead", "tbody" },
1214	{ "thead", "tfoot" },
1215	{ "title", "body" },
1216	{ "title", "frameset" },
1217	{ "tr", "tbody" },
1218	{ "tr", "tfoot" },
1219	{ "tr", "tr" },
1220	{ "tt", "p" },
1221	{ "u", "p" },
1222	{ "u", "td" },
1223	{ "u", "th" },
1224	{ "ul", "address" },
1225	{ "ul", "form" },
1226	{ "ul", "menu" },
1227	{ "ul", "pre" },
1228	{ "xmp", "dd" },
1229	{ "xmp", "dl" },
1230	{ "xmp", "dt" },
1231	{ "xmp", "fieldset" },
1232	{ "xmp", "form" },
1233	{ "xmp", "li" },
1234	{ "xmp", "table" },
1235	{ "xmp", "ul" }
1236	};
1237
1238	/*
1239	* The list of HTML elements which are supposed not to have
1240	* CDATA content and where a p element will be implied
1241	*
1242	* TODO: extend that list by reading the HTML SGML DTD on
1243	* implied paragraph
1244	*/
1245	static const char *const htmlNoContentElements[] = {
1246	"html",
1247	"head",
1248	NULL
1249	};
1250
1251	/*
1252	* The list of HTML attributes which are of content %Script;
1253	* NOTE: when adding ones, check htmlIsScriptAttribute() since
1254	* it assumes the name starts with 'on'
1255	*/
1256	static const char *const htmlScriptAttributes[] = {
1257	"onclick",
1258	"ondblclick",
1259	"onmousedown",
1260	"onmouseup",
1261	"onmouseover",
1262	"onmousemove",
1263	"onmouseout",
1264	"onkeypress",
1265	"onkeydown",
1266	"onkeyup",
1267	"onload",
1268	"onunload",
1269	"onfocus",
1270	"onblur",
1271	"onsubmit",
1272	"onreset",
1273	"onchange",
1274	"onselect"
1275	};
1276
1277	/*
1278	* This table is used by the htmlparser to know what to do with
1279	* broken html pages. By assigning different priorities to different
1280	* elements the parser can decide how to handle extra endtags.
1281	* Endtags are only allowed to close elements with lower or equal
1282	* priority.
1283	*/
1284
1285	typedef struct {
1286	const char *name;
1287	int priority;
1288	} elementPriority;
1289
1290	static const elementPriority htmlEndPriority[] = {
1291	{"div", 150},
1292	{"td", 160},
1293	{"th", 160},
1294	{"tr", 170},
1295	{"thead", 180},
1296	{"tbody", 180},
1297	{"tfoot", 180},
1298	{"table", 190},
1299	{"head", 200},
1300	{"body", 200},
1301	{"html", 220},
1302	{NULL, 100} /* Default priority */
1303	};
1304
1305	/************************************************************************
1306	* *
1307	* functions to handle HTML specific data *
1308	* *
1309	************************************************************************/
1310
1311	/**
1312	* htmlInitAutoClose:
1313	*
1314	* DEPRECATED: This is a no-op.
1315	*/
1316	void
1317	htmlInitAutoClose(void) {
1318	}
1319
1320	static int
1321	htmlCompareTags(const void key, const void member) {
1322	const xmlChar tag = (const xmlChar ) key;
1323	const htmlElemDesc desc = (const htmlElemDesc ) member;
1324
1325	return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1326	}
1327
1328	/**
1329	* htmlTagLookup:
1330	* @tag: The tag name in lowercase
1331	*
1332	* Lookup the HTML tag in the ElementTable
1333	*
1334	* Returns the related htmlElemDescPtr or NULL if not found.
1335	*/
1336	const htmlElemDesc *
1337	htmlTagLookup(const xmlChar *tag) {
1338	if (tag == NULL)
1339	return(NULL);
1340
1341	return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1342	sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1343	sizeof(htmlElemDesc), htmlCompareTags));
1344	}
1345
1346	/**
1347	* htmlGetEndPriority:
1348	* @name: The name of the element to look up the priority for.
1349	*
1350	* Return value: The "endtag" priority.
1351	**/
1352	static int
1353	htmlGetEndPriority (const xmlChar *name) {
1354	int i = 0;
1355
1356	while ((htmlEndPriority[i].name != NULL) &&
1357	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1358	i++;
1359
1360	return(htmlEndPriority[i].priority);
1361	}
1362
1363
1364	static int
1365	htmlCompareStartClose(const void vkey, const void member) {
1366	const htmlStartCloseEntry key = (const htmlStartCloseEntry ) vkey;
1367	const htmlStartCloseEntry entry = (const htmlStartCloseEntry ) member;
1368	int ret;
1369
1370	ret = strcmp(key->oldTag, entry->oldTag);
1371	if (ret == 0)
1372	ret = strcmp(key->newTag, entry->newTag);
1373
1374	return(ret);
1375	}
1376
1377	/**
1378	* htmlCheckAutoClose:
1379	* @newtag: The new tag name
1380	* @oldtag: The old tag name
1381	*
1382	* Checks whether the new tag is one of the registered valid tags for
1383	* closing old.
1384	*
1385	* Returns 0 if no, 1 if yes.
1386	*/
1387	static int
1388	htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1389	{
1390	htmlStartCloseEntry key;
1391	void *res;
1392
1393	key.oldTag = (const char *) oldtag;
1394	key.newTag = (const char *) newtag;
1395	res = bsearch(&key, htmlStartClose,
1396	sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1397	sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1398	return(res != NULL);
1399	}
1400
1401	/**
1402	* htmlAutoCloseOnClose:
1403	* @ctxt: an HTML parser context
1404	* @newtag: The new tag name
1405	* @force: force the tag closure
1406	*
1407	* The HTML DTD allows an ending tag to implicitly close other tags.
1408	*/
1409	static void
1410	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1411	{
1412	const htmlElemDesc *info;
1413	int i, priority;
1414
1415	priority = htmlGetEndPriority(newtag);
1416
1417	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1418
1419	if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1420	break;
1421	/*
1422	* A misplaced endtag can only close elements with lower
1423	* or equal priority, so if we find an element with higher
1424	* priority before we find an element with
1425	* matching name, we just ignore this endtag
1426	*/
1427	if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1428	return;
1429	}
1430	if (i < 0)
1431	return;
1432
1433	while (!xmlStrEqual(newtag, ctxt->name)) {
1434	info = htmlTagLookup(ctxt->name);
1435	if ((info != NULL) && (info->endTag == 3)) {
1436	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1437	"Opening and ending tag mismatch: %s and %s\n",
1438	newtag, ctxt->name);
1439	}
1440	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1441	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1442	htmlnamePop(ctxt);
1443	}
1444	}
1445
1446	/**
1447	* htmlAutoCloseOnEnd:
1448	* @ctxt: an HTML parser context
1449	*
1450	* Close all remaining tags at the end of the stream
1451	*/
1452	static void
1453	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1454	{
1455	int i;
1456
1457	if (ctxt->nameNr == 0)
1458	return;
1459	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1460	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1461	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1462	htmlnamePop(ctxt);
1463	}
1464	}
1465
1466	/**
1467	* htmlAutoClose:
1468	* @ctxt: an HTML parser context
1469	* @newtag: The new tag name or NULL
1470	*
1471	* The HTML DTD allows a tag to implicitly close other tags.
1472	* The list is kept in htmlStartClose array. This function is
1473	* called when a new tag has been detected and generates the
1474	* appropriates closes if possible/needed.
1475	* If newtag is NULL this mean we are at the end of the resource
1476	* and we should check
1477	*/
1478	static void
1479	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1480	{
1481	if (newtag == NULL)
1482	return;
1483
1484	while ((ctxt->name != NULL) &&
1485	(htmlCheckAutoClose(newtag, ctxt->name))) {
1486	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1487	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1488	htmlnamePop(ctxt);
1489	}
1490	}
1491
1492	/**
1493	* htmlAutoCloseTag:
1494	* @doc: the HTML document
1495	* @name: The tag name
1496	* @elem: the HTML element
1497	*
1498	* The HTML DTD allows a tag to implicitly close other tags.
1499	* The list is kept in htmlStartClose array. This function checks
1500	* if the element or one of it's children would autoclose the
1501	* given tag.
1502	*
1503	* Returns 1 if autoclose, 0 otherwise
1504	*/
1505	int
1506	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1507	htmlNodePtr child;
1508
1509	if (elem == NULL) return(1);
1510	if (xmlStrEqual(name, elem->name)) return(0);
1511	if (htmlCheckAutoClose(elem->name, name)) return(1);
1512	child = elem->children;
1513	while (child != NULL) {
1514	if (htmlAutoCloseTag(doc, name, child)) return(1);
1515	child = child->next;
1516	}
1517	return(0);
1518	}
1519
1520	/**
1521	* htmlIsAutoClosed:
1522	* @doc: the HTML document
1523	* @elem: the HTML element
1524	*
1525	* The HTML DTD allows a tag to implicitly close other tags.
1526	* The list is kept in htmlStartClose array. This function checks
1527	* if a tag is autoclosed by one of it's child
1528	*
1529	* Returns 1 if autoclosed, 0 otherwise
1530	*/
1531	int
1532	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1533	htmlNodePtr child;
1534
1535	if (elem == NULL) return(1);
1536	child = elem->children;
1537	while (child != NULL) {
1538	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1539	child = child->next;
1540	}
1541	return(0);
1542	}
1543
1544	/**
1545	* htmlCheckImplied:
1546	* @ctxt: an HTML parser context
1547	* @newtag: The new tag name
1548	*
1549	* The HTML DTD allows a tag to exists only implicitly
1550	* called when a new tag has been detected and generates the
1551	* appropriates implicit tags if missing
1552	*/
1553	static void
1554	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1555	int i;
1556
1557	if (ctxt->options & HTML_PARSE_NOIMPLIED)
1558	return;
1559	if (!htmlOmittedDefaultValue)
1560	return;
1561	if (xmlStrEqual(newtag, BAD_CAST"html"))
1562	return;
1563	if (ctxt->nameNr <= 0) {
1564	htmlnamePush(ctxt, BAD_CAST"html");
1565	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1567	}
1568	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
1569	return;
1570	if ((ctxt->nameNr <= 1) &&
1571	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
1572	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
1573	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
1574	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
1575	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
1576	(xmlStrEqual(newtag, BAD_CAST"base")))) {
1577	if (ctxt->html >= 3) {
1578	/* we already saw or generated an <head> before */
1579	return;
1580	}
1581	/*
1582	* dropped OBJECT ... i you put it first BODY will be
1583	* assumed !
1584	*/
1585	htmlnamePush(ctxt, BAD_CAST"head");
1586	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1587	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1588	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1589	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1590	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1591	if (ctxt->html >= 10) {
1592	/* we already saw or generated a <body> before */
1593	return;
1594	}
1595	for (i = 0;i < ctxt->nameNr;i++) {
1596	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1597	return;
1598	}
1599	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1600	return;
1601	}
1602	}
1603
1604	htmlnamePush(ctxt, BAD_CAST"body");
1605	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1606	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1607	}
1608	}
1609
1610	/**
1611	* htmlCheckParagraph
1612	* @ctxt: an HTML parser context
1613	*
1614	* Check whether a p element need to be implied before inserting
1615	* characters in the current element.
1616	*
1617	* Returns 1 if a paragraph has been inserted, 0 if not and -1
1618	* in case of error.
1619	*/
1620
1621	static int
1622	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1623	const xmlChar *tag;
1624	int i;
1625
1626	if (ctxt == NULL)
1627	return(-1);
1628	tag = ctxt->name;
1629	if (tag == NULL) {
1630	htmlAutoClose(ctxt, BAD_CAST"p");
1631	htmlCheckImplied(ctxt, BAD_CAST"p");
1632	htmlnamePush(ctxt, BAD_CAST"p");
1633	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1634	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1635	return(1);
1636	}
1637	if (!htmlOmittedDefaultValue)
1638	return(0);
1639	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1640	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1641	htmlAutoClose(ctxt, BAD_CAST"p");
1642	htmlCheckImplied(ctxt, BAD_CAST"p");
1643	htmlnamePush(ctxt, BAD_CAST"p");
1644	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1645	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1646	return(1);
1647	}
1648	}
1649	return(0);
1650	}
1651
1652	/**
1653	* htmlIsScriptAttribute:
1654	* @name: an attribute name
1655	*
1656	* Check if an attribute is of content type Script
1657	*
1658	* Returns 1 is the attribute is a script 0 otherwise
1659	*/
1660	int
1661	htmlIsScriptAttribute(const xmlChar *name) {
1662	unsigned int i;
1663
1664	if (name == NULL)
1665	return(0);
1666	/*
1667	* all script attributes start with 'on'
1668	*/
1669	if ((name[0] != 'o') \|\| (name[1] != 'n'))
1670	return(0);
1671	for (i = 0;
1672	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1673	i++) {
1674	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1675	return(1);
1676	}
1677	return(0);
1678	}
1679
1680	/************************************************************************
1681	* *
1682	* The list of HTML predefined entities *
1683	* *
1684	************************************************************************/
1685
1686
1687	static const htmlEntityDesc html40EntitiesTable[] = {
1688	/*
1689	* the 4 absolute ones, plus apostrophe.
1690	*/
1691	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1692	{ 38, "amp", "ampersand, U+0026 ISOnum" },
1693	{ 39, "apos", "single quote" },
1694	{ 60, "lt", "less-than sign, U+003C ISOnum" },
1695	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1696
1697	/*
1698	* A bunch still in the 128-255 range
1699	* Replacing them depend really on the charset used.
1700	*/
1701	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1702	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1703	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1704	{ 163, "pound","pound sign, U+00A3 ISOnum" },
1705	{ 164, "curren","currency sign, U+00A4 ISOnum" },
1706	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1707	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1708	{ 167, "sect", "section sign, U+00A7 ISOnum" },
1709	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1710	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1711	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1712	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1713	{ 172, "not", "not sign, U+00AC ISOnum" },
1714	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1715	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1716	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1717	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1718	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1719	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1720	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1721	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1722	{ 181, "micro","micro sign, U+00B5 ISOnum" },
1723	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1724	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1725	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1726	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1727	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1728	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1729	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1730	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1731	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1732	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1733	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1734	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1735	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1736	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1737	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1738	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1739	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1740	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1741	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1742	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1743	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1744	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1745	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1746	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1747	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1748	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1749	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1750	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1751	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1752	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1753	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1754	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1755	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1756	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1757	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1758	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1759	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1760	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1761	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1762	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1763	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1764	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1765	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1766	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1767	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1768	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1769	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1770	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1771	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1772	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1773	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1774	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1775	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1776	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1777	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1778	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1779	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1780	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1781	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1782	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1783	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1784	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1785	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1786	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1787	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1788	{ 247, "divide","division sign, U+00F7 ISOnum" },
1789	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1790	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1791	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1792	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1793	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1794	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1795	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1796	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1797
1798	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1799	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1800	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1801	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1802	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1803
1804	/*
1805	* Anything below should really be kept as entities references
1806	*/
1807	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1808
1809	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1810	{ 732, "tilde","small tilde, U+02DC ISOdia" },
1811
1812	{ 913, "Alpha","greek capital letter alpha, U+0391" },
1813	{ 914, "Beta", "greek capital letter beta, U+0392" },
1814	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1815	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1816	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1817	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1818	{ 919, "Eta", "greek capital letter eta, U+0397" },
1819	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1820	{ 921, "Iota", "greek capital letter iota, U+0399" },
1821	{ 922, "Kappa","greek capital letter kappa, U+039A" },
1822	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1823	{ 924, "Mu", "greek capital letter mu, U+039C" },
1824	{ 925, "Nu", "greek capital letter nu, U+039D" },
1825	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1826	{ 927, "Omicron","greek capital letter omicron, U+039F" },
1827	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1828	{ 929, "Rho", "greek capital letter rho, U+03A1" },
1829	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1830	{ 932, "Tau", "greek capital letter tau, U+03A4" },
1831	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1832	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1833	{ 935, "Chi", "greek capital letter chi, U+03A7" },
1834	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1835	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1836
1837	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1838	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1839	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1840	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1841	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1842	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1843	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1844	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1845	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1846	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1847	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1848	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1849	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1850	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1851	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1852	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1853	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1854	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1855	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1856	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1857	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1858	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1859	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1860	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1861	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1862	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1863	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1864	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1865
1866	{ 8194, "ensp", "en space, U+2002 ISOpub" },
1867	{ 8195, "emsp", "em space, U+2003 ISOpub" },
1868	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1869	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1870	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1871	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1872	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1873	{ 8211, "ndash","en dash, U+2013 ISOpub" },
1874	{ 8212, "mdash","em dash, U+2014 ISOpub" },
1875	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1876	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1877	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1878	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1879	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1880	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1881	{ 8224, "dagger","dagger, U+2020 ISOpub" },
1882	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1883
1884	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1885	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1886
1887	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1888
1889	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1890	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1891
1892	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1893	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1894
1895	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1896	{ 8260, "frasl","fraction slash, U+2044 NEW" },
1897
1898	{ 8364, "euro", "euro sign, U+20AC NEW" },
1899
1900	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1901	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1902	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1903	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1904	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1905	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1906	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1907	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1908	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1909	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1910	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1911	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1912	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1913	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1914	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1915	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1916
1917	{ 8704, "forall","for all, U+2200 ISOtech" },
1918	{ 8706, "part", "partial differential, U+2202 ISOtech" },
1919	{ 8707, "exist","there exists, U+2203 ISOtech" },
1920	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1921	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1922	{ 8712, "isin", "element of, U+2208 ISOtech" },
1923	{ 8713, "notin","not an element of, U+2209 ISOtech" },
1924	{ 8715, "ni", "contains as member, U+220B ISOtech" },
1925	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1926	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1927	{ 8722, "minus","minus sign, U+2212 ISOtech" },
1928	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1929	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1930	{ 8733, "prop", "proportional to, U+221D ISOtech" },
1931	{ 8734, "infin","infinity, U+221E ISOtech" },
1932	{ 8736, "ang", "angle, U+2220 ISOamso" },
1933	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1934	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1935	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1936	{ 8746, "cup", "union = cup, U+222A ISOtech" },
1937	{ 8747, "int", "integral, U+222B ISOtech" },
1938	{ 8756, "there4","therefore, U+2234 ISOtech" },
1939	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1940	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1941	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1942	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1943	{ 8801, "equiv","identical to, U+2261 ISOtech" },
1944	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1945	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1946	{ 8834, "sub", "subset of, U+2282 ISOtech" },
1947	{ 8835, "sup", "superset of, U+2283 ISOtech" },
1948	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1949	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1950	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1951	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1952	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1953	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1954	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1955	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1956	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1957	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1958	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1959	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1960	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1961	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1962
1963	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1964	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1965	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1966	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1967
1968	};
1969
1970	/************************************************************************
1971	* *
1972	* Commodity functions to handle entities *
1973	* *
1974	************************************************************************/
1975
1976	/*
1977	* Macro used to grow the current buffer.
1978	*/
1979	#define growBuffer(buffer) { \
1980	xmlChar *tmp; \
1981	buffer##_size *= 2; \
1982	tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); \
1983	if (tmp == NULL) { \
1984	htmlErrMemory(ctxt); \
1985	xmlFree(buffer); \
1986	return(NULL); \
1987	} \
1988	buffer = tmp; \
1989	}
1990
1991	/**
1992	* htmlEntityLookup:
1993	* @name: the entity name
1994	*
1995	* Lookup the given entity in EntitiesTable
1996	*
1997	* TODO: the linear scan is really ugly, an hash table is really needed.
1998	*
1999	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2000	*/
2001	const htmlEntityDesc *
2002	htmlEntityLookup(const xmlChar *name) {
2003	unsigned int i;
2004
2005	for (i = 0;i < (sizeof(html40EntitiesTable)/
2006	sizeof(html40EntitiesTable[0]));i++) {
2007	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2008	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2009	}
2010	}
2011	return(NULL);
2012	}
2013
2014	static int
2015	htmlCompareEntityDesc(const void vkey, const void vdesc) {
2016	const unsigned *key = vkey;
2017	const htmlEntityDesc *desc = vdesc;
2018
2019	return((int) *key - (int) desc->value);
2020	}
2021
2022	/**
2023	* htmlEntityValueLookup:
2024	* @value: the entity's unicode value
2025	*
2026	* Lookup the given entity in EntitiesTable
2027	*
2028	* TODO: the linear scan is really ugly, an hash table is really needed.
2029	*
2030	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2031	*/
2032	const htmlEntityDesc *
2033	htmlEntityValueLookup(unsigned int value) {
2034	const htmlEntityDesc *desc;
2035	size_t nmemb;
2036
2037	nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
2038	desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
2039	htmlCompareEntityDesc);
2040
2041	return(desc);
2042	}
2043
2044	/**
2045	* UTF8ToHtml:
2046	* @out: a pointer to an array of bytes to store the result
2047	* @outlen: the length of @out
2048	* @in: a pointer to an array of UTF-8 chars
2049	* @inlen: the length of @in
2050	*
2051	* Take a block of UTF-8 chars in and try to convert it to an ASCII
2052	* plus HTML entities block of chars out.
2053	*
2054	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2055	* The value of @inlen after return is the number of octets consumed
2056	* as the return value is positive, else unpredictable.
2057	* The value of @outlen after return is the number of octets consumed.
2058	*/
2059	int
2060	UTF8ToHtml(unsigned char* out, int *outlen,
2061	const unsigned char* in, int *inlen) {
2062	const unsigned char* processed = in;
2063	const unsigned char* outend;
2064	const unsigned char* outstart = out;
2065	const unsigned char* instart = in;
2066	const unsigned char* inend;
2067	unsigned int c, d;
2068	int trailing;
2069
2070	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL)) return(-1);
2071	if (in == NULL) {
2072	/*
2073	* initialization nothing to do
2074	*/
2075	*outlen = 0;
2076	*inlen = 0;
2077	return(0);
2078	}
2079	inend = in + (*inlen);
2080	outend = out + (*outlen);
2081	while (in < inend) {
2082	d = *in++;
2083	if (d < 0x80) { c= d; trailing= 0; }
2084	else if (d < 0xC0) {
2085	/* trailing byte in leading position */
2086	*outlen = out - outstart;
2087	*inlen = processed - instart;
2088	return(-2);
2089	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2090	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2091	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2092	else {
2093	/* no chance for this in Ascii */
2094	*outlen = out - outstart;
2095	*inlen = processed - instart;
2096	return(-2);
2097	}
2098
2099	if (inend - in < trailing) {
2100	break;
2101	}
2102
2103	for ( ; trailing; trailing--) {
2104	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
2105	break;
2106	c <<= 6;
2107	c \|= d & 0x3F;
2108	}
2109
2110	/* assertion: c is a single UTF-4 value */
2111	if (c < 0x80) {
2112	if (out + 1 >= outend)
2113	break;
2114	*out++ = c;
2115	} else {
2116	int len;
2117	const htmlEntityDesc * ent;
2118	const char *cp;
2119	char nbuf[16];
2120
2121	/*
2122	* Try to lookup a predefined HTML entity for it
2123	*/
2124
2125	ent = htmlEntityValueLookup(c);
2126	if (ent == NULL) {
2127	snprintf(nbuf, sizeof(nbuf), "#%u", c);
2128	cp = nbuf;
2129	}
2130	else
2131	cp = ent->name;
2132	len = strlen(cp);
2133	if (out + 2 + len >= outend)
2134	break;
2135	*out++ = '&';
2136	memcpy(out, cp, len);
2137	out += len;
2138	*out++ = ';';
2139	}
2140	processed = in;
2141	}
2142	*outlen = out - outstart;
2143	*inlen = processed - instart;
2144	return(0);
2145	}
2146
2147	/**
2148	* htmlEncodeEntities:
2149	* @out: a pointer to an array of bytes to store the result
2150	* @outlen: the length of @out
2151	* @in: a pointer to an array of UTF-8 chars
2152	* @inlen: the length of @in
2153	* @quoteChar: the quote character to escape (' or ") or zero.
2154	*
2155	* Take a block of UTF-8 chars in and try to convert it to an ASCII
2156	* plus HTML entities block of chars out.
2157	*
2158	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2159	* The value of @inlen after return is the number of octets consumed
2160	* as the return value is positive, else unpredictable.
2161	* The value of @outlen after return is the number of octets consumed.
2162	*/
2163	int
2164	htmlEncodeEntities(unsigned char* out, int *outlen,
2165	const unsigned char* in, int *inlen, int quoteChar) {
2166	const unsigned char* processed = in;
2167	const unsigned char* outend;
2168	const unsigned char* outstart = out;
2169	const unsigned char* instart = in;
2170	const unsigned char* inend;
2171	unsigned int c, d;
2172	int trailing;
2173
2174	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL) \|\| (in == NULL))
2175	return(-1);
2176	outend = out + (*outlen);
2177	inend = in + (*inlen);
2178	while (in < inend) {
2179	d = *in++;
2180	if (d < 0x80) { c= d; trailing= 0; }
2181	else if (d < 0xC0) {
2182	/* trailing byte in leading position */
2183	*outlen = out - outstart;
2184	*inlen = processed - instart;
2185	return(-2);
2186	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2187	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2188	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2189	else {
2190	/* no chance for this in Ascii */
2191	*outlen = out - outstart;
2192	*inlen = processed - instart;
2193	return(-2);
2194	}
2195
2196	if (inend - in < trailing)
2197	break;
2198
2199	while (trailing--) {
2200	if (((d= *in++) & 0xC0) != 0x80) {
2201	*outlen = out - outstart;
2202	*inlen = processed - instart;
2203	return(-2);
2204	}
2205	c <<= 6;
2206	c \|= d & 0x3F;
2207	}
2208
2209	/* assertion: c is a single UTF-4 value */
2210	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2211	(c != '&') && (c != '<') && (c != '>')) {
2212	if (out >= outend)
2213	break;
2214	*out++ = c;
2215	} else {
2216	const htmlEntityDesc * ent;
2217	const char *cp;
2218	char nbuf[16];
2219	int len;
2220
2221	/*
2222	* Try to lookup a predefined HTML entity for it
2223	*/
2224	ent = htmlEntityValueLookup(c);
2225	if (ent == NULL) {
2226	snprintf(nbuf, sizeof(nbuf), "#%u", c);
2227	cp = nbuf;
2228	}
2229	else
2230	cp = ent->name;
2231	len = strlen(cp);
2232	if (outend - out < len + 2)
2233	break;
2234	*out++ = '&';
2235	memcpy(out, cp, len);
2236	out += len;
2237	*out++ = ';';
2238	}
2239	processed = in;
2240	}
2241	*outlen = out - outstart;
2242	*inlen = processed - instart;
2243	return(0);
2244	}
2245
2246	/************************************************************************
2247	* *
2248	* Commodity functions, cleanup needed ? *
2249	* *
2250	************************************************************************/
2251	/*
2252	* all tags allowing pc data from the html 4.01 loose dtd
2253	* NOTE: it might be more appropriate to integrate this information
2254	* into the html40ElementTable array but I don't want to risk any
2255	* binary incompatibility
2256	*/
2257	static const char *allowPCData[] = {
2258	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2259	"blockquote", "body", "button", "caption", "center", "cite", "code",
2260	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2261	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2262	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2263	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2264	};
2265
2266	/**
2267	* areBlanks:
2268	* @ctxt: an HTML parser context
2269	* @str: a xmlChar *
2270	* @len: the size of @str
2271	*
2272	* Is this a sequence of blank chars that one can ignore ?
2273	*
2274	* Returns 1 if ignorable 0 otherwise.
2275	*/
2276
2277	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2278	unsigned int i;
2279	int j;
2280	xmlNodePtr lastChild;
2281	xmlDtdPtr dtd;
2282
2283	for (j = 0;j < len;j++)
2284	if (!(IS_BLANK_CH(str[j]))) return(0);
2285
2286	if (CUR == 0) return(1);
2287	if (CUR != '<') return(0);
2288	if (ctxt->name == NULL)
2289	return(1);
2290	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2291	return(1);
2292	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2293	return(1);
2294
2295	/* Only strip CDATA children of the body tag for strict HTML DTDs */
2296	if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2297	dtd = xmlGetIntSubset(ctxt->myDoc);
2298	if (dtd != NULL && dtd->ExternalID != NULL) {
2299	if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") \|\|
2300	!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2301	return(1);
2302	}
2303	}
2304
2305	if (ctxt->node == NULL) return(0);
2306	lastChild = xmlGetLastChild(ctxt->node);
2307	while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2308	lastChild = lastChild->prev;
2309	if (lastChild == NULL) {
2310	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2311	(ctxt->node->content != NULL)) return(0);
2312	/* keep ws in constructs like ...<b> </b>...
2313	for all tags "b" allowing PCDATA */
2314	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2315	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2316	return(0);
2317	}
2318	}
2319	} else if (xmlNodeIsText(lastChild)) {
2320	return(0);
2321	} else {
2322	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2323	for all tags "p" allowing PCDATA */
2324	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2325	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2326	return(0);
2327	}
2328	}
2329	}
2330	return(1);
2331	}
2332
2333	/**
2334	* htmlNewDocNoDtD:
2335	* @URI: URI for the dtd, or NULL
2336	* @ExternalID: the external ID of the DTD, or NULL
2337	*
2338	* Creates a new HTML document without a DTD node if @URI and @ExternalID
2339	* are NULL
2340	*
2341	* Returns a new document, do not initialize the DTD if not provided
2342	*/
2343	htmlDocPtr
2344	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
2345	xmlDocPtr cur;
2346
2347	/*
2348	* Allocate a new document and fill the fields.
2349	*/
2350	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2351	if (cur == NULL)
2352	return(NULL);
2353	memset(cur, 0, sizeof(xmlDoc));
2354
2355	cur->type = XML_HTML_DOCUMENT_NODE;
2356	cur->version = NULL;
2357	cur->intSubset = NULL;
2358	cur->doc = cur;
2359	cur->name = NULL;
2360	cur->children = NULL;
2361	cur->extSubset = NULL;
2362	cur->oldNs = NULL;
2363	cur->encoding = NULL;
2364	cur->standalone = 1;
2365	cur->compression = 0;
2366	cur->ids = NULL;
2367	cur->refs = NULL;
2368	cur->_private = NULL;
2369	cur->charset = XML_CHAR_ENCODING_UTF8;
2370	cur->properties = XML_DOC_HTML \| XML_DOC_USERBUILT;
2371	if ((ExternalID != NULL) \|\|
2372	(URI != NULL)) {
2373	xmlDtdPtr intSubset;
2374
2375	intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2376	if (intSubset == NULL) {
2377	xmlFree(cur);
2378	return(NULL);
2379	}
2380	}
2381	if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2382	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2383	return(cur);
2384	}
2385
2386	/**
2387	* htmlNewDoc:
2388	* @URI: URI for the dtd, or NULL
2389	* @ExternalID: the external ID of the DTD, or NULL
2390	*
2391	* Creates a new HTML document
2392	*
2393	* Returns a new document
2394	*/
2395	htmlDocPtr
2396	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
2397	if ((URI == NULL) && (ExternalID == NULL))
2398	return(htmlNewDocNoDtD(
2399	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2400	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2401
2402	return(htmlNewDocNoDtD(URI, ExternalID));
2403	}
2404
2405
2406	/************************************************************************
2407	* *
2408	* The parser itself *
2409	* Relates to http://www.w3.org/TR/html40 *
2410	* *
2411	************************************************************************/
2412
2413	/************************************************************************
2414	* *
2415	* The parser itself *
2416	* *
2417	************************************************************************/
2418
2419	static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2420
2421	static void
2422	htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2423	int c;
2424
2425	htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2426	"Incorrectly opened comment\n", NULL, NULL);
2427
2428	while (PARSER_STOPPED(ctxt) == 0) {
2429	c = CUR;
2430	if (c == 0)
2431	break;
2432	NEXT;
2433	if (c == '>')
2434	break;
2435	}
2436	}
2437
2438	/**
2439	* htmlParseHTMLName:
2440	* @ctxt: an HTML parser context
2441	*
2442	* parse an HTML tag or attribute name, note that we convert it to lowercase
2443	* since HTML names are not case-sensitive.
2444	*
2445	* Returns the Tag Name parsed or NULL
2446	*/
2447
2448	static const xmlChar *
2449	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2450	const xmlChar *ret;
2451	int i = 0;
2452	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2453
2454	if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2455	(CUR != ':') && (CUR != '.')) return(NULL);
2456
2457	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2458	((IS_ASCII_LETTER(CUR)) \|\| (IS_ASCII_DIGIT(CUR)) \|\|
2459	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_') \|\|
2460	(CUR == '.'))) {
2461	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2462	else loc[i] = CUR;
2463	i++;
2464
2465	NEXT;
2466	}
2467
2468	ret = xmlDictLookup(ctxt->dict, loc, i);
2469	if (ret == NULL)
2470	htmlErrMemory(ctxt);
2471
2472	return(ret);
2473	}
2474
2475
2476	/**
2477	* htmlParseHTMLName_nonInvasive:
2478	* @ctxt: an HTML parser context
2479	*
2480	* parse an HTML tag or attribute name, note that we convert it to lowercase
2481	* since HTML names are not case-sensitive, this doesn't consume the data
2482	* from the stream, it's a look-ahead
2483	*
2484	* Returns the Tag Name parsed or NULL
2485	*/
2486
2487	static const xmlChar *
2488	htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2489	int i = 0;
2490	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2491	const xmlChar *ret;
2492
2493	if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2494	(NXT(1) != ':')) return(NULL);
2495
2496	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2497	((IS_ASCII_LETTER(NXT(1+i))) \|\| (IS_ASCII_DIGIT(NXT(1+i))) \|\|
2498	(NXT(1+i) == ':') \|\| (NXT(1+i) == '-') \|\| (NXT(1+i) == '_'))) {
2499	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2500	else loc[i] = NXT(1+i);
2501	i++;
2502	}
2503
2504	ret = xmlDictLookup(ctxt->dict, loc, i);
2505	if (ret == NULL)
2506	htmlErrMemory(ctxt);
2507
2508	return(ret);
2509	}
2510
2511
2512	/**
2513	* htmlParseName:
2514	* @ctxt: an HTML parser context
2515	*
2516	* parse an HTML name, this routine is case sensitive.
2517	*
2518	* Returns the Name parsed or NULL
2519	*/
2520
2521	static const xmlChar *
2522	htmlParseName(htmlParserCtxtPtr ctxt) {
2523	const xmlChar *in;
2524	const xmlChar *ret;
2525	int count = 0;
2526
2527	GROW;
2528
2529	/*
2530	* Accelerator for simple ASCII names
2531	*/
2532	in = ctxt->input->cur;
2533	if (((in >= 0x61) && (in <= 0x7A)) \|\|
2534	((in >= 0x41) && (in <= 0x5A)) \|\|
2535	(in == '_') \|\| (in == ':')) {
2536	in++;
2537	while (((in >= 0x61) && (in <= 0x7A)) \|\|
2538	((in >= 0x41) && (in <= 0x5A)) \|\|
2539	((in >= 0x30) && (in <= 0x39)) \|\|
2540	(in == '_') \|\| (in == '-') \|\|
2541	(in == ':') \|\| (in == '.'))
2542	in++;
2543
2544	if (in == ctxt->input->end)
2545	return(NULL);
2546
2547	if ((in > 0) && (in < 0x80)) {
2548	count = in - ctxt->input->cur;
2549	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2550	if (ret == NULL)
2551	htmlErrMemory(ctxt);
2552	ctxt->input->cur = in;
2553	ctxt->input->col += count;
2554	return(ret);
2555	}
2556	}
2557	return(htmlParseNameComplex(ctxt));
2558	}
2559
2560	static const xmlChar *
2561	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2562	int len = 0, l;
2563	int c;
2564	int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2565	XML_MAX_TEXT_LENGTH :
2566	XML_MAX_NAME_LENGTH;
2567	const xmlChar *base = ctxt->input->base;
2568	const xmlChar *ret;
2569
2570	/*
2571	* Handler for more complex cases
2572	*/
2573	c = CUR_CHAR(l);
2574	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
2575	(!IS_LETTER(c) && (c != '_') &&
2576	(c != ':'))) {
2577	return(NULL);
2578	}
2579
2580	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2581	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
2582	(c == '.') \|\| (c == '-') \|\|
2583	(c == '_') \|\| (c == ':') \|\|
2584	(IS_COMBINING(c)) \|\|
2585	(IS_EXTENDER(c)))) {
2586	len += l;
2587	if (len > maxLength) {
2588	htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2589	return(NULL);
2590	}
2591	NEXTL(l);
2592	c = CUR_CHAR(l);
2593	if (ctxt->input->base != base) {
2594	/*
2595	* We changed encoding from an unknown encoding
2596	* Input buffer changed location, so we better start again
2597	*/
2598	return(htmlParseNameComplex(ctxt));
2599	}
2600	}
2601
2602	if (ctxt->input->cur - ctxt->input->base < len) {
2603	/* Sanity check */
2604	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2605	"unexpected change of input buffer", NULL, NULL);
2606	return (NULL);
2607	}
2608
2609	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len);
2610	if (ret == NULL)
2611	htmlErrMemory(ctxt);
2612
2613	return(ret);
2614	}
2615
2616
2617	/**
2618	* htmlParseHTMLAttribute:
2619	* @ctxt: an HTML parser context
2620	* @stop: a char stop value
2621	*
2622	* parse an HTML attribute value till the stop (quote), if
2623	* stop is 0 then it stops at the first space
2624	*
2625	* Returns the attribute parsed or NULL
2626	*/
2627
2628	static xmlChar *
2629	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2630	xmlChar *buffer = NULL;
2631	int buffer_size = 0;
2632	int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2633	XML_MAX_HUGE_LENGTH :
2634	XML_MAX_TEXT_LENGTH;
2635	xmlChar *out = NULL;
2636	const xmlChar *name = NULL;
2637	const xmlChar *cur = NULL;
2638	const htmlEntityDesc * ent;
2639
2640	/*
2641	* allocate a translation buffer.
2642	*/
2643	buffer_size = HTML_PARSER_BUFFER_SIZE;
2644	buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2645	if (buffer == NULL) {
2646	htmlErrMemory(ctxt);
2647	return(NULL);
2648	}
2649	out = buffer;
2650
2651	/*
2652	* Ok loop until we reach one of the ending chars
2653	*/
2654	while ((PARSER_STOPPED(ctxt) == 0) &&
2655	(CUR != 0) && (CUR != stop)) {
2656	if ((stop == 0) && (CUR == '>')) break;
2657	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2658	if (CUR == '&') {
2659	if (NXT(1) == '#') {
2660	unsigned int c;
2661	int bits;
2662
2663	c = htmlParseCharRef(ctxt);
2664	if (c < 0x80)
2665	{ *out++ = c; bits= -6; }
2666	else if (c < 0x800)
2667	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2668	else if (c < 0x10000)
2669	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2670	else
2671	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2672
2673	for ( ; bits >= 0; bits-= 6) {
2674	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2675	}
2676
2677	if (out - buffer > buffer_size - 100) {
2678	int indx = out - buffer;
2679
2680	growBuffer(buffer);
2681	out = &buffer[indx];
2682	}
2683	} else {
2684	ent = htmlParseEntityRef(ctxt, &name);
2685	if (name == NULL) {
2686	*out++ = '&';
2687	if (out - buffer > buffer_size - 100) {
2688	int indx = out - buffer;
2689
2690	growBuffer(buffer);
2691	out = &buffer[indx];
2692	}
2693	} else if (ent == NULL) {
2694	*out++ = '&';
2695	cur = name;
2696	while (*cur != 0) {
2697	if (out - buffer > buffer_size - 100) {
2698	int indx = out - buffer;
2699
2700	growBuffer(buffer);
2701	out = &buffer[indx];
2702	}
2703	out++ = cur++;
2704	}
2705	} else {
2706	unsigned int c;
2707	int bits;
2708
2709	if (out - buffer > buffer_size - 100) {
2710	int indx = out - buffer;
2711
2712	growBuffer(buffer);
2713	out = &buffer[indx];
2714	}
2715	c = ent->value;
2716	if (c < 0x80)
2717	{ *out++ = c; bits= -6; }
2718	else if (c < 0x800)
2719	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2720	else if (c < 0x10000)
2721	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2722	else
2723	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2724
2725	for ( ; bits >= 0; bits-= 6) {
2726	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2727	}
2728	}
2729	}
2730	} else {
2731	unsigned int c;
2732	int bits, l;
2733
2734	if (out - buffer > buffer_size - 100) {
2735	int indx = out - buffer;
2736
2737	growBuffer(buffer);
2738	out = &buffer[indx];
2739	}
2740	c = CUR_CHAR(l);
2741	if (c < 0x80)
2742	{ *out++ = c; bits= -6; }
2743	else if (c < 0x800)
2744	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2745	else if (c < 0x10000)
2746	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2747	else
2748	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2749
2750	for ( ; bits >= 0; bits-= 6) {
2751	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2752	}
2753	NEXTL(l);
2754	}
2755	if (out - buffer > maxLength) {
2756	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2757	"attribute value too long\n", NULL, NULL);
2758	xmlFree(buffer);
2759	return(NULL);
2760	}
2761	}
2762	*out = 0;
2763	return(buffer);
2764	}
2765
2766	/**
2767	* htmlParseEntityRef:
2768	* @ctxt: an HTML parser context
2769	* @str: location to store the entity name
2770	*
2771	* DEPRECATED: Internal function, don't use.
2772	*
2773	* parse an HTML ENTITY references
2774	*
2775	* [68] EntityRef ::= '&' Name ';'
2776	*
2777	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2778	* if non-NULL *str will have to be freed by the caller.
2779	*/
2780	const htmlEntityDesc *
2781	htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2782	const xmlChar *name;
2783	const htmlEntityDesc * ent = NULL;
2784
2785	if (str != NULL) *str = NULL;
2786	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) return(NULL);
2787
2788	if (CUR == '&') {
2789	NEXT;
2790	name = htmlParseName(ctxt);
2791	if (name == NULL) {
2792	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2793	"htmlParseEntityRef: no name\n", NULL, NULL);
2794	} else {
2795	GROW;
2796	if (CUR == ';') {
2797	if (str != NULL)
2798	*str = name;
2799
2800	/*
2801	* Lookup the entity in the table.
2802	*/
2803	ent = htmlEntityLookup(name);
2804	if (ent != NULL) /* OK that's ugly !!! */
2805	NEXT;
2806	} else {
2807	htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2808	"htmlParseEntityRef: expecting ';'\n",
2809	NULL, NULL);
2810	if (str != NULL)
2811	*str = name;
2812	}
2813	}
2814	}
2815	return(ent);
2816	}
2817
2818	/**
2819	* htmlParseAttValue:
2820	* @ctxt: an HTML parser context
2821	*
2822	* parse a value for an attribute
2823	* Note: the parser won't do substitution of entities here, this
2824	* will be handled later in xmlStringGetNodeList, unless it was
2825	* asked for ctxt->replaceEntities != 0
2826	*
2827	* Returns the AttValue parsed or NULL.
2828	*/
2829
2830	static xmlChar *
2831	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2832	xmlChar *ret = NULL;
2833
2834	if (CUR == '"') {
2835	NEXT;
2836	ret = htmlParseHTMLAttribute(ctxt, '"');
2837	if (CUR != '"') {
2838	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2839	"AttValue: \" expected\n", NULL, NULL);
2840	} else
2841	NEXT;
2842	} else if (CUR == '\'') {
2843	NEXT;
2844	ret = htmlParseHTMLAttribute(ctxt, '\'');
2845	if (CUR != '\'') {
2846	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2847	"AttValue: ' expected\n", NULL, NULL);
2848	} else
2849	NEXT;
2850	} else {
2851	/*
2852	* That's an HTMLism, the attribute value may not be quoted
2853	*/
2854	ret = htmlParseHTMLAttribute(ctxt, 0);
2855	if (ret == NULL) {
2856	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2857	"AttValue: no value found\n", NULL, NULL);
2858	}
2859	}
2860	return(ret);
2861	}
2862
2863	/**
2864	* htmlParseSystemLiteral:
2865	* @ctxt: an HTML parser context
2866	*
2867	* parse an HTML Literal
2868	*
2869	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
2870	*
2871	* Returns the SystemLiteral parsed or NULL
2872	*/
2873
2874	static xmlChar *
2875	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2876	size_t len = 0, startPosition = 0;
2877	int err = 0;
2878	int quote;
2879	xmlChar *ret = NULL;
2880
2881	if ((CUR != '"') && (CUR != '\'')) {
2882	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2883	"SystemLiteral \" or ' expected\n", NULL, NULL);
2884	return(NULL);
2885	}
2886	quote = CUR;
2887	NEXT;
2888
2889	if (CUR_PTR < BASE_PTR)
2890	return(ret);
2891	startPosition = CUR_PTR - BASE_PTR;
2892
2893	while ((PARSER_STOPPED(ctxt) == 0) &&
2894	(CUR != 0) && (CUR != quote)) {
2895	/* TODO: Handle UTF-8 */
2896	if (!IS_CHAR_CH(CUR)) {
2897	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2898	"Invalid char in SystemLiteral 0x%X\n", CUR);
2899	err = 1;
2900	}
2901	NEXT;
2902	len++;
2903	}
2904	if (CUR != quote) {
2905	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2906	"Unfinished SystemLiteral\n", NULL, NULL);
2907	} else {
2908	if (err == 0) {
2909	ret = xmlStrndup((BASE_PTR+startPosition), len);
2910	if (ret == NULL) {
2911	htmlErrMemory(ctxt);
2912	return(NULL);
2913	}
2914	}
2915	NEXT;
2916	}
2917
2918	return(ret);
2919	}
2920
2921	/**
2922	* htmlParsePubidLiteral:
2923	* @ctxt: an HTML parser context
2924	*
2925	* parse an HTML public literal
2926	*
2927	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
2928	*
2929	* Returns the PubidLiteral parsed or NULL.
2930	*/
2931
2932	static xmlChar *
2933	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2934	size_t len = 0, startPosition = 0;
2935	int err = 0;
2936	int quote;
2937	xmlChar *ret = NULL;
2938
2939	if ((CUR != '"') && (CUR != '\'')) {
2940	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2941	"PubidLiteral \" or ' expected\n", NULL, NULL);
2942	return(NULL);
2943	}
2944	quote = CUR;
2945	NEXT;
2946
2947	/*
2948	* Name ::= (Letter \| '_') (NameChar)*
2949	*/
2950	if (CUR_PTR < BASE_PTR)
2951	return(ret);
2952	startPosition = CUR_PTR - BASE_PTR;
2953
2954	while ((PARSER_STOPPED(ctxt) == 0) &&
2955	(CUR != 0) && (CUR != quote)) {
2956	if (!IS_PUBIDCHAR_CH(CUR)) {
2957	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2958	"Invalid char in PubidLiteral 0x%X\n", CUR);
2959	err = 1;
2960	}
2961	len++;
2962	NEXT;
2963	}
2964
2965	if (CUR != quote) {
2966	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2967	"Unfinished PubidLiteral\n", NULL, NULL);
2968	} else {
2969	if (err == 0) {
2970	ret = xmlStrndup((BASE_PTR + startPosition), len);
2971	if (ret == NULL) {
2972	htmlErrMemory(ctxt);
2973	return(NULL);
2974	}
2975	}
2976	NEXT;
2977	}
2978
2979	return(ret);
2980	}
2981
2982	/**
2983	* htmlParseScript:
2984	* @ctxt: an HTML parser context
2985	*
2986	* parse the content of an HTML SCRIPT or STYLE element
2987	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
2988	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2989	* http://www.w3.org/TR/html4/types.html#type-script
2990	* http://www.w3.org/TR/html4/types.html#h-6.15
2991	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2992	*
2993	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
2994	* element and the value of intrinsic event attributes. User agents must
2995	* not evaluate script data as HTML markup but instead must pass it on as
2996	* data to a script engine.
2997	* NOTES:
2998	* - The content is passed like CDATA
2999	* - the attributes for style and scripting "onXXX" are also described
3000	* as CDATA but SGML allows entities references in attributes so their
3001	* processing is identical as other attributes
3002	*/
3003	static void
3004	htmlParseScript(htmlParserCtxtPtr ctxt) {
3005	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3006	int nbchar = 0;
3007	int cur,l;
3008
3009	cur = CUR_CHAR(l);
3010	while (cur != 0) {
3011	if ((cur == '<') && (NXT(1) == '/')) {
3012	/*
3013	* One should break here, the specification is clear:
3014	* Authors should therefore escape "</" within the content.
3015	* Escape mechanisms are specific to each scripting or
3016	* style sheet language.
3017	*
3018	* In recovery mode, only break if end tag match the
3019	* current tag, effectively ignoring all tags inside the
3020	* script/style block and treating the entire block as
3021	* CDATA.
3022	*/
3023	if (ctxt->recovery) {
3024	if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3025	xmlStrlen(ctxt->name)) == 0)
3026	{
3027	break; /* while */
3028	} else {
3029	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3030	"Element %s embeds close tag\n",
3031	ctxt->name, NULL);
3032	}
3033	} else {
3034	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
3035	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3036	{
3037	break; /* while */
3038	}
3039	}
3040	}
3041	if (IS_CHAR(cur)) {
3042	COPY_BUF(l,buf,nbchar,cur);
3043	} else {
3044	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3045	"Invalid char in CDATA 0x%X\n", cur);
3046	}
3047	NEXTL(l);
3048	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3049	buf[nbchar] = 0;
3050	if (ctxt->sax->cdataBlock!= NULL) {
3051	/*
3052	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3053	*/
3054	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3055	} else if (ctxt->sax->characters != NULL) {
3056	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3057	}
3058	nbchar = 0;
3059	SHRINK;
3060	}
3061	cur = CUR_CHAR(l);
3062	}
3063
3064	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3065	buf[nbchar] = 0;
3066	if (ctxt->sax->cdataBlock!= NULL) {
3067	/*
3068	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3069	*/
3070	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3071	} else if (ctxt->sax->characters != NULL) {
3072	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3073	}
3074	}
3075	}
3076
3077
3078	/**
3079	* htmlParseCharDataInternal:
3080	* @ctxt: an HTML parser context
3081	* @readahead: optional read ahead character in ascii range
3082	*
3083	* parse a CharData section.
3084	* if we are within a CDATA section ']]>' marks an end of section.
3085	*
3086	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3087	*/
3088
3089	static void
3090	htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3091	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3092	int nbchar = 0;
3093	int cur, l;
3094
3095	if (readahead)
3096	buf[nbchar++] = readahead;
3097
3098	cur = CUR_CHAR(l);
3099	while ((cur != '<') &&
3100	(cur != '&') &&
3101	(cur != 0) &&
3102	(!PARSER_STOPPED(ctxt))) {
3103	if (!(IS_CHAR(cur))) {
3104	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3105	"Invalid char in CDATA 0x%X\n", cur);
3106	} else {
3107	COPY_BUF(l,buf,nbchar,cur);
3108	}
3109	NEXTL(l);
3110	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3111	buf[nbchar] = 0;
3112
3113	/*
3114	* Ok the segment is to be consumed as chars.
3115	*/
3116	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3117	if (areBlanks(ctxt, buf, nbchar)) {
3118	if (ctxt->keepBlanks) {
3119	if (ctxt->sax->characters != NULL)
3120	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3121	} else {
3122	if (ctxt->sax->ignorableWhitespace != NULL)
3123	ctxt->sax->ignorableWhitespace(ctxt->userData,
3124	buf, nbchar);
3125	}
3126	} else {
3127	htmlCheckParagraph(ctxt);
3128	if (ctxt->sax->characters != NULL)
3129	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3130	}
3131	}
3132	nbchar = 0;
3133	SHRINK;
3134	}
3135	cur = CUR_CHAR(l);
3136	}
3137	if (nbchar != 0) {
3138	buf[nbchar] = 0;
3139
3140	/*
3141	* Ok the segment is to be consumed as chars.
3142	*/
3143	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3144	if (areBlanks(ctxt, buf, nbchar)) {
3145	if (ctxt->keepBlanks) {
3146	if (ctxt->sax->characters != NULL)
3147	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3148	} else {
3149	if (ctxt->sax->ignorableWhitespace != NULL)
3150	ctxt->sax->ignorableWhitespace(ctxt->userData,
3151	buf, nbchar);
3152	}
3153	} else {
3154	htmlCheckParagraph(ctxt);
3155	if (ctxt->sax->characters != NULL)
3156	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3157	}
3158	}
3159	}
3160	}
3161
3162	/**
3163	* htmlParseCharData:
3164	* @ctxt: an HTML parser context
3165	*
3166	* parse a CharData section.
3167	* if we are within a CDATA section ']]>' marks an end of section.
3168	*
3169	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3170	*/
3171
3172	static void
3173	htmlParseCharData(htmlParserCtxtPtr ctxt) {
3174	htmlParseCharDataInternal(ctxt, 0);
3175	}
3176
3177	/**
3178	* htmlParseExternalID:
3179	* @ctxt: an HTML parser context
3180	* @publicID: a xmlChar** receiving PubidLiteral
3181	*
3182	* Parse an External ID or a Public ID
3183	*
3184	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3185	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
3186	*
3187	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
3188	*
3189	* Returns the function returns SystemLiteral and in the second
3190	* case publicID receives PubidLiteral, is strict is off
3191	* it is possible to return NULL and have publicID set.
3192	*/
3193
3194	static xmlChar *
3195	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3196	xmlChar *URI = NULL;
3197
3198	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3199	(UPP(2) == 'S') && (UPP(3) == 'T') &&
3200	(UPP(4) == 'E') && (UPP(5) == 'M')) {
3201	SKIP(6);
3202	if (!IS_BLANK_CH(CUR)) {
3203	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3204	"Space required after 'SYSTEM'\n", NULL, NULL);
3205	}
3206	SKIP_BLANKS;
3207	URI = htmlParseSystemLiteral(ctxt);
3208	if (URI == NULL) {
3209	htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3210	"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3211	}
3212	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3213	(UPP(2) == 'B') && (UPP(3) == 'L') &&
3214	(UPP(4) == 'I') && (UPP(5) == 'C')) {
3215	SKIP(6);
3216	if (!IS_BLANK_CH(CUR)) {
3217	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3218	"Space required after 'PUBLIC'\n", NULL, NULL);
3219	}
3220	SKIP_BLANKS;
3221	*publicID = htmlParsePubidLiteral(ctxt);
3222	if (*publicID == NULL) {
3223	htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3224	"htmlParseExternalID: PUBLIC, no Public Identifier\n",
3225	NULL, NULL);
3226	}
3227	SKIP_BLANKS;
3228	if ((CUR == '"') \|\| (CUR == '\'')) {
3229	URI = htmlParseSystemLiteral(ctxt);
3230	}
3231	}
3232	return(URI);
3233	}
3234
3235	/**
3236	* htmlParsePI:
3237	* @ctxt: an HTML parser context
3238	*
3239	* Parse an XML Processing Instruction. HTML5 doesn't allow processing
3240	* instructions, so this will be removed at some point.
3241	*/
3242	static void
3243	htmlParsePI(htmlParserCtxtPtr ctxt) {
3244	xmlChar *buf = NULL;
3245	int len = 0;
3246	int size = HTML_PARSER_BUFFER_SIZE;
3247	int cur, l;
3248	int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3249	XML_MAX_HUGE_LENGTH :
3250	XML_MAX_TEXT_LENGTH;
3251	const xmlChar *target;
3252	xmlParserInputState state;
3253
3254	if ((RAW == '<') && (NXT(1) == '?')) {
3255	state = ctxt->instate;
3256	ctxt->instate = XML_PARSER_PI;
3257	/*
3258	* this is a Processing Instruction.
3259	*/
3260	SKIP(2);
3261
3262	/*
3263	* Parse the target name and check for special support like
3264	* namespace.
3265	*/
3266	target = htmlParseName(ctxt);
3267	if (target != NULL) {
3268	if (RAW == '>') {
3269	SKIP(1);
3270
3271	/*
3272	* SAX: PI detected.
3273	*/
3274	if ((ctxt->sax) && (!ctxt->disableSAX) &&
3275	(ctxt->sax->processingInstruction != NULL))
3276	ctxt->sax->processingInstruction(ctxt->userData,
3277	target, NULL);
3278	goto done;
3279	}
3280	buf = (xmlChar *) xmlMallocAtomic(size);
3281	if (buf == NULL) {
3282	htmlErrMemory(ctxt);
3283	return;
3284	}
3285	cur = CUR;
3286	if (!IS_BLANK(cur)) {
3287	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3288	"ParsePI: PI %s space expected\n", target, NULL);
3289	}
3290	SKIP_BLANKS;
3291	cur = CUR_CHAR(l);
3292	while ((cur != 0) && (cur != '>')) {
3293	if (len + 5 >= size) {
3294	xmlChar *tmp;
3295
3296	size *= 2;
3297	tmp = (xmlChar *) xmlRealloc(buf, size);
3298	if (tmp == NULL) {
3299	htmlErrMemory(ctxt);
3300	xmlFree(buf);
3301	return;
3302	}
3303	buf = tmp;
3304	}
3305	if (IS_CHAR(cur)) {
3306	COPY_BUF(l,buf,len,cur);
3307	} else {
3308	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3309	"Invalid char in processing instruction "
3310	"0x%X\n", cur);
3311	}
3312	if (len > maxLength) {
3313	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3314	"PI %s too long", target, NULL);
3315	xmlFree(buf);
3316	goto done;
3317	}
3318	NEXTL(l);
3319	cur = CUR_CHAR(l);
3320	}
3321	buf[len] = 0;
3322	if (cur != '>') {
3323	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3324	"ParsePI: PI %s never end ...\n", target, NULL);
3325	} else {
3326	SKIP(1);
3327
3328	/*
3329	* SAX: PI detected.
3330	*/
3331	if ((ctxt->sax) && (!ctxt->disableSAX) &&
3332	(ctxt->sax->processingInstruction != NULL))
3333	ctxt->sax->processingInstruction(ctxt->userData,
3334	target, buf);
3335	}
3336	xmlFree(buf);
3337	} else {
3338	htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3339	"PI is not started correctly", NULL, NULL);
3340	}
3341
3342	done:
3343	ctxt->instate = state;
3344	}
3345	}
3346
3347	/**
3348	* htmlParseComment:
3349	* @ctxt: an HTML parser context
3350	*
3351	* Parse an HTML comment
3352	*/
3353	static void
3354	htmlParseComment(htmlParserCtxtPtr ctxt) {
3355	xmlChar *buf = NULL;
3356	int len;
3357	int size = HTML_PARSER_BUFFER_SIZE;
3358	int q, ql;
3359	int r, rl;
3360	int cur, l;
3361	int next, nl;
3362	int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3363	XML_MAX_HUGE_LENGTH :
3364	XML_MAX_TEXT_LENGTH;
3365	xmlParserInputState state;
3366
3367	/*
3368	* Check that there is a comment right here.
3369	*/
3370	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
3371	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
3372
3373	state = ctxt->instate;
3374	ctxt->instate = XML_PARSER_COMMENT;
3375	SKIP(4);
3376	buf = (xmlChar *) xmlMallocAtomic(size);
3377	if (buf == NULL) {
3378	htmlErrMemory(ctxt);
3379	return;
3380	}
3381	len = 0;
3382	buf[len] = 0;
3383	q = CUR_CHAR(ql);
3384	if (q == 0)
3385	goto unfinished;
3386	if (q == '>') {
3387	htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3388	cur = '>';
3389	goto finished;
3390	}
3391	NEXTL(ql);
3392	r = CUR_CHAR(rl);
3393	if (r == 0)
3394	goto unfinished;
3395	if (q == '-' && r == '>') {
3396	htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3397	cur = '>';
3398	goto finished;
3399	}
3400	NEXTL(rl);
3401	cur = CUR_CHAR(l);
3402	while ((cur != 0) &&
3403	((cur != '>') \|\|
3404	(r != '-') \|\| (q != '-'))) {
3405	NEXTL(l);
3406	next = CUR_CHAR(nl);
3407
3408	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3409	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3410	"Comment incorrectly closed by '--!>'", NULL, NULL);
3411	cur = '>';
3412	break;
3413	}
3414
3415	if (len + 5 >= size) {
3416	xmlChar *tmp;
3417
3418	size *= 2;
3419	tmp = (xmlChar *) xmlRealloc(buf, size);
3420	if (tmp == NULL) {
3421	xmlFree(buf);
3422	htmlErrMemory(ctxt);
3423	return;
3424	}
3425	buf = tmp;
3426	}
3427	if (IS_CHAR(q)) {
3428	COPY_BUF(ql,buf,len,q);
3429	} else {
3430	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3431	"Invalid char in comment 0x%X\n", q);
3432	}
3433	if (len > maxLength) {
3434	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3435	"comment too long", NULL, NULL);
3436	xmlFree(buf);
3437	ctxt->instate = state;
3438	return;
3439	}
3440
3441	q = r;
3442	ql = rl;
3443	r = cur;
3444	rl = l;
3445	cur = next;
3446	l = nl;
3447	}
3448	finished:
3449	buf[len] = 0;
3450	if (cur == '>') {
3451	NEXT;
3452	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3453	(!ctxt->disableSAX))
3454	ctxt->sax->comment(ctxt->userData, buf);
3455	xmlFree(buf);
3456	ctxt->instate = state;
3457	return;
3458	}
3459
3460	unfinished:
3461	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3462	"Comment not terminated \n<!--%.50s\n", buf, NULL);
3463	xmlFree(buf);
3464	}
3465
3466	/**
3467	* htmlParseCharRef:
3468	* @ctxt: an HTML parser context
3469	*
3470	* DEPRECATED: Internal function, don't use.
3471	*
3472	* parse Reference declarations
3473	*
3474	* [66] CharRef ::= '&#' [0-9]+ ';' \|
3475	* '&#x' [0-9a-fA-F]+ ';'
3476	*
3477	* Returns the value parsed (as an int)
3478	*/
3479	int
3480	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3481	int val = 0;
3482
3483	if ((ctxt == NULL) \|\| (ctxt->input == NULL))
3484	return(0);
3485	if ((CUR == '&') && (NXT(1) == '#') &&
3486	((NXT(2) == 'x') \|\| NXT(2) == 'X')) {
3487	SKIP(3);
3488	while (CUR != ';') {
3489	if ((CUR >= '0') && (CUR <= '9')) {
3490	if (val < 0x110000)
3491	val = val * 16 + (CUR - '0');
3492	} else if ((CUR >= 'a') && (CUR <= 'f')) {
3493	if (val < 0x110000)
3494	val = val * 16 + (CUR - 'a') + 10;
3495	} else if ((CUR >= 'A') && (CUR <= 'F')) {
3496	if (val < 0x110000)
3497	val = val * 16 + (CUR - 'A') + 10;
3498	} else {
3499	htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3500	"htmlParseCharRef: missing semicolon\n",
3501	NULL, NULL);
3502	break;
3503	}
3504	NEXT;
3505	}
3506	if (CUR == ';')
3507	NEXT;
3508	} else if ((CUR == '&') && (NXT(1) == '#')) {
3509	SKIP(2);
3510	while (CUR != ';') {
3511	if ((CUR >= '0') && (CUR <= '9')) {
3512	if (val < 0x110000)
3513	val = val * 10 + (CUR - '0');
3514	} else {
3515	htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3516	"htmlParseCharRef: missing semicolon\n",
3517	NULL, NULL);
3518	break;
3519	}
3520	NEXT;
3521	}
3522	if (CUR == ';')
3523	NEXT;
3524	} else {
3525	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3526	"htmlParseCharRef: invalid value\n", NULL, NULL);
3527	}
3528	/*
3529	* Check the value IS_CHAR ...
3530	*/
3531	if (IS_CHAR(val)) {
3532	return(val);
3533	} else if (val >= 0x110000) {
3534	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3535	"htmlParseCharRef: value too large\n", NULL, NULL);
3536	} else {
3537	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3538	"htmlParseCharRef: invalid xmlChar value %d\n",
3539	val);
3540	}
3541	return(0);
3542	}
3543
3544
3545	/**
3546	* htmlParseDocTypeDecl:
3547	* @ctxt: an HTML parser context
3548	*
3549	* parse a DOCTYPE declaration
3550	*
3551	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3552	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
3553	*/
3554
3555	static void
3556	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3557	const xmlChar *name;
3558	xmlChar *ExternalID = NULL;
3559	xmlChar *URI = NULL;
3560
3561	/*
3562	* We know that '<!DOCTYPE' has been detected.
3563	*/
3564	SKIP(9);
3565
3566	SKIP_BLANKS;
3567
3568	/*
3569	* Parse the DOCTYPE name.
3570	*/
3571	name = htmlParseName(ctxt);
3572	if (name == NULL) {
3573	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3574	"htmlParseDocTypeDecl : no DOCTYPE name !\n",
3575	NULL, NULL);
3576	}
3577	/*
3578	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
3579	*/
3580
3581	SKIP_BLANKS;
3582
3583	/*
3584	* Check for SystemID and ExternalID
3585	*/
3586	URI = htmlParseExternalID(ctxt, &ExternalID);
3587	SKIP_BLANKS;
3588
3589	/*
3590	* We should be at the end of the DOCTYPE declaration.
3591	*/
3592	if (CUR != '>') {
3593	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3594	"DOCTYPE improperly terminated\n", NULL, NULL);
3595	/* Ignore bogus content */
3596	while ((CUR != 0) && (CUR != '>') &&
3597	(PARSER_STOPPED(ctxt) == 0))
3598	NEXT;
3599	}
3600	if (CUR == '>')
3601	NEXT;
3602
3603	/*
3604	* Create or update the document accordingly to the DOCTYPE
3605	*/
3606	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3607	(!ctxt->disableSAX))
3608	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3609
3610	/*
3611	* Cleanup, since we don't use all those identifiers
3612	*/
3613	if (URI != NULL) xmlFree(URI);
3614	if (ExternalID != NULL) xmlFree(ExternalID);
3615	}
3616
3617	/**
3618	* htmlParseAttribute:
3619	* @ctxt: an HTML parser context
3620	* @value: a xmlChar ** used to store the value of the attribute
3621	*
3622	* parse an attribute
3623	*
3624	* [41] Attribute ::= Name Eq AttValue
3625	*
3626	* [25] Eq ::= S? '=' S?
3627	*
3628	* With namespace:
3629	*
3630	* [NS 11] Attribute ::= QName Eq AttValue
3631	*
3632	* Also the case QName == xmlns:??? is handled independently as a namespace
3633	* definition.
3634	*
3635	* Returns the attribute name, and the value in *value.
3636	*/
3637
3638	static const xmlChar *
3639	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3640	const xmlChar *name;
3641	xmlChar *val = NULL;
3642
3643	*value = NULL;
3644	name = htmlParseHTMLName(ctxt);
3645	if (name == NULL) {
3646	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3647	"error parsing attribute name\n", NULL, NULL);
3648	return(NULL);
3649	}
3650
3651	/*
3652	* read the value
3653	*/
3654	SKIP_BLANKS;
3655	if (CUR == '=') {
3656	NEXT;
3657	SKIP_BLANKS;
3658	val = htmlParseAttValue(ctxt);
3659	}
3660
3661	*value = val;
3662	return(name);
3663	}
3664
3665	/**
3666	* htmlCheckEncoding:
3667	* @ctxt: an HTML parser context
3668	* @attvalue: the attribute value
3669	*
3670	* Checks an http-equiv attribute from a Meta tag to detect
3671	* the encoding
3672	* If a new encoding is detected the parser is switched to decode
3673	* it and pass UTF8
3674	*/
3675	static void
3676	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3677	const xmlChar *encoding;
3678	xmlChar *copy;
3679
3680	if (!attvalue)
3681	return;
3682
3683	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3684	if (encoding != NULL) {
3685	encoding += 7;
3686	}
3687	/*
3688	* skip blank
3689	*/
3690	if (encoding && IS_BLANK_CH(*encoding))
3691	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3692	if (encoding && *encoding == '=') {
3693	encoding ++;
3694	copy = xmlStrdup(encoding);
3695	if (copy == NULL)
3696	htmlErrMemory(ctxt);
3697	xmlSetDeclaredEncoding(ctxt, copy);
3698	}
3699	}
3700
3701	/**
3702	* htmlCheckMeta:
3703	* @ctxt: an HTML parser context
3704	* @atts: the attributes values
3705	*
3706	* Checks an attributes from a Meta tag
3707	*/
3708	static void
3709	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3710	int i;
3711	const xmlChar att, value;
3712	int http = 0;
3713	const xmlChar *content = NULL;
3714
3715	if ((ctxt == NULL) \|\| (atts == NULL))
3716	return;
3717
3718	i = 0;
3719	att = atts[i++];
3720	while (att != NULL) {
3721	value = atts[i++];
3722	if (value != NULL) {
3723	if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3724	(!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3725	http = 1;
3726	} else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3727	xmlChar *copy;
3728
3729	copy = xmlStrdup(value);
3730	if (copy == NULL)
3731	htmlErrMemory(ctxt);
3732	xmlSetDeclaredEncoding(ctxt, copy);
3733	} else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3734	content = value;
3735	}
3736	}
3737	att = atts[i++];
3738	}
3739	if ((http) && (content != NULL))
3740	htmlCheckEncoding(ctxt, content);
3741
3742	}
3743
3744	/**
3745	* htmlParseStartTag:
3746	* @ctxt: an HTML parser context
3747	*
3748	* parse a start of tag either for rule element or
3749	* EmptyElement. In both case we don't parse the tag closing chars.
3750	*
3751	* [40] STag ::= '<' Name (S Attribute)* S? '>'
3752	*
3753	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3754	*
3755	* With namespace:
3756	*
3757	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3758	*
3759	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3760	*
3761	* Returns 0 in case of success, -1 in case of error and 1 if discarded
3762	*/
3763
3764	static int
3765	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3766	const xmlChar *name;
3767	const xmlChar *attname;
3768	xmlChar *attvalue;
3769	const xmlChar **atts;
3770	int nbatts = 0;
3771	int maxatts;
3772	int meta = 0;
3773	int i;
3774	int discardtag = 0;
3775
3776	if ((ctxt == NULL) \|\| (ctxt->input == NULL))
3777	return -1;
3778	if (CUR != '<') return -1;
3779	NEXT;
3780
3781	atts = ctxt->atts;
3782	maxatts = ctxt->maxatts;
3783
3784	GROW;
3785	name = htmlParseHTMLName(ctxt);
3786	if (name == NULL) {
3787	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3788	"htmlParseStartTag: invalid element name\n",
3789	NULL, NULL);
3790	/* Dump the bogus tag like browsers do */
3791	while ((CUR != 0) && (CUR != '>') &&
3792	(PARSER_STOPPED(ctxt) == 0))
3793	NEXT;
3794	return -1;
3795	}
3796	if (xmlStrEqual(name, BAD_CAST"meta"))
3797	meta = 1;
3798
3799	/*
3800	* Check for auto-closure of HTML elements.
3801	*/
3802	htmlAutoClose(ctxt, name);
3803
3804	/*
3805	* Check for implied HTML elements.
3806	*/
3807	htmlCheckImplied(ctxt, name);
3808
3809	/*
3810	* Avoid html at any level > 0, head at any level != 1
3811	* or any attempt to recurse body
3812	*/
3813	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3814	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3815	"htmlParseStartTag: misplaced <html> tag\n",
3816	name, NULL);
3817	discardtag = 1;
3818	ctxt->depth++;
3819	}
3820	if ((ctxt->nameNr != 1) &&
3821	(xmlStrEqual(name, BAD_CAST"head"))) {
3822	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3823	"htmlParseStartTag: misplaced <head> tag\n",
3824	name, NULL);
3825	discardtag = 1;
3826	ctxt->depth++;
3827	}
3828	if (xmlStrEqual(name, BAD_CAST"body")) {
3829	int indx;
3830	for (indx = 0;indx < ctxt->nameNr;indx++) {
3831	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3832	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3833	"htmlParseStartTag: misplaced <body> tag\n",
3834	name, NULL);
3835	discardtag = 1;
3836	ctxt->depth++;
3837	}
3838	}
3839	}
3840
3841	/*
3842	* Now parse the attributes, it ends up with the ending
3843	*
3844	* (S Attribute)* S?
3845	*/
3846	SKIP_BLANKS;
3847	while ((CUR != 0) &&
3848	(CUR != '>') &&
3849	((CUR != '/') \|\| (NXT(1) != '>')) &&
3850	(PARSER_STOPPED(ctxt) == 0)) {
3851	GROW;
3852	attname = htmlParseAttribute(ctxt, &attvalue);
3853	if (attname != NULL) {
3854
3855	/*
3856	* Well formedness requires at most one declaration of an attribute
3857	*/
3858	for (i = 0; i < nbatts;i += 2) {
3859	if (xmlStrEqual(atts[i], attname)) {
3860	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3861	"Attribute %s redefined\n", attname, NULL);
3862	if (attvalue != NULL)
3863	xmlFree(attvalue);
3864	goto failed;
3865	}
3866	}
3867
3868	/*
3869	* Add the pair to atts
3870	*/
3871	if (atts == NULL) {
3872	maxatts = 22; /* allow for 10 attrs by default */
3873	atts = (const xmlChar **)
3874	xmlMalloc(maxatts * sizeof(xmlChar *));
3875	if (atts == NULL) {
3876	htmlErrMemory(ctxt);
3877	if (attvalue != NULL)
3878	xmlFree(attvalue);
3879	goto failed;
3880	}
3881	ctxt->atts = atts;
3882	ctxt->maxatts = maxatts;
3883	} else if (nbatts + 4 > maxatts) {
3884	const xmlChar **n;
3885
3886	maxatts *= 2;
3887	n = (const xmlChar *) xmlRealloc((void ) atts,
3888	maxatts * sizeof(const xmlChar *));
3889	if (n == NULL) {
3890	htmlErrMemory(ctxt);
3891	if (attvalue != NULL)
3892	xmlFree(attvalue);
3893	goto failed;
3894	}
3895	atts = n;
3896	ctxt->atts = atts;
3897	ctxt->maxatts = maxatts;
3898	}
3899	atts[nbatts++] = attname;
3900	atts[nbatts++] = attvalue;
3901	atts[nbatts] = NULL;
3902	atts[nbatts + 1] = NULL;
3903	}
3904	else {
3905	if (attvalue != NULL)
3906	xmlFree(attvalue);
3907	/* Dump the bogus attribute string up to the next blank or
3908	* the end of the tag. */
3909	while ((CUR != 0) &&
3910	!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3911	((CUR != '/') \|\| (NXT(1) != '>')) &&
3912	(PARSER_STOPPED(ctxt) == 0))
3913	NEXT;
3914	}
3915
3916	failed:
3917	SKIP_BLANKS;
3918	}
3919
3920	/*
3921	* Handle specific association to the META tag
3922	*/
3923	if (meta && (nbatts != 0))
3924	htmlCheckMeta(ctxt, atts);
3925
3926	/*
3927	* SAX: Start of Element !
3928	*/
3929	if (!discardtag) {
3930	htmlnamePush(ctxt, name);
3931	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3932	if (nbatts != 0)
3933	ctxt->sax->startElement(ctxt->userData, name, atts);
3934	else
3935	ctxt->sax->startElement(ctxt->userData, name, NULL);
3936	}
3937	}
3938
3939	if (atts != NULL) {
3940	for (i = 1;i < nbatts;i += 2) {
3941	if (atts[i] != NULL)
3942	xmlFree((xmlChar *) atts[i]);
3943	}
3944	}
3945
3946	return(discardtag);
3947	}
3948
3949	/**
3950	* htmlParseEndTag:
3951	* @ctxt: an HTML parser context
3952	*
3953	* parse an end of tag
3954	*
3955	* [42] ETag ::= '</' Name S? '>'
3956	*
3957	* With namespace
3958	*
3959	* [NS 9] ETag ::= '</' QName S? '>'
3960	*
3961	* Returns 1 if the current level should be closed.
3962	*/
3963
3964	static int
3965	htmlParseEndTag(htmlParserCtxtPtr ctxt)
3966	{
3967	const xmlChar *name;
3968	const xmlChar *oldname;
3969	int i, ret;
3970
3971	if ((CUR != '<') \|\| (NXT(1) != '/')) {
3972	htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3973	"htmlParseEndTag: '</' not found\n", NULL, NULL);
3974	return (0);
3975	}
3976	SKIP(2);
3977
3978	name = htmlParseHTMLName(ctxt);
3979	if (name == NULL)
3980	return (0);
3981	/*
3982	* We should definitely be at the ending "S? '>'" part
3983	*/
3984	SKIP_BLANKS;
3985	if (CUR != '>') {
3986	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3987	"End tag : expected '>'\n", NULL, NULL);
3988	/* Skip to next '>' */
3989	while ((PARSER_STOPPED(ctxt) == 0) &&
3990	(CUR != 0) && (CUR != '>'))
3991	NEXT;
3992	}
3993	if (CUR == '>')
3994	NEXT;
3995
3996	/*
3997	* if we ignored misplaced tags in htmlParseStartTag don't pop them
3998	* out now.
3999	*/
4000	if ((ctxt->depth > 0) &&
4001	(xmlStrEqual(name, BAD_CAST "html") \|\|
4002	xmlStrEqual(name, BAD_CAST "body") \|\|
4003	xmlStrEqual(name, BAD_CAST "head"))) {
4004	ctxt->depth--;
4005	return (0);
4006	}
4007
4008	/*
4009	* If the name read is not one of the element in the parsing stack
4010	* then return, it's just an error.
4011	*/
4012	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4013	if (xmlStrEqual(name, ctxt->nameTab[i]))
4014	break;
4015	}
4016	if (i < 0) {
4017	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4018	"Unexpected end tag : %s\n", name, NULL);
4019	return (0);
4020	}
4021
4022
4023	/*
4024	* Check for auto-closure of HTML elements.
4025	*/
4026
4027	htmlAutoCloseOnClose(ctxt, name);
4028
4029	/*
4030	* Well formedness constraints, opening and closing must match.
4031	* With the exception that the autoclose may have popped stuff out
4032	* of the stack.
4033	*/
4034	if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4035	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4036	"Opening and ending tag mismatch: %s and %s\n",
4037	name, ctxt->name);
4038	}
4039
4040	/*
4041	* SAX: End of Tag
4042	*/
4043	oldname = ctxt->name;
4044	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4045	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4046	ctxt->sax->endElement(ctxt->userData, name);
4047	htmlNodeInfoPop(ctxt);
4048	htmlnamePop(ctxt);
4049	ret = 1;
4050	} else {
4051	ret = 0;
4052	}
4053
4054	return (ret);
4055	}
4056
4057
4058	/**
4059	* htmlParseReference:
4060	* @ctxt: an HTML parser context
4061	*
4062	* parse and handle entity references in content,
4063	* this will end-up in a call to character() since this is either a
4064	* CharRef, or a predefined entity.
4065	*/
4066	static void
4067	htmlParseReference(htmlParserCtxtPtr ctxt) {
4068	const htmlEntityDesc * ent;
4069	xmlChar out[6];
4070	const xmlChar *name;
4071	if (CUR != '&') return;
4072
4073	if (NXT(1) == '#') {
4074	unsigned int c;
4075	int bits, i = 0;
4076
4077	c = htmlParseCharRef(ctxt);
4078	if (c == 0)
4079	return;
4080
4081	if (c < 0x80) { out[i++]= c; bits= -6; }
4082	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
4083	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
4084	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
4085
4086	for ( ; bits >= 0; bits-= 6) {
4087	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
4088	}
4089	out[i] = 0;
4090
4091	htmlCheckParagraph(ctxt);
4092	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4093	ctxt->sax->characters(ctxt->userData, out, i);
4094	} else {
4095	ent = htmlParseEntityRef(ctxt, &name);
4096	if (name == NULL) {
4097	htmlCheckParagraph(ctxt);
4098	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4099	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4100	return;
4101	}
4102	if ((ent == NULL) \|\| !(ent->value > 0)) {
4103	htmlCheckParagraph(ctxt);
4104	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4105	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4106	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4107	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4108	}
4109	} else {
4110	unsigned int c;
4111	int bits, i = 0;
4112
4113	c = ent->value;
4114	if (c < 0x80)
4115	{ out[i++]= c; bits= -6; }
4116	else if (c < 0x800)
4117	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
4118	else if (c < 0x10000)
4119	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
4120	else
4121	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
4122
4123	for ( ; bits >= 0; bits-= 6) {
4124	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
4125	}
4126	out[i] = 0;
4127
4128	htmlCheckParagraph(ctxt);
4129	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4130	ctxt->sax->characters(ctxt->userData, out, i);
4131	}
4132	}
4133	}
4134
4135	/**
4136	* htmlParseContent:
4137	* @ctxt: an HTML parser context
4138	*
4139	* Parse a content: comment, sub-element, reference or text.
4140	* Kept for compatibility with old code
4141	*/
4142
4143	static void
4144	htmlParseContent(htmlParserCtxtPtr ctxt) {
4145	xmlChar *currentNode;
4146	int depth;
4147	const xmlChar *name;
4148
4149	currentNode = xmlStrdup(ctxt->name);
4150	depth = ctxt->nameNr;
4151	while (!PARSER_STOPPED(ctxt)) {
4152	GROW;
4153
4154	/*
4155	* Our tag or one of it's parent or children is ending.
4156	*/
4157	if ((CUR == '<') && (NXT(1) == '/')) {
4158	if (htmlParseEndTag(ctxt) &&
4159	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
4160	if (currentNode != NULL)
4161	xmlFree(currentNode);
4162	return;
4163	}
4164	continue; /* while */
4165	}
4166
4167	else if ((CUR == '<') &&
4168	((IS_ASCII_LETTER(NXT(1))) \|\|
4169	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
4170	name = htmlParseHTMLName_nonInvasive(ctxt);
4171	if (name == NULL) {
4172	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4173	"htmlParseStartTag: invalid element name\n",
4174	NULL, NULL);
4175	/* Dump the bogus tag like browsers do */
4176	while ((CUR != 0) && (CUR != '>'))
4177	NEXT;
4178
4179	if (currentNode != NULL)
4180	xmlFree(currentNode);
4181	return;
4182	}
4183
4184	if (ctxt->name != NULL) {
4185	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4186	htmlAutoClose(ctxt, name);
4187	continue;
4188	}
4189	}
4190	}
4191
4192	/*
4193	* Has this node been popped out during parsing of
4194	* the next element
4195	*/
4196	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4197	(!xmlStrEqual(currentNode, ctxt->name)))
4198	{
4199	if (currentNode != NULL) xmlFree(currentNode);
4200	return;
4201	}
4202
4203	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
4204	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
4205	/*
4206	* Handle SCRIPT/STYLE separately
4207	*/
4208	htmlParseScript(ctxt);
4209	}
4210
4211	else if ((CUR == '<') && (NXT(1) == '!')) {
4212	/*
4213	* Sometimes DOCTYPE arrives in the middle of the document
4214	*/
4215	if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4216	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4217	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4218	(UPP(8) == 'E')) {
4219	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4220	"Misplaced DOCTYPE declaration\n",
4221	BAD_CAST "DOCTYPE" , NULL);
4222	htmlParseDocTypeDecl(ctxt);
4223	}
4224	/*
4225	* First case : a comment
4226	*/
4227	else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4228	htmlParseComment(ctxt);
4229	}
4230	else {
4231	htmlSkipBogusComment(ctxt);
4232	}
4233	}
4234
4235	/*
4236	* Second case : a Processing Instruction.
4237	*/
4238	else if ((CUR == '<') && (NXT(1) == '?')) {
4239	htmlParsePI(ctxt);
4240	}
4241
4242	/*
4243	* Third case : a sub-element.
4244	*/
4245	else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4246	htmlParseElement(ctxt);
4247	}
4248	else if (CUR == '<') {
4249	if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4250	(ctxt->sax->characters != NULL))
4251	ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4252	NEXT;
4253	}
4254
4255	/*
4256	* Fourth case : a reference. If if has not been resolved,
4257	* parsing returns it's Name, create the node
4258	*/
4259	else if (CUR == '&') {
4260	htmlParseReference(ctxt);
4261	}
4262
4263	/*
4264	* Fifth case : end of the resource
4265	*/
4266	else if (CUR == 0) {
4267	htmlAutoCloseOnEnd(ctxt);
4268	break;
4269	}
4270
4271	/*
4272	* Last case, text. Note that References are handled directly.
4273	*/
4274	else {
4275	htmlParseCharData(ctxt);
4276	}
4277
4278	SHRINK;
4279	GROW;
4280	}
4281	if (currentNode != NULL) xmlFree(currentNode);
4282	}
4283
4284	/**
4285	* htmlParseElement:
4286	* @ctxt: an HTML parser context
4287	*
4288	* DEPRECATED: Internal function, don't use.
4289	*
4290	* parse an HTML element, this is highly recursive
4291	* this is kept for compatibility with previous code versions
4292	*
4293	* [39] element ::= EmptyElemTag \| STag content ETag
4294	*
4295	* [41] Attribute ::= Name Eq AttValue
4296	*/
4297
4298	void
4299	htmlParseElement(htmlParserCtxtPtr ctxt) {
4300	const xmlChar *name;
4301	xmlChar *currentNode = NULL;
4302	const htmlElemDesc * info;
4303	htmlParserNodeInfo node_info;
4304	int failed;
4305	int depth;
4306	const xmlChar *oldptr;
4307
4308	if ((ctxt == NULL) \|\| (ctxt->input == NULL))
4309	return;
4310
4311	/* Capture start position */
4312	if (ctxt->record_info) {
4313	node_info.begin_pos = ctxt->input->consumed +
4314	(CUR_PTR - ctxt->input->base);
4315	node_info.begin_line = ctxt->input->line;
4316	}
4317
4318	failed = htmlParseStartTag(ctxt);
4319	name = ctxt->name;
4320	if ((failed == -1) \|\| (name == NULL)) {
4321	if (CUR == '>')
4322	NEXT;
4323	return;
4324	}
4325
4326	/*
4327	* Lookup the info for that element.
4328	*/
4329	info = htmlTagLookup(name);
4330	if (info == NULL) {
4331	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4332	"Tag %s invalid\n", name, NULL);
4333	}
4334
4335	/*
4336	* Check for an Empty Element labeled the XML/SGML way
4337	*/
4338	if ((CUR == '/') && (NXT(1) == '>')) {
4339	SKIP(2);
4340	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4341	ctxt->sax->endElement(ctxt->userData, name);
4342	htmlnamePop(ctxt);
4343	return;
4344	}
4345
4346	if (CUR == '>') {
4347	NEXT;
4348	} else {
4349	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4350	"Couldn't find end of Start Tag %s\n", name, NULL);
4351
4352	/*
4353	* end of parsing of this node.
4354	*/
4355	if (xmlStrEqual(name, ctxt->name)) {
4356	nodePop(ctxt);
4357	htmlnamePop(ctxt);
4358	}
4359
4360	/*
4361	* Capture end position and add node
4362	*/
4363	if (ctxt->record_info) {
4364	node_info.end_pos = ctxt->input->consumed +
4365	(CUR_PTR - ctxt->input->base);
4366	node_info.end_line = ctxt->input->line;
4367	node_info.node = ctxt->node;
4368	xmlParserAddNodeInfo(ctxt, &node_info);
4369	}
4370	return;
4371	}
4372
4373	/*
4374	* Check for an Empty Element from DTD definition
4375	*/
4376	if ((info != NULL) && (info->empty)) {
4377	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4378	ctxt->sax->endElement(ctxt->userData, name);
4379	htmlnamePop(ctxt);
4380	return;
4381	}
4382
4383	/*
4384	* Parse the content of the element:
4385	*/
4386	currentNode = xmlStrdup(ctxt->name);
4387	depth = ctxt->nameNr;
4388	while (CUR != 0) {
4389	oldptr = ctxt->input->cur;
4390	htmlParseContent(ctxt);
4391	if (oldptr==ctxt->input->cur) break;
4392	if (ctxt->nameNr < depth) break;
4393	}
4394
4395	/*
4396	* Capture end position and add node
4397	*/
4398	if ( currentNode != NULL && ctxt->record_info ) {
4399	node_info.end_pos = ctxt->input->consumed +
4400	(CUR_PTR - ctxt->input->base);
4401	node_info.end_line = ctxt->input->line;
4402	node_info.node = ctxt->node;
4403	xmlParserAddNodeInfo(ctxt, &node_info);
4404	}
4405	if (CUR == 0) {
4406	htmlAutoCloseOnEnd(ctxt);
4407	}
4408
4409	if (currentNode != NULL)
4410	xmlFree(currentNode);
4411	}
4412
4413	static void
4414	htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4415	/*
4416	* Capture end position and add node
4417	*/
4418	if ( ctxt->node != NULL && ctxt->record_info ) {
4419	ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4420	(CUR_PTR - ctxt->input->base);
4421	ctxt->nodeInfo->end_line = ctxt->input->line;
4422	ctxt->nodeInfo->node = ctxt->node;
4423	xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4424	htmlNodeInfoPop(ctxt);
4425	}
4426	if (CUR == 0) {
4427	htmlAutoCloseOnEnd(ctxt);
4428	}
4429	}
4430
4431	/**
4432	* htmlParseElementInternal:
4433	* @ctxt: an HTML parser context
4434	*
4435	* parse an HTML element, new version, non recursive
4436	*
4437	* [39] element ::= EmptyElemTag \| STag content ETag
4438	*
4439	* [41] Attribute ::= Name Eq AttValue
4440	*/
4441
4442	static void
4443	htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4444	const xmlChar *name;
4445	const htmlElemDesc * info;
4446	htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4447	int failed;
4448
4449	if ((ctxt == NULL) \|\| (ctxt->input == NULL))
4450	return;
4451
4452	/* Capture start position */
4453	if (ctxt->record_info) {
4454	node_info.begin_pos = ctxt->input->consumed +
4455	(CUR_PTR - ctxt->input->base);
4456	node_info.begin_line = ctxt->input->line;
4457	}
4458
4459	failed = htmlParseStartTag(ctxt);
4460	name = ctxt->name;
4461	if ((failed == -1) \|\| (name == NULL)) {
4462	if (CUR == '>')
4463	NEXT;
4464	return;
4465	}
4466
4467	/*
4468	* Lookup the info for that element.
4469	*/
4470	info = htmlTagLookup(name);
4471	if (info == NULL) {
4472	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4473	"Tag %s invalid\n", name, NULL);
4474	}
4475
4476	/*
4477	* Check for an Empty Element labeled the XML/SGML way
4478	*/
4479	if ((CUR == '/') && (NXT(1) == '>')) {
4480	SKIP(2);
4481	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4482	ctxt->sax->endElement(ctxt->userData, name);
4483	htmlnamePop(ctxt);
4484	return;
4485	}
4486
4487	if (CUR == '>') {
4488	NEXT;
4489	} else {
4490	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4491	"Couldn't find end of Start Tag %s\n", name, NULL);
4492
4493	/*
4494	* end of parsing of this node.
4495	*/
4496	if (xmlStrEqual(name, ctxt->name)) {
4497	nodePop(ctxt);
4498	htmlnamePop(ctxt);
4499	}
4500
4501	if (ctxt->record_info)
4502	htmlNodeInfoPush(ctxt, &node_info);
4503	htmlParserFinishElementParsing(ctxt);
4504	return;
4505	}
4506
4507	/*
4508	* Check for an Empty Element from DTD definition
4509	*/
4510	if ((info != NULL) && (info->empty)) {
4511	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4512	ctxt->sax->endElement(ctxt->userData, name);
4513	htmlnamePop(ctxt);
4514	return;
4515	}
4516
4517	if (ctxt->record_info)
4518	htmlNodeInfoPush(ctxt, &node_info);
4519	}
4520
4521	/**
4522	* htmlParseContentInternal:
4523	* @ctxt: an HTML parser context
4524	*
4525	* Parse a content: comment, sub-element, reference or text.
4526	* New version for non recursive htmlParseElementInternal
4527	*/
4528
4529	static void
4530	htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4531	xmlChar *currentNode;
4532	int depth;
4533	const xmlChar *name;
4534
4535	depth = ctxt->nameNr;
4536	if (depth <= 0) {
4537	currentNode = NULL;
4538	} else {
4539	currentNode = xmlStrdup(ctxt->name);
4540	if (currentNode == NULL) {
4541	htmlErrMemory(ctxt);
4542	return;
4543	}
4544	}
4545	while (PARSER_STOPPED(ctxt) == 0) {
4546	GROW;
4547
4548	/*
4549	* Our tag or one of it's parent or children is ending.
4550	*/
4551	if ((CUR == '<') && (NXT(1) == '/')) {
4552	if (htmlParseEndTag(ctxt) &&
4553	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
4554	if (currentNode != NULL)
4555	xmlFree(currentNode);
4556
4557	depth = ctxt->nameNr;
4558	if (depth <= 0) {
4559	currentNode = NULL;
4560	} else {
4561	currentNode = xmlStrdup(ctxt->name);
4562	if (currentNode == NULL) {
4563	htmlErrMemory(ctxt);
4564	break;
4565	}
4566	}
4567	}
4568	continue; /* while */
4569	}
4570
4571	else if ((CUR == '<') &&
4572	((IS_ASCII_LETTER(NXT(1))) \|\|
4573	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
4574	name = htmlParseHTMLName_nonInvasive(ctxt);
4575	if (name == NULL) {
4576	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4577	"htmlParseStartTag: invalid element name\n",
4578	NULL, NULL);
4579	/* Dump the bogus tag like browsers do */
4580	while ((CUR == 0) && (CUR != '>'))
4581	NEXT;
4582
4583	htmlParserFinishElementParsing(ctxt);
4584	if (currentNode != NULL)
4585	xmlFree(currentNode);
4586
4587	if (ctxt->name == NULL) {
4588	currentNode = NULL;
4589	} else {
4590	currentNode = xmlStrdup(ctxt->name);
4591	if (currentNode == NULL) {
4592	htmlErrMemory(ctxt);
4593	break;
4594	}
4595	}
4596	depth = ctxt->nameNr;
4597	continue;
4598	}
4599
4600	if (ctxt->name != NULL) {
4601	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4602	htmlAutoClose(ctxt, name);
4603	continue;
4604	}
4605	}
4606	}
4607
4608	/*
4609	* Has this node been popped out during parsing of
4610	* the next element
4611	*/
4612	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4613	(!xmlStrEqual(currentNode, ctxt->name)))
4614	{
4615	htmlParserFinishElementParsing(ctxt);
4616	if (currentNode != NULL) xmlFree(currentNode);
4617
4618	if (ctxt->name == NULL) {
4619	currentNode = NULL;
4620	} else {
4621	currentNode = xmlStrdup(ctxt->name);
4622	if (currentNode == NULL) {
4623	htmlErrMemory(ctxt);
4624	break;
4625	}
4626	}
4627	depth = ctxt->nameNr;
4628	continue;
4629	}
4630
4631	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
4632	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
4633	/*
4634	* Handle SCRIPT/STYLE separately
4635	*/
4636	htmlParseScript(ctxt);
4637	}
4638
4639	else if ((CUR == '<') && (NXT(1) == '!')) {
4640	/*
4641	* Sometimes DOCTYPE arrives in the middle of the document
4642	*/
4643	if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4644	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4645	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4646	(UPP(8) == 'E')) {
4647	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4648	"Misplaced DOCTYPE declaration\n",
4649	BAD_CAST "DOCTYPE" , NULL);
4650	htmlParseDocTypeDecl(ctxt);
4651	}
4652	/*
4653	* First case : a comment
4654	*/
4655	else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4656	htmlParseComment(ctxt);
4657	}
4658	else {
4659	htmlSkipBogusComment(ctxt);
4660	}
4661	}
4662
4663	/*
4664	* Second case : a Processing Instruction.
4665	*/
4666	else if ((CUR == '<') && (NXT(1) == '?')) {
4667	htmlParsePI(ctxt);
4668	}
4669
4670	/*
4671	* Third case : a sub-element.
4672	*/
4673	else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4674	htmlParseElementInternal(ctxt);
4675	if (currentNode != NULL) xmlFree(currentNode);
4676
4677	if (ctxt->name == NULL) {
4678	currentNode = NULL;
4679	} else {
4680	currentNode = xmlStrdup(ctxt->name);
4681	if (currentNode == NULL) {
4682	htmlErrMemory(ctxt);
4683	break;
4684	}
4685	}
4686	depth = ctxt->nameNr;
4687	}
4688	else if (CUR == '<') {
4689	if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4690	(ctxt->sax->characters != NULL))
4691	ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4692	NEXT;
4693	}
4694
4695	/*
4696	* Fourth case : a reference. If if has not been resolved,
4697	* parsing returns it's Name, create the node
4698	*/
4699	else if (CUR == '&') {
4700	htmlParseReference(ctxt);
4701	}
4702
4703	/*
4704	* Fifth case : end of the resource
4705	*/
4706	else if (CUR == 0) {
4707	htmlAutoCloseOnEnd(ctxt);
4708	break;
4709	}
4710
4711	/*
4712	* Last case, text. Note that References are handled directly.
4713	*/
4714	else {
4715	htmlParseCharData(ctxt);
4716	}
4717
4718	SHRINK;
4719	GROW;
4720	}
4721	if (currentNode != NULL) xmlFree(currentNode);
4722	}
4723
4724	/**
4725	* htmlParseContent:
4726	* @ctxt: an HTML parser context
4727	*
4728	* Parse a content: comment, sub-element, reference or text.
4729	* This is the entry point when called from parser.c
4730	*/
4731
4732	void
4733	__htmlParseContent(void *ctxt) {
4734	if (ctxt != NULL)
4735	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4736	}
4737
4738	/**
4739	* htmlParseDocument:
4740	* @ctxt: an HTML parser context
4741	*
4742	* Parse an HTML document and invoke the SAX handlers. This is useful
4743	* if you're only interested in custom SAX callbacks. If you want a
4744	* document tree, use htmlCtxtParseDocument.
4745	*
4746	* Returns 0, -1 in case of error.
4747	*/
4748
4749	int
4750	htmlParseDocument(htmlParserCtxtPtr ctxt) {
4751	xmlDtdPtr dtd;
4752
4753	if ((ctxt == NULL) \|\| (ctxt->input == NULL))
4754	return(-1);
4755
4756	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4757	ctxt->sax->setDocumentLocator(ctxt->userData,
4758	(xmlSAXLocator *) &xmlDefaultSAXLocator);
4759	}
4760
4761	xmlDetectEncoding(ctxt);
4762
4763	/*
4764	* This is wrong but matches long-standing behavior. In most cases,
4765	* a document starting with an XML declaration will specify UTF-8.
4766	*/
4767	if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4768	(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4769	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4770
4771	/*
4772	* Wipe out everything which is before the first '<'
4773	*/
4774	SKIP_BLANKS;
4775	if (CUR == 0) {
4776	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4777	"Document is empty\n", NULL, NULL);
4778	}
4779
4780	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4781	ctxt->sax->startDocument(ctxt->userData);
4782
4783	/*
4784	* Parse possible comments and PIs before any content
4785	*/
4786	while (((CUR == '<') && (NXT(1) == '!') &&
4787	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4788	((CUR == '<') && (NXT(1) == '?'))) {
4789	htmlParseComment(ctxt);
4790	htmlParsePI(ctxt);
4791	SKIP_BLANKS;
4792	}
4793
4794
4795	/*
4796	* Then possibly doc type declaration(s) and more Misc
4797	* (doctypedecl Misc*)?
4798	*/
4799	if ((CUR == '<') && (NXT(1) == '!') &&
4800	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4801	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4802	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4803	(UPP(8) == 'E')) {
4804	htmlParseDocTypeDecl(ctxt);
4805	}
4806	SKIP_BLANKS;
4807
4808	/*
4809	* Parse possible comments and PIs before any content
4810	*/
4811	while ((PARSER_STOPPED(ctxt) == 0) &&
4812	(((CUR == '<') && (NXT(1) == '!') &&
4813	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4814	((CUR == '<') && (NXT(1) == '?')))) {
4815	htmlParseComment(ctxt);
4816	htmlParsePI(ctxt);
4817	SKIP_BLANKS;
4818	}
4819
4820	/*
4821	* Time to start parsing the tree itself
4822	*/
4823	htmlParseContentInternal(ctxt);
4824
4825	/*
4826	* autoclose
4827	*/
4828	if (CUR == 0)
4829	htmlAutoCloseOnEnd(ctxt);
4830
4831
4832	/*
4833	* SAX: end of the document processing.
4834	*/
4835	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4836	ctxt->sax->endDocument(ctxt->userData);
4837
4838	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4839	dtd = xmlGetIntSubset(ctxt->myDoc);
4840	if (dtd == NULL) {
4841	ctxt->myDoc->intSubset =
4842	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4843	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4844	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4845	if (ctxt->myDoc->intSubset == NULL)
4846	htmlErrMemory(ctxt);
4847	}
4848	}
4849	if (! ctxt->wellFormed) return(-1);
4850	return(0);
4851	}
4852
4853
4854	/************************************************************************
4855	* *
4856	* Parser contexts handling *
4857	* *
4858	************************************************************************/
4859
4860	/**
4861	* htmlInitParserCtxt:
4862	* @ctxt: an HTML parser context
4863	* @sax: SAX handler
4864	* @userData: user data
4865	*
4866	* Initialize a parser context
4867	*
4868	* Returns 0 in case of success and -1 in case of error
4869	*/
4870
4871	static int
4872	htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4873	void *userData)
4874	{
4875	if (ctxt == NULL) return(-1);
4876	memset(ctxt, 0, sizeof(htmlParserCtxt));
4877
4878	ctxt->dict = xmlDictCreate();
4879	if (ctxt->dict == NULL)
4880	return(-1);
4881
4882	if (ctxt->sax == NULL)
4883	ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4884	if (ctxt->sax == NULL)
4885	return(-1);
4886	if (sax == NULL) {
4887	memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4888	xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4889	ctxt->userData = ctxt;
4890	} else {
4891	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4892	ctxt->userData = userData ? userData : ctxt;
4893	}
4894
4895	/* Allocate the Input stack */
4896	ctxt->inputTab = (htmlParserInputPtr *)
4897	xmlMalloc(5 * sizeof(htmlParserInputPtr));
4898	if (ctxt->inputTab == NULL)
4899	return(-1);
4900	ctxt->inputNr = 0;
4901	ctxt->inputMax = 5;
4902	ctxt->input = NULL;
4903	ctxt->version = NULL;
4904	ctxt->encoding = NULL;
4905	ctxt->standalone = -1;
4906	ctxt->instate = XML_PARSER_START;
4907
4908	/* Allocate the Node stack */
4909	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
4910	if (ctxt->nodeTab == NULL)
4911	return(-1);
4912	ctxt->nodeNr = 0;
4913	ctxt->nodeMax = 10;
4914	ctxt->node = NULL;
4915
4916	/* Allocate the Name stack */
4917	ctxt->nameTab = (const xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
4918	if (ctxt->nameTab == NULL)
4919	return(-1);
4920	ctxt->nameNr = 0;
4921	ctxt->nameMax = 10;
4922	ctxt->name = NULL;
4923
4924	ctxt->nodeInfoTab = NULL;
4925	ctxt->nodeInfoNr = 0;
4926	ctxt->nodeInfoMax = 0;
4927
4928	ctxt->myDoc = NULL;
4929	ctxt->wellFormed = 1;
4930	ctxt->replaceEntities = 0;
4931	ctxt->linenumbers = xmlLineNumbersDefaultValue;
4932	ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4933	ctxt->html = 1;
4934	ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4935	ctxt->vctxt.userData = ctxt;
4936	ctxt->vctxt.error = xmlParserValidityError;
4937	ctxt->vctxt.warning = xmlParserValidityWarning;
4938	ctxt->record_info = 0;
4939	ctxt->validate = 0;
4940	ctxt->checkIndex = 0;
4941	ctxt->catalogs = NULL;
4942	xmlInitNodeInfoSeq(&ctxt->node_seq);
4943	return(0);
4944	}
4945
4946	/**
4947	* htmlFreeParserCtxt:
4948	* @ctxt: an HTML parser context
4949	*
4950	* Free all the memory used by a parser context. However the parsed
4951	* document in ctxt->myDoc is not freed.
4952	*/
4953
4954	void
4955	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4956	{
4957	xmlFreeParserCtxt(ctxt);
4958	}
4959
4960	/**
4961	* htmlNewParserCtxt:
4962	*
4963	* Allocate and initialize a new HTML parser context.
4964	*
4965	* This can be used to parse HTML documents into DOM trees with
4966	* functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4967	*
4968	* See htmlCtxtUseOptions for parser options.
4969	*
4970	* See xmlCtxtSetErrorHandler for advanced error handling.
4971	*
4972	* See xmlNewInputURL, xmlNewInputMemory, xmlNewInputIO and similar
4973	* functions for advanced input control.
4974	*
4975	* See htmlNewSAXParserCtxt for custom SAX parsers.
4976	*
4977	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
4978	*/
4979
4980	htmlParserCtxtPtr
4981	htmlNewParserCtxt(void)
4982	{
4983	return(htmlNewSAXParserCtxt(NULL, NULL));
4984	}
4985
4986	/**
4987	* htmlNewSAXParserCtxt:
4988	* @sax: SAX handler
4989	* @userData: user data
4990	*
4991	* Allocate and initialize a new HTML SAX parser context. If userData
4992	* is NULL, the parser context will be passed as user data.
4993	*
4994	* Available since 2.11.0. If you want support older versions,
4995	* it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4996	* struct assignment.
4997	*
4998	* Also see htmlNewParserCtxt.
4999	*
5000	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
5001	*/
5002
5003	htmlParserCtxtPtr
5004	htmlNewSAXParserCtxt(const htmlSAXHandler sax, void userData)
5005	{
5006	xmlParserCtxtPtr ctxt;
5007
5008	xmlInitParser();
5009
5010	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5011	if (ctxt == NULL)
5012	return(NULL);
5013	memset(ctxt, 0, sizeof(xmlParserCtxt));
5014	if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5015	htmlFreeParserCtxt(ctxt);
5016	return(NULL);
5017	}
5018	return(ctxt);
5019	}
5020
5021	static htmlParserCtxtPtr
5022	htmlCreateMemoryParserCtxtInternal(const char *url,
5023	const char *buffer, size_t size,
5024	const char *encoding) {
5025	xmlParserCtxtPtr ctxt;
5026	xmlParserInputPtr input;
5027
5028	if (buffer == NULL)
5029	return(NULL);
5030
5031	ctxt = htmlNewParserCtxt();
5032	if (ctxt == NULL)
5033	return(NULL);
5034
5035	input = xmlNewInputMemory(ctxt, url, buffer, size, encoding, 0);
5036	if (input == NULL) {
5037	xmlFreeParserCtxt(ctxt);
5038	return(NULL);
5039	}
5040
5041	inputPush(ctxt, input);
5042
5043	return(ctxt);
5044	}
5045
5046	/**
5047	* htmlCreateMemoryParserCtxt:
5048	* @buffer: a pointer to a char array
5049	* @size: the size of the array
5050	*
5051	* DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
5052	*
5053	* Create a parser context for an HTML in-memory document. The input
5054	* buffer must not contain any terminating null bytes.
5055	*
5056	* Returns the new parser context or NULL
5057	*/
5058	htmlParserCtxtPtr
5059	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5060	if (size <= 0)
5061	return(NULL);
5062
5063	return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
5064	}
5065
5066	/**
5067	* htmlCreateDocParserCtxt:
5068	* @str: a pointer to an array of xmlChar
5069	* @encoding: encoding (optional)
5070	*
5071	* Create a parser context for a null-terminated string.
5072	*
5073	* Returns the new parser context or NULL if a memory allocation failed.
5074	*/
5075	static htmlParserCtxtPtr
5076	htmlCreateDocParserCtxt(const xmlChar str, const char url,
5077	const char *encoding) {
5078	xmlParserCtxtPtr ctxt;
5079	xmlParserInputPtr input;
5080
5081	if (str == NULL)
5082	return(NULL);
5083
5084	ctxt = htmlNewParserCtxt();
5085	if (ctxt == NULL)
5086	return(NULL);
5087
5088	input = xmlNewInputString(ctxt, url, (const char *) str, encoding, 0);
5089	if (input == NULL) {
5090	xmlFreeParserCtxt(ctxt);
5091	return(NULL);
5092	}
5093
5094	inputPush(ctxt, input);
5095
5096	return(ctxt);
5097	}
5098
5099	#ifdef LIBXML_PUSH_ENABLED
5100	/************************************************************************
5101	* *
5102	* Progressive parsing interfaces *
5103	* *
5104	************************************************************************/
5105
5106	/**
5107	* htmlParseLookupSequence:
5108	* @ctxt: an HTML parser context
5109	* @first: the first char to lookup
5110	* @next: the next char to lookup or zero
5111	* @third: the next char to lookup or zero
5112	* @ignoreattrval: skip over attribute values
5113	*
5114	* Try to find if a sequence (first, next, third) or just (first next) or
5115	* (first) is available in the input stream.
5116	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
5117	* to avoid rescanning sequences of bytes, it DOES change the state of the
5118	* parser, do not use liberally.
5119	* This is basically similar to xmlParseLookupSequence()
5120	*
5121	* Returns the index to the current parsing point if the full sequence
5122	* is available, -1 otherwise.
5123	*/
5124	static int
5125	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5126	xmlChar next, xmlChar third, int ignoreattrval)
5127	{
5128	size_t base, len;
5129	htmlParserInputPtr in;
5130	const xmlChar *buf;
5131	int quote;
5132
5133	in = ctxt->input;
5134	if (in == NULL)
5135	return (-1);
5136
5137	base = ctxt->checkIndex;
5138	quote = ctxt->endCheckState;
5139
5140	buf = in->cur;
5141	len = in->end - in->cur;
5142
5143	/* take into account the sequence length */
5144	if (third)
5145	len -= 2;
5146	else if (next)
5147	len--;
5148	for (; base < len; base++) {
5149	if (base >= INT_MAX / 2) {
5150	ctxt->checkIndex = 0;
5151	ctxt->endCheckState = 0;
5152	return (base - 2);
5153	}
5154	if (ignoreattrval) {
5155	if (quote) {
5156	if (buf[base] == quote)
5157	quote = 0;
5158	continue;
5159	}
5160	if (buf[base] == '"' \|\| buf[base] == '\'') {
5161	quote = buf[base];
5162	continue;
5163	}
5164	}
5165	if (buf[base] == first) {
5166	if (third != 0) {
5167	if ((buf[base + 1] != next) \|\| (buf[base + 2] != third))
5168	continue;
5169	} else if (next != 0) {
5170	if (buf[base + 1] != next)
5171	continue;
5172	}
5173	ctxt->checkIndex = 0;
5174	ctxt->endCheckState = 0;
5175	return (base);
5176	}
5177	}
5178	ctxt->checkIndex = base;
5179	ctxt->endCheckState = quote;
5180	return (-1);
5181	}
5182
5183	/**
5184	* htmlParseLookupCommentEnd:
5185	* @ctxt: an HTML parser context
5186	*
5187	* Try to find a comment end tag in the input stream
5188	* The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5189	* (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5190	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
5191	* to avoid rescanning sequences of bytes, it DOES change the state of the
5192	* parser, do not use liberally.
5193	* This wraps to htmlParseLookupSequence()
5194	*
5195	* Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5196	*/
5197	static int
5198	htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5199	{
5200	int mark = 0;
5201	int offset;
5202
5203	while (1) {
5204	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5205	if (mark < 0)
5206	break;
5207	if ((NXT(mark+2) == '>') \|\|
5208	((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5209	ctxt->checkIndex = 0;
5210	break;
5211	}
5212	offset = (NXT(mark+2) == '!') ? 3 : 2;
5213	if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5214	ctxt->checkIndex = mark;
5215	return(-1);
5216	}
5217	ctxt->checkIndex = mark + 1;
5218	}
5219	return mark;
5220	}
5221
5222
5223	/**
5224	* htmlParseTryOrFinish:
5225	* @ctxt: an HTML parser context
5226	* @terminate: last chunk indicator
5227	*
5228	* Try to progress on parsing
5229	*
5230	* Returns zero if no parsing was possible
5231	*/
5232	static int
5233	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5234	int ret = 0;
5235	htmlParserInputPtr in;
5236	ptrdiff_t avail = 0;
5237	xmlChar cur, next;
5238
5239	htmlParserNodeInfo node_info;
5240
5241	while (PARSER_STOPPED(ctxt) == 0) {
5242
5243	in = ctxt->input;
5244	if (in == NULL) break;
5245	avail = in->end - in->cur;
5246	if ((avail == 0) && (terminate)) {
5247	htmlAutoCloseOnEnd(ctxt);
5248	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5249	/*
5250	* SAX: end of the document processing.
5251	*/
5252	ctxt->instate = XML_PARSER_EOF;
5253	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5254	ctxt->sax->endDocument(ctxt->userData);
5255	}
5256	}
5257	if (avail < 1)
5258	goto done;
5259	/*
5260	* This is done to make progress and avoid an infinite loop
5261	* if a parsing attempt was aborted by hitting a NUL byte. After
5262	* changing htmlCurrentChar, this probably isn't necessary anymore.
5263	* We should consider removing this check.
5264	*/
5265	cur = in->cur[0];
5266	if (cur == 0) {
5267	SKIP(1);
5268	continue;
5269	}
5270
5271	switch (ctxt->instate) {
5272	case XML_PARSER_EOF:
5273	/*
5274	* Document parsing is done !
5275	*/
5276	goto done;
5277	case XML_PARSER_START:
5278	/*
5279	* This is wrong but matches long-standing behavior. In most
5280	* cases, a document starting with an XML declaration will
5281	* specify UTF-8.
5282	*/
5283	if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5284	(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5285	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5286	}
5287
5288	/*
5289	* Very first chars read from the document flow.
5290	*/
5291	cur = in->cur[0];
5292	if (IS_BLANK_CH(cur)) {
5293	SKIP_BLANKS;
5294	avail = in->end - in->cur;
5295	}
5296	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
5297	ctxt->sax->setDocumentLocator(ctxt->userData,
5298	(xmlSAXLocator *) &xmlDefaultSAXLocator);
5299	}
5300	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5301	(!ctxt->disableSAX))
5302	ctxt->sax->startDocument(ctxt->userData);
5303
5304	cur = in->cur[0];
5305	next = in->cur[1];
5306	if ((cur == '<') && (next == '!') &&
5307	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5308	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5309	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5310	(UPP(8) == 'E')) {
5311	if ((!terminate) &&
5312	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5313	goto done;
5314	htmlParseDocTypeDecl(ctxt);
5315	ctxt->instate = XML_PARSER_PROLOG;
5316	} else {
5317	ctxt->instate = XML_PARSER_MISC;
5318	}
5319	break;
5320	case XML_PARSER_MISC:
5321	SKIP_BLANKS;
5322	avail = in->end - in->cur;
5323	/*
5324	* no chars in buffer
5325	*/
5326	if (avail < 1)
5327	goto done;
5328	/*
5329	* not enough chars in buffer
5330	*/
5331	if (avail < 2) {
5332	if (!terminate)
5333	goto done;
5334	else
5335	next = ' ';
5336	} else {
5337	next = in->cur[1];
5338	}
5339	cur = in->cur[0];
5340	if ((cur == '<') && (next == '!') &&
5341	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5342	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5343	goto done;
5344	htmlParseComment(ctxt);
5345	ctxt->instate = XML_PARSER_MISC;
5346	} else if ((cur == '<') && (next == '?')) {
5347	if ((!terminate) &&
5348	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5349	goto done;
5350	htmlParsePI(ctxt);
5351	ctxt->instate = XML_PARSER_MISC;
5352	} else if ((cur == '<') && (next == '!') &&
5353	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5354	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5355	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5356	(UPP(8) == 'E')) {
5357	if ((!terminate) &&
5358	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5359	goto done;
5360	htmlParseDocTypeDecl(ctxt);
5361	ctxt->instate = XML_PARSER_PROLOG;
5362	} else if ((cur == '<') && (next == '!') &&
5363	(avail < 9)) {
5364	goto done;
5365	} else {
5366	ctxt->instate = XML_PARSER_CONTENT;
5367	}
5368	break;
5369	case XML_PARSER_PROLOG:
5370	SKIP_BLANKS;
5371	avail = in->end - in->cur;
5372	if (avail < 2)
5373	goto done;
5374	cur = in->cur[0];
5375	next = in->cur[1];
5376	if ((cur == '<') && (next == '!') &&
5377	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5378	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5379	goto done;
5380	htmlParseComment(ctxt);
5381	ctxt->instate = XML_PARSER_PROLOG;
5382	} else if ((cur == '<') && (next == '?')) {
5383	if ((!terminate) &&
5384	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5385	goto done;
5386	htmlParsePI(ctxt);
5387	ctxt->instate = XML_PARSER_PROLOG;
5388	} else if ((cur == '<') && (next == '!') &&
5389	(avail < 4)) {
5390	goto done;
5391	} else {
5392	ctxt->instate = XML_PARSER_CONTENT;
5393	}
5394	break;
5395	case XML_PARSER_EPILOG:
5396	avail = in->end - in->cur;
5397	if (avail < 1)
5398	goto done;
5399	cur = in->cur[0];
5400	if (IS_BLANK_CH(cur)) {
5401	htmlParseCharData(ctxt);
5402	goto done;
5403	}
5404	if (avail < 2)
5405	goto done;
5406	next = in->cur[1];
5407	if ((cur == '<') && (next == '!') &&
5408	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5409	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5410	goto done;
5411	htmlParseComment(ctxt);
5412	ctxt->instate = XML_PARSER_EPILOG;
5413	} else if ((cur == '<') && (next == '?')) {
5414	if ((!terminate) &&
5415	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5416	goto done;
5417	htmlParsePI(ctxt);
5418	ctxt->instate = XML_PARSER_EPILOG;
5419	} else if ((cur == '<') && (next == '!') &&
5420	(avail < 4)) {
5421	goto done;
5422	} else {
5423	ctxt->errNo = XML_ERR_DOCUMENT_END;
5424	ctxt->wellFormed = 0;
5425	ctxt->instate = XML_PARSER_EOF;
5426	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5427	ctxt->sax->endDocument(ctxt->userData);
5428	goto done;
5429	}
5430	break;
5431	case XML_PARSER_START_TAG: {
5432	const xmlChar *name;
5433	int failed;
5434	const htmlElemDesc * info;
5435
5436	/*
5437	* no chars in buffer
5438	*/
5439	if (avail < 1)
5440	goto done;
5441	/*
5442	* not enough chars in buffer
5443	*/
5444	if (avail < 2) {
5445	if (!terminate)
5446	goto done;
5447	else
5448	next = ' ';
5449	} else {
5450	next = in->cur[1];
5451	}
5452	cur = in->cur[0];
5453	if (cur != '<') {
5454	ctxt->instate = XML_PARSER_CONTENT;
5455	break;
5456	}
5457	if (next == '/') {
5458	ctxt->instate = XML_PARSER_END_TAG;
5459	ctxt->checkIndex = 0;
5460	break;
5461	}
5462	if ((!terminate) &&
5463	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5464	goto done;
5465
5466	/* Capture start position */
5467	if (ctxt->record_info) {
5468	node_info.begin_pos = ctxt->input->consumed +
5469	(CUR_PTR - ctxt->input->base);
5470	node_info.begin_line = ctxt->input->line;
5471	}
5472
5473
5474	failed = htmlParseStartTag(ctxt);
5475	name = ctxt->name;
5476	if ((failed == -1) \|\|
5477	(name == NULL)) {
5478	if (CUR == '>')
5479	NEXT;
5480	break;
5481	}
5482
5483	/*
5484	* Lookup the info for that element.
5485	*/
5486	info = htmlTagLookup(name);
5487	if (info == NULL) {
5488	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5489	"Tag %s invalid\n", name, NULL);
5490	}
5491
5492	/*
5493	* Check for an Empty Element labeled the XML/SGML way
5494	*/
5495	if ((CUR == '/') && (NXT(1) == '>')) {
5496	SKIP(2);
5497	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5498	ctxt->sax->endElement(ctxt->userData, name);
5499	htmlnamePop(ctxt);
5500	ctxt->instate = XML_PARSER_CONTENT;
5501	break;
5502	}
5503
5504	if (CUR == '>') {
5505	NEXT;
5506	} else {
5507	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5508	"Couldn't find end of Start Tag %s\n",
5509	name, NULL);
5510
5511	/*
5512	* end of parsing of this node.
5513	*/
5514	if (xmlStrEqual(name, ctxt->name)) {
5515	nodePop(ctxt);
5516	htmlnamePop(ctxt);
5517	}
5518
5519	if (ctxt->record_info)
5520	htmlNodeInfoPush(ctxt, &node_info);
5521
5522	ctxt->instate = XML_PARSER_CONTENT;
5523	break;
5524	}
5525
5526	/*
5527	* Check for an Empty Element from DTD definition
5528	*/
5529	if ((info != NULL) && (info->empty)) {
5530	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5531	ctxt->sax->endElement(ctxt->userData, name);
5532	htmlnamePop(ctxt);
5533	}
5534
5535	if (ctxt->record_info)
5536	htmlNodeInfoPush(ctxt, &node_info);
5537
5538	ctxt->instate = XML_PARSER_CONTENT;
5539	break;
5540	}
5541	case XML_PARSER_CONTENT: {
5542	xmlChar chr[2] = { 0, 0 };
5543
5544	/*
5545	* Handle preparsed entities and charRef
5546	*/
5547	if ((avail == 1) && (terminate)) {
5548	cur = in->cur[0];
5549	if ((cur != '<') && (cur != '&')) {
5550	if (ctxt->sax != NULL) {
5551	chr[0] = cur;
5552	if (IS_BLANK_CH(cur)) {
5553	if (ctxt->keepBlanks) {
5554	if (ctxt->sax->characters != NULL)
5555	ctxt->sax->characters(
5556	ctxt->userData, chr, 1);
5557	} else {
5558	if (ctxt->sax->ignorableWhitespace != NULL)
5559	ctxt->sax->ignorableWhitespace(
5560	ctxt->userData, chr, 1);
5561	}
5562	} else {
5563	htmlCheckParagraph(ctxt);
5564	if (ctxt->sax->characters != NULL)
5565	ctxt->sax->characters(
5566	ctxt->userData, chr, 1);
5567	}
5568	}
5569	ctxt->checkIndex = 0;
5570	in->cur++;
5571	break;
5572	}
5573	}
5574	if (avail < 2)
5575	goto done;
5576	cur = in->cur[0];
5577	next = in->cur[1];
5578	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
5579	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5580	/*
5581	* Handle SCRIPT/STYLE separately
5582	*/
5583	if (!terminate) {
5584	int idx;
5585	xmlChar val;
5586
5587	idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5588	if (idx < 0)
5589	goto done;
5590	val = in->cur[idx + 2];
5591	if (val == 0) { /* bad cut of input */
5592	/*
5593	* FIXME: htmlParseScript checks for additional
5594	* characters after '</'.
5595	*/
5596	ctxt->checkIndex = idx;
5597	goto done;
5598	}
5599	}
5600	htmlParseScript(ctxt);
5601	if ((cur == '<') && (next == '/')) {
5602	ctxt->instate = XML_PARSER_END_TAG;
5603	ctxt->checkIndex = 0;
5604	break;
5605	}
5606	} else if ((cur == '<') && (next == '!')) {
5607	if (avail < 4)
5608	goto done;
5609	/*
5610	* Sometimes DOCTYPE arrives in the middle of the document
5611	*/
5612	if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5613	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5614	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5615	(UPP(8) == 'E')) {
5616	if ((!terminate) &&
5617	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5618	goto done;
5619	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5620	"Misplaced DOCTYPE declaration\n",
5621	BAD_CAST "DOCTYPE" , NULL);
5622	htmlParseDocTypeDecl(ctxt);
5623	} else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5624	if ((!terminate) &&
5625	(htmlParseLookupCommentEnd(ctxt) < 0))
5626	goto done;
5627	htmlParseComment(ctxt);
5628	ctxt->instate = XML_PARSER_CONTENT;
5629	} else {
5630	if ((!terminate) &&
5631	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5632	goto done;
5633	htmlSkipBogusComment(ctxt);
5634	}
5635	} else if ((cur == '<') && (next == '?')) {
5636	if ((!terminate) &&
5637	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5638	goto done;
5639	htmlParsePI(ctxt);
5640	ctxt->instate = XML_PARSER_CONTENT;
5641	} else if ((cur == '<') && (next == '/')) {
5642	ctxt->instate = XML_PARSER_END_TAG;
5643	ctxt->checkIndex = 0;
5644	break;
5645	} else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5646	if ((!terminate) && (next == 0))
5647	goto done;
5648	ctxt->instate = XML_PARSER_START_TAG;
5649	ctxt->checkIndex = 0;
5650	break;
5651	} else if (cur == '<') {
5652	if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5653	(ctxt->sax->characters != NULL))
5654	ctxt->sax->characters(ctxt->userData,
5655	BAD_CAST "<", 1);
5656	NEXT;
5657	} else {
5658	/*
5659	* check that the text sequence is complete
5660	* before handing out the data to the parser
5661	* to avoid problems with erroneous end of
5662	* data detection.
5663	*/
5664	if ((!terminate) &&
5665	(htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5666	goto done;
5667	ctxt->checkIndex = 0;
5668	while ((PARSER_STOPPED(ctxt) == 0) &&
5669	(cur != '<') && (in->cur < in->end)) {
5670	if (cur == '&') {
5671	htmlParseReference(ctxt);
5672	} else {
5673	htmlParseCharData(ctxt);
5674	}
5675	cur = in->cur[0];
5676	}
5677	}
5678
5679	break;
5680	}
5681	case XML_PARSER_END_TAG:
5682	if (avail < 2)
5683	goto done;
5684	if ((!terminate) &&
5685	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5686	goto done;
5687	htmlParseEndTag(ctxt);
5688	if (ctxt->nameNr == 0) {
5689	ctxt->instate = XML_PARSER_EPILOG;
5690	} else {
5691	ctxt->instate = XML_PARSER_CONTENT;
5692	}
5693	ctxt->checkIndex = 0;
5694	break;
5695	default:
5696	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5697	"HPP: internal error\n", NULL, NULL);
5698	ctxt->instate = XML_PARSER_EOF;
5699	break;
5700	}
5701	}
5702	done:
5703	if ((avail == 0) && (terminate)) {
5704	htmlAutoCloseOnEnd(ctxt);
5705	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5706	/*
5707	* SAX: end of the document processing.
5708	*/
5709	ctxt->instate = XML_PARSER_EOF;
5710	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5711	ctxt->sax->endDocument(ctxt->userData);
5712	}
5713	}
5714	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5715	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
5716	(ctxt->instate == XML_PARSER_EPILOG))) {
5717	xmlDtdPtr dtd;
5718	dtd = xmlGetIntSubset(ctxt->myDoc);
5719	if (dtd == NULL) {
5720	ctxt->myDoc->intSubset =
5721	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5722	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5723	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5724	if (ctxt->myDoc->intSubset == NULL)
5725	htmlErrMemory(ctxt);
5726	}
5727	}
5728	return(ret);
5729	}
5730
5731	/**
5732	* htmlParseChunk:
5733	* @ctxt: an HTML parser context
5734	* @chunk: chunk of memory
5735	* @size: size of chunk in bytes
5736	* @terminate: last chunk indicator
5737	*
5738	* Parse a chunk of memory in push parser mode.
5739	*
5740	* Assumes that the parser context was initialized with
5741	* htmlCreatePushParserCtxt.
5742	*
5743	* The last chunk, which will often be empty, must be marked with
5744	* the @terminate flag. With the default SAX callbacks, the resulting
5745	* document will be available in ctxt->myDoc. This pointer will not
5746	* be freed by the library.
5747	*
5748	* If the document isn't well-formed, ctxt->myDoc is set to NULL.
5749	*
5750	* Returns an xmlParserErrors code (0 on success).
5751	*/
5752	int
5753	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5754	int terminate) {
5755	if ((ctxt == NULL) \|\| (ctxt->input == NULL))
5756	return(XML_ERR_ARGUMENT);
5757	if (PARSER_STOPPED(ctxt) != 0)
5758	return(ctxt->errNo);
5759	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5760	(ctxt->input->buf != NULL)) {
5761	size_t pos = ctxt->input->cur - ctxt->input->base;
5762	int res;
5763
5764	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5765	xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5766	if (res < 0) {
5767	htmlParseErr(ctxt, ctxt->input->buf->error,
5768	"xmlParserInputBufferPush failed", NULL, NULL);
5769	xmlHaltParser(ctxt);
5770	return (ctxt->errNo);
5771	}
5772	}
5773	htmlParseTryOrFinish(ctxt, terminate);
5774	if (terminate) {
5775	if (ctxt->instate != XML_PARSER_EOF) {
5776	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5777	ctxt->sax->endDocument(ctxt->userData);
5778	}
5779	ctxt->instate = XML_PARSER_EOF;
5780	}
5781	return((xmlParserErrors) ctxt->errNo);
5782	}
5783
5784	/************************************************************************
5785	* *
5786	* User entry points *
5787	* *
5788	************************************************************************/
5789
5790	/**
5791	* htmlCreatePushParserCtxt:
5792	* @sax: a SAX handler (optional)
5793	* @user_data: The user data returned on SAX callbacks (optional)
5794	* @chunk: a pointer to an array of chars (optional)
5795	* @size: number of chars in the array
5796	* @filename: only used for error reporting (optional)
5797	* @enc: encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5798	*
5799	* Create a parser context for using the HTML parser in push mode.
5800	*
5801	* Returns the new parser context or NULL if a memory allocation
5802	* failed.
5803	*/
5804	htmlParserCtxtPtr
5805	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5806	const char chunk, int size, const char filename,
5807	xmlCharEncoding enc) {
5808	htmlParserCtxtPtr ctxt;
5809	htmlParserInputPtr input;
5810	const char *encoding;
5811
5812	ctxt = htmlNewSAXParserCtxt(sax, user_data);
5813	if (ctxt == NULL)
5814	return(NULL);
5815
5816	encoding = xmlGetCharEncodingName(enc);
5817	input = xmlNewInputPush(ctxt, filename, chunk, size, encoding);
5818	if (input == NULL) {
5819	htmlFreeParserCtxt(ctxt);
5820	return(NULL);
5821	}
5822	inputPush(ctxt, input);
5823
5824	return(ctxt);
5825	}
5826	#endif /* LIBXML_PUSH_ENABLED */
5827
5828	/**
5829	* htmlSAXParseDoc:
5830	* @cur: a pointer to an array of xmlChar
5831	* @encoding: a free form C string describing the HTML document encoding, or NULL
5832	* @sax: the SAX handler block
5833	* @userData: if using SAX, this pointer will be provided on callbacks.
5834	*
5835	* DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5836	*
5837	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5838	* to handle parse events. If sax is NULL, fallback to the default DOM
5839	* behavior and return a tree.
5840	*
5841	* Returns the resulting document tree unless SAX is NULL or the document is
5842	* not well formed.
5843	*/
5844
5845	htmlDocPtr
5846	htmlSAXParseDoc(const xmlChar cur, const char encoding,
5847	htmlSAXHandlerPtr sax, void *userData) {
5848	htmlDocPtr ret;
5849	htmlParserCtxtPtr ctxt;
5850
5851	if (cur == NULL)
5852	return(NULL);
5853
5854	ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5855	if (ctxt == NULL)
5856	return(NULL);
5857
5858	if (sax != NULL) {
5859	ctxt->sax = sax;
5860	ctxt->userData = userData;
5861	}
5862
5863	htmlParseDocument(ctxt);
5864	ret = ctxt->myDoc;
5865	htmlFreeParserCtxt(ctxt);
5866
5867	return(ret);
5868	}
5869
5870	/**
5871	* htmlParseDoc:
5872	* @cur: a pointer to an array of xmlChar
5873	* @encoding: the encoding (optional)
5874	*
5875	* DEPRECATED: Use htmlReadDoc.
5876	*
5877	* Parse an HTML in-memory document and build a tree.
5878	*
5879	* This function uses deprecated global parser options.
5880	*
5881	* Returns the resulting document tree
5882	*/
5883
5884	htmlDocPtr
5885	htmlParseDoc(const xmlChar cur, const char encoding) {
5886	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5887	}
5888
5889
5890	/**
5891	* htmlCreateFileParserCtxt:
5892	* @filename: the filename
5893	* @encoding: optional encoding
5894	*
5895	* DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5896	*
5897	* Create a parser context to read from a file.
5898	*
5899	* A non-NULL encoding overrides encoding declarations in the document.
5900	*
5901	* Automatic support for ZLIB/Compress compressed document is provided
5902	* by default if found at compile-time.
5903	*
5904	* Returns the new parser context or NULL if a memory allocation failed.
5905	*/
5906	htmlParserCtxtPtr
5907	htmlCreateFileParserCtxt(const char filename, const char encoding)
5908	{
5909	htmlParserCtxtPtr ctxt;
5910	htmlParserInputPtr input;
5911
5912	if (filename == NULL)
5913	return(NULL);
5914
5915	ctxt = htmlNewParserCtxt();
5916	if (ctxt == NULL) {
5917	return(NULL);
5918	}
5919
5920	input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
5921	if (input == NULL) {
5922	xmlFreeParserCtxt(ctxt);
5923	return(NULL);
5924	}
5925	inputPush(ctxt, input);
5926
5927	return(ctxt);
5928	}
5929
5930	/**
5931	* htmlSAXParseFile:
5932	* @filename: the filename
5933	* @encoding: encoding (optional)
5934	* @sax: the SAX handler block
5935	* @userData: if using SAX, this pointer will be provided on callbacks.
5936	*
5937	* DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5938	*
5939	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5940	* compressed document is provided by default if found at compile-time.
5941	* It use the given SAX function block to handle the parsing callback.
5942	* If sax is NULL, fallback to the default DOM tree building routines.
5943	*
5944	* Returns the resulting document tree unless SAX is NULL or the document is
5945	* not well formed.
5946	*/
5947
5948	htmlDocPtr
5949	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
5950	void *userData) {
5951	htmlDocPtr ret;
5952	htmlParserCtxtPtr ctxt;
5953	htmlSAXHandlerPtr oldsax = NULL;
5954
5955	ctxt = htmlCreateFileParserCtxt(filename, encoding);
5956	if (ctxt == NULL) return(NULL);
5957	if (sax != NULL) {
5958	oldsax = ctxt->sax;
5959	ctxt->sax = sax;
5960	ctxt->userData = userData;
5961	}
5962
5963	htmlParseDocument(ctxt);
5964
5965	ret = ctxt->myDoc;
5966	if (sax != NULL) {
5967	ctxt->sax = oldsax;
5968	ctxt->userData = NULL;
5969	}
5970	htmlFreeParserCtxt(ctxt);
5971
5972	return(ret);
5973	}
5974
5975	/**
5976	* htmlParseFile:
5977	* @filename: the filename
5978	* @encoding: encoding (optional)
5979	*
5980	* Parse an HTML file and build a tree.
5981	*
5982	* See xmlNewInputURL for details.
5983	*
5984	* Returns the resulting document tree
5985	*/
5986
5987	htmlDocPtr
5988	htmlParseFile(const char filename, const char encoding) {
5989	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5990	}
5991
5992	/**
5993	* htmlHandleOmittedElem:
5994	* @val: int 0 or 1
5995	*
5996	* DEPRECATED: Use HTML_PARSE_NOIMPLIED
5997	*
5998	* Set and return the previous value for handling HTML omitted tags.
5999	*
6000	* Returns the last value for 0 for no handling, 1 for auto insertion.
6001	*/
6002
6003	int
6004	htmlHandleOmittedElem(int val) {
6005	int old = htmlOmittedDefaultValue;
6006
6007	htmlOmittedDefaultValue = val;
6008	return(old);
6009	}
6010
6011	/**
6012	* htmlElementAllowedHere:
6013	* @parent: HTML parent element
6014	* @elt: HTML element
6015	*
6016	* Checks whether an HTML element may be a direct child of a parent element.
6017	* Note - doesn't check for deprecated elements
6018	*
6019	* Returns 1 if allowed; 0 otherwise.
6020	*/
6021	int
6022	htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6023	const char** p ;
6024
6025	if ( ! elt \|\| ! parent \|\| ! parent->subelts )
6026	return 0 ;
6027
6028	for ( p = parent->subelts; *p; ++p )
6029	if ( !xmlStrcmp((const xmlChar )p, elt) )
6030	return 1 ;
6031
6032	return 0 ;
6033	}
6034	/**
6035	* htmlElementStatusHere:
6036	* @parent: HTML parent element
6037	* @elt: HTML element
6038	*
6039	* Checks whether an HTML element may be a direct child of a parent element.
6040	* and if so whether it is valid or deprecated.
6041	*
6042	* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6043	*/
6044	htmlStatus
6045	htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6046	if ( ! parent \|\| ! elt )
6047	return HTML_INVALID ;
6048	if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6049	return HTML_INVALID ;
6050
6051	return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6052	}
6053	/**
6054	* htmlAttrAllowed:
6055	* @elt: HTML element
6056	* @attr: HTML attribute
6057	* @legacy: whether to allow deprecated attributes
6058	*
6059	* Checks whether an attribute is valid for an element
6060	* Has full knowledge of Required and Deprecated attributes
6061	*
6062	* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6063	*/
6064	htmlStatus
6065	htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6066	const char** p ;
6067
6068	if ( !elt \|\| ! attr )
6069	return HTML_INVALID ;
6070
6071	if ( elt->attrs_req )
6072	for ( p = elt->attrs_req; *p; ++p)
6073	if ( !xmlStrcmp((const xmlChar)p, attr) )
6074	return HTML_REQUIRED ;
6075
6076	if ( elt->attrs_opt )
6077	for ( p = elt->attrs_opt; *p; ++p)
6078	if ( !xmlStrcmp((const xmlChar)p, attr) )
6079	return HTML_VALID ;
6080
6081	if ( legacy && elt->attrs_depr )
6082	for ( p = elt->attrs_depr; *p; ++p)
6083	if ( !xmlStrcmp((const xmlChar)p, attr) )
6084	return HTML_DEPRECATED ;
6085
6086	return HTML_INVALID ;
6087	}
6088	/**
6089	* htmlNodeStatus:
6090	* @node: an htmlNodePtr in a tree
6091	* @legacy: whether to allow deprecated elements (YES is faster here
6092	* for Element nodes)
6093	*
6094	* Checks whether the tree node is valid. Experimental (the author
6095	* only uses the HTML enhancements in a SAX parser)
6096	*
6097	* Return: for Element nodes, a return from htmlElementAllowedHere (if
6098	* legacy allowed) or htmlElementStatusHere (otherwise).
6099	* for Attribute nodes, a return from htmlAttrAllowed
6100	* for other nodes, HTML_NA (no checks performed)
6101	*/
6102	htmlStatus
6103	htmlNodeStatus(htmlNodePtr node, int legacy) {
6104	if ( ! node )
6105	return HTML_INVALID ;
6106
6107	switch ( node->type ) {
6108	case XML_ELEMENT_NODE:
6109	return legacy
6110	? ( htmlElementAllowedHere (
6111	htmlTagLookup(node->parent->name) , node->name
6112	) ? HTML_VALID : HTML_INVALID )
6113	: htmlElementStatusHere(
6114	htmlTagLookup(node->parent->name) ,
6115	htmlTagLookup(node->name) )
6116	;
6117	case XML_ATTRIBUTE_NODE:
6118	return htmlAttrAllowed(
6119	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6120	default: return HTML_NA ;
6121	}
6122	}
6123	/************************************************************************
6124	* *
6125	* New set (2.6.0) of simpler and more flexible APIs *
6126	* *
6127	************************************************************************/
6128	/**
6129	* DICT_FREE:
6130	* @str: a string
6131	*
6132	* Free a string if it is not owned by the "dict" dictionary in the
6133	* current scope
6134	*/
6135	#define DICT_FREE(str) \
6136	if ((str) && ((!dict) \|\| \
6137	(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6138	xmlFree((char *)(str));
6139
6140	/**
6141	* htmlCtxtReset:
6142	* @ctxt: an HTML parser context
6143	*
6144	* Reset a parser context
6145	*/
6146	void
6147	htmlCtxtReset(htmlParserCtxtPtr ctxt)
6148	{
6149	xmlParserInputPtr input;
6150	xmlDictPtr dict;
6151
6152	if (ctxt == NULL)
6153	return;
6154
6155	dict = ctxt->dict;
6156
6157	while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6158	xmlFreeInputStream(input);
6159	}
6160	ctxt->inputNr = 0;
6161	ctxt->input = NULL;
6162
6163	ctxt->spaceNr = 0;
6164	if (ctxt->spaceTab != NULL) {
6165	ctxt->spaceTab[0] = -1;
6166	ctxt->space = &ctxt->spaceTab[0];
6167	} else {
6168	ctxt->space = NULL;
6169	}
6170
6171
6172	ctxt->nodeNr = 0;
6173	ctxt->node = NULL;
6174
6175	ctxt->nameNr = 0;
6176	ctxt->name = NULL;
6177
6178	ctxt->nsNr = 0;
6179
6180	DICT_FREE(ctxt->version);
6181	ctxt->version = NULL;
6182	DICT_FREE(ctxt->encoding);
6183	ctxt->encoding = NULL;
6184	DICT_FREE(ctxt->extSubURI);
6185	ctxt->extSubURI = NULL;
6186	DICT_FREE(ctxt->extSubSystem);
6187	ctxt->extSubSystem = NULL;
6188
6189	if (ctxt->directory != NULL) {
6190	xmlFree(ctxt->directory);
6191	ctxt->directory = NULL;
6192	}
6193
6194	if (ctxt->myDoc != NULL)
6195	xmlFreeDoc(ctxt->myDoc);
6196	ctxt->myDoc = NULL;
6197
6198	ctxt->standalone = -1;
6199	ctxt->hasExternalSubset = 0;
6200	ctxt->hasPErefs = 0;
6201	ctxt->html = 1;
6202	ctxt->instate = XML_PARSER_START;
6203
6204	ctxt->wellFormed = 1;
6205	ctxt->nsWellFormed = 1;
6206	ctxt->disableSAX = 0;
6207	ctxt->valid = 1;
6208	ctxt->vctxt.userData = ctxt;
6209	ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6210	ctxt->vctxt.error = xmlParserValidityError;
6211	ctxt->vctxt.warning = xmlParserValidityWarning;
6212	ctxt->record_info = 0;
6213	ctxt->checkIndex = 0;
6214	ctxt->endCheckState = 0;
6215	ctxt->inSubset = 0;
6216	ctxt->errNo = XML_ERR_OK;
6217	ctxt->depth = 0;
6218	ctxt->catalogs = NULL;
6219	xmlInitNodeInfoSeq(&ctxt->node_seq);
6220
6221	if (ctxt->attsDefault != NULL) {
6222	xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6223	ctxt->attsDefault = NULL;
6224	}
6225	if (ctxt->attsSpecial != NULL) {
6226	xmlHashFree(ctxt->attsSpecial, NULL);
6227	ctxt->attsSpecial = NULL;
6228	}
6229
6230	ctxt->nbErrors = 0;
6231	ctxt->nbWarnings = 0;
6232	if (ctxt->lastError.code != XML_ERR_OK)
6233	xmlResetError(&ctxt->lastError);
6234	}
6235
6236	/**
6237	* htmlCtxtUseOptions:
6238	* @ctxt: an HTML parser context
6239	* @options: a combination of htmlParserOption(s)
6240	*
6241	* Applies the options to the parser context
6242	*
6243	* Returns 0 in case of success, the set of unknown or unimplemented options
6244	* in case of error.
6245	*/
6246	int
6247	htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6248	{
6249	if (ctxt == NULL)
6250	return(-1);
6251
6252	if (options & HTML_PARSE_NOWARNING) {
6253	ctxt->sax->warning = NULL;
6254	ctxt->vctxt.warning = NULL;
6255	options -= XML_PARSE_NOWARNING;
6256	ctxt->options \|= XML_PARSE_NOWARNING;
6257	}
6258	if (options & HTML_PARSE_NOERROR) {
6259	ctxt->sax->error = NULL;
6260	ctxt->vctxt.error = NULL;
6261	ctxt->sax->fatalError = NULL;
6262	options -= XML_PARSE_NOERROR;
6263	ctxt->options \|= XML_PARSE_NOERROR;
6264	}
6265	if (options & HTML_PARSE_PEDANTIC) {
6266	ctxt->pedantic = 1;
6267	options -= XML_PARSE_PEDANTIC;
6268	ctxt->options \|= XML_PARSE_PEDANTIC;
6269	} else
6270	ctxt->pedantic = 0;
6271	if (options & XML_PARSE_NOBLANKS) {
6272	ctxt->keepBlanks = 0;
6273	ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6274	options -= XML_PARSE_NOBLANKS;
6275	ctxt->options \|= XML_PARSE_NOBLANKS;
6276	} else
6277	ctxt->keepBlanks = 1;
6278	if (options & HTML_PARSE_RECOVER) {
6279	ctxt->recovery = 1;
6280	options -= HTML_PARSE_RECOVER;
6281	} else
6282	ctxt->recovery = 0;
6283	if (options & HTML_PARSE_COMPACT) {
6284	ctxt->options \|= HTML_PARSE_COMPACT;
6285	options -= HTML_PARSE_COMPACT;
6286	}
6287	if (options & XML_PARSE_HUGE) {
6288	ctxt->options \|= XML_PARSE_HUGE;
6289	options -= XML_PARSE_HUGE;
6290	}
6291	if (options & HTML_PARSE_NODEFDTD) {
6292	ctxt->options \|= HTML_PARSE_NODEFDTD;
6293	options -= HTML_PARSE_NODEFDTD;
6294	}
6295	if (options & HTML_PARSE_IGNORE_ENC) {
6296	ctxt->options \|= HTML_PARSE_IGNORE_ENC;
6297	options -= HTML_PARSE_IGNORE_ENC;
6298	}
6299	if (options & HTML_PARSE_NOIMPLIED) {
6300	ctxt->options \|= HTML_PARSE_NOIMPLIED;
6301	options -= HTML_PARSE_NOIMPLIED;
6302	}
6303	ctxt->dictNames = 0;
6304	ctxt->linenumbers = 1;
6305	return (options);
6306	}
6307
6308	/**
6309	* htmlCtxtParseDocument:
6310	* @ctxt: an HTML parser context
6311	* @input: parser input
6312	*
6313	* Parse an HTML document and return the resulting document tree.
6314	*
6315	* Available since 2.13.0.
6316	*
6317	* Returns the resulting document tree or NULL
6318	*/
6319	htmlDocPtr
6320	htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
6321	{
6322	htmlDocPtr ret;
6323
6324	if ((ctxt == NULL) \|\| (input == NULL))
6325	return(NULL);
6326
6327	/* assert(ctxt->inputNr == 0); */
6328	while (ctxt->inputNr > 0)
6329	xmlFreeInputStream(inputPop(ctxt));
6330
6331	if (inputPush(ctxt, input) < 0) {
6332	xmlFreeInputStream(input);
6333	return(NULL);
6334	}
6335
6336	ctxt->html = 1;
6337	htmlParseDocument(ctxt);
6338
6339	if (ctxt->errNo != XML_ERR_NO_MEMORY) {
6340	ret = ctxt->myDoc;
6341	} else {
6342	ret = NULL;
6343	xmlFreeDoc(ctxt->myDoc);
6344	}
6345	ctxt->myDoc = NULL;
6346
6347	/* assert(ctxt->inputNr == 1); */
6348	while (ctxt->inputNr > 0)
6349	xmlFreeInputStream(inputPop(ctxt));
6350
6351	return(ret);
6352	}
6353
6354	/**
6355	* htmlReadDoc:
6356	* @str: a pointer to a zero terminated string
6357	* @url: only used for error reporting (optoinal)
6358	* @encoding: the document encoding (optional)
6359	* @options: a combination of htmlParserOptions
6360	*
6361	* Convenience function to parse an HTML document from a zero-terminated
6362	* string.
6363	*
6364	* See htmlCtxtReadDoc for details.
6365	*
6366	* Returns the resulting document tree.
6367	*/
6368	htmlDocPtr
6369	htmlReadDoc(const xmlChar str, const char url, const char *encoding,
6370	int options)
6371	{
6372	htmlParserCtxtPtr ctxt;
6373	xmlParserInputPtr input;
6374	htmlDocPtr doc;
6375
6376	ctxt = htmlNewParserCtxt();
6377	if (ctxt == NULL)
6378	return(NULL);
6379
6380	htmlCtxtUseOptions(ctxt, options);
6381
6382	input = xmlNewInputString(ctxt, url, (const char *) str, encoding,
6383	XML_INPUT_BUF_STATIC);
6384
6385	doc = htmlCtxtParseDocument(ctxt, input);
6386
6387	htmlFreeParserCtxt(ctxt);
6388	return(doc);
6389	}
6390
6391	/**
6392	* htmlReadFile:
6393	* @filename: a file or URL
6394	* @encoding: the document encoding (optional)
6395	* @options: a combination of htmlParserOptions
6396	*
6397	* Convenience function to parse an HTML file from the filesystem,
6398	* the network or a global user-defined resource loader.
6399	*
6400	* See htmlCtxtReadFile for details.
6401	*
6402	* Returns the resulting document tree.
6403	*/
6404	htmlDocPtr
6405	htmlReadFile(const char filename, const char encoding, int options)
6406	{
6407	htmlParserCtxtPtr ctxt;
6408	xmlParserInputPtr input;
6409	htmlDocPtr doc;
6410
6411	ctxt = htmlNewParserCtxt();
6412	if (ctxt == NULL)
6413	return(NULL);
6414
6415	htmlCtxtUseOptions(ctxt, options);
6416
6417	input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6418
6419	doc = htmlCtxtParseDocument(ctxt, input);
6420
6421	htmlFreeParserCtxt(ctxt);
6422	return(doc);
6423	}
6424
6425	/**
6426	* htmlReadMemory:
6427	* @buffer: a pointer to a char array
6428	* @size: the size of the array
6429	* @url: only used for error reporting (optional)
6430	* @encoding: the document encoding, or NULL
6431	* @options: a combination of htmlParserOption(s)
6432	*
6433	* Convenience function to parse an HTML document from memory.
6434	* The input buffer must not contain any terminating null bytes.
6435	*
6436	* See htmlCtxtReadMemory for details.
6437	*
6438	* Returns the resulting document tree
6439	*/
6440	htmlDocPtr
6441	htmlReadMemory(const char buffer, int size, const char url,
6442	const char *encoding, int options)
6443	{
6444	htmlParserCtxtPtr ctxt;
6445	xmlParserInputPtr input;
6446	htmlDocPtr doc;
6447
6448	if (size < 0)
6449	return(NULL);
6450
6451	ctxt = htmlNewParserCtxt();
6452	if (ctxt == NULL)
6453	return(NULL);
6454
6455	htmlCtxtUseOptions(ctxt, options);
6456
6457	input = xmlNewInputMemory(ctxt, url, buffer, size, encoding,
6458	XML_INPUT_BUF_STATIC);
6459
6460	doc = htmlCtxtParseDocument(ctxt, input);
6461
6462	htmlFreeParserCtxt(ctxt);
6463	return(doc);
6464	}
6465
6466	/**
6467	* htmlReadFd:
6468	* @fd: an open file descriptor
6469	* @url: only used for error reporting (optional)
6470	* @encoding: the document encoding, or NULL
6471	* @options: a combination of htmlParserOptions
6472	*
6473	* Convenience function to parse an HTML document from a
6474	* file descriptor.
6475	*
6476	* NOTE that the file descriptor will not be closed when the
6477	* context is freed or reset.
6478	*
6479	* See htmlCtxtReadFd for details.
6480	*
6481	* Returns the resulting document tree
6482	*/
6483	htmlDocPtr
6484	htmlReadFd(int fd, const char url, const char encoding, int options)
6485	{
6486	htmlParserCtxtPtr ctxt;
6487	xmlParserInputPtr input;
6488	htmlDocPtr doc;
6489
6490	ctxt = htmlNewParserCtxt();
6491	if (ctxt == NULL)
6492	return(NULL);
6493
6494	htmlCtxtUseOptions(ctxt, options);
6495
6496	input = xmlNewInputFd(ctxt, url, fd, encoding, 0);
6497
6498	doc = htmlCtxtParseDocument(ctxt, input);
6499
6500	htmlFreeParserCtxt(ctxt);
6501	return(doc);
6502	}
6503
6504	/**
6505	* htmlReadIO:
6506	* @ioread: an I/O read function
6507	* @ioclose: an I/O close function (optional)
6508	* @ioctx: an I/O handler
6509	* @url: only used for error reporting (optional)
6510	* @encoding: the document encoding (optional)
6511	* @options: a combination of htmlParserOption(s)
6512	*
6513	* Convenience function to parse an HTML document from I/O functions
6514	* and context.
6515	*
6516	* See htmlCtxtReadIO for details.
6517	*
6518	* Returns the resulting document tree
6519	*/
6520	htmlDocPtr
6521	htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6522	void ioctx, const char url, const char *encoding, int options)
6523	{
6524	htmlParserCtxtPtr ctxt;
6525	xmlParserInputPtr input;
6526	htmlDocPtr doc;
6527
6528	ctxt = htmlNewParserCtxt();
6529	if (ctxt == NULL)
6530	return (NULL);
6531
6532	htmlCtxtUseOptions(ctxt, options);
6533
6534	input = xmlNewInputIO(ctxt, url, ioread, ioclose, ioctx, encoding, 0);
6535
6536	doc = htmlCtxtParseDocument(ctxt, input);
6537
6538	htmlFreeParserCtxt(ctxt);
6539	return(doc);
6540	}
6541
6542	/**
6543	* htmlCtxtReadDoc:
6544	* @ctxt: an HTML parser context
6545	* @str: a pointer to a zero terminated string
6546	* @URL: only used for error reporting (optional)
6547	* @encoding: the document encoding (optional)
6548	* @options: a combination of htmlParserOptions
6549	*
6550	* Parse an HTML in-memory document and build a tree.
6551	*
6552	* See htmlCtxtUseOptions for details.
6553	*
6554	* Returns the resulting document tree
6555	*/
6556	htmlDocPtr
6557	htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6558	const char URL, const char encoding, int options)
6559	{
6560	xmlParserInputPtr input;
6561
6562	if (ctxt == NULL)
6563	return (NULL);
6564
6565	htmlCtxtReset(ctxt);
6566	htmlCtxtUseOptions(ctxt, options);
6567
6568	input = xmlNewInputString(ctxt, URL, (const char *) str, encoding, 0);
6569
6570	return(htmlCtxtParseDocument(ctxt, input));
6571	}
6572
6573	/**
6574	* htmlCtxtReadFile:
6575	* @ctxt: an HTML parser context
6576	* @filename: a file or URL
6577	* @encoding: the document encoding (optional)
6578	* @options: a combination of htmlParserOptions
6579	*
6580	* Parse an HTML file from the filesystem, the network or a
6581	* user-defined resource loader.
6582	*
6583	* See xmlNewInputURL and htmlCtxtUseOptions for details.
6584	*
6585	* Returns the resulting document tree
6586	*/
6587	htmlDocPtr
6588	htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6589	const char *encoding, int options)
6590	{
6591	xmlParserInputPtr input;
6592
6593	if (ctxt == NULL)
6594	return (NULL);
6595
6596	htmlCtxtReset(ctxt);
6597	htmlCtxtUseOptions(ctxt, options);
6598
6599	input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6600
6601	return(htmlCtxtParseDocument(ctxt, input));
6602	}
6603
6604	/**
6605	* htmlCtxtReadMemory:
6606	* @ctxt: an HTML parser context
6607	* @buffer: a pointer to a char array
6608	* @size: the size of the array
6609	* @URL: only used for error reporting (optional)
6610	* @encoding: the document encoding (optinal)
6611	* @options: a combination of htmlParserOptions
6612	*
6613	* Parse an HTML in-memory document and build a tree. The input buffer must
6614	* not contain any terminating null bytes.
6615	*
6616	* See htmlCtxtUseOptions for details.
6617	*
6618	* Returns the resulting document tree
6619	*/
6620	htmlDocPtr
6621	htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6622	const char URL, const char encoding, int options)
6623	{
6624	xmlParserInputPtr input;
6625
6626	if ((ctxt == NULL) \|\| (size < 0))
6627	return (NULL);
6628
6629	htmlCtxtReset(ctxt);
6630	htmlCtxtUseOptions(ctxt, options);
6631
6632	input = xmlNewInputMemory(ctxt, URL, buffer, size, encoding,
6633	XML_INPUT_BUF_STATIC);
6634
6635	return(htmlCtxtParseDocument(ctxt, input));
6636	}
6637
6638	/**
6639	* htmlCtxtReadFd:
6640	* @ctxt: an HTML parser context
6641	* @fd: an open file descriptor
6642	* @URL: only used for error reporting (optional)
6643	* @encoding: the document encoding (optinal)
6644	* @options: a combination of htmlParserOptions
6645	*
6646	* Parse an HTML from a file descriptor and build a tree.
6647	*
6648	* See htmlCtxtUseOptions for details.
6649	*
6650	* NOTE that the file descriptor will not be closed when the
6651	* context is freed or reset.
6652	*
6653	* Returns the resulting document tree
6654	*/
6655	htmlDocPtr
6656	htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6657	const char URL, const char encoding, int options)
6658	{
6659	xmlParserInputPtr input;
6660
6661	if (ctxt == NULL)
6662	return(NULL);
6663
6664	htmlCtxtReset(ctxt);
6665	htmlCtxtUseOptions(ctxt, options);
6666
6667	input = xmlNewInputFd(ctxt, URL, fd, encoding, 0);
6668
6669	return(htmlCtxtParseDocument(ctxt, input));
6670	}
6671
6672	/**
6673	* htmlCtxtReadIO:
6674	* @ctxt: an HTML parser context
6675	* @ioread: an I/O read function
6676	* @ioclose: an I/O close function
6677	* @ioctx: an I/O handler
6678	* @URL: the base URL to use for the document
6679	* @encoding: the document encoding, or NULL
6680	* @options: a combination of htmlParserOption(s)
6681	*
6682	* Parse an HTML document from I/O functions and source and build a tree.
6683	*
6684	* See xmlNewInputIO and htmlCtxtUseOptions for details.
6685	*
6686	* Returns the resulting document tree
6687	*/
6688	htmlDocPtr
6689	htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6690	xmlInputCloseCallback ioclose, void *ioctx,
6691	const char *URL,
6692	const char *encoding, int options)
6693	{
6694	xmlParserInputPtr input;
6695
6696	if (ctxt == NULL)
6697	return (NULL);
6698
6699	htmlCtxtReset(ctxt);
6700	htmlCtxtUseOptions(ctxt, options);
6701
6702	input = xmlNewInputIO(ctxt, URL, ioread, ioclose, ioctx, encoding, 0);
6703
6704	return(htmlCtxtParseDocument(ctxt, input));
6705	}
6706
6707	#endif /* LIBXML_HTML_ENABLED */

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/libxml2-2.13.2/HTMLparser.c@ 106165

以其他格式下載: