HTMLparser.c@ 66550

最後變更在這個檔案從66550是 65950,由 vboxsync 提交於 8 年前
libxml 2.9.4: fix export
屬性 svn:eol-style 設為 `native`
檔案大小: 204.8 KB

行
1	/*
2	* HTMLparser.c : an HTML 4.0 non-verifying parser
3	*
4	* See Copyright for the status of this software.
5	*
6	* [email protected]
7	*/
8
9	#define IN_LIBXML
10	#include "libxml.h"
11	#ifdef LIBXML_HTML_ENABLED
12
13	#include <string.h>
14	#ifdef HAVE_CTYPE_H
15	#include <ctype.h>
16	#endif
17	#ifdef HAVE_STDLIB_H
18	#include <stdlib.h>
19	#endif
20	#ifdef HAVE_SYS_STAT_H
21	#include <sys/stat.h>
22	#endif
23	#ifdef HAVE_FCNTL_H
24	#include <fcntl.h>
25	#endif
26	#ifdef HAVE_UNISTD_H
27	#include <unistd.h>
28	#endif
29	#ifdef HAVE_ZLIB_H
30	#include <zlib.h>
31	#endif
32
33	#include <libxml/xmlmemory.h>
34	#include <libxml/tree.h>
35	#include <libxml/parser.h>
36	#include <libxml/parserInternals.h>
37	#include <libxml/xmlerror.h>
38	#include <libxml/HTMLparser.h>
39	#include <libxml/HTMLtree.h>
40	#include <libxml/entities.h>
41	#include <libxml/encoding.h>
42	#include <libxml/valid.h>
43	#include <libxml/xmlIO.h>
44	#include <libxml/globals.h>
45	#include <libxml/uri.h>
46
47	#include "buf.h"
48	#include "enc.h"
49
50	#define HTML_MAX_NAMELEN 1000
51	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52	#define HTML_PARSER_BUFFER_SIZE 100
53
54	/* #define DEBUG */
55	/* #define DEBUG_PUSH */
56
57	static int htmlOmittedDefaultValue = 1;
58
59	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60	xmlChar end, xmlChar end2, xmlChar end3);
61	static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63	/************************************************************************
64	* *
65	* Some factorized error routines *
66	* *
67	************************************************************************/
68
69	/**
70	* htmlErrMemory:
71	* @ctxt: an HTML parser context
72	* @extra: extra informations
73	*
74	* Handle a redefinition of attribute error
75	*/
76	static void
77	htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78	{
79	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80	(ctxt->instate == XML_PARSER_EOF))
81	return;
82	if (ctxt != NULL) {
83	ctxt->errNo = XML_ERR_NO_MEMORY;
84	ctxt->instate = XML_PARSER_EOF;
85	ctxt->disableSAX = 1;
86	}
87	if (extra)
88	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90	NULL, NULL, 0, 0,
91	"Memory allocation failed : %s\n", extra);
92	else
93	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95	NULL, NULL, 0, 0, "Memory allocation failed\n");
96	}
97
98	/**
99	* htmlParseErr:
100	* @ctxt: an HTML parser context
101	* @error: the error number
102	* @msg: the error message
103	* @str1: string infor
104	* @str2: string infor
105	*
106	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
107	*/
108	static void LIBXML_ATTR_FORMAT(3,0)
109	htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110	const char msg, const xmlChar str1, const xmlChar *str2)
111	{
112	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113	(ctxt->instate == XML_PARSER_EOF))
114	return;
115	if (ctxt != NULL)
116	ctxt->errNo = error;
117	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118	XML_ERR_ERROR, NULL, 0,
119	(const char ) str1, (const char ) str2,
120	NULL, 0, 0,
121	msg, str1, str2);
122	if (ctxt != NULL)
123	ctxt->wellFormed = 0;
124	}
125
126	/**
127	* htmlParseErrInt:
128	* @ctxt: an HTML parser context
129	* @error: the error number
130	* @msg: the error message
131	* @val: integer info
132	*
133	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
134	*/
135	static void LIBXML_ATTR_FORMAT(3,0)
136	htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137	const char *msg, int val)
138	{
139	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140	(ctxt->instate == XML_PARSER_EOF))
141	return;
142	if (ctxt != NULL)
143	ctxt->errNo = error;
144	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145	XML_ERR_ERROR, NULL, 0, NULL, NULL,
146	NULL, val, 0, msg, val);
147	if (ctxt != NULL)
148	ctxt->wellFormed = 0;
149	}
150
151	/************************************************************************
152	* *
153	* Parser stacks related functions and macros *
154	* *
155	************************************************************************/
156
157	/**
158	* htmlnamePush:
159	* @ctxt: an HTML parser context
160	* @value: the element name
161	*
162	* Pushes a new element name on top of the name stack
163	*
164	* Returns 0 in case of error, the index in the stack otherwise
165	*/
166	static int
167	htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168	{
169	if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170	ctxt->html = 3;
171	if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172	ctxt->html = 10;
173	if (ctxt->nameNr >= ctxt->nameMax) {
174	ctxt->nameMax *= 2;
175	ctxt->nameTab = (const xmlChar * *)
176	xmlRealloc((xmlChar * *)ctxt->nameTab,
177	ctxt->nameMax *
178	sizeof(ctxt->nameTab[0]));
179	if (ctxt->nameTab == NULL) {
180	htmlErrMemory(ctxt, NULL);
181	return (0);
182	}
183	}
184	ctxt->nameTab[ctxt->nameNr] = value;
185	ctxt->name = value;
186	return (ctxt->nameNr++);
187	}
188	/**
189	* htmlnamePop:
190	* @ctxt: an HTML parser context
191	*
192	* Pops the top element name from the name stack
193	*
194	* Returns the name just removed
195	*/
196	static const xmlChar *
197	htmlnamePop(htmlParserCtxtPtr ctxt)
198	{
199	const xmlChar *ret;
200
201	if (ctxt->nameNr <= 0)
202	return (NULL);
203	ctxt->nameNr--;
204	if (ctxt->nameNr < 0)
205	return (NULL);
206	if (ctxt->nameNr > 0)
207	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208	else
209	ctxt->name = NULL;
210	ret = ctxt->nameTab[ctxt->nameNr];
211	ctxt->nameTab[ctxt->nameNr] = NULL;
212	return (ret);
213	}
214
215	/**
216	* htmlNodeInfoPush:
217	* @ctxt: an HTML parser context
218	* @value: the node info
219	*
220	* Pushes a new element name on top of the node info stack
221	*
222	* Returns 0 in case of error, the index in the stack otherwise
223	*/
224	static int
225	htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226	{
227	if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228	if (ctxt->nodeInfoMax == 0)
229	ctxt->nodeInfoMax = 5;
230	ctxt->nodeInfoMax *= 2;
231	ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232	xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233	ctxt->nodeInfoMax *
234	sizeof(ctxt->nodeInfoTab[0]));
235	if (ctxt->nodeInfoTab == NULL) {
236	htmlErrMemory(ctxt, NULL);
237	return (0);
238	}
239	}
240	ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242	return (ctxt->nodeInfoNr++);
243	}
244
245	/**
246	* htmlNodeInfoPop:
247	* @ctxt: an HTML parser context
248	*
249	* Pops the top element name from the node info stack
250	*
251	* Returns 0 in case of error, the pointer to NodeInfo otherwise
252	*/
253	static htmlParserNodeInfo *
254	htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255	{
256	if (ctxt->nodeInfoNr <= 0)
257	return (NULL);
258	ctxt->nodeInfoNr--;
259	if (ctxt->nodeInfoNr < 0)
260	return (NULL);
261	if (ctxt->nodeInfoNr > 0)
262	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263	else
264	ctxt->nodeInfo = NULL;
265	return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266	}
267
268	/*
269	* Macros for accessing the content. Those should be used only by the parser,
270	* and not exported.
271	*
272	* Dirty macros, i.e. one need to make assumption on the context to use them
273	*
274	* CUR_PTR return the current pointer to the xmlChar to be parsed.
275	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277	* in UNICODE mode. This should be used internally by the parser
278	* only to compare to ASCII values otherwise it would break when
279	* running with UTF-8 encoding.
280	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281	* to compare on ASCII based substring.
282	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283	* it should be used only to compare on ASCII based substring.
284	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285	* strings without newlines within the parser.
286	*
287	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288	*
289	* CURRENT Returns the current char value, with the full decoding of
290	* UTF-8 if we are using this mode. It returns an int.
291	* NEXT Skip to the next character, this does the proper decoding
292	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
293	* NEXTL(l) Skip the current unicode character of l xmlChars long.
294	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295	*/
296
297	#define UPPER (toupper(*ctxt->input->cur))
298
299	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
300
301	#define NXT(val) ctxt->input->cur[(val)]
302
303	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305	#define CUR_PTR ctxt->input->cur
306	#define BASE_PTR ctxt->input->base
307
308	#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310	xmlParserInputShrink(ctxt->input)
311
312	#define GROW if ((ctxt->progressive == 0) && \
313	(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316	#define CURRENT ((int) (*ctxt->input->cur))
317
318	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
320	/* Inported from XML */
321
322	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
323	#define CUR ((int) (*ctxt->input->cur))
324	#define NEXT xmlNextChar(ctxt)
325
326	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329	#define NEXTL(l) do { \
330	if (*(ctxt->input->cur) == '\n') { \
331	ctxt->input->line++; ctxt->input->col = 1; \
332	} else ctxt->input->col++; \
333	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
334	} while (0)
335
336	/************
337	\
338	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340	************/
341
342	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345	#define COPY_BUF(l,b,i,v) \
346	if (l == 1) b[i++] = (xmlChar) v; \
347	else i += xmlCopyChar(l,&b[i],v)
348
349	/**
350	* htmlFindEncoding:
351	* @the HTML parser context
352	*
353	* Ty to find and encoding in the current data available in the input
354	* buffer this is needed to try to switch to the proper encoding when
355	* one face a character error.
356	* That's an heuristic, since it's operating outside of parsing it could
357	* try to use a meta which had been commented out, that's the reason it
358	* should only be used in case of error, not as a default.
359	*
360	* Returns an encoding string or NULL if not found, the string need to
361	* be freed
362	*/
363	static xmlChar *
364	htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365	const xmlChar start, cur, *end;
366
367	if ((ctxt == NULL) \|\| (ctxt->input == NULL) \|\|
368	(ctxt->input->encoding != NULL) \|\| (ctxt->input->buf == NULL) \|\|
369	(ctxt->input->buf->encoder != NULL))
370	return(NULL);
371	if ((ctxt->input->cur == NULL) \|\| (ctxt->input->end == NULL))
372	return(NULL);
373
374	start = ctxt->input->cur;
375	end = ctxt->input->end;
376	/* we also expect the input buffer to be zero terminated */
377	if (*end != 0)
378	return(NULL);
379
380	cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381	if (cur == NULL)
382	return(NULL);
383	cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384	if (cur == NULL)
385	return(NULL);
386	cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387	if (cur == NULL)
388	return(NULL);
389	cur += 8;
390	start = cur;
391	while (((cur >= 'A') && (cur <= 'Z')) \|\|
392	((cur >= 'a') && (cur <= 'z')) \|\|
393	((cur >= '0') && (cur <= '9')) \|\|
394	(cur == '-') \|\| (cur == '_') \|\| (cur == ':') \|\| (cur == '/'))
395	cur++;
396	if (cur == start)
397	return(NULL);
398	return(xmlStrndup(start, cur - start));
399	}
400
401	/**
402	* htmlCurrentChar:
403	* @ctxt: the HTML parser context
404	* @len: pointer to the length of the char read
405	*
406	* The current char value, if using UTF-8 this may actually span multiple
407	* bytes in the input buffer. Implement the end of line normalization:
408	* 2.11 End-of-Line Handling
409	* If the encoding is unspecified, in the case we find an ISO-Latin-1
410	* char, then the encoding converter is plugged in automatically.
411	*
412	* Returns the current char value and its length
413	*/
414
415	static int
416	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417	if (ctxt->instate == XML_PARSER_EOF)
418	return(0);
419
420	if (ctxt->token != 0) {
421	*len = 0;
422	return(ctxt->token);
423	}
424	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
425	/*
426	* We are supposed to handle UTF8, check it's valid
427	* From rfc2044: encoding of the Unicode values on UTF-8:
428	*
429	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
430	* 0000 0000-0000 007F 0xxxxxxx
431	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
432	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
433	*
434	* Check for the 0x110000 limit too
435	*/
436	const unsigned char *cur = ctxt->input->cur;
437	unsigned char c;
438	unsigned int val;
439
440	c = *cur;
441	if (c & 0x80) {
442	if (cur[1] == 0) {
443	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
444	cur = ctxt->input->cur;
445	}
446	if ((cur[1] & 0xc0) != 0x80)
447	goto encoding_error;
448	if ((c & 0xe0) == 0xe0) {
449
450	if (cur[2] == 0) {
451	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
452	cur = ctxt->input->cur;
453	}
454	if ((cur[2] & 0xc0) != 0x80)
455	goto encoding_error;
456	if ((c & 0xf0) == 0xf0) {
457	if (cur[3] == 0) {
458	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
459	cur = ctxt->input->cur;
460	}
461	if (((c & 0xf8) != 0xf0) \|\|
462	((cur[3] & 0xc0) != 0x80))
463	goto encoding_error;
464	/* 4-byte code */
465	*len = 4;
466	val = (cur[0] & 0x7) << 18;
467	val \|= (cur[1] & 0x3f) << 12;
468	val \|= (cur[2] & 0x3f) << 6;
469	val \|= cur[3] & 0x3f;
470	} else {
471	/* 3-byte code */
472	*len = 3;
473	val = (cur[0] & 0xf) << 12;
474	val \|= (cur[1] & 0x3f) << 6;
475	val \|= cur[2] & 0x3f;
476	}
477	} else {
478	/* 2-byte code */
479	*len = 2;
480	val = (cur[0] & 0x1f) << 6;
481	val \|= cur[1] & 0x3f;
482	}
483	if (!IS_CHAR(val)) {
484	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
485	"Char 0x%X out of allowed range\n", val);
486	}
487	return(val);
488	} else {
489	if ((*ctxt->input->cur == 0) &&
490	(ctxt->input->cur < ctxt->input->end)) {
491	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
492	"Char 0x%X out of allowed range\n", 0);
493	*len = 1;
494	return(' ');
495	}
496	/* 1-byte code */
497	*len = 1;
498	return((int) *ctxt->input->cur);
499	}
500	}
501	/*
502	* Assume it's a fixed length encoding (1) with
503	* a compatible encoding for the ASCII set, since
504	* XML constructs only use < 128 chars
505	*/
506	*len = 1;
507	if ((int) *ctxt->input->cur < 0x80)
508	return((int) *ctxt->input->cur);
509
510	/*
511	* Humm this is bad, do an automatic flow conversion
512	*/
513	{
514	xmlChar * guess;
515	xmlCharEncodingHandlerPtr handler;
516
517	guess = htmlFindEncoding(ctxt);
518	if (guess == NULL) {
519	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
520	} else {
521	if (ctxt->input->encoding != NULL)
522	xmlFree((xmlChar *) ctxt->input->encoding);
523	ctxt->input->encoding = guess;
524	handler = xmlFindCharEncodingHandler((const char *) guess);
525	if (handler != NULL) {
526	xmlSwitchToEncoding(ctxt, handler);
527	} else {
528	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
529	"Unsupported encoding %s", guess, NULL);
530	}
531	}
532	ctxt->charset = XML_CHAR_ENCODING_UTF8;
533	}
534
535	return(xmlCurrentChar(ctxt, len));
536
537	encoding_error:
538	/*
539	* If we detect an UTF8 error that probably mean that the
540	* input encoding didn't get properly advertized in the
541	* declaration header. Report the error and switch the encoding
542	* to ISO-Latin-1 (if you don't like this policy, just declare the
543	* encoding !)
544	*/
545	{
546	char buffer[150];
547
548	if (ctxt->input->end - ctxt->input->cur >= 4) {
549	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
550	ctxt->input->cur[0], ctxt->input->cur[1],
551	ctxt->input->cur[2], ctxt->input->cur[3]);
552	} else {
553	snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
554	}
555	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
556	"Input is not proper UTF-8, indicate encoding !\n",
557	BAD_CAST buffer, NULL);
558	}
559
560	ctxt->charset = XML_CHAR_ENCODING_8859_1;
561	*len = 1;
562	return((int) *ctxt->input->cur);
563	}
564
565	/**
566	* htmlSkipBlankChars:
567	* @ctxt: the HTML parser context
568	*
569	* skip all blanks character found at that point in the input streams.
570	*
571	* Returns the number of space chars skipped
572	*/
573
574	static int
575	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
576	int res = 0;
577
578	while (IS_BLANK_CH(*(ctxt->input->cur))) {
579	if ((*ctxt->input->cur == 0) &&
580	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
581	xmlPopInput(ctxt);
582	} else {
583	if (*(ctxt->input->cur) == '\n') {
584	ctxt->input->line++; ctxt->input->col = 1;
585	} else ctxt->input->col++;
586	ctxt->input->cur++;
587	ctxt->nbChars++;
588	if (*ctxt->input->cur == 0)
589	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
590	}
591	res++;
592	}
593	return(res);
594	}
595
596
597
598	/************************************************************************
599	* *
600	* The list of HTML elements and their properties *
601	* *
602	************************************************************************/
603
604	/*
605	* Start Tag: 1 means the start tag can be ommited
606	* End Tag: 1 means the end tag can be ommited
607	* 2 means it's forbidden (empty elements)
608	* 3 means the tag is stylistic and should be closed easily
609	* Depr: this element is deprecated
610	* DTD: 1 means that this element is valid only in the Loose DTD
611	* 2 means that this element is valid only in the Frameset DTD
612	*
613	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
614	, subElements , impliedsubelt , Attributes, userdata
615	*/
616
617	/* Definitions and a couple of vars for HTML Elements */
618
619	#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
620	#define NB_FONTSTYLE 8
621	#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
622	#define NB_PHRASE 10
623	#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
624	#define NB_SPECIAL 16
625	#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
626	#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
627	#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
628	#define NB_BLOCK NB_HEADING + NB_LIST + 14
629	#define FORMCTRL "input", "select", "textarea", "label", "button"
630	#define NB_FORMCTRL 5
631	#define PCDATA
632	#define NB_PCDATA 0
633	#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
634	#define NB_HEADING 6
635	#define LIST "ul", "ol", "dir", "menu"
636	#define NB_LIST 4
637	#define MODIFIER
638	#define NB_MODIFIER 0
639	#define FLOW BLOCK,INLINE
640	#define NB_FLOW NB_BLOCK + NB_INLINE
641	#define EMPTY NULL
642
643
644	static const char* const html_flow[] = { FLOW, NULL } ;
645	static const char* const html_inline[] = { INLINE, NULL } ;
646
647	/* placeholders: elts with content but no subelements */
648	static const char* const html_pcdata[] = { NULL } ;
649	#define html_cdata html_pcdata
650
651
652	/* ... and for HTML Attributes */
653
654	#define COREATTRS "id", "class", "style", "title"
655	#define NB_COREATTRS 4
656	#define I18N "lang", "dir"
657	#define NB_I18N 2
658	#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
659	#define NB_EVENTS 9
660	#define ATTRS COREATTRS,I18N,EVENTS
661	#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
662	#define CELLHALIGN "align", "char", "charoff"
663	#define NB_CELLHALIGN 3
664	#define CELLVALIGN "valign"
665	#define NB_CELLVALIGN 1
666
667	static const char* const html_attrs[] = { ATTRS, NULL } ;
668	static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
669	static const char* const core_attrs[] = { COREATTRS, NULL } ;
670	static const char* const i18n_attrs[] = { I18N, NULL } ;
671
672
673	/* Other declarations that should go inline ... */
674	static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
675	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
676	"tabindex", "onfocus", "onblur", NULL } ;
677	static const char* const target_attr[] = { "target", NULL } ;
678	static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
679	static const char* const alt_attr[] = { "alt", NULL } ;
680	static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
681	static const char* const href_attrs[] = { "href", NULL } ;
682	static const char* const clear_attrs[] = { "clear", NULL } ;
683	static const char* const inline_p[] = { INLINE, "p", NULL } ;
684
685	static const char* const flow_param[] = { FLOW, "param", NULL } ;
686	static const char* const applet_attrs[] = { COREATTRS , "codebase",
687	"archive", "alt", "name", "height", "width", "align",
688	"hspace", "vspace", NULL } ;
689	static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
690	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
691	static const char* const basefont_attrs[] =
692	{ "id", "size", "color", "face", NULL } ;
693	static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
694	static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
695	static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
696	static const char* const body_depr[] = { "background", "bgcolor", "text",
697	"link", "vlink", "alink", NULL } ;
698	static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
699	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
700
701
702	static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
703	static const char* const col_elt[] = { "col", NULL } ;
704	static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
705	static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
706	static const char* const dl_contents[] = { "dt", "dd", NULL } ;
707	static const char* const compact_attr[] = { "compact", NULL } ;
708	static const char* const label_attr[] = { "label", NULL } ;
709	static const char* const fieldset_contents[] = { FLOW, "legend" } ;
710	static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
711	static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
712	static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
713	static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
714	static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
715	static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
716	static const char* const head_attrs[] = { I18N, "profile", NULL } ;
717	static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
718	static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
719	static const char* const version_attr[] = { "version", NULL } ;
720	static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
721	static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
722	static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
723	static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
724	static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
725	static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
726	static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
727	static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
728	static const char* const align_attr[] = { "align", NULL } ;
729	static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
730	static const char* const map_contents[] = { BLOCK, "area", NULL } ;
731	static const char* const name_attr[] = { "name", NULL } ;
732	static const char* const action_attr[] = { "action", NULL } ;
733	static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
734	static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
735	static const char* const content_attr[] = { "content", NULL } ;
736	static const char* const type_attr[] = { "type", NULL } ;
737	static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
738	static const char* const object_contents[] = { FLOW, "param", NULL } ;
739	static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
740	static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
741	static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
742	static const char* const option_elt[] = { "option", NULL } ;
743	static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
744	static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
745	static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
746	static const char* const width_attr[] = { "width", NULL } ;
747	static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
748	static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
749	static const char* const language_attr[] = { "language", NULL } ;
750	static const char* const select_content[] = { "optgroup", "option", NULL } ;
751	static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
752	static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
753	static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
754	static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
755	static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
756	static const char* const tr_elt[] = { "tr", NULL } ;
757	static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
758	static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
759	static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
760	static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
761	static const char* const tr_contents[] = { "th", "td", NULL } ;
762	static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
763	static const char* const li_elt[] = { "li", NULL } ;
764	static const char* const ul_depr[] = { "type", "compact", NULL} ;
765	static const char* const dir_attr[] = { "dir", NULL} ;
766
767	#define DECL (const char**)
768
769	static const htmlElemDesc
770	html40ElementTable[] = {
771	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
772	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
773	},
774	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
775	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
776	},
777	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
778	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
779	},
780	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
781	DECL inline_p , NULL , DECL html_attrs, NULL, NULL
782	},
783	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
784	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
785	},
786	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
787	EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
788	},
789	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
790	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
791	},
792	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
793	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
794	},
795	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
796	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
797	},
798	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
799	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
800	},
801	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
802	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
803	},
804	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
805	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
806	},
807	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
808	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
809	},
810	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
811	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
812	},
813	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
814	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
815	},
816	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
817	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
818	},
819	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
820	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
821	},
822	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
823	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
824	},
825	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
826	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
827	},
828	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
829	EMPTY , NULL , DECL col_attrs , NULL, NULL
830	},
831	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
832	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
833	},
834	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
835	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
836	},
837	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
838	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
839	},
840	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
841	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
842	},
843	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
844	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
845	},
846	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
847	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
848	},
849	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
850	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
851	},
852	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
853	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854	},
855	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
856	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
857	},
858	{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
859	EMPTY, NULL, DECL embed_attrs, NULL, NULL
860	},
861	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
862	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
863	},
864	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
865	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
866	},
867	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
868	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
869	},
870	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
871	EMPTY, NULL, NULL, DECL frame_attrs, NULL
872	},
873	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
874	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
875	},
876	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
877	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
878	},
879	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
880	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
881	},
882	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
883	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
884	},
885	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
886	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
887	},
888	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
889	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
890	},
891	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
892	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893	},
894	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
895	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
896	},
897	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
898	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
899	},
900	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
901	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
902	},
903	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
904	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
905	},
906	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
907	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
908	},
909	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
910	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
911	},
912	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
913	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
914	},
915	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
916	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
917	},
918	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
919	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
920	},
921	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
922	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
923	},
924	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
925	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
926	},
927	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
928	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
929	},
930	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
931	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
932	},
933	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
934	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
935	},
936	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
937	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
938	},
939	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
940	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
941	},
942	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
943	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
944	},
945	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
946	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
947	},
948	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
949	DECL html_flow, "div", DECL html_attrs, NULL, NULL
950	},
951	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
952	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
953	},
954	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
955	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
956	},
957	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
958	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
959	},
960	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
961	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
962	},
963	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
964	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
965	},
966	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
967	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
968	},
969	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
970	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
971	},
972	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
973	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
974	},
975	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
976	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
977	},
978	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
979	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
980	},
981	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
982	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
983	},
984	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
985	DECL select_content, NULL, DECL select_attrs, NULL, NULL
986	},
987	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
988	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
989	},
990	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
991	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
992	},
993	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
994	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
995	},
996	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
997	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
998	},
999	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1000	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1001	},
1002	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1003	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004	},
1005	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1006	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1007	},
1008	{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1009	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1010	},
1011	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1012	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1013	},
1014	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1015	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1016	},
1017	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1018	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1019	},
1020	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1021	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1022	},
1023	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1024	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1025	},
1026	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1027	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1028	},
1029	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1030	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1031	},
1032	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1033	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1034	},
1035	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1036	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1037	},
1038	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1039	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1040	},
1041	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1042	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1043	},
1044	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1045	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1046	}
1047	};
1048
1049	/*
1050	* start tags that imply the end of current element
1051	*/
1052	static const char * const htmlStartClose[] = {
1053	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1054	"dl", "ul", "ol", "menu", "dir", "address", "pre",
1055	"listing", "xmp", "head", NULL,
1056	"head", "p", NULL,
1057	"title", "p", NULL,
1058	"body", "head", "style", "link", "title", "p", NULL,
1059	"frameset", "head", "style", "link", "title", "p", NULL,
1060	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1061	"pre", "listing", "xmp", "head", "li", NULL,
1062	"hr", "p", "head", NULL,
1063	"h1", "p", "head", NULL,
1064	"h2", "p", "head", NULL,
1065	"h3", "p", "head", NULL,
1066	"h4", "p", "head", NULL,
1067	"h5", "p", "head", NULL,
1068	"h6", "p", "head", NULL,
1069	"dir", "p", "head", NULL,
1070	"address", "p", "head", "ul", NULL,
1071	"pre", "p", "head", "ul", NULL,
1072	"listing", "p", "head", NULL,
1073	"xmp", "p", "head", NULL,
1074	"blockquote", "p", "head", NULL,
1075	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1076	"xmp", "head", NULL,
1077	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1078	"head", "dd", NULL,
1079	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1080	"head", "dt", NULL,
1081	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1082	"listing", "xmp", NULL,
1083	"ol", "p", "head", "ul", NULL,
1084	"menu", "p", "head", "ul", NULL,
1085	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1086	"div", "p", "head", NULL,
1087	"noscript", "p", NULL,
1088	"center", "font", "b", "i", "p", "head", NULL,
1089	"a", "a", "head", NULL,
1090	"caption", "p", NULL,
1091	"colgroup", "caption", "colgroup", "col", "p", NULL,
1092	"col", "caption", "col", "p", NULL,
1093	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1094	"listing", "xmp", "a", NULL,
1095	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1096	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1097	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1098	"thead", "caption", "col", "colgroup", NULL,
1099	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1100	"tbody", "p", NULL,
1101	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1102	"tfoot", "tbody", "p", NULL,
1103	"optgroup", "option", NULL,
1104	"option", "option", NULL,
1105	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1106	"pre", "listing", "xmp", "a", NULL,
1107	/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1108	"tt", "head", NULL,
1109	"i", "head", NULL,
1110	"b", "head", NULL,
1111	"u", "head", NULL,
1112	"s", "head", NULL,
1113	"strike", "head", NULL,
1114	"big", "head", NULL,
1115	"small", "head", NULL,
1116
1117	"em", "head", NULL,
1118	"strong", "head", NULL,
1119	"dfn", "head", NULL,
1120	"code", "head", NULL,
1121	"samp", "head", NULL,
1122	"kbd", "head", NULL,
1123	"var", "head", NULL,
1124	"cite", "head", NULL,
1125	"abbr", "head", NULL,
1126	"acronym", "head", NULL,
1127
1128	/* "a" */
1129	"img", "head", NULL,
1130	/* "applet" */
1131	/* "embed" */
1132	/* "object" */
1133	"font", "head", NULL,
1134	/* "basefont" */
1135	"br", "head", NULL,
1136	/* "script" */
1137	"map", "head", NULL,
1138	"q", "head", NULL,
1139	"sub", "head", NULL,
1140	"sup", "head", NULL,
1141	"span", "head", NULL,
1142	"bdo", "head", NULL,
1143	"iframe", "head", NULL,
1144	NULL
1145	};
1146
1147	/*
1148	* The list of HTML elements which are supposed not to have
1149	* CDATA content and where a p element will be implied
1150	*
1151	* TODO: extend that list by reading the HTML SGML DTD on
1152	* implied paragraph
1153	*/
1154	static const char *const htmlNoContentElements[] = {
1155	"html",
1156	"head",
1157	NULL
1158	};
1159
1160	/*
1161	* The list of HTML attributes which are of content %Script;
1162	* NOTE: when adding ones, check htmlIsScriptAttribute() since
1163	* it assumes the name starts with 'on'
1164	*/
1165	static const char *const htmlScriptAttributes[] = {
1166	"onclick",
1167	"ondblclick",
1168	"onmousedown",
1169	"onmouseup",
1170	"onmouseover",
1171	"onmousemove",
1172	"onmouseout",
1173	"onkeypress",
1174	"onkeydown",
1175	"onkeyup",
1176	"onload",
1177	"onunload",
1178	"onfocus",
1179	"onblur",
1180	"onsubmit",
1181	"onreset",
1182	"onchange",
1183	"onselect"
1184	};
1185
1186	/*
1187	* This table is used by the htmlparser to know what to do with
1188	* broken html pages. By assigning different priorities to different
1189	* elements the parser can decide how to handle extra endtags.
1190	* Endtags are only allowed to close elements with lower or equal
1191	* priority.
1192	*/
1193
1194	typedef struct {
1195	const char *name;
1196	int priority;
1197	} elementPriority;
1198
1199	static const elementPriority htmlEndPriority[] = {
1200	{"div", 150},
1201	{"td", 160},
1202	{"th", 160},
1203	{"tr", 170},
1204	{"thead", 180},
1205	{"tbody", 180},
1206	{"tfoot", 180},
1207	{"table", 190},
1208	{"head", 200},
1209	{"body", 200},
1210	{"html", 220},
1211	{NULL, 100} /* Default priority */
1212	};
1213
1214	static const char** htmlStartCloseIndex[100];
1215	static int htmlStartCloseIndexinitialized = 0;
1216
1217	/************************************************************************
1218	* *
1219	* functions to handle HTML specific data *
1220	* *
1221	************************************************************************/
1222
1223	/**
1224	* htmlInitAutoClose:
1225	*
1226	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1227	* This is not reentrant. Call xmlInitParser() once before processing in
1228	* case of use in multithreaded programs.
1229	*/
1230	void
1231	htmlInitAutoClose(void) {
1232	int indx, i = 0;
1233
1234	if (htmlStartCloseIndexinitialized) return;
1235
1236	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1237	indx = 0;
1238	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1239	htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1240	while (htmlStartClose[i] != NULL) i++;
1241	i++;
1242	}
1243	htmlStartCloseIndexinitialized = 1;
1244	}
1245
1246	/**
1247	* htmlTagLookup:
1248	* @tag: The tag name in lowercase
1249	*
1250	* Lookup the HTML tag in the ElementTable
1251	*
1252	* Returns the related htmlElemDescPtr or NULL if not found.
1253	*/
1254	const htmlElemDesc *
1255	htmlTagLookup(const xmlChar *tag) {
1256	unsigned int i;
1257
1258	for (i = 0; i < (sizeof(html40ElementTable) /
1259	sizeof(html40ElementTable[0]));i++) {
1260	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1261	return((htmlElemDescPtr) &html40ElementTable[i]);
1262	}
1263	return(NULL);
1264	}
1265
1266	/**
1267	* htmlGetEndPriority:
1268	* @name: The name of the element to look up the priority for.
1269	*
1270	* Return value: The "endtag" priority.
1271	**/
1272	static int
1273	htmlGetEndPriority (const xmlChar *name) {
1274	int i = 0;
1275
1276	while ((htmlEndPriority[i].name != NULL) &&
1277	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1278	i++;
1279
1280	return(htmlEndPriority[i].priority);
1281	}
1282
1283
1284	/**
1285	* htmlCheckAutoClose:
1286	* @newtag: The new tag name
1287	* @oldtag: The old tag name
1288	*
1289	* Checks whether the new tag is one of the registered valid tags for
1290	* closing old.
1291	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1292	*
1293	* Returns 0 if no, 1 if yes.
1294	*/
1295	static int
1296	htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1297	{
1298	int i, indx;
1299	const char **closed = NULL;
1300
1301	if (htmlStartCloseIndexinitialized == 0)
1302	htmlInitAutoClose();
1303
1304	/* inefficient, but not a big deal */
1305	for (indx = 0; indx < 100; indx++) {
1306	closed = htmlStartCloseIndex[indx];
1307	if (closed == NULL)
1308	return (0);
1309	if (xmlStrEqual(BAD_CAST * closed, newtag))
1310	break;
1311	}
1312
1313	i = closed - htmlStartClose;
1314	i++;
1315	while (htmlStartClose[i] != NULL) {
1316	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1317	return (1);
1318	}
1319	i++;
1320	}
1321	return (0);
1322	}
1323
1324	/**
1325	* htmlAutoCloseOnClose:
1326	* @ctxt: an HTML parser context
1327	* @newtag: The new tag name
1328	* @force: force the tag closure
1329	*
1330	* The HTML DTD allows an ending tag to implicitly close other tags.
1331	*/
1332	static void
1333	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1334	{
1335	const htmlElemDesc *info;
1336	int i, priority;
1337
1338	priority = htmlGetEndPriority(newtag);
1339
1340	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1341
1342	if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1343	break;
1344	/*
1345	* A missplaced endtag can only close elements with lower
1346	* or equal priority, so if we find an element with higher
1347	* priority before we find an element with
1348	* matching name, we just ignore this endtag
1349	*/
1350	if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1351	return;
1352	}
1353	if (i < 0)
1354	return;
1355
1356	while (!xmlStrEqual(newtag, ctxt->name)) {
1357	info = htmlTagLookup(ctxt->name);
1358	if ((info != NULL) && (info->endTag == 3)) {
1359	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1360	"Opening and ending tag mismatch: %s and %s\n",
1361	newtag, ctxt->name);
1362	}
1363	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1364	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1365	htmlnamePop(ctxt);
1366	}
1367	}
1368
1369	/**
1370	* htmlAutoCloseOnEnd:
1371	* @ctxt: an HTML parser context
1372	*
1373	* Close all remaining tags at the end of the stream
1374	*/
1375	static void
1376	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1377	{
1378	int i;
1379
1380	if (ctxt->nameNr == 0)
1381	return;
1382	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1383	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1384	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1385	htmlnamePop(ctxt);
1386	}
1387	}
1388
1389	/**
1390	* htmlAutoClose:
1391	* @ctxt: an HTML parser context
1392	* @newtag: The new tag name or NULL
1393	*
1394	* The HTML DTD allows a tag to implicitly close other tags.
1395	* The list is kept in htmlStartClose array. This function is
1396	* called when a new tag has been detected and generates the
1397	* appropriates closes if possible/needed.
1398	* If newtag is NULL this mean we are at the end of the resource
1399	* and we should check
1400	*/
1401	static void
1402	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1403	{
1404	while ((newtag != NULL) && (ctxt->name != NULL) &&
1405	(htmlCheckAutoClose(newtag, ctxt->name))) {
1406	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1407	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1408	htmlnamePop(ctxt);
1409	}
1410	if (newtag == NULL) {
1411	htmlAutoCloseOnEnd(ctxt);
1412	return;
1413	}
1414	while ((newtag == NULL) && (ctxt->name != NULL) &&
1415	((xmlStrEqual(ctxt->name, BAD_CAST "head")) \|\|
1416	(xmlStrEqual(ctxt->name, BAD_CAST "body")) \|\|
1417	(xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1418	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420	htmlnamePop(ctxt);
1421	}
1422	}
1423
1424	/**
1425	* htmlAutoCloseTag:
1426	* @doc: the HTML document
1427	* @name: The tag name
1428	* @elem: the HTML element
1429	*
1430	* The HTML DTD allows a tag to implicitly close other tags.
1431	* The list is kept in htmlStartClose array. This function checks
1432	* if the element or one of it's children would autoclose the
1433	* given tag.
1434	*
1435	* Returns 1 if autoclose, 0 otherwise
1436	*/
1437	int
1438	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1439	htmlNodePtr child;
1440
1441	if (elem == NULL) return(1);
1442	if (xmlStrEqual(name, elem->name)) return(0);
1443	if (htmlCheckAutoClose(elem->name, name)) return(1);
1444	child = elem->children;
1445	while (child != NULL) {
1446	if (htmlAutoCloseTag(doc, name, child)) return(1);
1447	child = child->next;
1448	}
1449	return(0);
1450	}
1451
1452	/**
1453	* htmlIsAutoClosed:
1454	* @doc: the HTML document
1455	* @elem: the HTML element
1456	*
1457	* The HTML DTD allows a tag to implicitly close other tags.
1458	* The list is kept in htmlStartClose array. This function checks
1459	* if a tag is autoclosed by one of it's child
1460	*
1461	* Returns 1 if autoclosed, 0 otherwise
1462	*/
1463	int
1464	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1465	htmlNodePtr child;
1466
1467	if (elem == NULL) return(1);
1468	child = elem->children;
1469	while (child != NULL) {
1470	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471	child = child->next;
1472	}
1473	return(0);
1474	}
1475
1476	/**
1477	* htmlCheckImplied:
1478	* @ctxt: an HTML parser context
1479	* @newtag: The new tag name
1480	*
1481	* The HTML DTD allows a tag to exists only implicitly
1482	* called when a new tag has been detected and generates the
1483	* appropriates implicit tags if missing
1484	*/
1485	static void
1486	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1487	int i;
1488
1489	if (ctxt->options & HTML_PARSE_NOIMPLIED)
1490	return;
1491	if (!htmlOmittedDefaultValue)
1492	return;
1493	if (xmlStrEqual(newtag, BAD_CAST"html"))
1494	return;
1495	if (ctxt->nameNr <= 0) {
1496	htmlnamePush(ctxt, BAD_CAST"html");
1497	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1498	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1499	}
1500	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
1501	return;
1502	if ((ctxt->nameNr <= 1) &&
1503	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
1504	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
1505	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
1506	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
1507	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
1508	(xmlStrEqual(newtag, BAD_CAST"base")))) {
1509	if (ctxt->html >= 3) {
1510	/* we already saw or generated an <head> before */
1511	return;
1512	}
1513	/*
1514	* dropped OBJECT ... i you put it first BODY will be
1515	* assumed !
1516	*/
1517	htmlnamePush(ctxt, BAD_CAST"head");
1518	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1519	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1520	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1521	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1522	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1523	if (ctxt->html >= 10) {
1524	/* we already saw or generated a <body> before */
1525	return;
1526	}
1527	for (i = 0;i < ctxt->nameNr;i++) {
1528	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1529	return;
1530	}
1531	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1532	return;
1533	}
1534	}
1535
1536	htmlnamePush(ctxt, BAD_CAST"body");
1537	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1538	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1539	}
1540	}
1541
1542	/**
1543	* htmlCheckParagraph
1544	* @ctxt: an HTML parser context
1545	*
1546	* Check whether a p element need to be implied before inserting
1547	* characters in the current element.
1548	*
1549	* Returns 1 if a paragraph has been inserted, 0 if not and -1
1550	* in case of error.
1551	*/
1552
1553	static int
1554	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1555	const xmlChar *tag;
1556	int i;
1557
1558	if (ctxt == NULL)
1559	return(-1);
1560	tag = ctxt->name;
1561	if (tag == NULL) {
1562	htmlAutoClose(ctxt, BAD_CAST"p");
1563	htmlCheckImplied(ctxt, BAD_CAST"p");
1564	htmlnamePush(ctxt, BAD_CAST"p");
1565	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1567	return(1);
1568	}
1569	if (!htmlOmittedDefaultValue)
1570	return(0);
1571	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1572	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1573	htmlAutoClose(ctxt, BAD_CAST"p");
1574	htmlCheckImplied(ctxt, BAD_CAST"p");
1575	htmlnamePush(ctxt, BAD_CAST"p");
1576	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1577	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1578	return(1);
1579	}
1580	}
1581	return(0);
1582	}
1583
1584	/**
1585	* htmlIsScriptAttribute:
1586	* @name: an attribute name
1587	*
1588	* Check if an attribute is of content type Script
1589	*
1590	* Returns 1 is the attribute is a script 0 otherwise
1591	*/
1592	int
1593	htmlIsScriptAttribute(const xmlChar *name) {
1594	unsigned int i;
1595
1596	if (name == NULL)
1597	return(0);
1598	/*
1599	* all script attributes start with 'on'
1600	*/
1601	if ((name[0] != 'o') \|\| (name[1] != 'n'))
1602	return(0);
1603	for (i = 0;
1604	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1605	i++) {
1606	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1607	return(1);
1608	}
1609	return(0);
1610	}
1611
1612	/************************************************************************
1613	* *
1614	* The list of HTML predefined entities *
1615	* *
1616	************************************************************************/
1617
1618
1619	static const htmlEntityDesc html40EntitiesTable[] = {
1620	/*
1621	* the 4 absolute ones, plus apostrophe.
1622	*/
1623	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1624	{ 38, "amp", "ampersand, U+0026 ISOnum" },
1625	{ 39, "apos", "single quote" },
1626	{ 60, "lt", "less-than sign, U+003C ISOnum" },
1627	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1628
1629	/*
1630	* A bunch still in the 128-255 range
1631	* Replacing them depend really on the charset used.
1632	*/
1633	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1634	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1635	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1636	{ 163, "pound","pound sign, U+00A3 ISOnum" },
1637	{ 164, "curren","currency sign, U+00A4 ISOnum" },
1638	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1639	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1640	{ 167, "sect", "section sign, U+00A7 ISOnum" },
1641	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1642	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1643	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1644	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1645	{ 172, "not", "not sign, U+00AC ISOnum" },
1646	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1647	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1648	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1649	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1650	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1651	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1652	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1653	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1654	{ 181, "micro","micro sign, U+00B5 ISOnum" },
1655	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1656	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1657	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1658	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1659	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1660	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1661	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1662	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1663	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1664	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1665	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1666	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1667	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1668	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1669	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1670	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1671	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1672	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1673	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1674	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1675	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1676	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1677	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1678	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1679	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1680	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1681	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1682	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1683	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1684	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1685	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1686	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1687	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1688	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1689	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1690	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1691	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1692	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1693	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1694	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1695	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1696	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1697	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1698	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1699	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1700	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1701	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1702	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1703	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1704	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1705	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1706	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1707	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1708	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1709	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1710	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1711	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1712	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1713	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1714	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1715	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1716	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1717	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1718	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1719	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1720	{ 247, "divide","division sign, U+00F7 ISOnum" },
1721	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1722	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1723	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1724	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1725	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1726	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1727	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1728	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1729
1730	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1731	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1732	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1733	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1734	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1735
1736	/*
1737	* Anything below should really be kept as entities references
1738	*/
1739	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1740
1741	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1742	{ 732, "tilde","small tilde, U+02DC ISOdia" },
1743
1744	{ 913, "Alpha","greek capital letter alpha, U+0391" },
1745	{ 914, "Beta", "greek capital letter beta, U+0392" },
1746	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1747	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1748	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1749	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1750	{ 919, "Eta", "greek capital letter eta, U+0397" },
1751	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1752	{ 921, "Iota", "greek capital letter iota, U+0399" },
1753	{ 922, "Kappa","greek capital letter kappa, U+039A" },
1754	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1755	{ 924, "Mu", "greek capital letter mu, U+039C" },
1756	{ 925, "Nu", "greek capital letter nu, U+039D" },
1757	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1758	{ 927, "Omicron","greek capital letter omicron, U+039F" },
1759	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1760	{ 929, "Rho", "greek capital letter rho, U+03A1" },
1761	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1762	{ 932, "Tau", "greek capital letter tau, U+03A4" },
1763	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1764	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1765	{ 935, "Chi", "greek capital letter chi, U+03A7" },
1766	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1767	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1768
1769	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1770	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1771	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1772	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1773	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1774	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1775	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1776	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1777	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1778	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1779	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1780	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1781	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1782	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1783	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1784	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1785	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1786	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1787	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1788	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1789	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1790	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1791	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1792	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1793	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1794	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1795	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1796	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1797
1798	{ 8194, "ensp", "en space, U+2002 ISOpub" },
1799	{ 8195, "emsp", "em space, U+2003 ISOpub" },
1800	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1801	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1802	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1803	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1804	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1805	{ 8211, "ndash","en dash, U+2013 ISOpub" },
1806	{ 8212, "mdash","em dash, U+2014 ISOpub" },
1807	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1808	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1809	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1810	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1811	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1812	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1813	{ 8224, "dagger","dagger, U+2020 ISOpub" },
1814	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1815
1816	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1817	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1818
1819	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1820
1821	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1822	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1823
1824	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1825	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1826
1827	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1828	{ 8260, "frasl","fraction slash, U+2044 NEW" },
1829
1830	{ 8364, "euro", "euro sign, U+20AC NEW" },
1831
1832	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1833	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1834	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1835	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1836	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1837	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1838	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1839	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1840	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1841	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1842	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1843	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1844	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1845	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1846	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1847	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1848
1849	{ 8704, "forall","for all, U+2200 ISOtech" },
1850	{ 8706, "part", "partial differential, U+2202 ISOtech" },
1851	{ 8707, "exist","there exists, U+2203 ISOtech" },
1852	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1853	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1854	{ 8712, "isin", "element of, U+2208 ISOtech" },
1855	{ 8713, "notin","not an element of, U+2209 ISOtech" },
1856	{ 8715, "ni", "contains as member, U+220B ISOtech" },
1857	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1858	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1859	{ 8722, "minus","minus sign, U+2212 ISOtech" },
1860	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1861	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1862	{ 8733, "prop", "proportional to, U+221D ISOtech" },
1863	{ 8734, "infin","infinity, U+221E ISOtech" },
1864	{ 8736, "ang", "angle, U+2220 ISOamso" },
1865	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1866	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1867	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1868	{ 8746, "cup", "union = cup, U+222A ISOtech" },
1869	{ 8747, "int", "integral, U+222B ISOtech" },
1870	{ 8756, "there4","therefore, U+2234 ISOtech" },
1871	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1872	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1873	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1874	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1875	{ 8801, "equiv","identical to, U+2261 ISOtech" },
1876	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1877	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1878	{ 8834, "sub", "subset of, U+2282 ISOtech" },
1879	{ 8835, "sup", "superset of, U+2283 ISOtech" },
1880	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1881	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1882	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1883	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1884	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1885	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1886	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1887	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1888	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1889	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1890	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1891	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1892	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1893	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1894
1895	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1896	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1897	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1898	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1899
1900	};
1901
1902	/************************************************************************
1903	* *
1904	* Commodity functions to handle entities *
1905	* *
1906	************************************************************************/
1907
1908	/*
1909	* Macro used to grow the current buffer.
1910	*/
1911	#define growBuffer(buffer) { \
1912	xmlChar *tmp; \
1913	buffer##_size *= 2; \
1914	tmp = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
1915	if (tmp == NULL) { \
1916	htmlErrMemory(ctxt, "growing buffer\n"); \
1917	xmlFree(buffer); \
1918	return(NULL); \
1919	} \
1920	buffer = tmp; \
1921	}
1922
1923	/**
1924	* htmlEntityLookup:
1925	* @name: the entity name
1926	*
1927	* Lookup the given entity in EntitiesTable
1928	*
1929	* TODO: the linear scan is really ugly, an hash table is really needed.
1930	*
1931	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1932	*/
1933	const htmlEntityDesc *
1934	htmlEntityLookup(const xmlChar *name) {
1935	unsigned int i;
1936
1937	for (i = 0;i < (sizeof(html40EntitiesTable)/
1938	sizeof(html40EntitiesTable[0]));i++) {
1939	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1940	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1941	}
1942	}
1943	return(NULL);
1944	}
1945
1946	/**
1947	* htmlEntityValueLookup:
1948	* @value: the entity's unicode value
1949	*
1950	* Lookup the given entity in EntitiesTable
1951	*
1952	* TODO: the linear scan is really ugly, an hash table is really needed.
1953	*
1954	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1955	*/
1956	const htmlEntityDesc *
1957	htmlEntityValueLookup(unsigned int value) {
1958	unsigned int i;
1959
1960	for (i = 0;i < (sizeof(html40EntitiesTable)/
1961	sizeof(html40EntitiesTable[0]));i++) {
1962	if (html40EntitiesTable[i].value >= value) {
1963	if (html40EntitiesTable[i].value > value)
1964	break;
1965	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1966	}
1967	}
1968	return(NULL);
1969	}
1970
1971	/**
1972	* UTF8ToHtml:
1973	* @out: a pointer to an array of bytes to store the result
1974	* @outlen: the length of @out
1975	* @in: a pointer to an array of UTF-8 chars
1976	* @inlen: the length of @in
1977	*
1978	* Take a block of UTF-8 chars in and try to convert it to an ASCII
1979	* plus HTML entities block of chars out.
1980	*
1981	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1982	* The value of @inlen after return is the number of octets consumed
1983	* as the return value is positive, else unpredictable.
1984	* The value of @outlen after return is the number of octets consumed.
1985	*/
1986	int
1987	UTF8ToHtml(unsigned char* out, int *outlen,
1988	const unsigned char* in, int *inlen) {
1989	const unsigned char* processed = in;
1990	const unsigned char* outend;
1991	const unsigned char* outstart = out;
1992	const unsigned char* instart = in;
1993	const unsigned char* inend;
1994	unsigned int c, d;
1995	int trailing;
1996
1997	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL)) return(-1);
1998	if (in == NULL) {
1999	/*
2000	* initialization nothing to do
2001	*/
2002	*outlen = 0;
2003	*inlen = 0;
2004	return(0);
2005	}
2006	inend = in + (*inlen);
2007	outend = out + (*outlen);
2008	while (in < inend) {
2009	d = *in++;
2010	if (d < 0x80) { c= d; trailing= 0; }
2011	else if (d < 0xC0) {
2012	/* trailing byte in leading position */
2013	*outlen = out - outstart;
2014	*inlen = processed - instart;
2015	return(-2);
2016	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2017	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2018	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2019	else {
2020	/* no chance for this in Ascii */
2021	*outlen = out - outstart;
2022	*inlen = processed - instart;
2023	return(-2);
2024	}
2025
2026	if (inend - in < trailing) {
2027	break;
2028	}
2029
2030	for ( ; trailing; trailing--) {
2031	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
2032	break;
2033	c <<= 6;
2034	c \|= d & 0x3F;
2035	}
2036
2037	/* assertion: c is a single UTF-4 value */
2038	if (c < 0x80) {
2039	if (out + 1 >= outend)
2040	break;
2041	*out++ = c;
2042	} else {
2043	int len;
2044	const htmlEntityDesc * ent;
2045	const char *cp;
2046	char nbuf[16];
2047
2048	/*
2049	* Try to lookup a predefined HTML entity for it
2050	*/
2051
2052	ent = htmlEntityValueLookup(c);
2053	if (ent == NULL) {
2054	snprintf(nbuf, sizeof(nbuf), "#%u", c);
2055	cp = nbuf;
2056	}
2057	else
2058	cp = ent->name;
2059	len = strlen(cp);
2060	if (out + 2 + len >= outend)
2061	break;
2062	*out++ = '&';
2063	memcpy(out, cp, len);
2064	out += len;
2065	*out++ = ';';
2066	}
2067	processed = in;
2068	}
2069	*outlen = out - outstart;
2070	*inlen = processed - instart;
2071	return(0);
2072	}
2073
2074	/**
2075	* htmlEncodeEntities:
2076	* @out: a pointer to an array of bytes to store the result
2077	* @outlen: the length of @out
2078	* @in: a pointer to an array of UTF-8 chars
2079	* @inlen: the length of @in
2080	* @quoteChar: the quote character to escape (' or ") or zero.
2081	*
2082	* Take a block of UTF-8 chars in and try to convert it to an ASCII
2083	* plus HTML entities block of chars out.
2084	*
2085	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2086	* The value of @inlen after return is the number of octets consumed
2087	* as the return value is positive, else unpredictable.
2088	* The value of @outlen after return is the number of octets consumed.
2089	*/
2090	int
2091	htmlEncodeEntities(unsigned char* out, int *outlen,
2092	const unsigned char* in, int *inlen, int quoteChar) {
2093	const unsigned char* processed = in;
2094	const unsigned char* outend;
2095	const unsigned char* outstart = out;
2096	const unsigned char* instart = in;
2097	const unsigned char* inend;
2098	unsigned int c, d;
2099	int trailing;
2100
2101	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL) \|\| (in == NULL))
2102	return(-1);
2103	outend = out + (*outlen);
2104	inend = in + (*inlen);
2105	while (in < inend) {
2106	d = *in++;
2107	if (d < 0x80) { c= d; trailing= 0; }
2108	else if (d < 0xC0) {
2109	/* trailing byte in leading position */
2110	*outlen = out - outstart;
2111	*inlen = processed - instart;
2112	return(-2);
2113	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2114	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2115	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2116	else {
2117	/* no chance for this in Ascii */
2118	*outlen = out - outstart;
2119	*inlen = processed - instart;
2120	return(-2);
2121	}
2122
2123	if (inend - in < trailing)
2124	break;
2125
2126	while (trailing--) {
2127	if (((d= *in++) & 0xC0) != 0x80) {
2128	*outlen = out - outstart;
2129	*inlen = processed - instart;
2130	return(-2);
2131	}
2132	c <<= 6;
2133	c \|= d & 0x3F;
2134	}
2135
2136	/* assertion: c is a single UTF-4 value */
2137	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2138	(c != '&') && (c != '<') && (c != '>')) {
2139	if (out >= outend)
2140	break;
2141	*out++ = c;
2142	} else {
2143	const htmlEntityDesc * ent;
2144	const char *cp;
2145	char nbuf[16];
2146	int len;
2147
2148	/*
2149	* Try to lookup a predefined HTML entity for it
2150	*/
2151	ent = htmlEntityValueLookup(c);
2152	if (ent == NULL) {
2153	snprintf(nbuf, sizeof(nbuf), "#%u", c);
2154	cp = nbuf;
2155	}
2156	else
2157	cp = ent->name;
2158	len = strlen(cp);
2159	if (out + 2 + len > outend)
2160	break;
2161	*out++ = '&';
2162	memcpy(out, cp, len);
2163	out += len;
2164	*out++ = ';';
2165	}
2166	processed = in;
2167	}
2168	*outlen = out - outstart;
2169	*inlen = processed - instart;
2170	return(0);
2171	}
2172
2173	/************************************************************************
2174	* *
2175	* Commodity functions to handle streams *
2176	* *
2177	************************************************************************/
2178
2179	/**
2180	* htmlNewInputStream:
2181	* @ctxt: an HTML parser context
2182	*
2183	* Create a new input stream structure
2184	* Returns the new input stream or NULL
2185	*/
2186	static htmlParserInputPtr
2187	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2188	htmlParserInputPtr input;
2189
2190	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2191	if (input == NULL) {
2192	htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2193	return(NULL);
2194	}
2195	memset(input, 0, sizeof(htmlParserInput));
2196	input->filename = NULL;
2197	input->directory = NULL;
2198	input->base = NULL;
2199	input->cur = NULL;
2200	input->buf = NULL;
2201	input->line = 1;
2202	input->col = 1;
2203	input->buf = NULL;
2204	input->free = NULL;
2205	input->version = NULL;
2206	input->consumed = 0;
2207	input->length = 0;
2208	return(input);
2209	}
2210
2211
2212	/************************************************************************
2213	* *
2214	* Commodity functions, cleanup needed ? *
2215	* *
2216	************************************************************************/
2217	/*
2218	* all tags allowing pc data from the html 4.01 loose dtd
2219	* NOTE: it might be more apropriate to integrate this information
2220	* into the html40ElementTable array but I don't want to risk any
2221	* binary incomptibility
2222	*/
2223	static const char *allowPCData[] = {
2224	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2225	"blockquote", "body", "button", "caption", "center", "cite", "code",
2226	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2227	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2228	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2229	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2230	};
2231
2232	/**
2233	* areBlanks:
2234	* @ctxt: an HTML parser context
2235	* @str: a xmlChar *
2236	* @len: the size of @str
2237	*
2238	* Is this a sequence of blank chars that one can ignore ?
2239	*
2240	* Returns 1 if ignorable 0 otherwise.
2241	*/
2242
2243	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2244	unsigned int i;
2245	int j;
2246	xmlNodePtr lastChild;
2247	xmlDtdPtr dtd;
2248
2249	for (j = 0;j < len;j++)
2250	if (!(IS_BLANK_CH(str[j]))) return(0);
2251
2252	if (CUR == 0) return(1);
2253	if (CUR != '<') return(0);
2254	if (ctxt->name == NULL)
2255	return(1);
2256	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2257	return(1);
2258	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2259	return(1);
2260
2261	/* Only strip CDATA children of the body tag for strict HTML DTDs */
2262	if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2263	dtd = xmlGetIntSubset(ctxt->myDoc);
2264	if (dtd != NULL && dtd->ExternalID != NULL) {
2265	if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") \|\|
2266	!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2267	return(1);
2268	}
2269	}
2270
2271	if (ctxt->node == NULL) return(0);
2272	lastChild = xmlGetLastChild(ctxt->node);
2273	while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2274	lastChild = lastChild->prev;
2275	if (lastChild == NULL) {
2276	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2277	(ctxt->node->content != NULL)) return(0);
2278	/* keep ws in constructs like ...<b> </b>...
2279	for all tags "b" allowing PCDATA */
2280	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2281	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2282	return(0);
2283	}
2284	}
2285	} else if (xmlNodeIsText(lastChild)) {
2286	return(0);
2287	} else {
2288	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2289	for all tags "p" allowing PCDATA */
2290	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2291	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2292	return(0);
2293	}
2294	}
2295	}
2296	return(1);
2297	}
2298
2299	/**
2300	* htmlNewDocNoDtD:
2301	* @URI: URI for the dtd, or NULL
2302	* @ExternalID: the external ID of the DTD, or NULL
2303	*
2304	* Creates a new HTML document without a DTD node if @URI and @ExternalID
2305	* are NULL
2306	*
2307	* Returns a new document, do not initialize the DTD if not provided
2308	*/
2309	htmlDocPtr
2310	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
2311	xmlDocPtr cur;
2312
2313	/*
2314	* Allocate a new document and fill the fields.
2315	*/
2316	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2317	if (cur == NULL) {
2318	htmlErrMemory(NULL, "HTML document creation failed\n");
2319	return(NULL);
2320	}
2321	memset(cur, 0, sizeof(xmlDoc));
2322
2323	cur->type = XML_HTML_DOCUMENT_NODE;
2324	cur->version = NULL;
2325	cur->intSubset = NULL;
2326	cur->doc = cur;
2327	cur->name = NULL;
2328	cur->children = NULL;
2329	cur->extSubset = NULL;
2330	cur->oldNs = NULL;
2331	cur->encoding = NULL;
2332	cur->standalone = 1;
2333	cur->compression = 0;
2334	cur->ids = NULL;
2335	cur->refs = NULL;
2336	cur->_private = NULL;
2337	cur->charset = XML_CHAR_ENCODING_UTF8;
2338	cur->properties = XML_DOC_HTML \| XML_DOC_USERBUILT;
2339	if ((ExternalID != NULL) \|\|
2340	(URI != NULL))
2341	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2342	return(cur);
2343	}
2344
2345	/**
2346	* htmlNewDoc:
2347	* @URI: URI for the dtd, or NULL
2348	* @ExternalID: the external ID of the DTD, or NULL
2349	*
2350	* Creates a new HTML document
2351	*
2352	* Returns a new document
2353	*/
2354	htmlDocPtr
2355	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
2356	if ((URI == NULL) && (ExternalID == NULL))
2357	return(htmlNewDocNoDtD(
2358	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2359	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2360
2361	return(htmlNewDocNoDtD(URI, ExternalID));
2362	}
2363
2364
2365	/************************************************************************
2366	* *
2367	* The parser itself *
2368	* Relates to http://www.w3.org/TR/html40 *
2369	* *
2370	************************************************************************/
2371
2372	/************************************************************************
2373	* *
2374	* The parser itself *
2375	* *
2376	************************************************************************/
2377
2378	static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2379
2380	/**
2381	* htmlParseHTMLName:
2382	* @ctxt: an HTML parser context
2383	*
2384	* parse an HTML tag or attribute name, note that we convert it to lowercase
2385	* since HTML names are not case-sensitive.
2386	*
2387	* Returns the Tag Name parsed or NULL
2388	*/
2389
2390	static const xmlChar *
2391	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2392	int i = 0;
2393	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2394
2395	if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2396	(CUR != ':') && (CUR != '.')) return(NULL);
2397
2398	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2399	((IS_ASCII_LETTER(CUR)) \|\| (IS_ASCII_DIGIT(CUR)) \|\|
2400	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_') \|\|
2401	(CUR == '.'))) {
2402	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2403	else loc[i] = CUR;
2404	i++;
2405
2406	NEXT;
2407	}
2408
2409	return(xmlDictLookup(ctxt->dict, loc, i));
2410	}
2411
2412
2413	/**
2414	* htmlParseHTMLName_nonInvasive:
2415	* @ctxt: an HTML parser context
2416	*
2417	* parse an HTML tag or attribute name, note that we convert it to lowercase
2418	* since HTML names are not case-sensitive, this doesn't consume the data
2419	* from the stream, it's a look-ahead
2420	*
2421	* Returns the Tag Name parsed or NULL
2422	*/
2423
2424	static const xmlChar *
2425	htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2426	int i = 0;
2427	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2428
2429	if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2430	(NXT(1) != ':')) return(NULL);
2431
2432	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2433	((IS_ASCII_LETTER(NXT(1+i))) \|\| (IS_ASCII_DIGIT(NXT(1+i))) \|\|
2434	(NXT(1+i) == ':') \|\| (NXT(1+i) == '-') \|\| (NXT(1+i) == '_'))) {
2435	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2436	else loc[i] = NXT(1+i);
2437	i++;
2438	}
2439
2440	return(xmlDictLookup(ctxt->dict, loc, i));
2441	}
2442
2443
2444	/**
2445	* htmlParseName:
2446	* @ctxt: an HTML parser context
2447	*
2448	* parse an HTML name, this routine is case sensitive.
2449	*
2450	* Returns the Name parsed or NULL
2451	*/
2452
2453	static const xmlChar *
2454	htmlParseName(htmlParserCtxtPtr ctxt) {
2455	const xmlChar *in;
2456	const xmlChar *ret;
2457	int count = 0;
2458
2459	GROW;
2460
2461	/*
2462	* Accelerator for simple ASCII names
2463	*/
2464	in = ctxt->input->cur;
2465	if (((in >= 0x61) && (in <= 0x7A)) \|\|
2466	((in >= 0x41) && (in <= 0x5A)) \|\|
2467	(in == '_') \|\| (in == ':')) {
2468	in++;
2469	while (((in >= 0x61) && (in <= 0x7A)) \|\|
2470	((in >= 0x41) && (in <= 0x5A)) \|\|
2471	((in >= 0x30) && (in <= 0x39)) \|\|
2472	(in == '_') \|\| (in == '-') \|\|
2473	(in == ':') \|\| (in == '.'))
2474	in++;
2475
2476	if (in == ctxt->input->end)
2477	return(NULL);
2478
2479	if ((in > 0) && (in < 0x80)) {
2480	count = in - ctxt->input->cur;
2481	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2482	ctxt->input->cur = in;
2483	ctxt->nbChars += count;
2484	ctxt->input->col += count;
2485	return(ret);
2486	}
2487	}
2488	return(htmlParseNameComplex(ctxt));
2489	}
2490
2491	static const xmlChar *
2492	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2493	int len = 0, l;
2494	int c;
2495	int count = 0;
2496	const xmlChar *base = ctxt->input->base;
2497
2498	/*
2499	* Handler for more complex cases
2500	*/
2501	GROW;
2502	c = CUR_CHAR(l);
2503	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
2504	(!IS_LETTER(c) && (c != '_') &&
2505	(c != ':'))) {
2506	return(NULL);
2507	}
2508
2509	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2510	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
2511	(c == '.') \|\| (c == '-') \|\|
2512	(c == '_') \|\| (c == ':') \|\|
2513	(IS_COMBINING(c)) \|\|
2514	(IS_EXTENDER(c)))) {
2515	if (count++ > 100) {
2516	count = 0;
2517	GROW;
2518	}
2519	len += l;
2520	NEXTL(l);
2521	c = CUR_CHAR(l);
2522	if (ctxt->input->base != base) {
2523	/*
2524	* We changed encoding from an unknown encoding
2525	* Input buffer changed location, so we better start again
2526	*/
2527	return(htmlParseNameComplex(ctxt));
2528	}
2529	}
2530
2531	if (ctxt->input->base > ctxt->input->cur - len)
2532	return(NULL);
2533
2534	return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2535	}
2536
2537
2538	/**
2539	* htmlParseHTMLAttribute:
2540	* @ctxt: an HTML parser context
2541	* @stop: a char stop value
2542	*
2543	* parse an HTML attribute value till the stop (quote), if
2544	* stop is 0 then it stops at the first space
2545	*
2546	* Returns the attribute parsed or NULL
2547	*/
2548
2549	static xmlChar *
2550	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2551	xmlChar *buffer = NULL;
2552	int buffer_size = 0;
2553	xmlChar *out = NULL;
2554	const xmlChar *name = NULL;
2555	const xmlChar *cur = NULL;
2556	const htmlEntityDesc * ent;
2557
2558	/*
2559	* allocate a translation buffer.
2560	*/
2561	buffer_size = HTML_PARSER_BUFFER_SIZE;
2562	buffer = (xmlChar ) xmlMallocAtomic(buffer_size sizeof(xmlChar));
2563	if (buffer == NULL) {
2564	htmlErrMemory(ctxt, "buffer allocation failed\n");
2565	return(NULL);
2566	}
2567	out = buffer;
2568
2569	/*
2570	* Ok loop until we reach one of the ending chars
2571	*/
2572	while ((CUR != 0) && (CUR != stop)) {
2573	if ((stop == 0) && (CUR == '>')) break;
2574	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2575	if (CUR == '&') {
2576	if (NXT(1) == '#') {
2577	unsigned int c;
2578	int bits;
2579
2580	c = htmlParseCharRef(ctxt);
2581	if (c < 0x80)
2582	{ *out++ = c; bits= -6; }
2583	else if (c < 0x800)
2584	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2585	else if (c < 0x10000)
2586	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2587	else
2588	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2589
2590	for ( ; bits >= 0; bits-= 6) {
2591	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2592	}
2593
2594	if (out - buffer > buffer_size - 100) {
2595	int indx = out - buffer;
2596
2597	growBuffer(buffer);
2598	out = &buffer[indx];
2599	}
2600	} else {
2601	ent = htmlParseEntityRef(ctxt, &name);
2602	if (name == NULL) {
2603	*out++ = '&';
2604	if (out - buffer > buffer_size - 100) {
2605	int indx = out - buffer;
2606
2607	growBuffer(buffer);
2608	out = &buffer[indx];
2609	}
2610	} else if (ent == NULL) {
2611	*out++ = '&';
2612	cur = name;
2613	while (*cur != 0) {
2614	if (out - buffer > buffer_size - 100) {
2615	int indx = out - buffer;
2616
2617	growBuffer(buffer);
2618	out = &buffer[indx];
2619	}
2620	out++ = cur++;
2621	}
2622	} else {
2623	unsigned int c;
2624	int bits;
2625
2626	if (out - buffer > buffer_size - 100) {
2627	int indx = out - buffer;
2628
2629	growBuffer(buffer);
2630	out = &buffer[indx];
2631	}
2632	c = ent->value;
2633	if (c < 0x80)
2634	{ *out++ = c; bits= -6; }
2635	else if (c < 0x800)
2636	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2637	else if (c < 0x10000)
2638	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2639	else
2640	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2641
2642	for ( ; bits >= 0; bits-= 6) {
2643	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2644	}
2645	}
2646	}
2647	} else {
2648	unsigned int c;
2649	int bits, l;
2650
2651	if (out - buffer > buffer_size - 100) {
2652	int indx = out - buffer;
2653
2654	growBuffer(buffer);
2655	out = &buffer[indx];
2656	}
2657	c = CUR_CHAR(l);
2658	if (c < 0x80)
2659	{ *out++ = c; bits= -6; }
2660	else if (c < 0x800)
2661	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2662	else if (c < 0x10000)
2663	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2664	else
2665	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2666
2667	for ( ; bits >= 0; bits-= 6) {
2668	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2669	}
2670	NEXT;
2671	}
2672	}
2673	*out = 0;
2674	return(buffer);
2675	}
2676
2677	/**
2678	* htmlParseEntityRef:
2679	* @ctxt: an HTML parser context
2680	* @str: location to store the entity name
2681	*
2682	* parse an HTML ENTITY references
2683	*
2684	* [68] EntityRef ::= '&' Name ';'
2685	*
2686	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2687	* if non-NULL *str will have to be freed by the caller.
2688	*/
2689	const htmlEntityDesc *
2690	htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2691	const xmlChar *name;
2692	const htmlEntityDesc * ent = NULL;
2693
2694	if (str != NULL) *str = NULL;
2695	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) return(NULL);
2696
2697	if (CUR == '&') {
2698	NEXT;
2699	name = htmlParseName(ctxt);
2700	if (name == NULL) {
2701	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2702	"htmlParseEntityRef: no name\n", NULL, NULL);
2703	} else {
2704	GROW;
2705	if (CUR == ';') {
2706	if (str != NULL)
2707	*str = name;
2708
2709	/*
2710	* Lookup the entity in the table.
2711	*/
2712	ent = htmlEntityLookup(name);
2713	if (ent != NULL) /* OK that's ugly !!! */
2714	NEXT;
2715	} else {
2716	htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2717	"htmlParseEntityRef: expecting ';'\n",
2718	NULL, NULL);
2719	if (str != NULL)
2720	*str = name;
2721	}
2722	}
2723	}
2724	return(ent);
2725	}
2726
2727	/**
2728	* htmlParseAttValue:
2729	* @ctxt: an HTML parser context
2730	*
2731	* parse a value for an attribute
2732	* Note: the parser won't do substitution of entities here, this
2733	* will be handled later in xmlStringGetNodeList, unless it was
2734	* asked for ctxt->replaceEntities != 0
2735	*
2736	* Returns the AttValue parsed or NULL.
2737	*/
2738
2739	static xmlChar *
2740	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2741	xmlChar *ret = NULL;
2742
2743	if (CUR == '"') {
2744	NEXT;
2745	ret = htmlParseHTMLAttribute(ctxt, '"');
2746	if (CUR != '"') {
2747	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2748	"AttValue: \" expected\n", NULL, NULL);
2749	} else
2750	NEXT;
2751	} else if (CUR == '\'') {
2752	NEXT;
2753	ret = htmlParseHTMLAttribute(ctxt, '\'');
2754	if (CUR != '\'') {
2755	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2756	"AttValue: ' expected\n", NULL, NULL);
2757	} else
2758	NEXT;
2759	} else {
2760	/*
2761	* That's an HTMLism, the attribute value may not be quoted
2762	*/
2763	ret = htmlParseHTMLAttribute(ctxt, 0);
2764	if (ret == NULL) {
2765	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2766	"AttValue: no value found\n", NULL, NULL);
2767	}
2768	}
2769	return(ret);
2770	}
2771
2772	/**
2773	* htmlParseSystemLiteral:
2774	* @ctxt: an HTML parser context
2775	*
2776	* parse an HTML Literal
2777	*
2778	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
2779	*
2780	* Returns the SystemLiteral parsed or NULL
2781	*/
2782
2783	static xmlChar *
2784	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2785	size_t len = 0, startPosition = 0;
2786	xmlChar *ret = NULL;
2787
2788	if (CUR == '"') {
2789	NEXT;
2790
2791	if (CUR_PTR < BASE_PTR)
2792	return(ret);
2793	startPosition = CUR_PTR - BASE_PTR;
2794
2795	while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
2796	NEXT;
2797	len++;
2798	}
2799	if (!IS_CHAR_CH(CUR)) {
2800	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2801	"Unfinished SystemLiteral\n", NULL, NULL);
2802	} else {
2803	ret = xmlStrndup((BASE_PTR+startPosition), len);
2804	NEXT;
2805	}
2806	} else if (CUR == '\'') {
2807	NEXT;
2808
2809	if (CUR_PTR < BASE_PTR)
2810	return(ret);
2811	startPosition = CUR_PTR - BASE_PTR;
2812
2813	while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
2814	NEXT;
2815	len++;
2816	}
2817	if (!IS_CHAR_CH(CUR)) {
2818	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2819	"Unfinished SystemLiteral\n", NULL, NULL);
2820	} else {
2821	ret = xmlStrndup((BASE_PTR+startPosition), len);
2822	NEXT;
2823	}
2824	} else {
2825	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2826	" or ' expected\n", NULL, NULL);
2827	}
2828
2829	return(ret);
2830	}
2831
2832	/**
2833	* htmlParsePubidLiteral:
2834	* @ctxt: an HTML parser context
2835	*
2836	* parse an HTML public literal
2837	*
2838	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
2839	*
2840	* Returns the PubidLiteral parsed or NULL.
2841	*/
2842
2843	static xmlChar *
2844	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2845	size_t len = 0, startPosition = 0;
2846	xmlChar *ret = NULL;
2847	/*
2848	* Name ::= (Letter \| '_') (NameChar)*
2849	*/
2850	if (CUR == '"') {
2851	NEXT;
2852
2853	if (CUR_PTR < BASE_PTR)
2854	return(ret);
2855	startPosition = CUR_PTR - BASE_PTR;
2856
2857	while (IS_PUBIDCHAR_CH(CUR)) {
2858	len++;
2859	NEXT;
2860	}
2861
2862	if (CUR != '"') {
2863	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2864	"Unfinished PubidLiteral\n", NULL, NULL);
2865	} else {
2866	ret = xmlStrndup((BASE_PTR + startPosition), len);
2867	NEXT;
2868	}
2869	} else if (CUR == '\'') {
2870	NEXT;
2871
2872	if (CUR_PTR < BASE_PTR)
2873	return(ret);
2874	startPosition = CUR_PTR - BASE_PTR;
2875
2876	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2877	len++;
2878	NEXT;
2879	}
2880
2881	if (CUR != '\'') {
2882	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2883	"Unfinished PubidLiteral\n", NULL, NULL);
2884	} else {
2885	ret = xmlStrndup((BASE_PTR + startPosition), len);
2886	NEXT;
2887	}
2888	} else {
2889	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2890	"PubidLiteral \" or ' expected\n", NULL, NULL);
2891	}
2892
2893	return(ret);
2894	}
2895
2896	/**
2897	* htmlParseScript:
2898	* @ctxt: an HTML parser context
2899	*
2900	* parse the content of an HTML SCRIPT or STYLE element
2901	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
2902	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2903	* http://www.w3.org/TR/html4/types.html#type-script
2904	* http://www.w3.org/TR/html4/types.html#h-6.15
2905	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2906	*
2907	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
2908	* element and the value of intrinsic event attributes. User agents must
2909	* not evaluate script data as HTML markup but instead must pass it on as
2910	* data to a script engine.
2911	* NOTES:
2912	* - The content is passed like CDATA
2913	* - the attributes for style and scripting "onXXX" are also described
2914	* as CDATA but SGML allows entities references in attributes so their
2915	* processing is identical as other attributes
2916	*/
2917	static void
2918	htmlParseScript(htmlParserCtxtPtr ctxt) {
2919	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2920	int nbchar = 0;
2921	int cur,l;
2922
2923	SHRINK;
2924	cur = CUR_CHAR(l);
2925	while (IS_CHAR_CH(cur)) {
2926	if ((cur == '<') && (NXT(1) == '/')) {
2927	/*
2928	* One should break here, the specification is clear:
2929	* Authors should therefore escape "</" within the content.
2930	* Escape mechanisms are specific to each scripting or
2931	* style sheet language.
2932	*
2933	* In recovery mode, only break if end tag match the
2934	* current tag, effectively ignoring all tags inside the
2935	* script/style block and treating the entire block as
2936	* CDATA.
2937	*/
2938	if (ctxt->recovery) {
2939	if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2940	xmlStrlen(ctxt->name)) == 0)
2941	{
2942	break; /* while */
2943	} else {
2944	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2945	"Element %s embeds close tag\n",
2946	ctxt->name, NULL);
2947	}
2948	} else {
2949	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
2950	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2951	{
2952	break; /* while */
2953	}
2954	}
2955	}
2956	COPY_BUF(l,buf,nbchar,cur);
2957	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2958	if (ctxt->sax->cdataBlock!= NULL) {
2959	/*
2960	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2961	*/
2962	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2963	} else if (ctxt->sax->characters != NULL) {
2964	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2965	}
2966	nbchar = 0;
2967	}
2968	GROW;
2969	NEXTL(l);
2970	cur = CUR_CHAR(l);
2971	}
2972
2973	if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2974	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2975	"Invalid char in CDATA 0x%X\n", cur);
2976	if (ctxt->input->cur < ctxt->input->end) {
2977	NEXT;
2978	}
2979	}
2980
2981	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2982	if (ctxt->sax->cdataBlock!= NULL) {
2983	/*
2984	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2985	*/
2986	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2987	} else if (ctxt->sax->characters != NULL) {
2988	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2989	}
2990	}
2991	}
2992
2993
2994	/**
2995	* htmlParseCharDataInternal:
2996	* @ctxt: an HTML parser context
2997	* @readahead: optional read ahead character in ascii range
2998	*
2999	* parse a CharData section.
3000	* if we are within a CDATA section ']]>' marks an end of section.
3001	*
3002	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3003	*/
3004
3005	static void
3006	htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3007	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3008	int nbchar = 0;
3009	int cur, l;
3010	int chunk = 0;
3011
3012	if (readahead)
3013	buf[nbchar++] = readahead;
3014
3015	SHRINK;
3016	cur = CUR_CHAR(l);
3017	while (((cur != '<') \|\| (ctxt->token == '<')) &&
3018	((cur != '&') \|\| (ctxt->token == '&')) &&
3019	(cur != 0)) {
3020	if (!(IS_CHAR(cur))) {
3021	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3022	"Invalid char in CDATA 0x%X\n", cur);
3023	} else {
3024	COPY_BUF(l,buf,nbchar,cur);
3025	}
3026	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3027	/*
3028	* Ok the segment is to be consumed as chars.
3029	*/
3030	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3031	if (areBlanks(ctxt, buf, nbchar)) {
3032	if (ctxt->keepBlanks) {
3033	if (ctxt->sax->characters != NULL)
3034	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3035	} else {
3036	if (ctxt->sax->ignorableWhitespace != NULL)
3037	ctxt->sax->ignorableWhitespace(ctxt->userData,
3038	buf, nbchar);
3039	}
3040	} else {
3041	htmlCheckParagraph(ctxt);
3042	if (ctxt->sax->characters != NULL)
3043	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3044	}
3045	}
3046	nbchar = 0;
3047	}
3048	NEXTL(l);
3049	chunk++;
3050	if (chunk > HTML_PARSER_BUFFER_SIZE) {
3051	chunk = 0;
3052	SHRINK;
3053	GROW;
3054	}
3055	cur = CUR_CHAR(l);
3056	if (cur == 0) {
3057	SHRINK;
3058	GROW;
3059	cur = CUR_CHAR(l);
3060	}
3061	}
3062	if (nbchar != 0) {
3063	buf[nbchar] = 0;
3064
3065	/*
3066	* Ok the segment is to be consumed as chars.
3067	*/
3068	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3069	if (areBlanks(ctxt, buf, nbchar)) {
3070	if (ctxt->keepBlanks) {
3071	if (ctxt->sax->characters != NULL)
3072	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3073	} else {
3074	if (ctxt->sax->ignorableWhitespace != NULL)
3075	ctxt->sax->ignorableWhitespace(ctxt->userData,
3076	buf, nbchar);
3077	}
3078	} else {
3079	htmlCheckParagraph(ctxt);
3080	if (ctxt->sax->characters != NULL)
3081	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3082	}
3083	}
3084	} else {
3085	/*
3086	* Loop detection
3087	*/
3088	if (cur == 0)
3089	ctxt->instate = XML_PARSER_EOF;
3090	}
3091	}
3092
3093	/**
3094	* htmlParseCharData:
3095	* @ctxt: an HTML parser context
3096	*
3097	* parse a CharData section.
3098	* if we are within a CDATA section ']]>' marks an end of section.
3099	*
3100	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3101	*/
3102
3103	static void
3104	htmlParseCharData(htmlParserCtxtPtr ctxt) {
3105	htmlParseCharDataInternal(ctxt, 0);
3106	}
3107
3108	/**
3109	* htmlParseExternalID:
3110	* @ctxt: an HTML parser context
3111	* @publicID: a xmlChar** receiving PubidLiteral
3112	*
3113	* Parse an External ID or a Public ID
3114	*
3115	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3116	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
3117	*
3118	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
3119	*
3120	* Returns the function returns SystemLiteral and in the second
3121	* case publicID receives PubidLiteral, is strict is off
3122	* it is possible to return NULL and have publicID set.
3123	*/
3124
3125	static xmlChar *
3126	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3127	xmlChar *URI = NULL;
3128
3129	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3130	(UPP(2) == 'S') && (UPP(3) == 'T') &&
3131	(UPP(4) == 'E') && (UPP(5) == 'M')) {
3132	SKIP(6);
3133	if (!IS_BLANK_CH(CUR)) {
3134	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3135	"Space required after 'SYSTEM'\n", NULL, NULL);
3136	}
3137	SKIP_BLANKS;
3138	URI = htmlParseSystemLiteral(ctxt);
3139	if (URI == NULL) {
3140	htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3141	"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3142	}
3143	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3144	(UPP(2) == 'B') && (UPP(3) == 'L') &&
3145	(UPP(4) == 'I') && (UPP(5) == 'C')) {
3146	SKIP(6);
3147	if (!IS_BLANK_CH(CUR)) {
3148	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3149	"Space required after 'PUBLIC'\n", NULL, NULL);
3150	}
3151	SKIP_BLANKS;
3152	*publicID = htmlParsePubidLiteral(ctxt);
3153	if (*publicID == NULL) {
3154	htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3155	"htmlParseExternalID: PUBLIC, no Public Identifier\n",
3156	NULL, NULL);
3157	}
3158	SKIP_BLANKS;
3159	if ((CUR == '"') \|\| (CUR == '\'')) {
3160	URI = htmlParseSystemLiteral(ctxt);
3161	}
3162	}
3163	return(URI);
3164	}
3165
3166	/**
3167	* xmlParsePI:
3168	* @ctxt: an XML parser context
3169	*
3170	* parse an XML Processing Instruction.
3171	*
3172	* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3173	*/
3174	static void
3175	htmlParsePI(htmlParserCtxtPtr ctxt) {
3176	xmlChar *buf = NULL;
3177	int len = 0;
3178	int size = HTML_PARSER_BUFFER_SIZE;
3179	int cur, l;
3180	const xmlChar *target;
3181	xmlParserInputState state;
3182	int count = 0;
3183
3184	if ((RAW == '<') && (NXT(1) == '?')) {
3185	state = ctxt->instate;
3186	ctxt->instate = XML_PARSER_PI;
3187	/*
3188	* this is a Processing Instruction.
3189	*/
3190	SKIP(2);
3191	SHRINK;
3192
3193	/*
3194	* Parse the target name and check for special support like
3195	* namespace.
3196	*/
3197	target = htmlParseName(ctxt);
3198	if (target != NULL) {
3199	if (RAW == '>') {
3200	SKIP(1);
3201
3202	/*
3203	* SAX: PI detected.
3204	*/
3205	if ((ctxt->sax) && (!ctxt->disableSAX) &&
3206	(ctxt->sax->processingInstruction != NULL))
3207	ctxt->sax->processingInstruction(ctxt->userData,
3208	target, NULL);
3209	ctxt->instate = state;
3210	return;
3211	}
3212	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
3213	if (buf == NULL) {
3214	htmlErrMemory(ctxt, NULL);
3215	ctxt->instate = state;
3216	return;
3217	}
3218	cur = CUR;
3219	if (!IS_BLANK(cur)) {
3220	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3221	"ParsePI: PI %s space expected\n", target, NULL);
3222	}
3223	SKIP_BLANKS;
3224	cur = CUR_CHAR(l);
3225	while (IS_CHAR(cur) && (cur != '>')) {
3226	if (len + 5 >= size) {
3227	xmlChar *tmp;
3228
3229	size *= 2;
3230	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
3231	if (tmp == NULL) {
3232	htmlErrMemory(ctxt, NULL);
3233	xmlFree(buf);
3234	ctxt->instate = state;
3235	return;
3236	}
3237	buf = tmp;
3238	}
3239	count++;
3240	if (count > 50) {
3241	GROW;
3242	count = 0;
3243	}
3244	COPY_BUF(l,buf,len,cur);
3245	NEXTL(l);
3246	cur = CUR_CHAR(l);
3247	if (cur == 0) {
3248	SHRINK;
3249	GROW;
3250	cur = CUR_CHAR(l);
3251	}
3252	}
3253	buf[len] = 0;
3254	if (cur != '>') {
3255	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3256	"ParsePI: PI %s never end ...\n", target, NULL);
3257	} else {
3258	SKIP(1);
3259
3260	/*
3261	* SAX: PI detected.
3262	*/
3263	if ((ctxt->sax) && (!ctxt->disableSAX) &&
3264	(ctxt->sax->processingInstruction != NULL))
3265	ctxt->sax->processingInstruction(ctxt->userData,
3266	target, buf);
3267	}
3268	xmlFree(buf);
3269	} else {
3270	htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3271	"PI is not started correctly", NULL, NULL);
3272	}
3273	ctxt->instate = state;
3274	}
3275	}
3276
3277	/**
3278	* htmlParseComment:
3279	* @ctxt: an HTML parser context
3280	*
3281	* Parse an XML (SGML) comment <!-- .... -->
3282	*
3283	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
3284	*/
3285	static void
3286	htmlParseComment(htmlParserCtxtPtr ctxt) {
3287	xmlChar *buf = NULL;
3288	int len;
3289	int size = HTML_PARSER_BUFFER_SIZE;
3290	int q, ql;
3291	int r, rl;
3292	int cur, l;
3293	xmlParserInputState state;
3294
3295	/*
3296	* Check that there is a comment right here.
3297	*/
3298	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
3299	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
3300
3301	state = ctxt->instate;
3302	ctxt->instate = XML_PARSER_COMMENT;
3303	SHRINK;
3304	SKIP(4);
3305	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
3306	if (buf == NULL) {
3307	htmlErrMemory(ctxt, "buffer allocation failed\n");
3308	ctxt->instate = state;
3309	return;
3310	}
3311	len = 0;
3312	buf[len] = 0;
3313	q = CUR_CHAR(ql);
3314	if (!IS_CHAR(q))
3315	goto unfinished;
3316	NEXTL(ql);
3317	r = CUR_CHAR(rl);
3318	if (!IS_CHAR(r))
3319	goto unfinished;
3320	NEXTL(rl);
3321	cur = CUR_CHAR(l);
3322	while (IS_CHAR(cur) &&
3323	((cur != '>') \|\|
3324	(r != '-') \|\| (q != '-'))) {
3325	if (len + 5 >= size) {
3326	xmlChar *tmp;
3327
3328	size *= 2;
3329	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
3330	if (tmp == NULL) {
3331	xmlFree(buf);
3332	htmlErrMemory(ctxt, "growing buffer failed\n");
3333	ctxt->instate = state;
3334	return;
3335	}
3336	buf = tmp;
3337	}
3338	COPY_BUF(ql,buf,len,q);
3339	q = r;
3340	ql = rl;
3341	r = cur;
3342	rl = l;
3343	NEXTL(l);
3344	cur = CUR_CHAR(l);
3345	if (cur == 0) {
3346	SHRINK;
3347	GROW;
3348	cur = CUR_CHAR(l);
3349	}
3350	}
3351	buf[len] = 0;
3352	if (IS_CHAR(cur)) {
3353	NEXT;
3354	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3355	(!ctxt->disableSAX))
3356	ctxt->sax->comment(ctxt->userData, buf);
3357	xmlFree(buf);
3358	ctxt->instate = state;
3359	return;
3360	}
3361
3362	unfinished:
3363	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3364	"Comment not terminated \n<!--%.50s\n", buf, NULL);
3365	xmlFree(buf);
3366	}
3367
3368	/**
3369	* htmlParseCharRef:
3370	* @ctxt: an HTML parser context
3371	*
3372	* parse Reference declarations
3373	*
3374	* [66] CharRef ::= '&#' [0-9]+ ';' \|
3375	* '&#x' [0-9a-fA-F]+ ';'
3376	*
3377	* Returns the value parsed (as an int)
3378	*/
3379	int
3380	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3381	int val = 0;
3382
3383	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3384	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3385	"htmlParseCharRef: context error\n",
3386	NULL, NULL);
3387	return(0);
3388	}
3389	if ((CUR == '&') && (NXT(1) == '#') &&
3390	((NXT(2) == 'x') \|\| NXT(2) == 'X')) {
3391	SKIP(3);
3392	while (CUR != ';') {
3393	if ((CUR >= '0') && (CUR <= '9'))
3394	val = val * 16 + (CUR - '0');
3395	else if ((CUR >= 'a') && (CUR <= 'f'))
3396	val = val * 16 + (CUR - 'a') + 10;
3397	else if ((CUR >= 'A') && (CUR <= 'F'))
3398	val = val * 16 + (CUR - 'A') + 10;
3399	else {
3400	htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3401	"htmlParseCharRef: missing semicolon\n",
3402	NULL, NULL);
3403	break;
3404	}
3405	NEXT;
3406	}
3407	if (CUR == ';')
3408	NEXT;
3409	} else if ((CUR == '&') && (NXT(1) == '#')) {
3410	SKIP(2);
3411	while (CUR != ';') {
3412	if ((CUR >= '0') && (CUR <= '9'))
3413	val = val * 10 + (CUR - '0');
3414	else {
3415	htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3416	"htmlParseCharRef: missing semicolon\n",
3417	NULL, NULL);
3418	break;
3419	}
3420	NEXT;
3421	}
3422	if (CUR == ';')
3423	NEXT;
3424	} else {
3425	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3426	"htmlParseCharRef: invalid value\n", NULL, NULL);
3427	}
3428	/*
3429	* Check the value IS_CHAR ...
3430	*/
3431	if (IS_CHAR(val)) {
3432	return(val);
3433	} else {
3434	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3435	"htmlParseCharRef: invalid xmlChar value %d\n",
3436	val);
3437	}
3438	return(0);
3439	}
3440
3441
3442	/**
3443	* htmlParseDocTypeDecl:
3444	* @ctxt: an HTML parser context
3445	*
3446	* parse a DOCTYPE declaration
3447	*
3448	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3449	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
3450	*/
3451
3452	static void
3453	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3454	const xmlChar *name;
3455	xmlChar *ExternalID = NULL;
3456	xmlChar *URI = NULL;
3457
3458	/*
3459	* We know that '<!DOCTYPE' has been detected.
3460	*/
3461	SKIP(9);
3462
3463	SKIP_BLANKS;
3464
3465	/*
3466	* Parse the DOCTYPE name.
3467	*/
3468	name = htmlParseName(ctxt);
3469	if (name == NULL) {
3470	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3471	"htmlParseDocTypeDecl : no DOCTYPE name !\n",
3472	NULL, NULL);
3473	}
3474	/*
3475	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
3476	*/
3477
3478	SKIP_BLANKS;
3479
3480	/*
3481	* Check for SystemID and ExternalID
3482	*/
3483	URI = htmlParseExternalID(ctxt, &ExternalID);
3484	SKIP_BLANKS;
3485
3486	/*
3487	* We should be at the end of the DOCTYPE declaration.
3488	*/
3489	if (CUR != '>') {
3490	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3491	"DOCTYPE improperly terminated\n", NULL, NULL);
3492	/* We shouldn't try to resynchronize ... */
3493	}
3494	NEXT;
3495
3496	/*
3497	* Create or update the document accordingly to the DOCTYPE
3498	*/
3499	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3500	(!ctxt->disableSAX))
3501	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3502
3503	/*
3504	* Cleanup, since we don't use all those identifiers
3505	*/
3506	if (URI != NULL) xmlFree(URI);
3507	if (ExternalID != NULL) xmlFree(ExternalID);
3508	}
3509
3510	/**
3511	* htmlParseAttribute:
3512	* @ctxt: an HTML parser context
3513	* @value: a xmlChar ** used to store the value of the attribute
3514	*
3515	* parse an attribute
3516	*
3517	* [41] Attribute ::= Name Eq AttValue
3518	*
3519	* [25] Eq ::= S? '=' S?
3520	*
3521	* With namespace:
3522	*
3523	* [NS 11] Attribute ::= QName Eq AttValue
3524	*
3525	* Also the case QName == xmlns:??? is handled independently as a namespace
3526	* definition.
3527	*
3528	* Returns the attribute name, and the value in *value.
3529	*/
3530
3531	static const xmlChar *
3532	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3533	const xmlChar *name;
3534	xmlChar *val = NULL;
3535
3536	*value = NULL;
3537	name = htmlParseHTMLName(ctxt);
3538	if (name == NULL) {
3539	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3540	"error parsing attribute name\n", NULL, NULL);
3541	return(NULL);
3542	}
3543
3544	/*
3545	* read the value
3546	*/
3547	SKIP_BLANKS;
3548	if (CUR == '=') {
3549	NEXT;
3550	SKIP_BLANKS;
3551	val = htmlParseAttValue(ctxt);
3552	}
3553
3554	*value = val;
3555	return(name);
3556	}
3557
3558	/**
3559	* htmlCheckEncodingDirect:
3560	* @ctxt: an HTML parser context
3561	* @attvalue: the attribute value
3562	*
3563	* Checks an attribute value to detect
3564	* the encoding
3565	* If a new encoding is detected the parser is switched to decode
3566	* it and pass UTF8
3567	*/
3568	static void
3569	htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3570
3571	if ((ctxt == NULL) \|\| (encoding == NULL) \|\|
3572	(ctxt->options & HTML_PARSE_IGNORE_ENC))
3573	return;
3574
3575	/* do not change encoding */
3576	if (ctxt->input->encoding != NULL)
3577	return;
3578
3579	if (encoding != NULL) {
3580	xmlCharEncoding enc;
3581	xmlCharEncodingHandlerPtr handler;
3582
3583	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
3584
3585	if (ctxt->input->encoding != NULL)
3586	xmlFree((xmlChar *) ctxt->input->encoding);
3587	ctxt->input->encoding = xmlStrdup(encoding);
3588
3589	enc = xmlParseCharEncoding((const char *) encoding);
3590	/*
3591	* registered set of known encodings
3592	*/
3593	if (enc != XML_CHAR_ENCODING_ERROR) {
3594	if (((enc == XML_CHAR_ENCODING_UTF16LE) \|\|
3595	(enc == XML_CHAR_ENCODING_UTF16BE) \|\|
3596	(enc == XML_CHAR_ENCODING_UCS4LE) \|\|
3597	(enc == XML_CHAR_ENCODING_UCS4BE)) &&
3598	(ctxt->input->buf != NULL) &&
3599	(ctxt->input->buf->encoder == NULL)) {
3600	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3601	"htmlCheckEncoding: wrong encoding meta\n",
3602	NULL, NULL);
3603	} else {
3604	xmlSwitchEncoding(ctxt, enc);
3605	}
3606	ctxt->charset = XML_CHAR_ENCODING_UTF8;
3607	} else {
3608	/*
3609	* fallback for unknown encodings
3610	*/
3611	handler = xmlFindCharEncodingHandler((const char *) encoding);
3612	if (handler != NULL) {
3613	xmlSwitchToEncoding(ctxt, handler);
3614	ctxt->charset = XML_CHAR_ENCODING_UTF8;
3615	} else {
3616	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3617	"htmlCheckEncoding: unknown encoding %s\n",
3618	encoding, NULL);
3619	}
3620	}
3621
3622	if ((ctxt->input->buf != NULL) &&
3623	(ctxt->input->buf->encoder != NULL) &&
3624	(ctxt->input->buf->raw != NULL) &&
3625	(ctxt->input->buf->buffer != NULL)) {
3626	int nbchars;
3627	int processed;
3628
3629	/*
3630	* convert as much as possible to the parser reading buffer.
3631	*/
3632	processed = ctxt->input->cur - ctxt->input->base;
3633	xmlBufShrink(ctxt->input->buf->buffer, processed);
3634	nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3635	if (nbchars < 0) {
3636	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3637	"htmlCheckEncoding: encoder error\n",
3638	NULL, NULL);
3639	}
3640	xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3641	}
3642	}
3643	}
3644
3645	/**
3646	* htmlCheckEncoding:
3647	* @ctxt: an HTML parser context
3648	* @attvalue: the attribute value
3649	*
3650	* Checks an http-equiv attribute from a Meta tag to detect
3651	* the encoding
3652	* If a new encoding is detected the parser is switched to decode
3653	* it and pass UTF8
3654	*/
3655	static void
3656	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3657	const xmlChar *encoding;
3658
3659	if (!attvalue)
3660	return;
3661
3662	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3663	if (encoding != NULL) {
3664	encoding += 7;
3665	}
3666	/*
3667	* skip blank
3668	*/
3669	if (encoding && IS_BLANK_CH(*encoding))
3670	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3671	if (encoding && *encoding == '=') {
3672	encoding ++;
3673	htmlCheckEncodingDirect(ctxt, encoding);
3674	}
3675	}
3676
3677	/**
3678	* htmlCheckMeta:
3679	* @ctxt: an HTML parser context
3680	* @atts: the attributes values
3681	*
3682	* Checks an attributes from a Meta tag
3683	*/
3684	static void
3685	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3686	int i;
3687	const xmlChar att, value;
3688	int http = 0;
3689	const xmlChar *content = NULL;
3690
3691	if ((ctxt == NULL) \|\| (atts == NULL))
3692	return;
3693
3694	i = 0;
3695	att = atts[i++];
3696	while (att != NULL) {
3697	value = atts[i++];
3698	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3699	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3700	http = 1;
3701	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3702	htmlCheckEncodingDirect(ctxt, value);
3703	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3704	content = value;
3705	att = atts[i++];
3706	}
3707	if ((http) && (content != NULL))
3708	htmlCheckEncoding(ctxt, content);
3709
3710	}
3711
3712	/**
3713	* htmlParseStartTag:
3714	* @ctxt: an HTML parser context
3715	*
3716	* parse a start of tag either for rule element or
3717	* EmptyElement. In both case we don't parse the tag closing chars.
3718	*
3719	* [40] STag ::= '<' Name (S Attribute)* S? '>'
3720	*
3721	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3722	*
3723	* With namespace:
3724	*
3725	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3726	*
3727	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3728	*
3729	* Returns 0 in case of success, -1 in case of error and 1 if discarded
3730	*/
3731
3732	static int
3733	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3734	const xmlChar *name;
3735	const xmlChar *attname;
3736	xmlChar *attvalue;
3737	const xmlChar **atts;
3738	int nbatts = 0;
3739	int maxatts;
3740	int meta = 0;
3741	int i;
3742	int discardtag = 0;
3743
3744	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3745	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3746	"htmlParseStartTag: context error\n", NULL, NULL);
3747	return -1;
3748	}
3749	if (ctxt->instate == XML_PARSER_EOF)
3750	return(-1);
3751	if (CUR != '<') return -1;
3752	NEXT;
3753
3754	atts = ctxt->atts;
3755	maxatts = ctxt->maxatts;
3756
3757	GROW;
3758	name = htmlParseHTMLName(ctxt);
3759	if (name == NULL) {
3760	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3761	"htmlParseStartTag: invalid element name\n",
3762	NULL, NULL);
3763	/* if recover preserve text on classic misconstructs */
3764	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) \|\| (CUR == '<') \|\|
3765	(CUR == '=') \|\| (CUR == '>') \|\| (((CUR >= '0') && (CUR <= '9'))))) {
3766	htmlParseCharDataInternal(ctxt, '<');
3767	return(-1);
3768	}
3769
3770
3771	/* Dump the bogus tag like browsers do */
3772	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3773	(ctxt->instate != XML_PARSER_EOF))
3774	NEXT;
3775	return -1;
3776	}
3777	if (xmlStrEqual(name, BAD_CAST"meta"))
3778	meta = 1;
3779
3780	/*
3781	* Check for auto-closure of HTML elements.
3782	*/
3783	htmlAutoClose(ctxt, name);
3784
3785	/*
3786	* Check for implied HTML elements.
3787	*/
3788	htmlCheckImplied(ctxt, name);
3789
3790	/*
3791	* Avoid html at any level > 0, head at any level != 1
3792	* or any attempt to recurse body
3793	*/
3794	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3795	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3796	"htmlParseStartTag: misplaced <html> tag\n",
3797	name, NULL);
3798	discardtag = 1;
3799	ctxt->depth++;
3800	}
3801	if ((ctxt->nameNr != 1) &&
3802	(xmlStrEqual(name, BAD_CAST"head"))) {
3803	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3804	"htmlParseStartTag: misplaced <head> tag\n",
3805	name, NULL);
3806	discardtag = 1;
3807	ctxt->depth++;
3808	}
3809	if (xmlStrEqual(name, BAD_CAST"body")) {
3810	int indx;
3811	for (indx = 0;indx < ctxt->nameNr;indx++) {
3812	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3813	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3814	"htmlParseStartTag: misplaced <body> tag\n",
3815	name, NULL);
3816	discardtag = 1;
3817	ctxt->depth++;
3818	}
3819	}
3820	}
3821
3822	/*
3823	* Now parse the attributes, it ends up with the ending
3824	*
3825	* (S Attribute)* S?
3826	*/
3827	SKIP_BLANKS;
3828	while ((IS_CHAR_CH(CUR)) &&
3829	(CUR != '>') &&
3830	((CUR != '/') \|\| (NXT(1) != '>'))) {
3831	long cons = ctxt->nbChars;
3832
3833	GROW;
3834	attname = htmlParseAttribute(ctxt, &attvalue);
3835	if (attname != NULL) {
3836
3837	/*
3838	* Well formedness requires at most one declaration of an attribute
3839	*/
3840	for (i = 0; i < nbatts;i += 2) {
3841	if (xmlStrEqual(atts[i], attname)) {
3842	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3843	"Attribute %s redefined\n", attname, NULL);
3844	if (attvalue != NULL)
3845	xmlFree(attvalue);
3846	goto failed;
3847	}
3848	}
3849
3850	/*
3851	* Add the pair to atts
3852	*/
3853	if (atts == NULL) {
3854	maxatts = 22; /* allow for 10 attrs by default */
3855	atts = (const xmlChar **)
3856	xmlMalloc(maxatts * sizeof(xmlChar *));
3857	if (atts == NULL) {
3858	htmlErrMemory(ctxt, NULL);
3859	if (attvalue != NULL)
3860	xmlFree(attvalue);
3861	goto failed;
3862	}
3863	ctxt->atts = atts;
3864	ctxt->maxatts = maxatts;
3865	} else if (nbatts + 4 > maxatts) {
3866	const xmlChar **n;
3867
3868	maxatts *= 2;
3869	n = (const xmlChar *) xmlRealloc((void ) atts,
3870	maxatts * sizeof(const xmlChar *));
3871	if (n == NULL) {
3872	htmlErrMemory(ctxt, NULL);
3873	if (attvalue != NULL)
3874	xmlFree(attvalue);
3875	goto failed;
3876	}
3877	atts = n;
3878	ctxt->atts = atts;
3879	ctxt->maxatts = maxatts;
3880	}
3881	atts[nbatts++] = attname;
3882	atts[nbatts++] = attvalue;
3883	atts[nbatts] = NULL;
3884	atts[nbatts + 1] = NULL;
3885	}
3886	else {
3887	if (attvalue != NULL)
3888	xmlFree(attvalue);
3889	/* Dump the bogus attribute string up to the next blank or
3890	* the end of the tag. */
3891	while ((IS_CHAR_CH(CUR)) &&
3892	!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3893	((CUR != '/') \|\| (NXT(1) != '>')))
3894	NEXT;
3895	}
3896
3897	failed:
3898	SKIP_BLANKS;
3899	if (cons == ctxt->nbChars) {
3900	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3901	"htmlParseStartTag: problem parsing attributes\n",
3902	NULL, NULL);
3903	break;
3904	}
3905	}
3906
3907	/*
3908	* Handle specific association to the META tag
3909	*/
3910	if (meta && (nbatts != 0))
3911	htmlCheckMeta(ctxt, atts);
3912
3913	/*
3914	* SAX: Start of Element !
3915	*/
3916	if (!discardtag) {
3917	htmlnamePush(ctxt, name);
3918	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3919	if (nbatts != 0)
3920	ctxt->sax->startElement(ctxt->userData, name, atts);
3921	else
3922	ctxt->sax->startElement(ctxt->userData, name, NULL);
3923	}
3924	}
3925
3926	if (atts != NULL) {
3927	for (i = 1;i < nbatts;i += 2) {
3928	if (atts[i] != NULL)
3929	xmlFree((xmlChar *) atts[i]);
3930	}
3931	}
3932
3933	return(discardtag);
3934	}
3935
3936	/**
3937	* htmlParseEndTag:
3938	* @ctxt: an HTML parser context
3939	*
3940	* parse an end of tag
3941	*
3942	* [42] ETag ::= '</' Name S? '>'
3943	*
3944	* With namespace
3945	*
3946	* [NS 9] ETag ::= '</' QName S? '>'
3947	*
3948	* Returns 1 if the current level should be closed.
3949	*/
3950
3951	static int
3952	htmlParseEndTag(htmlParserCtxtPtr ctxt)
3953	{
3954	const xmlChar *name;
3955	const xmlChar *oldname;
3956	int i, ret;
3957
3958	if ((CUR != '<') \|\| (NXT(1) != '/')) {
3959	htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3960	"htmlParseEndTag: '</' not found\n", NULL, NULL);
3961	return (0);
3962	}
3963	SKIP(2);
3964
3965	name = htmlParseHTMLName(ctxt);
3966	if (name == NULL)
3967	return (0);
3968	/*
3969	* We should definitely be at the ending "S? '>'" part
3970	*/
3971	SKIP_BLANKS;
3972	if ((!IS_CHAR_CH(CUR)) \|\| (CUR != '>')) {
3973	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3974	"End tag : expected '>'\n", NULL, NULL);
3975	if (ctxt->recovery) {
3976	/*
3977	* We're not at the ending > !!
3978	* Error, unless in recover mode where we search forwards
3979	* until we find a >
3980	*/
3981	while (CUR != '\0' && CUR != '>') NEXT;
3982	NEXT;
3983	}
3984	} else
3985	NEXT;
3986
3987	/*
3988	* if we ignored misplaced tags in htmlParseStartTag don't pop them
3989	* out now.
3990	*/
3991	if ((ctxt->depth > 0) &&
3992	(xmlStrEqual(name, BAD_CAST "html") \|\|
3993	xmlStrEqual(name, BAD_CAST "body") \|\|
3994	xmlStrEqual(name, BAD_CAST "head"))) {
3995	ctxt->depth--;
3996	return (0);
3997	}
3998
3999	/*
4000	* If the name read is not one of the element in the parsing stack
4001	* then return, it's just an error.
4002	*/
4003	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4004	if (xmlStrEqual(name, ctxt->nameTab[i]))
4005	break;
4006	}
4007	if (i < 0) {
4008	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4009	"Unexpected end tag : %s\n", name, NULL);
4010	return (0);
4011	}
4012
4013
4014	/*
4015	* Check for auto-closure of HTML elements.
4016	*/
4017
4018	htmlAutoCloseOnClose(ctxt, name);
4019
4020	/*
4021	* Well formedness constraints, opening and closing must match.
4022	* With the exception that the autoclose may have popped stuff out
4023	* of the stack.
4024	*/
4025	if (!xmlStrEqual(name, ctxt->name)) {
4026	if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4027	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4028	"Opening and ending tag mismatch: %s and %s\n",
4029	name, ctxt->name);
4030	}
4031	}
4032
4033	/*
4034	* SAX: End of Tag
4035	*/
4036	oldname = ctxt->name;
4037	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4038	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4039	ctxt->sax->endElement(ctxt->userData, name);
4040	htmlNodeInfoPop(ctxt);
4041	htmlnamePop(ctxt);
4042	ret = 1;
4043	} else {
4044	ret = 0;
4045	}
4046
4047	return (ret);
4048	}
4049
4050
4051	/**
4052	* htmlParseReference:
4053	* @ctxt: an HTML parser context
4054	*
4055	* parse and handle entity references in content,
4056	* this will end-up in a call to character() since this is either a
4057	* CharRef, or a predefined entity.
4058	*/
4059	static void
4060	htmlParseReference(htmlParserCtxtPtr ctxt) {
4061	const htmlEntityDesc * ent;
4062	xmlChar out[6];
4063	const xmlChar *name;
4064	if (CUR != '&') return;
4065
4066	if (NXT(1) == '#') {
4067	unsigned int c;
4068	int bits, i = 0;
4069
4070	c = htmlParseCharRef(ctxt);
4071	if (c == 0)
4072	return;
4073
4074	if (c < 0x80) { out[i++]= c; bits= -6; }
4075	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
4076	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
4077	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
4078
4079	for ( ; bits >= 0; bits-= 6) {
4080	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
4081	}
4082	out[i] = 0;
4083
4084	htmlCheckParagraph(ctxt);
4085	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4086	ctxt->sax->characters(ctxt->userData, out, i);
4087	} else {
4088	ent = htmlParseEntityRef(ctxt, &name);
4089	if (name == NULL) {
4090	htmlCheckParagraph(ctxt);
4091	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4092	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4093	return;
4094	}
4095	if ((ent == NULL) \|\| !(ent->value > 0)) {
4096	htmlCheckParagraph(ctxt);
4097	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4098	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4099	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4100	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4101	}
4102	} else {
4103	unsigned int c;
4104	int bits, i = 0;
4105
4106	c = ent->value;
4107	if (c < 0x80)
4108	{ out[i++]= c; bits= -6; }
4109	else if (c < 0x800)
4110	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
4111	else if (c < 0x10000)
4112	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
4113	else
4114	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
4115
4116	for ( ; bits >= 0; bits-= 6) {
4117	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
4118	}
4119	out[i] = 0;
4120
4121	htmlCheckParagraph(ctxt);
4122	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4123	ctxt->sax->characters(ctxt->userData, out, i);
4124	}
4125	}
4126	}
4127
4128	/**
4129	* htmlParseContent:
4130	* @ctxt: an HTML parser context
4131	*
4132	* Parse a content: comment, sub-element, reference or text.
4133	* Kept for compatibility with old code
4134	*/
4135
4136	static void
4137	htmlParseContent(htmlParserCtxtPtr ctxt) {
4138	xmlChar *currentNode;
4139	int depth;
4140	const xmlChar *name;
4141
4142	currentNode = xmlStrdup(ctxt->name);
4143	depth = ctxt->nameNr;
4144	while (1) {
4145	long cons = ctxt->nbChars;
4146
4147	GROW;
4148
4149	if (ctxt->instate == XML_PARSER_EOF)
4150	break;
4151
4152	/*
4153	* Our tag or one of it's parent or children is ending.
4154	*/
4155	if ((CUR == '<') && (NXT(1) == '/')) {
4156	if (htmlParseEndTag(ctxt) &&
4157	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
4158	if (currentNode != NULL)
4159	xmlFree(currentNode);
4160	return;
4161	}
4162	continue; /* while */
4163	}
4164
4165	else if ((CUR == '<') &&
4166	((IS_ASCII_LETTER(NXT(1))) \|\|
4167	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
4168	name = htmlParseHTMLName_nonInvasive(ctxt);
4169	if (name == NULL) {
4170	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4171	"htmlParseStartTag: invalid element name\n",
4172	NULL, NULL);
4173	/* Dump the bogus tag like browsers do */
4174	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4175	NEXT;
4176
4177	if (currentNode != NULL)
4178	xmlFree(currentNode);
4179	return;
4180	}
4181
4182	if (ctxt->name != NULL) {
4183	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4184	htmlAutoClose(ctxt, name);
4185	continue;
4186	}
4187	}
4188	}
4189
4190	/*
4191	* Has this node been popped out during parsing of
4192	* the next element
4193	*/
4194	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4195	(!xmlStrEqual(currentNode, ctxt->name)))
4196	{
4197	if (currentNode != NULL) xmlFree(currentNode);
4198	return;
4199	}
4200
4201	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
4202	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
4203	/*
4204	* Handle SCRIPT/STYLE separately
4205	*/
4206	htmlParseScript(ctxt);
4207	} else {
4208	/*
4209	* Sometimes DOCTYPE arrives in the middle of the document
4210	*/
4211	if ((CUR == '<') && (NXT(1) == '!') &&
4212	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4213	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4214	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4215	(UPP(8) == 'E')) {
4216	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4217	"Misplaced DOCTYPE declaration\n",
4218	BAD_CAST "DOCTYPE" , NULL);
4219	htmlParseDocTypeDecl(ctxt);
4220	}
4221
4222	/*
4223	* First case : a comment
4224	*/
4225	if ((CUR == '<') && (NXT(1) == '!') &&
4226	(NXT(2) == '-') && (NXT(3) == '-')) {
4227	htmlParseComment(ctxt);
4228	}
4229
4230	/*
4231	* Second case : a Processing Instruction.
4232	*/
4233	else if ((CUR == '<') && (NXT(1) == '?')) {
4234	htmlParsePI(ctxt);
4235	}
4236
4237	/*
4238	* Third case : a sub-element.
4239	*/
4240	else if (CUR == '<') {
4241	htmlParseElement(ctxt);
4242	}
4243
4244	/*
4245	* Fourth case : a reference. If if has not been resolved,
4246	* parsing returns it's Name, create the node
4247	*/
4248	else if (CUR == '&') {
4249	htmlParseReference(ctxt);
4250	}
4251
4252	/*
4253	* Fifth case : end of the resource
4254	*/
4255	else if (CUR == 0) {
4256	htmlAutoCloseOnEnd(ctxt);
4257	break;
4258	}
4259
4260	/*
4261	* Last case, text. Note that References are handled directly.
4262	*/
4263	else {
4264	htmlParseCharData(ctxt);
4265	}
4266
4267	if (cons == ctxt->nbChars) {
4268	if (ctxt->node != NULL) {
4269	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4270	"detected an error in element content\n",
4271	NULL, NULL);
4272	}
4273	break;
4274	}
4275	}
4276	GROW;
4277	}
4278	if (currentNode != NULL) xmlFree(currentNode);
4279	}
4280
4281	/**
4282	* htmlParseElement:
4283	* @ctxt: an HTML parser context
4284	*
4285	* parse an HTML element, this is highly recursive
4286	* this is kept for compatibility with previous code versions
4287	*
4288	* [39] element ::= EmptyElemTag \| STag content ETag
4289	*
4290	* [41] Attribute ::= Name Eq AttValue
4291	*/
4292
4293	void
4294	htmlParseElement(htmlParserCtxtPtr ctxt) {
4295	const xmlChar *name;
4296	xmlChar *currentNode = NULL;
4297	const htmlElemDesc * info;
4298	htmlParserNodeInfo node_info;
4299	int failed;
4300	int depth;
4301	const xmlChar *oldptr;
4302
4303	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4304	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4305	"htmlParseElement: context error\n", NULL, NULL);
4306	return;
4307	}
4308
4309	if (ctxt->instate == XML_PARSER_EOF)
4310	return;
4311
4312	/* Capture start position */
4313	if (ctxt->record_info) {
4314	node_info.begin_pos = ctxt->input->consumed +
4315	(CUR_PTR - ctxt->input->base);
4316	node_info.begin_line = ctxt->input->line;
4317	}
4318
4319	failed = htmlParseStartTag(ctxt);
4320	name = ctxt->name;
4321	if ((failed == -1) \|\| (name == NULL)) {
4322	if (CUR == '>')
4323	NEXT;
4324	return;
4325	}
4326
4327	/*
4328	* Lookup the info for that element.
4329	*/
4330	info = htmlTagLookup(name);
4331	if (info == NULL) {
4332	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4333	"Tag %s invalid\n", name, NULL);
4334	}
4335
4336	/*
4337	* Check for an Empty Element labeled the XML/SGML way
4338	*/
4339	if ((CUR == '/') && (NXT(1) == '>')) {
4340	SKIP(2);
4341	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4342	ctxt->sax->endElement(ctxt->userData, name);
4343	htmlnamePop(ctxt);
4344	return;
4345	}
4346
4347	if (CUR == '>') {
4348	NEXT;
4349	} else {
4350	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4351	"Couldn't find end of Start Tag %s\n", name, NULL);
4352
4353	/*
4354	* end of parsing of this node.
4355	*/
4356	if (xmlStrEqual(name, ctxt->name)) {
4357	nodePop(ctxt);
4358	htmlnamePop(ctxt);
4359	}
4360
4361	/*
4362	* Capture end position and add node
4363	*/
4364	if (ctxt->record_info) {
4365	node_info.end_pos = ctxt->input->consumed +
4366	(CUR_PTR - ctxt->input->base);
4367	node_info.end_line = ctxt->input->line;
4368	node_info.node = ctxt->node;
4369	xmlParserAddNodeInfo(ctxt, &node_info);
4370	}
4371	return;
4372	}
4373
4374	/*
4375	* Check for an Empty Element from DTD definition
4376	*/
4377	if ((info != NULL) && (info->empty)) {
4378	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4379	ctxt->sax->endElement(ctxt->userData, name);
4380	htmlnamePop(ctxt);
4381	return;
4382	}
4383
4384	/*
4385	* Parse the content of the element:
4386	*/
4387	currentNode = xmlStrdup(ctxt->name);
4388	depth = ctxt->nameNr;
4389	while (IS_CHAR_CH(CUR)) {
4390	oldptr = ctxt->input->cur;
4391	htmlParseContent(ctxt);
4392	if (oldptr==ctxt->input->cur) break;
4393	if (ctxt->nameNr < depth) break;
4394	}
4395
4396	/*
4397	* Capture end position and add node
4398	*/
4399	if ( currentNode != NULL && ctxt->record_info ) {
4400	node_info.end_pos = ctxt->input->consumed +
4401	(CUR_PTR - ctxt->input->base);
4402	node_info.end_line = ctxt->input->line;
4403	node_info.node = ctxt->node;
4404	xmlParserAddNodeInfo(ctxt, &node_info);
4405	}
4406	if (!IS_CHAR_CH(CUR)) {
4407	htmlAutoCloseOnEnd(ctxt);
4408	}
4409
4410	if (currentNode != NULL)
4411	xmlFree(currentNode);
4412	}
4413
4414	static void
4415	htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4416	/*
4417	* Capture end position and add node
4418	*/
4419	if ( ctxt->node != NULL && ctxt->record_info ) {
4420	ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4421	(CUR_PTR - ctxt->input->base);
4422	ctxt->nodeInfo->end_line = ctxt->input->line;
4423	ctxt->nodeInfo->node = ctxt->node;
4424	xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4425	htmlNodeInfoPop(ctxt);
4426	}
4427	if (!IS_CHAR_CH(CUR)) {
4428	htmlAutoCloseOnEnd(ctxt);
4429	}
4430	}
4431
4432	/**
4433	* htmlParseElementInternal:
4434	* @ctxt: an HTML parser context
4435	*
4436	* parse an HTML element, new version, non recursive
4437	*
4438	* [39] element ::= EmptyElemTag \| STag content ETag
4439	*
4440	* [41] Attribute ::= Name Eq AttValue
4441	*/
4442
4443	static void
4444	htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4445	const xmlChar *name;
4446	const htmlElemDesc * info;
4447	htmlParserNodeInfo node_info = { 0, };
4448	int failed;
4449
4450	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4451	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4452	"htmlParseElementInternal: context error\n", NULL, NULL);
4453	return;
4454	}
4455
4456	if (ctxt->instate == XML_PARSER_EOF)
4457	return;
4458
4459	/* Capture start position */
4460	if (ctxt->record_info) {
4461	node_info.begin_pos = ctxt->input->consumed +
4462	(CUR_PTR - ctxt->input->base);
4463	node_info.begin_line = ctxt->input->line;
4464	}
4465
4466	failed = htmlParseStartTag(ctxt);
4467	name = ctxt->name;
4468	if ((failed == -1) \|\| (name == NULL)) {
4469	if (CUR == '>')
4470	NEXT;
4471	return;
4472	}
4473
4474	/*
4475	* Lookup the info for that element.
4476	*/
4477	info = htmlTagLookup(name);
4478	if (info == NULL) {
4479	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4480	"Tag %s invalid\n", name, NULL);
4481	}
4482
4483	/*
4484	* Check for an Empty Element labeled the XML/SGML way
4485	*/
4486	if ((CUR == '/') && (NXT(1) == '>')) {
4487	SKIP(2);
4488	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4489	ctxt->sax->endElement(ctxt->userData, name);
4490	htmlnamePop(ctxt);
4491	return;
4492	}
4493
4494	if (CUR == '>') {
4495	NEXT;
4496	} else {
4497	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4498	"Couldn't find end of Start Tag %s\n", name, NULL);
4499
4500	/*
4501	* end of parsing of this node.
4502	*/
4503	if (xmlStrEqual(name, ctxt->name)) {
4504	nodePop(ctxt);
4505	htmlnamePop(ctxt);
4506	}
4507
4508	if (ctxt->record_info)
4509	htmlNodeInfoPush(ctxt, &node_info);
4510	htmlParserFinishElementParsing(ctxt);
4511	return;
4512	}
4513
4514	/*
4515	* Check for an Empty Element from DTD definition
4516	*/
4517	if ((info != NULL) && (info->empty)) {
4518	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4519	ctxt->sax->endElement(ctxt->userData, name);
4520	htmlnamePop(ctxt);
4521	return;
4522	}
4523
4524	if (ctxt->record_info)
4525	htmlNodeInfoPush(ctxt, &node_info);
4526	}
4527
4528	/**
4529	* htmlParseContentInternal:
4530	* @ctxt: an HTML parser context
4531	*
4532	* Parse a content: comment, sub-element, reference or text.
4533	* New version for non recursive htmlParseElementInternal
4534	*/
4535
4536	static void
4537	htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4538	xmlChar *currentNode;
4539	int depth;
4540	const xmlChar *name;
4541
4542	currentNode = xmlStrdup(ctxt->name);
4543	depth = ctxt->nameNr;
4544	while (1) {
4545	long cons = ctxt->nbChars;
4546
4547	GROW;
4548
4549	if (ctxt->instate == XML_PARSER_EOF)
4550	break;
4551
4552	/*
4553	* Our tag or one of it's parent or children is ending.
4554	*/
4555	if ((CUR == '<') && (NXT(1) == '/')) {
4556	if (htmlParseEndTag(ctxt) &&
4557	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
4558	if (currentNode != NULL)
4559	xmlFree(currentNode);
4560
4561	currentNode = xmlStrdup(ctxt->name);
4562	depth = ctxt->nameNr;
4563	}
4564	continue; /* while */
4565	}
4566
4567	else if ((CUR == '<') &&
4568	((IS_ASCII_LETTER(NXT(1))) \|\|
4569	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
4570	name = htmlParseHTMLName_nonInvasive(ctxt);
4571	if (name == NULL) {
4572	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4573	"htmlParseStartTag: invalid element name\n",
4574	NULL, NULL);
4575	/* Dump the bogus tag like browsers do */
4576	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4577	NEXT;
4578
4579	htmlParserFinishElementParsing(ctxt);
4580	if (currentNode != NULL)
4581	xmlFree(currentNode);
4582
4583	currentNode = xmlStrdup(ctxt->name);
4584	depth = ctxt->nameNr;
4585	continue;
4586	}
4587
4588	if (ctxt->name != NULL) {
4589	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4590	htmlAutoClose(ctxt, name);
4591	continue;
4592	}
4593	}
4594	}
4595
4596	/*
4597	* Has this node been popped out during parsing of
4598	* the next element
4599	*/
4600	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4601	(!xmlStrEqual(currentNode, ctxt->name)))
4602	{
4603	htmlParserFinishElementParsing(ctxt);
4604	if (currentNode != NULL) xmlFree(currentNode);
4605
4606	currentNode = xmlStrdup(ctxt->name);
4607	depth = ctxt->nameNr;
4608	continue;
4609	}
4610
4611	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
4612	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
4613	/*
4614	* Handle SCRIPT/STYLE separately
4615	*/
4616	htmlParseScript(ctxt);
4617	} else {
4618	/*
4619	* Sometimes DOCTYPE arrives in the middle of the document
4620	*/
4621	if ((CUR == '<') && (NXT(1) == '!') &&
4622	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4623	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4624	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4625	(UPP(8) == 'E')) {
4626	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4627	"Misplaced DOCTYPE declaration\n",
4628	BAD_CAST "DOCTYPE" , NULL);
4629	htmlParseDocTypeDecl(ctxt);
4630	}
4631
4632	/*
4633	* First case : a comment
4634	*/
4635	if ((CUR == '<') && (NXT(1) == '!') &&
4636	(NXT(2) == '-') && (NXT(3) == '-')) {
4637	htmlParseComment(ctxt);
4638	}
4639
4640	/*
4641	* Second case : a Processing Instruction.
4642	*/
4643	else if ((CUR == '<') && (NXT(1) == '?')) {
4644	htmlParsePI(ctxt);
4645	}
4646
4647	/*
4648	* Third case : a sub-element.
4649	*/
4650	else if (CUR == '<') {
4651	htmlParseElementInternal(ctxt);
4652	if (currentNode != NULL) xmlFree(currentNode);
4653
4654	currentNode = xmlStrdup(ctxt->name);
4655	depth = ctxt->nameNr;
4656	}
4657
4658	/*
4659	* Fourth case : a reference. If if has not been resolved,
4660	* parsing returns it's Name, create the node
4661	*/
4662	else if (CUR == '&') {
4663	htmlParseReference(ctxt);
4664	}
4665
4666	/*
4667	* Fifth case : end of the resource
4668	*/
4669	else if (CUR == 0) {
4670	htmlAutoCloseOnEnd(ctxt);
4671	break;
4672	}
4673
4674	/*
4675	* Last case, text. Note that References are handled directly.
4676	*/
4677	else {
4678	htmlParseCharData(ctxt);
4679	}
4680
4681	if (cons == ctxt->nbChars) {
4682	if (ctxt->node != NULL) {
4683	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4684	"detected an error in element content\n",
4685	NULL, NULL);
4686	}
4687	break;
4688	}
4689	}
4690	GROW;
4691	}
4692	if (currentNode != NULL) xmlFree(currentNode);
4693	}
4694
4695	/**
4696	* htmlParseContent:
4697	* @ctxt: an HTML parser context
4698	*
4699	* Parse a content: comment, sub-element, reference or text.
4700	* This is the entry point when called from parser.c
4701	*/
4702
4703	void
4704	__htmlParseContent(void *ctxt) {
4705	if (ctxt != NULL)
4706	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4707	}
4708
4709	/**
4710	* htmlParseDocument:
4711	* @ctxt: an HTML parser context
4712	*
4713	* parse an HTML document (and build a tree if using the standard SAX
4714	* interface).
4715	*
4716	* Returns 0, -1 in case of error. the parser context is augmented
4717	* as a result of the parsing.
4718	*/
4719
4720	int
4721	htmlParseDocument(htmlParserCtxtPtr ctxt) {
4722	xmlChar start[4];
4723	xmlCharEncoding enc;
4724	xmlDtdPtr dtd;
4725
4726	xmlInitParser();
4727
4728	htmlDefaultSAXHandlerInit();
4729
4730	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4731	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4732	"htmlParseDocument: context error\n", NULL, NULL);
4733	return(XML_ERR_INTERNAL_ERROR);
4734	}
4735	ctxt->html = 1;
4736	ctxt->linenumbers = 1;
4737	GROW;
4738	/*
4739	* SAX: beginning of the document processing.
4740	*/
4741	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4742	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4743
4744	if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4745	((ctxt->input->end - ctxt->input->cur) >= 4)) {
4746	/*
4747	* Get the 4 first bytes and decode the charset
4748	* if enc != XML_CHAR_ENCODING_NONE
4749	* plug some encoding conversion routines.
4750	*/
4751	start[0] = RAW;
4752	start[1] = NXT(1);
4753	start[2] = NXT(2);
4754	start[3] = NXT(3);
4755	enc = xmlDetectCharEncoding(&start[0], 4);
4756	if (enc != XML_CHAR_ENCODING_NONE) {
4757	xmlSwitchEncoding(ctxt, enc);
4758	}
4759	}
4760
4761	/*
4762	* Wipe out everything which is before the first '<'
4763	*/
4764	SKIP_BLANKS;
4765	if (CUR == 0) {
4766	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4767	"Document is empty\n", NULL, NULL);
4768	}
4769
4770	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4771	ctxt->sax->startDocument(ctxt->userData);
4772
4773
4774	/*
4775	* Parse possible comments and PIs before any content
4776	*/
4777	while (((CUR == '<') && (NXT(1) == '!') &&
4778	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4779	((CUR == '<') && (NXT(1) == '?'))) {
4780	htmlParseComment(ctxt);
4781	htmlParsePI(ctxt);
4782	SKIP_BLANKS;
4783	}
4784
4785
4786	/*
4787	* Then possibly doc type declaration(s) and more Misc
4788	* (doctypedecl Misc*)?
4789	*/
4790	if ((CUR == '<') && (NXT(1) == '!') &&
4791	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4792	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4793	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4794	(UPP(8) == 'E')) {
4795	htmlParseDocTypeDecl(ctxt);
4796	}
4797	SKIP_BLANKS;
4798
4799	/*
4800	* Parse possible comments and PIs before any content
4801	*/
4802	while (((CUR == '<') && (NXT(1) == '!') &&
4803	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4804	((CUR == '<') && (NXT(1) == '?'))) {
4805	htmlParseComment(ctxt);
4806	htmlParsePI(ctxt);
4807	SKIP_BLANKS;
4808	}
4809
4810	/*
4811	* Time to start parsing the tree itself
4812	*/
4813	htmlParseContentInternal(ctxt);
4814
4815	/*
4816	* autoclose
4817	*/
4818	if (CUR == 0)
4819	htmlAutoCloseOnEnd(ctxt);
4820
4821
4822	/*
4823	* SAX: end of the document processing.
4824	*/
4825	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4826	ctxt->sax->endDocument(ctxt->userData);
4827
4828	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4829	dtd = xmlGetIntSubset(ctxt->myDoc);
4830	if (dtd == NULL)
4831	ctxt->myDoc->intSubset =
4832	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4833	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4834	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4835	}
4836	if (! ctxt->wellFormed) return(-1);
4837	return(0);
4838	}
4839
4840
4841	/************************************************************************
4842	* *
4843	* Parser contexts handling *
4844	* *
4845	************************************************************************/
4846
4847	/**
4848	* htmlInitParserCtxt:
4849	* @ctxt: an HTML parser context
4850	*
4851	* Initialize a parser context
4852	*
4853	* Returns 0 in case of success and -1 in case of error
4854	*/
4855
4856	static int
4857	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4858	{
4859	htmlSAXHandler *sax;
4860
4861	if (ctxt == NULL) return(-1);
4862	memset(ctxt, 0, sizeof(htmlParserCtxt));
4863
4864	ctxt->dict = xmlDictCreate();
4865	if (ctxt->dict == NULL) {
4866	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4867	return(-1);
4868	}
4869	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4870	if (sax == NULL) {
4871	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4872	return(-1);
4873	}
4874	else
4875	memset(sax, 0, sizeof(htmlSAXHandler));
4876
4877	/* Allocate the Input stack */
4878	ctxt->inputTab = (htmlParserInputPtr *)
4879	xmlMalloc(5 * sizeof(htmlParserInputPtr));
4880	if (ctxt->inputTab == NULL) {
4881	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4882	ctxt->inputNr = 0;
4883	ctxt->inputMax = 0;
4884	ctxt->input = NULL;
4885	return(-1);
4886	}
4887	ctxt->inputNr = 0;
4888	ctxt->inputMax = 5;
4889	ctxt->input = NULL;
4890	ctxt->version = NULL;
4891	ctxt->encoding = NULL;
4892	ctxt->standalone = -1;
4893	ctxt->instate = XML_PARSER_START;
4894
4895	/* Allocate the Node stack */
4896	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
4897	if (ctxt->nodeTab == NULL) {
4898	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4899	ctxt->nodeNr = 0;
4900	ctxt->nodeMax = 0;
4901	ctxt->node = NULL;
4902	ctxt->inputNr = 0;
4903	ctxt->inputMax = 0;
4904	ctxt->input = NULL;
4905	return(-1);
4906	}
4907	ctxt->nodeNr = 0;
4908	ctxt->nodeMax = 10;
4909	ctxt->node = NULL;
4910
4911	/* Allocate the Name stack */
4912	ctxt->nameTab = (const xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
4913	if (ctxt->nameTab == NULL) {
4914	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4915	ctxt->nameNr = 0;
4916	ctxt->nameMax = 0;
4917	ctxt->name = NULL;
4918	ctxt->nodeNr = 0;
4919	ctxt->nodeMax = 0;
4920	ctxt->node = NULL;
4921	ctxt->inputNr = 0;
4922	ctxt->inputMax = 0;
4923	ctxt->input = NULL;
4924	return(-1);
4925	}
4926	ctxt->nameNr = 0;
4927	ctxt->nameMax = 10;
4928	ctxt->name = NULL;
4929
4930	ctxt->nodeInfoTab = NULL;
4931	ctxt->nodeInfoNr = 0;
4932	ctxt->nodeInfoMax = 0;
4933
4934	if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4935	else {
4936	ctxt->sax = sax;
4937	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4938	}
4939	ctxt->userData = ctxt;
4940	ctxt->myDoc = NULL;
4941	ctxt->wellFormed = 1;
4942	ctxt->replaceEntities = 0;
4943	ctxt->linenumbers = xmlLineNumbersDefaultValue;
4944	ctxt->html = 1;
4945	ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4946	ctxt->vctxt.userData = ctxt;
4947	ctxt->vctxt.error = xmlParserValidityError;
4948	ctxt->vctxt.warning = xmlParserValidityWarning;
4949	ctxt->record_info = 0;
4950	ctxt->validate = 0;
4951	ctxt->nbChars = 0;
4952	ctxt->checkIndex = 0;
4953	ctxt->catalogs = NULL;
4954	xmlInitNodeInfoSeq(&ctxt->node_seq);
4955	return(0);
4956	}
4957
4958	/**
4959	* htmlFreeParserCtxt:
4960	* @ctxt: an HTML parser context
4961	*
4962	* Free all the memory used by a parser context. However the parsed
4963	* document in ctxt->myDoc is not freed.
4964	*/
4965
4966	void
4967	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4968	{
4969	xmlFreeParserCtxt(ctxt);
4970	}
4971
4972	/**
4973	* htmlNewParserCtxt:
4974	*
4975	* Allocate and initialize a new parser context.
4976	*
4977	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
4978	*/
4979
4980	htmlParserCtxtPtr
4981	htmlNewParserCtxt(void)
4982	{
4983	xmlParserCtxtPtr ctxt;
4984
4985	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4986	if (ctxt == NULL) {
4987	htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4988	return(NULL);
4989	}
4990	memset(ctxt, 0, sizeof(xmlParserCtxt));
4991	if (htmlInitParserCtxt(ctxt) < 0) {
4992	htmlFreeParserCtxt(ctxt);
4993	return(NULL);
4994	}
4995	return(ctxt);
4996	}
4997
4998	/**
4999	* htmlCreateMemoryParserCtxt:
5000	* @buffer: a pointer to a char array
5001	* @size: the size of the array
5002	*
5003	* Create a parser context for an HTML in-memory document.
5004	*
5005	* Returns the new parser context or NULL
5006	*/
5007	htmlParserCtxtPtr
5008	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5009	xmlParserCtxtPtr ctxt;
5010	xmlParserInputPtr input;
5011	xmlParserInputBufferPtr buf;
5012
5013	if (buffer == NULL)
5014	return(NULL);
5015	if (size <= 0)
5016	return(NULL);
5017
5018	ctxt = htmlNewParserCtxt();
5019	if (ctxt == NULL)
5020	return(NULL);
5021
5022	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5023	if (buf == NULL) return(NULL);
5024
5025	input = xmlNewInputStream(ctxt);
5026	if (input == NULL) {
5027	xmlFreeParserCtxt(ctxt);
5028	return(NULL);
5029	}
5030
5031	input->filename = NULL;
5032	input->buf = buf;
5033	xmlBufResetInput(buf->buffer, input);
5034
5035	inputPush(ctxt, input);
5036	return(ctxt);
5037	}
5038
5039	/**
5040	* htmlCreateDocParserCtxt:
5041	* @cur: a pointer to an array of xmlChar
5042	* @encoding: a free form C string describing the HTML document encoding, or NULL
5043	*
5044	* Create a parser context for an HTML document.
5045	*
5046	* TODO: check the need to add encoding handling there
5047	*
5048	* Returns the new parser context or NULL
5049	*/
5050	static htmlParserCtxtPtr
5051	htmlCreateDocParserCtxt(const xmlChar cur, const char encoding) {
5052	int len;
5053	htmlParserCtxtPtr ctxt;
5054
5055	if (cur == NULL)
5056	return(NULL);
5057	len = xmlStrlen(cur);
5058	ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5059	if (ctxt == NULL)
5060	return(NULL);
5061
5062	if (encoding != NULL) {
5063	xmlCharEncoding enc;
5064	xmlCharEncodingHandlerPtr handler;
5065
5066	if (ctxt->input->encoding != NULL)
5067	xmlFree((xmlChar *) ctxt->input->encoding);
5068	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5069
5070	enc = xmlParseCharEncoding(encoding);
5071	/*
5072	* registered set of known encodings
5073	*/
5074	if (enc != XML_CHAR_ENCODING_ERROR) {
5075	xmlSwitchEncoding(ctxt, enc);
5076	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5077	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5078	"Unsupported encoding %s\n",
5079	(const xmlChar *) encoding, NULL);
5080	}
5081	} else {
5082	/*
5083	* fallback for unknown encodings
5084	*/
5085	handler = xmlFindCharEncodingHandler((const char *) encoding);
5086	if (handler != NULL) {
5087	xmlSwitchToEncoding(ctxt, handler);
5088	} else {
5089	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5090	"Unsupported encoding %s\n",
5091	(const xmlChar *) encoding, NULL);
5092	}
5093	}
5094	}
5095	return(ctxt);
5096	}
5097
5098	#ifdef LIBXML_PUSH_ENABLED
5099	/************************************************************************
5100	* *
5101	* Progressive parsing interfaces *
5102	* *
5103	************************************************************************/
5104
5105	/**
5106	* htmlParseLookupSequence:
5107	* @ctxt: an HTML parser context
5108	* @first: the first char to lookup
5109	* @next: the next char to lookup or zero
5110	* @third: the next char to lookup or zero
5111	* @comment: flag to force checking inside comments
5112	*
5113	* Try to find if a sequence (first, next, third) or just (first next) or
5114	* (first) is available in the input stream.
5115	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
5116	* to avoid rescanning sequences of bytes, it DOES change the state of the
5117	* parser, do not use liberally.
5118	* This is basically similar to xmlParseLookupSequence()
5119	*
5120	* Returns the index to the current parsing point if the full sequence
5121	* is available, -1 otherwise.
5122	*/
5123	static int
5124	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5125	xmlChar next, xmlChar third, int iscomment,
5126	int ignoreattrval)
5127	{
5128	int base, len;
5129	htmlParserInputPtr in;
5130	const xmlChar *buf;
5131	int incomment = 0;
5132	int invalue = 0;
5133	char valdellim = 0x0;
5134
5135	in = ctxt->input;
5136	if (in == NULL)
5137	return (-1);
5138
5139	base = in->cur - in->base;
5140	if (base < 0)
5141	return (-1);
5142
5143	if (ctxt->checkIndex > base)
5144	base = ctxt->checkIndex;
5145
5146	if (in->buf == NULL) {
5147	buf = in->base;
5148	len = in->length;
5149	} else {
5150	buf = xmlBufContent(in->buf->buffer);
5151	len = xmlBufUse(in->buf->buffer);
5152	}
5153
5154	/* take into account the sequence length */
5155	if (third)
5156	len -= 2;
5157	else if (next)
5158	len--;
5159	for (; base < len; base++) {
5160	if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5161	if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5162	(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5163	incomment = 1;
5164	/* do not increment past <! - some people use <!--> */
5165	base += 2;
5166	}
5167	}
5168	if (ignoreattrval) {
5169	if (buf[base] == '"' \|\| buf[base] == '\'') {
5170	if (invalue) {
5171	if (buf[base] == valdellim) {
5172	invalue = 0;
5173	continue;
5174	}
5175	} else {
5176	valdellim = buf[base];
5177	invalue = 1;
5178	continue;
5179	}
5180	} else if (invalue) {
5181	continue;
5182	}
5183	}
5184	if (incomment) {
5185	if (base + 3 > len)
5186	return (-1);
5187	if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5188	(buf[base + 2] == '>')) {
5189	incomment = 0;
5190	base += 2;
5191	}
5192	continue;
5193	}
5194	if (buf[base] == first) {
5195	if (third != 0) {
5196	if ((buf[base + 1] != next) \|\| (buf[base + 2] != third))
5197	continue;
5198	} else if (next != 0) {
5199	if (buf[base + 1] != next)
5200	continue;
5201	}
5202	ctxt->checkIndex = 0;
5203	#ifdef DEBUG_PUSH
5204	if (next == 0)
5205	xmlGenericError(xmlGenericErrorContext,
5206	"HPP: lookup '%c' found at %d\n",
5207	first, base);
5208	else if (third == 0)
5209	xmlGenericError(xmlGenericErrorContext,
5210	"HPP: lookup '%c%c' found at %d\n",
5211	first, next, base);
5212	else
5213	xmlGenericError(xmlGenericErrorContext,
5214	"HPP: lookup '%c%c%c' found at %d\n",
5215	first, next, third, base);
5216	#endif
5217	return (base - (in->cur - in->base));
5218	}
5219	}
5220	if ((!incomment) && (!invalue))
5221	ctxt->checkIndex = base;
5222	#ifdef DEBUG_PUSH
5223	if (next == 0)
5224	xmlGenericError(xmlGenericErrorContext,
5225	"HPP: lookup '%c' failed\n", first);
5226	else if (third == 0)
5227	xmlGenericError(xmlGenericErrorContext,
5228	"HPP: lookup '%c%c' failed\n", first, next);
5229	else
5230	xmlGenericError(xmlGenericErrorContext,
5231	"HPP: lookup '%c%c%c' failed\n", first, next,
5232	third);
5233	#endif
5234	return (-1);
5235	}
5236
5237	/**
5238	* htmlParseLookupChars:
5239	* @ctxt: an HTML parser context
5240	* @stop: Array of chars, which stop the lookup.
5241	* @stopLen: Length of stop-Array
5242	*
5243	* Try to find if any char of the stop-Array is available in the input
5244	* stream.
5245	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
5246	* to avoid rescanning sequences of bytes, it DOES change the state of the
5247	* parser, do not use liberally.
5248	*
5249	* Returns the index to the current parsing point if a stopChar
5250	* is available, -1 otherwise.
5251	*/
5252	static int
5253	htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5254	int stopLen)
5255	{
5256	int base, len;
5257	htmlParserInputPtr in;
5258	const xmlChar *buf;
5259	int incomment = 0;
5260	int i;
5261
5262	in = ctxt->input;
5263	if (in == NULL)
5264	return (-1);
5265
5266	base = in->cur - in->base;
5267	if (base < 0)
5268	return (-1);
5269
5270	if (ctxt->checkIndex > base)
5271	base = ctxt->checkIndex;
5272
5273	if (in->buf == NULL) {
5274	buf = in->base;
5275	len = in->length;
5276	} else {
5277	buf = xmlBufContent(in->buf->buffer);
5278	len = xmlBufUse(in->buf->buffer);
5279	}
5280
5281	for (; base < len; base++) {
5282	if (!incomment && (base + 4 < len)) {
5283	if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5284	(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5285	incomment = 1;
5286	/* do not increment past <! - some people use <!--> */
5287	base += 2;
5288	}
5289	}
5290	if (incomment) {
5291	if (base + 3 > len)
5292	return (-1);
5293	if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5294	(buf[base + 2] == '>')) {
5295	incomment = 0;
5296	base += 2;
5297	}
5298	continue;
5299	}
5300	for (i = 0; i < stopLen; ++i) {
5301	if (buf[base] == stop[i]) {
5302	ctxt->checkIndex = 0;
5303	return (base - (in->cur - in->base));
5304	}
5305	}
5306	}
5307	ctxt->checkIndex = base;
5308	return (-1);
5309	}
5310
5311	/**
5312	* htmlParseTryOrFinish:
5313	* @ctxt: an HTML parser context
5314	* @terminate: last chunk indicator
5315	*
5316	* Try to progress on parsing
5317	*
5318	* Returns zero if no parsing was possible
5319	*/
5320	static int
5321	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5322	int ret = 0;
5323	htmlParserInputPtr in;
5324	int avail = 0;
5325	xmlChar cur, next;
5326
5327	htmlParserNodeInfo node_info;
5328
5329	#ifdef DEBUG_PUSH
5330	switch (ctxt->instate) {
5331	case XML_PARSER_EOF:
5332	xmlGenericError(xmlGenericErrorContext,
5333	"HPP: try EOF\n"); break;
5334	case XML_PARSER_START:
5335	xmlGenericError(xmlGenericErrorContext,
5336	"HPP: try START\n"); break;
5337	case XML_PARSER_MISC:
5338	xmlGenericError(xmlGenericErrorContext,
5339	"HPP: try MISC\n");break;
5340	case XML_PARSER_COMMENT:
5341	xmlGenericError(xmlGenericErrorContext,
5342	"HPP: try COMMENT\n");break;
5343	case XML_PARSER_PROLOG:
5344	xmlGenericError(xmlGenericErrorContext,
5345	"HPP: try PROLOG\n");break;
5346	case XML_PARSER_START_TAG:
5347	xmlGenericError(xmlGenericErrorContext,
5348	"HPP: try START_TAG\n");break;
5349	case XML_PARSER_CONTENT:
5350	xmlGenericError(xmlGenericErrorContext,
5351	"HPP: try CONTENT\n");break;
5352	case XML_PARSER_CDATA_SECTION:
5353	xmlGenericError(xmlGenericErrorContext,
5354	"HPP: try CDATA_SECTION\n");break;
5355	case XML_PARSER_END_TAG:
5356	xmlGenericError(xmlGenericErrorContext,
5357	"HPP: try END_TAG\n");break;
5358	case XML_PARSER_ENTITY_DECL:
5359	xmlGenericError(xmlGenericErrorContext,
5360	"HPP: try ENTITY_DECL\n");break;
5361	case XML_PARSER_ENTITY_VALUE:
5362	xmlGenericError(xmlGenericErrorContext,
5363	"HPP: try ENTITY_VALUE\n");break;
5364	case XML_PARSER_ATTRIBUTE_VALUE:
5365	xmlGenericError(xmlGenericErrorContext,
5366	"HPP: try ATTRIBUTE_VALUE\n");break;
5367	case XML_PARSER_DTD:
5368	xmlGenericError(xmlGenericErrorContext,
5369	"HPP: try DTD\n");break;
5370	case XML_PARSER_EPILOG:
5371	xmlGenericError(xmlGenericErrorContext,
5372	"HPP: try EPILOG\n");break;
5373	case XML_PARSER_PI:
5374	xmlGenericError(xmlGenericErrorContext,
5375	"HPP: try PI\n");break;
5376	case XML_PARSER_SYSTEM_LITERAL:
5377	xmlGenericError(xmlGenericErrorContext,
5378	"HPP: try SYSTEM_LITERAL\n");break;
5379	}
5380	#endif
5381
5382	while (1) {
5383
5384	in = ctxt->input;
5385	if (in == NULL) break;
5386	if (in->buf == NULL)
5387	avail = in->length - (in->cur - in->base);
5388	else
5389	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5390	if ((avail == 0) && (terminate)) {
5391	htmlAutoCloseOnEnd(ctxt);
5392	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5393	/*
5394	* SAX: end of the document processing.
5395	*/
5396	ctxt->instate = XML_PARSER_EOF;
5397	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5398	ctxt->sax->endDocument(ctxt->userData);
5399	}
5400	}
5401	if (avail < 1)
5402	goto done;
5403	cur = in->cur[0];
5404	if (cur == 0) {
5405	SKIP(1);
5406	continue;
5407	}
5408
5409	switch (ctxt->instate) {
5410	case XML_PARSER_EOF:
5411	/*
5412	* Document parsing is done !
5413	*/
5414	goto done;
5415	case XML_PARSER_START:
5416	/*
5417	* Very first chars read from the document flow.
5418	*/
5419	cur = in->cur[0];
5420	if (IS_BLANK_CH(cur)) {
5421	SKIP_BLANKS;
5422	if (in->buf == NULL)
5423	avail = in->length - (in->cur - in->base);
5424	else
5425	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5426	}
5427	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5428	ctxt->sax->setDocumentLocator(ctxt->userData,
5429	&xmlDefaultSAXLocator);
5430	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5431	(!ctxt->disableSAX))
5432	ctxt->sax->startDocument(ctxt->userData);
5433
5434	cur = in->cur[0];
5435	next = in->cur[1];
5436	if ((cur == '<') && (next == '!') &&
5437	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5438	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5439	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5440	(UPP(8) == 'E')) {
5441	if ((!terminate) &&
5442	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5443	goto done;
5444	#ifdef DEBUG_PUSH
5445	xmlGenericError(xmlGenericErrorContext,
5446	"HPP: Parsing internal subset\n");
5447	#endif
5448	htmlParseDocTypeDecl(ctxt);
5449	ctxt->instate = XML_PARSER_PROLOG;
5450	#ifdef DEBUG_PUSH
5451	xmlGenericError(xmlGenericErrorContext,
5452	"HPP: entering PROLOG\n");
5453	#endif
5454	} else {
5455	ctxt->instate = XML_PARSER_MISC;
5456	#ifdef DEBUG_PUSH
5457	xmlGenericError(xmlGenericErrorContext,
5458	"HPP: entering MISC\n");
5459	#endif
5460	}
5461	break;
5462	case XML_PARSER_MISC:
5463	SKIP_BLANKS;
5464	if (in->buf == NULL)
5465	avail = in->length - (in->cur - in->base);
5466	else
5467	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5468	/*
5469	* no chars in buffer
5470	*/
5471	if (avail < 1)
5472	goto done;
5473	/*
5474	* not enouth chars in buffer
5475	*/
5476	if (avail < 2) {
5477	if (!terminate)
5478	goto done;
5479	else
5480	next = ' ';
5481	} else {
5482	next = in->cur[1];
5483	}
5484	cur = in->cur[0];
5485	if ((cur == '<') && (next == '!') &&
5486	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5487	if ((!terminate) &&
5488	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5489	goto done;
5490	#ifdef DEBUG_PUSH
5491	xmlGenericError(xmlGenericErrorContext,
5492	"HPP: Parsing Comment\n");
5493	#endif
5494	htmlParseComment(ctxt);
5495	ctxt->instate = XML_PARSER_MISC;
5496	} else if ((cur == '<') && (next == '?')) {
5497	if ((!terminate) &&
5498	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5499	goto done;
5500	#ifdef DEBUG_PUSH
5501	xmlGenericError(xmlGenericErrorContext,
5502	"HPP: Parsing PI\n");
5503	#endif
5504	htmlParsePI(ctxt);
5505	ctxt->instate = XML_PARSER_MISC;
5506	} else if ((cur == '<') && (next == '!') &&
5507	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5508	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5509	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5510	(UPP(8) == 'E')) {
5511	if ((!terminate) &&
5512	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5513	goto done;
5514	#ifdef DEBUG_PUSH
5515	xmlGenericError(xmlGenericErrorContext,
5516	"HPP: Parsing internal subset\n");
5517	#endif
5518	htmlParseDocTypeDecl(ctxt);
5519	ctxt->instate = XML_PARSER_PROLOG;
5520	#ifdef DEBUG_PUSH
5521	xmlGenericError(xmlGenericErrorContext,
5522	"HPP: entering PROLOG\n");
5523	#endif
5524	} else if ((cur == '<') && (next == '!') &&
5525	(avail < 9)) {
5526	goto done;
5527	} else {
5528	ctxt->instate = XML_PARSER_START_TAG;
5529	#ifdef DEBUG_PUSH
5530	xmlGenericError(xmlGenericErrorContext,
5531	"HPP: entering START_TAG\n");
5532	#endif
5533	}
5534	break;
5535	case XML_PARSER_PROLOG:
5536	SKIP_BLANKS;
5537	if (in->buf == NULL)
5538	avail = in->length - (in->cur - in->base);
5539	else
5540	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5541	if (avail < 2)
5542	goto done;
5543	cur = in->cur[0];
5544	next = in->cur[1];
5545	if ((cur == '<') && (next == '!') &&
5546	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5547	if ((!terminate) &&
5548	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5549	goto done;
5550	#ifdef DEBUG_PUSH
5551	xmlGenericError(xmlGenericErrorContext,
5552	"HPP: Parsing Comment\n");
5553	#endif
5554	htmlParseComment(ctxt);
5555	ctxt->instate = XML_PARSER_PROLOG;
5556	} else if ((cur == '<') && (next == '?')) {
5557	if ((!terminate) &&
5558	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5559	goto done;
5560	#ifdef DEBUG_PUSH
5561	xmlGenericError(xmlGenericErrorContext,
5562	"HPP: Parsing PI\n");
5563	#endif
5564	htmlParsePI(ctxt);
5565	ctxt->instate = XML_PARSER_PROLOG;
5566	} else if ((cur == '<') && (next == '!') &&
5567	(avail < 4)) {
5568	goto done;
5569	} else {
5570	ctxt->instate = XML_PARSER_START_TAG;
5571	#ifdef DEBUG_PUSH
5572	xmlGenericError(xmlGenericErrorContext,
5573	"HPP: entering START_TAG\n");
5574	#endif
5575	}
5576	break;
5577	case XML_PARSER_EPILOG:
5578	if (in->buf == NULL)
5579	avail = in->length - (in->cur - in->base);
5580	else
5581	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5582	if (avail < 1)
5583	goto done;
5584	cur = in->cur[0];
5585	if (IS_BLANK_CH(cur)) {
5586	htmlParseCharData(ctxt);
5587	goto done;
5588	}
5589	if (avail < 2)
5590	goto done;
5591	next = in->cur[1];
5592	if ((cur == '<') && (next == '!') &&
5593	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5594	if ((!terminate) &&
5595	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5596	goto done;
5597	#ifdef DEBUG_PUSH
5598	xmlGenericError(xmlGenericErrorContext,
5599	"HPP: Parsing Comment\n");
5600	#endif
5601	htmlParseComment(ctxt);
5602	ctxt->instate = XML_PARSER_EPILOG;
5603	} else if ((cur == '<') && (next == '?')) {
5604	if ((!terminate) &&
5605	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5606	goto done;
5607	#ifdef DEBUG_PUSH
5608	xmlGenericError(xmlGenericErrorContext,
5609	"HPP: Parsing PI\n");
5610	#endif
5611	htmlParsePI(ctxt);
5612	ctxt->instate = XML_PARSER_EPILOG;
5613	} else if ((cur == '<') && (next == '!') &&
5614	(avail < 4)) {
5615	goto done;
5616	} else {
5617	ctxt->errNo = XML_ERR_DOCUMENT_END;
5618	ctxt->wellFormed = 0;
5619	ctxt->instate = XML_PARSER_EOF;
5620	#ifdef DEBUG_PUSH
5621	xmlGenericError(xmlGenericErrorContext,
5622	"HPP: entering EOF\n");
5623	#endif
5624	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5625	ctxt->sax->endDocument(ctxt->userData);
5626	goto done;
5627	}
5628	break;
5629	case XML_PARSER_START_TAG: {
5630	const xmlChar *name;
5631	int failed;
5632	const htmlElemDesc * info;
5633
5634	/*
5635	* no chars in buffer
5636	*/
5637	if (avail < 1)
5638	goto done;
5639	/*
5640	* not enouth chars in buffer
5641	*/
5642	if (avail < 2) {
5643	if (!terminate)
5644	goto done;
5645	else
5646	next = ' ';
5647	} else {
5648	next = in->cur[1];
5649	}
5650	cur = in->cur[0];
5651	if (cur != '<') {
5652	ctxt->instate = XML_PARSER_CONTENT;
5653	#ifdef DEBUG_PUSH
5654	xmlGenericError(xmlGenericErrorContext,
5655	"HPP: entering CONTENT\n");
5656	#endif
5657	break;
5658	}
5659	if (next == '/') {
5660	ctxt->instate = XML_PARSER_END_TAG;
5661	ctxt->checkIndex = 0;
5662	#ifdef DEBUG_PUSH
5663	xmlGenericError(xmlGenericErrorContext,
5664	"HPP: entering END_TAG\n");
5665	#endif
5666	break;
5667	}
5668	if ((!terminate) &&
5669	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5670	goto done;
5671
5672	/* Capture start position */
5673	if (ctxt->record_info) {
5674	node_info.begin_pos = ctxt->input->consumed +
5675	(CUR_PTR - ctxt->input->base);
5676	node_info.begin_line = ctxt->input->line;
5677	}
5678
5679
5680	failed = htmlParseStartTag(ctxt);
5681	name = ctxt->name;
5682	if ((failed == -1) \|\|
5683	(name == NULL)) {
5684	if (CUR == '>')
5685	NEXT;
5686	break;
5687	}
5688
5689	/*
5690	* Lookup the info for that element.
5691	*/
5692	info = htmlTagLookup(name);
5693	if (info == NULL) {
5694	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5695	"Tag %s invalid\n", name, NULL);
5696	}
5697
5698	/*
5699	* Check for an Empty Element labeled the XML/SGML way
5700	*/
5701	if ((CUR == '/') && (NXT(1) == '>')) {
5702	SKIP(2);
5703	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5704	ctxt->sax->endElement(ctxt->userData, name);
5705	htmlnamePop(ctxt);
5706	ctxt->instate = XML_PARSER_CONTENT;
5707	#ifdef DEBUG_PUSH
5708	xmlGenericError(xmlGenericErrorContext,
5709	"HPP: entering CONTENT\n");
5710	#endif
5711	break;
5712	}
5713
5714	if (CUR == '>') {
5715	NEXT;
5716	} else {
5717	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5718	"Couldn't find end of Start Tag %s\n",
5719	name, NULL);
5720
5721	/*
5722	* end of parsing of this node.
5723	*/
5724	if (xmlStrEqual(name, ctxt->name)) {
5725	nodePop(ctxt);
5726	htmlnamePop(ctxt);
5727	}
5728
5729	if (ctxt->record_info)
5730	htmlNodeInfoPush(ctxt, &node_info);
5731
5732	ctxt->instate = XML_PARSER_CONTENT;
5733	#ifdef DEBUG_PUSH
5734	xmlGenericError(xmlGenericErrorContext,
5735	"HPP: entering CONTENT\n");
5736	#endif
5737	break;
5738	}
5739
5740	/*
5741	* Check for an Empty Element from DTD definition
5742	*/
5743	if ((info != NULL) && (info->empty)) {
5744	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5745	ctxt->sax->endElement(ctxt->userData, name);
5746	htmlnamePop(ctxt);
5747	}
5748
5749	if (ctxt->record_info)
5750	htmlNodeInfoPush(ctxt, &node_info);
5751
5752	ctxt->instate = XML_PARSER_CONTENT;
5753	#ifdef DEBUG_PUSH
5754	xmlGenericError(xmlGenericErrorContext,
5755	"HPP: entering CONTENT\n");
5756	#endif
5757	break;
5758	}
5759	case XML_PARSER_CONTENT: {
5760	long cons;
5761	/*
5762	* Handle preparsed entities and charRef
5763	*/
5764	if (ctxt->token != 0) {
5765	xmlChar chr[2] = { 0 , 0 } ;
5766
5767	chr[0] = (xmlChar) ctxt->token;
5768	htmlCheckParagraph(ctxt);
5769	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5770	ctxt->sax->characters(ctxt->userData, chr, 1);
5771	ctxt->token = 0;
5772	ctxt->checkIndex = 0;
5773	}
5774	if ((avail == 1) && (terminate)) {
5775	cur = in->cur[0];
5776	if ((cur != '<') && (cur != '&')) {
5777	if (ctxt->sax != NULL) {
5778	if (IS_BLANK_CH(cur)) {
5779	if (ctxt->keepBlanks) {
5780	if (ctxt->sax->characters != NULL)
5781	ctxt->sax->characters(
5782	ctxt->userData, &in->cur[0], 1);
5783	} else {
5784	if (ctxt->sax->ignorableWhitespace != NULL)
5785	ctxt->sax->ignorableWhitespace(
5786	ctxt->userData, &in->cur[0], 1);
5787	}
5788	} else {
5789	htmlCheckParagraph(ctxt);
5790	if (ctxt->sax->characters != NULL)
5791	ctxt->sax->characters(
5792	ctxt->userData, &in->cur[0], 1);
5793	}
5794	}
5795	ctxt->token = 0;
5796	ctxt->checkIndex = 0;
5797	in->cur++;
5798	break;
5799	}
5800	}
5801	if (avail < 2)
5802	goto done;
5803	cur = in->cur[0];
5804	next = in->cur[1];
5805	cons = ctxt->nbChars;
5806	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
5807	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5808	/*
5809	* Handle SCRIPT/STYLE separately
5810	*/
5811	if (!terminate) {
5812	int idx;
5813	xmlChar val;
5814
5815	idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5816	if (idx < 0)
5817	goto done;
5818	val = in->cur[idx + 2];
5819	if (val == 0) /* bad cut of input */
5820	goto done;
5821	}
5822	htmlParseScript(ctxt);
5823	if ((cur == '<') && (next == '/')) {
5824	ctxt->instate = XML_PARSER_END_TAG;
5825	ctxt->checkIndex = 0;
5826	#ifdef DEBUG_PUSH
5827	xmlGenericError(xmlGenericErrorContext,
5828	"HPP: entering END_TAG\n");
5829	#endif
5830	break;
5831	}
5832	} else {
5833	/*
5834	* Sometimes DOCTYPE arrives in the middle of the document
5835	*/
5836	if ((cur == '<') && (next == '!') &&
5837	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5838	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5839	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5840	(UPP(8) == 'E')) {
5841	if ((!terminate) &&
5842	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5843	goto done;
5844	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5845	"Misplaced DOCTYPE declaration\n",
5846	BAD_CAST "DOCTYPE" , NULL);
5847	htmlParseDocTypeDecl(ctxt);
5848	} else if ((cur == '<') && (next == '!') &&
5849	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5850	if ((!terminate) &&
5851	(htmlParseLookupSequence(
5852	ctxt, '-', '-', '>', 1, 1) < 0))
5853	goto done;
5854	#ifdef DEBUG_PUSH
5855	xmlGenericError(xmlGenericErrorContext,
5856	"HPP: Parsing Comment\n");
5857	#endif
5858	htmlParseComment(ctxt);
5859	ctxt->instate = XML_PARSER_CONTENT;
5860	} else if ((cur == '<') && (next == '?')) {
5861	if ((!terminate) &&
5862	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5863	goto done;
5864	#ifdef DEBUG_PUSH
5865	xmlGenericError(xmlGenericErrorContext,
5866	"HPP: Parsing PI\n");
5867	#endif
5868	htmlParsePI(ctxt);
5869	ctxt->instate = XML_PARSER_CONTENT;
5870	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
5871	goto done;
5872	} else if ((cur == '<') && (next == '/')) {
5873	ctxt->instate = XML_PARSER_END_TAG;
5874	ctxt->checkIndex = 0;
5875	#ifdef DEBUG_PUSH
5876	xmlGenericError(xmlGenericErrorContext,
5877	"HPP: entering END_TAG\n");
5878	#endif
5879	break;
5880	} else if (cur == '<') {
5881	ctxt->instate = XML_PARSER_START_TAG;
5882	ctxt->checkIndex = 0;
5883	#ifdef DEBUG_PUSH
5884	xmlGenericError(xmlGenericErrorContext,
5885	"HPP: entering START_TAG\n");
5886	#endif
5887	break;
5888	} else if (cur == '&') {
5889	if ((!terminate) &&
5890	(htmlParseLookupChars(ctxt,
5891	BAD_CAST "; >/", 4) < 0))
5892	goto done;
5893	#ifdef DEBUG_PUSH
5894	xmlGenericError(xmlGenericErrorContext,
5895	"HPP: Parsing Reference\n");
5896	#endif
5897	/* TODO: check generation of subtrees if noent !!! */
5898	htmlParseReference(ctxt);
5899	} else {
5900	/*
5901	* check that the text sequence is complete
5902	* before handing out the data to the parser
5903	* to avoid problems with erroneous end of
5904	* data detection.
5905	*/
5906	if ((!terminate) &&
5907	(htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5908	goto done;
5909	ctxt->checkIndex = 0;
5910	#ifdef DEBUG_PUSH
5911	xmlGenericError(xmlGenericErrorContext,
5912	"HPP: Parsing char data\n");
5913	#endif
5914	htmlParseCharData(ctxt);
5915	}
5916	}
5917	if (cons == ctxt->nbChars) {
5918	if (ctxt->node != NULL) {
5919	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5920	"detected an error in element content\n",
5921	NULL, NULL);
5922	}
5923	NEXT;
5924	break;
5925	}
5926
5927	break;
5928	}
5929	case XML_PARSER_END_TAG:
5930	if (avail < 2)
5931	goto done;
5932	if ((!terminate) &&
5933	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5934	goto done;
5935	htmlParseEndTag(ctxt);
5936	if (ctxt->nameNr == 0) {
5937	ctxt->instate = XML_PARSER_EPILOG;
5938	} else {
5939	ctxt->instate = XML_PARSER_CONTENT;
5940	}
5941	ctxt->checkIndex = 0;
5942	#ifdef DEBUG_PUSH
5943	xmlGenericError(xmlGenericErrorContext,
5944	"HPP: entering CONTENT\n");
5945	#endif
5946	break;
5947	case XML_PARSER_CDATA_SECTION:
5948	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5949	"HPP: internal error, state == CDATA\n",
5950	NULL, NULL);
5951	ctxt->instate = XML_PARSER_CONTENT;
5952	ctxt->checkIndex = 0;
5953	#ifdef DEBUG_PUSH
5954	xmlGenericError(xmlGenericErrorContext,
5955	"HPP: entering CONTENT\n");
5956	#endif
5957	break;
5958	case XML_PARSER_DTD:
5959	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5960	"HPP: internal error, state == DTD\n",
5961	NULL, NULL);
5962	ctxt->instate = XML_PARSER_CONTENT;
5963	ctxt->checkIndex = 0;
5964	#ifdef DEBUG_PUSH
5965	xmlGenericError(xmlGenericErrorContext,
5966	"HPP: entering CONTENT\n");
5967	#endif
5968	break;
5969	case XML_PARSER_COMMENT:
5970	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5971	"HPP: internal error, state == COMMENT\n",
5972	NULL, NULL);
5973	ctxt->instate = XML_PARSER_CONTENT;
5974	ctxt->checkIndex = 0;
5975	#ifdef DEBUG_PUSH
5976	xmlGenericError(xmlGenericErrorContext,
5977	"HPP: entering CONTENT\n");
5978	#endif
5979	break;
5980	case XML_PARSER_PI:
5981	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5982	"HPP: internal error, state == PI\n",
5983	NULL, NULL);
5984	ctxt->instate = XML_PARSER_CONTENT;
5985	ctxt->checkIndex = 0;
5986	#ifdef DEBUG_PUSH
5987	xmlGenericError(xmlGenericErrorContext,
5988	"HPP: entering CONTENT\n");
5989	#endif
5990	break;
5991	case XML_PARSER_ENTITY_DECL:
5992	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5993	"HPP: internal error, state == ENTITY_DECL\n",
5994	NULL, NULL);
5995	ctxt->instate = XML_PARSER_CONTENT;
5996	ctxt->checkIndex = 0;
5997	#ifdef DEBUG_PUSH
5998	xmlGenericError(xmlGenericErrorContext,
5999	"HPP: entering CONTENT\n");
6000	#endif
6001	break;
6002	case XML_PARSER_ENTITY_VALUE:
6003	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6004	"HPP: internal error, state == ENTITY_VALUE\n",
6005	NULL, NULL);
6006	ctxt->instate = XML_PARSER_CONTENT;
6007	ctxt->checkIndex = 0;
6008	#ifdef DEBUG_PUSH
6009	xmlGenericError(xmlGenericErrorContext,
6010	"HPP: entering DTD\n");
6011	#endif
6012	break;
6013	case XML_PARSER_ATTRIBUTE_VALUE:
6014	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6015	"HPP: internal error, state == ATTRIBUTE_VALUE\n",
6016	NULL, NULL);
6017	ctxt->instate = XML_PARSER_START_TAG;
6018	ctxt->checkIndex = 0;
6019	#ifdef DEBUG_PUSH
6020	xmlGenericError(xmlGenericErrorContext,
6021	"HPP: entering START_TAG\n");
6022	#endif
6023	break;
6024	case XML_PARSER_SYSTEM_LITERAL:
6025	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6026	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6027	NULL, NULL);
6028	ctxt->instate = XML_PARSER_CONTENT;
6029	ctxt->checkIndex = 0;
6030	#ifdef DEBUG_PUSH
6031	xmlGenericError(xmlGenericErrorContext,
6032	"HPP: entering CONTENT\n");
6033	#endif
6034	break;
6035	case XML_PARSER_IGNORE:
6036	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6037	"HPP: internal error, state == XML_PARSER_IGNORE\n",
6038	NULL, NULL);
6039	ctxt->instate = XML_PARSER_CONTENT;
6040	ctxt->checkIndex = 0;
6041	#ifdef DEBUG_PUSH
6042	xmlGenericError(xmlGenericErrorContext,
6043	"HPP: entering CONTENT\n");
6044	#endif
6045	break;
6046	case XML_PARSER_PUBLIC_LITERAL:
6047	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6048	"HPP: internal error, state == XML_PARSER_LITERAL\n",
6049	NULL, NULL);
6050	ctxt->instate = XML_PARSER_CONTENT;
6051	ctxt->checkIndex = 0;
6052	#ifdef DEBUG_PUSH
6053	xmlGenericError(xmlGenericErrorContext,
6054	"HPP: entering CONTENT\n");
6055	#endif
6056	break;
6057
6058	}
6059	}
6060	done:
6061	if ((avail == 0) && (terminate)) {
6062	htmlAutoCloseOnEnd(ctxt);
6063	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6064	/*
6065	* SAX: end of the document processing.
6066	*/
6067	ctxt->instate = XML_PARSER_EOF;
6068	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6069	ctxt->sax->endDocument(ctxt->userData);
6070	}
6071	}
6072	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6073	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
6074	(ctxt->instate == XML_PARSER_EPILOG))) {
6075	xmlDtdPtr dtd;
6076	dtd = xmlGetIntSubset(ctxt->myDoc);
6077	if (dtd == NULL)
6078	ctxt->myDoc->intSubset =
6079	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6080	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6081	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6082	}
6083	#ifdef DEBUG_PUSH
6084	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6085	#endif
6086	return(ret);
6087	}
6088
6089	/**
6090	* htmlParseChunk:
6091	* @ctxt: an HTML parser context
6092	* @chunk: an char array
6093	* @size: the size in byte of the chunk
6094	* @terminate: last chunk indicator
6095	*
6096	* Parse a Chunk of memory
6097	*
6098	* Returns zero if no error, the xmlParserErrors otherwise.
6099	*/
6100	int
6101	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6102	int terminate) {
6103	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
6104	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6105	"htmlParseChunk: context error\n", NULL, NULL);
6106	return(XML_ERR_INTERNAL_ERROR);
6107	}
6108	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6109	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6110	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6111	size_t cur = ctxt->input->cur - ctxt->input->base;
6112	int res;
6113
6114	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6115	if (res < 0) {
6116	ctxt->errNo = XML_PARSER_EOF;
6117	ctxt->disableSAX = 1;
6118	return (XML_PARSER_EOF);
6119	}
6120	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6121	#ifdef DEBUG_PUSH
6122	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6123	#endif
6124
6125	#if 0
6126	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
6127	htmlParseTryOrFinish(ctxt, terminate);
6128	#endif
6129	} else if (ctxt->instate != XML_PARSER_EOF) {
6130	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6131	xmlParserInputBufferPtr in = ctxt->input->buf;
6132	if ((in->encoder != NULL) && (in->buffer != NULL) &&
6133	(in->raw != NULL)) {
6134	int nbchars;
6135	size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6136	size_t current = ctxt->input->cur - ctxt->input->base;
6137
6138	nbchars = xmlCharEncInput(in, terminate);
6139	if (nbchars < 0) {
6140	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6141	"encoder error\n", NULL, NULL);
6142	return(XML_ERR_INVALID_ENCODING);
6143	}
6144	xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6145	}
6146	}
6147	}
6148	htmlParseTryOrFinish(ctxt, terminate);
6149	if (terminate) {
6150	if ((ctxt->instate != XML_PARSER_EOF) &&
6151	(ctxt->instate != XML_PARSER_EPILOG) &&
6152	(ctxt->instate != XML_PARSER_MISC)) {
6153	ctxt->errNo = XML_ERR_DOCUMENT_END;
6154	ctxt->wellFormed = 0;
6155	}
6156	if (ctxt->instate != XML_PARSER_EOF) {
6157	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6158	ctxt->sax->endDocument(ctxt->userData);
6159	}
6160	ctxt->instate = XML_PARSER_EOF;
6161	}
6162	return((xmlParserErrors) ctxt->errNo);
6163	}
6164
6165	/************************************************************************
6166	* *
6167	* User entry points *
6168	* *
6169	************************************************************************/
6170
6171	/**
6172	* htmlCreatePushParserCtxt:
6173	* @sax: a SAX handler
6174	* @user_data: The user data returned on SAX callbacks
6175	* @chunk: a pointer to an array of chars
6176	* @size: number of chars in the array
6177	* @filename: an optional file name or URI
6178	* @enc: an optional encoding
6179	*
6180	* Create a parser context for using the HTML parser in push mode
6181	* The value of @filename is used for fetching external entities
6182	* and error/warning reports.
6183	*
6184	* Returns the new parser context or NULL
6185	*/
6186	htmlParserCtxtPtr
6187	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6188	const char chunk, int size, const char filename,
6189	xmlCharEncoding enc) {
6190	htmlParserCtxtPtr ctxt;
6191	htmlParserInputPtr inputStream;
6192	xmlParserInputBufferPtr buf;
6193
6194	xmlInitParser();
6195
6196	buf = xmlAllocParserInputBuffer(enc);
6197	if (buf == NULL) return(NULL);
6198
6199	ctxt = htmlNewParserCtxt();
6200	if (ctxt == NULL) {
6201	xmlFreeParserInputBuffer(buf);
6202	return(NULL);
6203	}
6204	if(enc==XML_CHAR_ENCODING_UTF8 \|\| buf->encoder)
6205	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6206	if (sax != NULL) {
6207	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6208	xmlFree(ctxt->sax);
6209	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6210	if (ctxt->sax == NULL) {
6211	xmlFree(buf);
6212	xmlFree(ctxt);
6213	return(NULL);
6214	}
6215	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6216	if (user_data != NULL)
6217	ctxt->userData = user_data;
6218	}
6219	if (filename == NULL) {
6220	ctxt->directory = NULL;
6221	} else {
6222	ctxt->directory = xmlParserGetDirectory(filename);
6223	}
6224
6225	inputStream = htmlNewInputStream(ctxt);
6226	if (inputStream == NULL) {
6227	xmlFreeParserCtxt(ctxt);
6228	xmlFree(buf);
6229	return(NULL);
6230	}
6231
6232	if (filename == NULL)
6233	inputStream->filename = NULL;
6234	else
6235	inputStream->filename = (char *)
6236	xmlCanonicPath((const xmlChar *) filename);
6237	inputStream->buf = buf;
6238	xmlBufResetInput(buf->buffer, inputStream);
6239
6240	inputPush(ctxt, inputStream);
6241
6242	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6243	(ctxt->input->buf != NULL)) {
6244	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6245	size_t cur = ctxt->input->cur - ctxt->input->base;
6246
6247	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6248
6249	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6250	#ifdef DEBUG_PUSH
6251	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6252	#endif
6253	}
6254	ctxt->progressive = 1;
6255
6256	return(ctxt);
6257	}
6258	#endif /* LIBXML_PUSH_ENABLED */
6259
6260	/**
6261	* htmlSAXParseDoc:
6262	* @cur: a pointer to an array of xmlChar
6263	* @encoding: a free form C string describing the HTML document encoding, or NULL
6264	* @sax: the SAX handler block
6265	* @userData: if using SAX, this pointer will be provided on callbacks.
6266	*
6267	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6268	* to handle parse events. If sax is NULL, fallback to the default DOM
6269	* behavior and return a tree.
6270	*
6271	* Returns the resulting document tree unless SAX is NULL or the document is
6272	* not well formed.
6273	*/
6274
6275	htmlDocPtr
6276	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
6277	htmlDocPtr ret;
6278	htmlParserCtxtPtr ctxt;
6279
6280	xmlInitParser();
6281
6282	if (cur == NULL) return(NULL);
6283
6284
6285	ctxt = htmlCreateDocParserCtxt(cur, encoding);
6286	if (ctxt == NULL) return(NULL);
6287	if (sax != NULL) {
6288	if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6289	ctxt->sax = sax;
6290	ctxt->userData = userData;
6291	}
6292
6293	htmlParseDocument(ctxt);
6294	ret = ctxt->myDoc;
6295	if (sax != NULL) {
6296	ctxt->sax = NULL;
6297	ctxt->userData = NULL;
6298	}
6299	htmlFreeParserCtxt(ctxt);
6300
6301	return(ret);
6302	}
6303
6304	/**
6305	* htmlParseDoc:
6306	* @cur: a pointer to an array of xmlChar
6307	* @encoding: a free form C string describing the HTML document encoding, or NULL
6308	*
6309	* parse an HTML in-memory document and build a tree.
6310	*
6311	* Returns the resulting document tree
6312	*/
6313
6314	htmlDocPtr
6315	htmlParseDoc(xmlChar cur, const char encoding) {
6316	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6317	}
6318
6319
6320	/**
6321	* htmlCreateFileParserCtxt:
6322	* @filename: the filename
6323	* @encoding: a free form C string describing the HTML document encoding, or NULL
6324	*
6325	* Create a parser context for a file content.
6326	* Automatic support for ZLIB/Compress compressed document is provided
6327	* by default if found at compile-time.
6328	*
6329	* Returns the new parser context or NULL
6330	*/
6331	htmlParserCtxtPtr
6332	htmlCreateFileParserCtxt(const char filename, const char encoding)
6333	{
6334	htmlParserCtxtPtr ctxt;
6335	htmlParserInputPtr inputStream;
6336	char *canonicFilename;
6337	/* htmlCharEncoding enc; */
6338	xmlChar content, content_line = (xmlChar *) "charset=";
6339
6340	if (filename == NULL)
6341	return(NULL);
6342
6343	ctxt = htmlNewParserCtxt();
6344	if (ctxt == NULL) {
6345	return(NULL);
6346	}
6347	canonicFilename = (char ) xmlCanonicPath((const xmlChar ) filename);
6348	if (canonicFilename == NULL) {
6349	#ifdef LIBXML_SAX1_ENABLED
6350	if (xmlDefaultSAXHandler.error != NULL) {
6351	xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6352	}
6353	#endif
6354	xmlFreeParserCtxt(ctxt);
6355	return(NULL);
6356	}
6357
6358	inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6359	xmlFree(canonicFilename);
6360	if (inputStream == NULL) {
6361	xmlFreeParserCtxt(ctxt);
6362	return(NULL);
6363	}
6364
6365	inputPush(ctxt, inputStream);
6366
6367	/* set encoding */
6368	if (encoding) {
6369	size_t l = strlen(encoding);
6370
6371	if (l < 1000) {
6372	content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6373	if (content) {
6374	strcpy ((char )content, (char )content_line);
6375	strcat ((char )content, (char )encoding);
6376	htmlCheckEncoding (ctxt, content);
6377	xmlFree (content);
6378	}
6379	}
6380	}
6381
6382	return(ctxt);
6383	}
6384
6385	/**
6386	* htmlSAXParseFile:
6387	* @filename: the filename
6388	* @encoding: a free form C string describing the HTML document encoding, or NULL
6389	* @sax: the SAX handler block
6390	* @userData: if using SAX, this pointer will be provided on callbacks.
6391	*
6392	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6393	* compressed document is provided by default if found at compile-time.
6394	* It use the given SAX function block to handle the parsing callback.
6395	* If sax is NULL, fallback to the default DOM tree building routines.
6396	*
6397	* Returns the resulting document tree unless SAX is NULL or the document is
6398	* not well formed.
6399	*/
6400
6401	htmlDocPtr
6402	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
6403	void *userData) {
6404	htmlDocPtr ret;
6405	htmlParserCtxtPtr ctxt;
6406	htmlSAXHandlerPtr oldsax = NULL;
6407
6408	xmlInitParser();
6409
6410	ctxt = htmlCreateFileParserCtxt(filename, encoding);
6411	if (ctxt == NULL) return(NULL);
6412	if (sax != NULL) {
6413	oldsax = ctxt->sax;
6414	ctxt->sax = sax;
6415	ctxt->userData = userData;
6416	}
6417
6418	htmlParseDocument(ctxt);
6419
6420	ret = ctxt->myDoc;
6421	if (sax != NULL) {
6422	ctxt->sax = oldsax;
6423	ctxt->userData = NULL;
6424	}
6425	htmlFreeParserCtxt(ctxt);
6426
6427	return(ret);
6428	}
6429
6430	/**
6431	* htmlParseFile:
6432	* @filename: the filename
6433	* @encoding: a free form C string describing the HTML document encoding, or NULL
6434	*
6435	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6436	* compressed document is provided by default if found at compile-time.
6437	*
6438	* Returns the resulting document tree
6439	*/
6440
6441	htmlDocPtr
6442	htmlParseFile(const char filename, const char encoding) {
6443	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6444	}
6445
6446	/**
6447	* htmlHandleOmittedElem:
6448	* @val: int 0 or 1
6449	*
6450	* Set and return the previous value for handling HTML omitted tags.
6451	*
6452	* Returns the last value for 0 for no handling, 1 for auto insertion.
6453	*/
6454
6455	int
6456	htmlHandleOmittedElem(int val) {
6457	int old = htmlOmittedDefaultValue;
6458
6459	htmlOmittedDefaultValue = val;
6460	return(old);
6461	}
6462
6463	/**
6464	* htmlElementAllowedHere:
6465	* @parent: HTML parent element
6466	* @elt: HTML element
6467	*
6468	* Checks whether an HTML element may be a direct child of a parent element.
6469	* Note - doesn't check for deprecated elements
6470	*
6471	* Returns 1 if allowed; 0 otherwise.
6472	*/
6473	int
6474	htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6475	const char** p ;
6476
6477	if ( ! elt \|\| ! parent \|\| ! parent->subelts )
6478	return 0 ;
6479
6480	for ( p = parent->subelts; *p; ++p )
6481	if ( !xmlStrcmp((const xmlChar )p, elt) )
6482	return 1 ;
6483
6484	return 0 ;
6485	}
6486	/**
6487	* htmlElementStatusHere:
6488	* @parent: HTML parent element
6489	* @elt: HTML element
6490	*
6491	* Checks whether an HTML element may be a direct child of a parent element.
6492	* and if so whether it is valid or deprecated.
6493	*
6494	* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6495	*/
6496	htmlStatus
6497	htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6498	if ( ! parent \|\| ! elt )
6499	return HTML_INVALID ;
6500	if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6501	return HTML_INVALID ;
6502
6503	return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6504	}
6505	/**
6506	* htmlAttrAllowed:
6507	* @elt: HTML element
6508	* @attr: HTML attribute
6509	* @legacy: whether to allow deprecated attributes
6510	*
6511	* Checks whether an attribute is valid for an element
6512	* Has full knowledge of Required and Deprecated attributes
6513	*
6514	* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6515	*/
6516	htmlStatus
6517	htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6518	const char** p ;
6519
6520	if ( !elt \|\| ! attr )
6521	return HTML_INVALID ;
6522
6523	if ( elt->attrs_req )
6524	for ( p = elt->attrs_req; *p; ++p)
6525	if ( !xmlStrcmp((const xmlChar)p, attr) )
6526	return HTML_REQUIRED ;
6527
6528	if ( elt->attrs_opt )
6529	for ( p = elt->attrs_opt; *p; ++p)
6530	if ( !xmlStrcmp((const xmlChar)p, attr) )
6531	return HTML_VALID ;
6532
6533	if ( legacy && elt->attrs_depr )
6534	for ( p = elt->attrs_depr; *p; ++p)
6535	if ( !xmlStrcmp((const xmlChar)p, attr) )
6536	return HTML_DEPRECATED ;
6537
6538	return HTML_INVALID ;
6539	}
6540	/**
6541	* htmlNodeStatus:
6542	* @node: an htmlNodePtr in a tree
6543	* @legacy: whether to allow deprecated elements (YES is faster here
6544	* for Element nodes)
6545	*
6546	* Checks whether the tree node is valid. Experimental (the author
6547	* only uses the HTML enhancements in a SAX parser)
6548	*
6549	* Return: for Element nodes, a return from htmlElementAllowedHere (if
6550	* legacy allowed) or htmlElementStatusHere (otherwise).
6551	* for Attribute nodes, a return from htmlAttrAllowed
6552	* for other nodes, HTML_NA (no checks performed)
6553	*/
6554	htmlStatus
6555	htmlNodeStatus(const htmlNodePtr node, int legacy) {
6556	if ( ! node )
6557	return HTML_INVALID ;
6558
6559	switch ( node->type ) {
6560	case XML_ELEMENT_NODE:
6561	return legacy
6562	? ( htmlElementAllowedHere (
6563	htmlTagLookup(node->parent->name) , node->name
6564	) ? HTML_VALID : HTML_INVALID )
6565	: htmlElementStatusHere(
6566	htmlTagLookup(node->parent->name) ,
6567	htmlTagLookup(node->name) )
6568	;
6569	case XML_ATTRIBUTE_NODE:
6570	return htmlAttrAllowed(
6571	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6572	default: return HTML_NA ;
6573	}
6574	}
6575	/************************************************************************
6576	* *
6577	* New set (2.6.0) of simpler and more flexible APIs *
6578	* *
6579	************************************************************************/
6580	/**
6581	* DICT_FREE:
6582	* @str: a string
6583	*
6584	* Free a string if it is not owned by the "dict" dictionary in the
6585	* current scope
6586	*/
6587	#define DICT_FREE(str) \
6588	if ((str) && ((!dict) \|\| \
6589	(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6590	xmlFree((char *)(str));
6591
6592	/**
6593	* htmlCtxtReset:
6594	* @ctxt: an HTML parser context
6595	*
6596	* Reset a parser context
6597	*/
6598	void
6599	htmlCtxtReset(htmlParserCtxtPtr ctxt)
6600	{
6601	xmlParserInputPtr input;
6602	xmlDictPtr dict;
6603
6604	if (ctxt == NULL)
6605	return;
6606
6607	xmlInitParser();
6608	dict = ctxt->dict;
6609
6610	while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6611	xmlFreeInputStream(input);
6612	}
6613	ctxt->inputNr = 0;
6614	ctxt->input = NULL;
6615
6616	ctxt->spaceNr = 0;
6617	if (ctxt->spaceTab != NULL) {
6618	ctxt->spaceTab[0] = -1;
6619	ctxt->space = &ctxt->spaceTab[0];
6620	} else {
6621	ctxt->space = NULL;
6622	}
6623
6624
6625	ctxt->nodeNr = 0;
6626	ctxt->node = NULL;
6627
6628	ctxt->nameNr = 0;
6629	ctxt->name = NULL;
6630
6631	DICT_FREE(ctxt->version);
6632	ctxt->version = NULL;
6633	DICT_FREE(ctxt->encoding);
6634	ctxt->encoding = NULL;
6635	DICT_FREE(ctxt->directory);
6636	ctxt->directory = NULL;
6637	DICT_FREE(ctxt->extSubURI);
6638	ctxt->extSubURI = NULL;
6639	DICT_FREE(ctxt->extSubSystem);
6640	ctxt->extSubSystem = NULL;
6641	if (ctxt->myDoc != NULL)
6642	xmlFreeDoc(ctxt->myDoc);
6643	ctxt->myDoc = NULL;
6644
6645	ctxt->standalone = -1;
6646	ctxt->hasExternalSubset = 0;
6647	ctxt->hasPErefs = 0;
6648	ctxt->html = 1;
6649	ctxt->external = 0;
6650	ctxt->instate = XML_PARSER_START;
6651	ctxt->token = 0;
6652
6653	ctxt->wellFormed = 1;
6654	ctxt->nsWellFormed = 1;
6655	ctxt->disableSAX = 0;
6656	ctxt->valid = 1;
6657	ctxt->vctxt.userData = ctxt;
6658	ctxt->vctxt.error = xmlParserValidityError;
6659	ctxt->vctxt.warning = xmlParserValidityWarning;
6660	ctxt->record_info = 0;
6661	ctxt->nbChars = 0;
6662	ctxt->checkIndex = 0;
6663	ctxt->inSubset = 0;
6664	ctxt->errNo = XML_ERR_OK;
6665	ctxt->depth = 0;
6666	ctxt->charset = XML_CHAR_ENCODING_NONE;
6667	ctxt->catalogs = NULL;
6668	xmlInitNodeInfoSeq(&ctxt->node_seq);
6669
6670	if (ctxt->attsDefault != NULL) {
6671	xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6672	ctxt->attsDefault = NULL;
6673	}
6674	if (ctxt->attsSpecial != NULL) {
6675	xmlHashFree(ctxt->attsSpecial, NULL);
6676	ctxt->attsSpecial = NULL;
6677	}
6678	}
6679
6680	/**
6681	* htmlCtxtUseOptions:
6682	* @ctxt: an HTML parser context
6683	* @options: a combination of htmlParserOption(s)
6684	*
6685	* Applies the options to the parser context
6686	*
6687	* Returns 0 in case of success, the set of unknown or unimplemented options
6688	* in case of error.
6689	*/
6690	int
6691	htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6692	{
6693	if (ctxt == NULL)
6694	return(-1);
6695
6696	if (options & HTML_PARSE_NOWARNING) {
6697	ctxt->sax->warning = NULL;
6698	ctxt->vctxt.warning = NULL;
6699	options -= XML_PARSE_NOWARNING;
6700	ctxt->options \|= XML_PARSE_NOWARNING;
6701	}
6702	if (options & HTML_PARSE_NOERROR) {
6703	ctxt->sax->error = NULL;
6704	ctxt->vctxt.error = NULL;
6705	ctxt->sax->fatalError = NULL;
6706	options -= XML_PARSE_NOERROR;
6707	ctxt->options \|= XML_PARSE_NOERROR;
6708	}
6709	if (options & HTML_PARSE_PEDANTIC) {
6710	ctxt->pedantic = 1;
6711	options -= XML_PARSE_PEDANTIC;
6712	ctxt->options \|= XML_PARSE_PEDANTIC;
6713	} else
6714	ctxt->pedantic = 0;
6715	if (options & XML_PARSE_NOBLANKS) {
6716	ctxt->keepBlanks = 0;
6717	ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6718	options -= XML_PARSE_NOBLANKS;
6719	ctxt->options \|= XML_PARSE_NOBLANKS;
6720	} else
6721	ctxt->keepBlanks = 1;
6722	if (options & HTML_PARSE_RECOVER) {
6723	ctxt->recovery = 1;
6724	options -= HTML_PARSE_RECOVER;
6725	} else
6726	ctxt->recovery = 0;
6727	if (options & HTML_PARSE_COMPACT) {
6728	ctxt->options \|= HTML_PARSE_COMPACT;
6729	options -= HTML_PARSE_COMPACT;
6730	}
6731	if (options & XML_PARSE_HUGE) {
6732	ctxt->options \|= XML_PARSE_HUGE;
6733	options -= XML_PARSE_HUGE;
6734	}
6735	if (options & HTML_PARSE_NODEFDTD) {
6736	ctxt->options \|= HTML_PARSE_NODEFDTD;
6737	options -= HTML_PARSE_NODEFDTD;
6738	}
6739	if (options & HTML_PARSE_IGNORE_ENC) {
6740	ctxt->options \|= HTML_PARSE_IGNORE_ENC;
6741	options -= HTML_PARSE_IGNORE_ENC;
6742	}
6743	if (options & HTML_PARSE_NOIMPLIED) {
6744	ctxt->options \|= HTML_PARSE_NOIMPLIED;
6745	options -= HTML_PARSE_NOIMPLIED;
6746	}
6747	ctxt->dictNames = 0;
6748	return (options);
6749	}
6750
6751	/**
6752	* htmlDoRead:
6753	* @ctxt: an HTML parser context
6754	* @URL: the base URL to use for the document
6755	* @encoding: the document encoding, or NULL
6756	* @options: a combination of htmlParserOption(s)
6757	* @reuse: keep the context for reuse
6758	*
6759	* Common front-end for the htmlRead functions
6760	*
6761	* Returns the resulting document tree or NULL
6762	*/
6763	static htmlDocPtr
6764	htmlDoRead(htmlParserCtxtPtr ctxt, const char URL, const char encoding,
6765	int options, int reuse)
6766	{
6767	htmlDocPtr ret;
6768
6769	htmlCtxtUseOptions(ctxt, options);
6770	ctxt->html = 1;
6771	if (encoding != NULL) {
6772	xmlCharEncodingHandlerPtr hdlr;
6773
6774	hdlr = xmlFindCharEncodingHandler(encoding);
6775	if (hdlr != NULL) {
6776	xmlSwitchToEncoding(ctxt, hdlr);
6777	if (ctxt->input->encoding != NULL)
6778	xmlFree((xmlChar *) ctxt->input->encoding);
6779	ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6780	}
6781	}
6782	if ((URL != NULL) && (ctxt->input != NULL) &&
6783	(ctxt->input->filename == NULL))
6784	ctxt->input->filename = (char ) xmlStrdup((const xmlChar ) URL);
6785	htmlParseDocument(ctxt);
6786	ret = ctxt->myDoc;
6787	ctxt->myDoc = NULL;
6788	if (!reuse) {
6789	if ((ctxt->dictNames) &&
6790	(ret != NULL) &&
6791	(ret->dict == ctxt->dict))
6792	ctxt->dict = NULL;
6793	xmlFreeParserCtxt(ctxt);
6794	}
6795	return (ret);
6796	}
6797
6798	/**
6799	* htmlReadDoc:
6800	* @cur: a pointer to a zero terminated string
6801	* @URL: the base URL to use for the document
6802	* @encoding: the document encoding, or NULL
6803	* @options: a combination of htmlParserOption(s)
6804	*
6805	* parse an XML in-memory document and build a tree.
6806	*
6807	* Returns the resulting document tree
6808	*/
6809	htmlDocPtr
6810	htmlReadDoc(const xmlChar * cur, const char URL, const char encoding, int options)
6811	{
6812	htmlParserCtxtPtr ctxt;
6813
6814	if (cur == NULL)
6815	return (NULL);
6816
6817	xmlInitParser();
6818	ctxt = htmlCreateDocParserCtxt(cur, NULL);
6819	if (ctxt == NULL)
6820	return (NULL);
6821	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6822	}
6823
6824	/**
6825	* htmlReadFile:
6826	* @filename: a file or URL
6827	* @encoding: the document encoding, or NULL
6828	* @options: a combination of htmlParserOption(s)
6829	*
6830	* parse an XML file from the filesystem or the network.
6831	*
6832	* Returns the resulting document tree
6833	*/
6834	htmlDocPtr
6835	htmlReadFile(const char filename, const char encoding, int options)
6836	{
6837	htmlParserCtxtPtr ctxt;
6838
6839	xmlInitParser();
6840	ctxt = htmlCreateFileParserCtxt(filename, encoding);
6841	if (ctxt == NULL)
6842	return (NULL);
6843	return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6844	}
6845
6846	/**
6847	* htmlReadMemory:
6848	* @buffer: a pointer to a char array
6849	* @size: the size of the array
6850	* @URL: the base URL to use for the document
6851	* @encoding: the document encoding, or NULL
6852	* @options: a combination of htmlParserOption(s)
6853	*
6854	* parse an XML in-memory document and build a tree.
6855	*
6856	* Returns the resulting document tree
6857	*/
6858	htmlDocPtr
6859	htmlReadMemory(const char buffer, int size, const char URL, const char *encoding, int options)
6860	{
6861	htmlParserCtxtPtr ctxt;
6862
6863	xmlInitParser();
6864	ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6865	if (ctxt == NULL)
6866	return (NULL);
6867	htmlDefaultSAXHandlerInit();
6868	if (ctxt->sax != NULL)
6869	memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6870	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6871	}
6872
6873	/**
6874	* htmlReadFd:
6875	* @fd: an open file descriptor
6876	* @URL: the base URL to use for the document
6877	* @encoding: the document encoding, or NULL
6878	* @options: a combination of htmlParserOption(s)
6879	*
6880	* parse an XML from a file descriptor and build a tree.
6881	*
6882	* Returns the resulting document tree
6883	*/
6884	htmlDocPtr
6885	htmlReadFd(int fd, const char URL, const char encoding, int options)
6886	{
6887	htmlParserCtxtPtr ctxt;
6888	xmlParserInputBufferPtr input;
6889	xmlParserInputPtr stream;
6890
6891	if (fd < 0)
6892	return (NULL);
6893	xmlInitParser();
6894
6895	xmlInitParser();
6896	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6897	if (input == NULL)
6898	return (NULL);
6899	ctxt = xmlNewParserCtxt();
6900	if (ctxt == NULL) {
6901	xmlFreeParserInputBuffer(input);
6902	return (NULL);
6903	}
6904	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6905	if (stream == NULL) {
6906	xmlFreeParserInputBuffer(input);
6907	xmlFreeParserCtxt(ctxt);
6908	return (NULL);
6909	}
6910	inputPush(ctxt, stream);
6911	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6912	}
6913
6914	/**
6915	* htmlReadIO:
6916	* @ioread: an I/O read function
6917	* @ioclose: an I/O close function
6918	* @ioctx: an I/O handler
6919	* @URL: the base URL to use for the document
6920	* @encoding: the document encoding, or NULL
6921	* @options: a combination of htmlParserOption(s)
6922	*
6923	* parse an HTML document from I/O functions and source and build a tree.
6924	*
6925	* Returns the resulting document tree
6926	*/
6927	htmlDocPtr
6928	htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6929	void ioctx, const char URL, const char *encoding, int options)
6930	{
6931	htmlParserCtxtPtr ctxt;
6932	xmlParserInputBufferPtr input;
6933	xmlParserInputPtr stream;
6934
6935	if (ioread == NULL)
6936	return (NULL);
6937	xmlInitParser();
6938
6939	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6940	XML_CHAR_ENCODING_NONE);
6941	if (input == NULL) {
6942	if (ioclose != NULL)
6943	ioclose(ioctx);
6944	return (NULL);
6945	}
6946	ctxt = htmlNewParserCtxt();
6947	if (ctxt == NULL) {
6948	xmlFreeParserInputBuffer(input);
6949	return (NULL);
6950	}
6951	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6952	if (stream == NULL) {
6953	xmlFreeParserInputBuffer(input);
6954	xmlFreeParserCtxt(ctxt);
6955	return (NULL);
6956	}
6957	inputPush(ctxt, stream);
6958	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6959	}
6960
6961	/**
6962	* htmlCtxtReadDoc:
6963	* @ctxt: an HTML parser context
6964	* @cur: a pointer to a zero terminated string
6965	* @URL: the base URL to use for the document
6966	* @encoding: the document encoding, or NULL
6967	* @options: a combination of htmlParserOption(s)
6968	*
6969	* parse an XML in-memory document and build a tree.
6970	* This reuses the existing @ctxt parser context
6971	*
6972	* Returns the resulting document tree
6973	*/
6974	htmlDocPtr
6975	htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6976	const char URL, const char encoding, int options)
6977	{
6978	xmlParserInputPtr stream;
6979
6980	if (cur == NULL)
6981	return (NULL);
6982	if (ctxt == NULL)
6983	return (NULL);
6984	xmlInitParser();
6985
6986	htmlCtxtReset(ctxt);
6987
6988	stream = xmlNewStringInputStream(ctxt, cur);
6989	if (stream == NULL) {
6990	return (NULL);
6991	}
6992	inputPush(ctxt, stream);
6993	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6994	}
6995
6996	/**
6997	* htmlCtxtReadFile:
6998	* @ctxt: an HTML parser context
6999	* @filename: a file or URL
7000	* @encoding: the document encoding, or NULL
7001	* @options: a combination of htmlParserOption(s)
7002	*
7003	* parse an XML file from the filesystem or the network.
7004	* This reuses the existing @ctxt parser context
7005	*
7006	* Returns the resulting document tree
7007	*/
7008	htmlDocPtr
7009	htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7010	const char *encoding, int options)
7011	{
7012	xmlParserInputPtr stream;
7013
7014	if (filename == NULL)
7015	return (NULL);
7016	if (ctxt == NULL)
7017	return (NULL);
7018	xmlInitParser();
7019
7020	htmlCtxtReset(ctxt);
7021
7022	stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7023	if (stream == NULL) {
7024	return (NULL);
7025	}
7026	inputPush(ctxt, stream);
7027	return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7028	}
7029
7030	/**
7031	* htmlCtxtReadMemory:
7032	* @ctxt: an HTML parser context
7033	* @buffer: a pointer to a char array
7034	* @size: the size of the array
7035	* @URL: the base URL to use for the document
7036	* @encoding: the document encoding, or NULL
7037	* @options: a combination of htmlParserOption(s)
7038	*
7039	* parse an XML in-memory document and build a tree.
7040	* This reuses the existing @ctxt parser context
7041	*
7042	* Returns the resulting document tree
7043	*/
7044	htmlDocPtr
7045	htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7046	const char URL, const char encoding, int options)
7047	{
7048	xmlParserInputBufferPtr input;
7049	xmlParserInputPtr stream;
7050
7051	if (ctxt == NULL)
7052	return (NULL);
7053	if (buffer == NULL)
7054	return (NULL);
7055	xmlInitParser();
7056
7057	htmlCtxtReset(ctxt);
7058
7059	input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7060	if (input == NULL) {
7061	return(NULL);
7062	}
7063
7064	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7065	if (stream == NULL) {
7066	xmlFreeParserInputBuffer(input);
7067	return(NULL);
7068	}
7069
7070	inputPush(ctxt, stream);
7071	return (htmlDoRead(ctxt, URL, encoding, options, 1));
7072	}
7073
7074	/**
7075	* htmlCtxtReadFd:
7076	* @ctxt: an HTML parser context
7077	* @fd: an open file descriptor
7078	* @URL: the base URL to use for the document
7079	* @encoding: the document encoding, or NULL
7080	* @options: a combination of htmlParserOption(s)
7081	*
7082	* parse an XML from a file descriptor and build a tree.
7083	* This reuses the existing @ctxt parser context
7084	*
7085	* Returns the resulting document tree
7086	*/
7087	htmlDocPtr
7088	htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7089	const char URL, const char encoding, int options)
7090	{
7091	xmlParserInputBufferPtr input;
7092	xmlParserInputPtr stream;
7093
7094	if (fd < 0)
7095	return (NULL);
7096	if (ctxt == NULL)
7097	return (NULL);
7098	xmlInitParser();
7099
7100	htmlCtxtReset(ctxt);
7101
7102
7103	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7104	if (input == NULL)
7105	return (NULL);
7106	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7107	if (stream == NULL) {
7108	xmlFreeParserInputBuffer(input);
7109	return (NULL);
7110	}
7111	inputPush(ctxt, stream);
7112	return (htmlDoRead(ctxt, URL, encoding, options, 1));
7113	}
7114
7115	/**
7116	* htmlCtxtReadIO:
7117	* @ctxt: an HTML parser context
7118	* @ioread: an I/O read function
7119	* @ioclose: an I/O close function
7120	* @ioctx: an I/O handler
7121	* @URL: the base URL to use for the document
7122	* @encoding: the document encoding, or NULL
7123	* @options: a combination of htmlParserOption(s)
7124	*
7125	* parse an HTML document from I/O functions and source and build a tree.
7126	* This reuses the existing @ctxt parser context
7127	*
7128	* Returns the resulting document tree
7129	*/
7130	htmlDocPtr
7131	htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7132	xmlInputCloseCallback ioclose, void *ioctx,
7133	const char *URL,
7134	const char *encoding, int options)
7135	{
7136	xmlParserInputBufferPtr input;
7137	xmlParserInputPtr stream;
7138
7139	if (ioread == NULL)
7140	return (NULL);
7141	if (ctxt == NULL)
7142	return (NULL);
7143	xmlInitParser();
7144
7145	htmlCtxtReset(ctxt);
7146
7147	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7148	XML_CHAR_ENCODING_NONE);
7149	if (input == NULL) {
7150	if (ioclose != NULL)
7151	ioclose(ioctx);
7152	return (NULL);
7153	}
7154	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7155	if (stream == NULL) {
7156	xmlFreeParserInputBuffer(input);
7157	return (NULL);
7158	}
7159	inputPush(ctxt, stream);
7160	return (htmlDoRead(ctxt, URL, encoding, options, 1));
7161	}
7162
7163	#define bottom_HTMLparser
7164	#include "elfgcchack.h"
7165	#endif /* LIBXML_HTML_ENABLED */

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/libxml2-2.9.4/HTMLparser.c@ 66550

以其他格式下載: