updated to more recent libxml2 version (work in progress)

2006-06-18 23:22:39 +00:00
parent c91d74bfa3
commit f2636f8d4b
70 changed files with 12357 additions and 3518 deletions
--- a/Extras/LibXML/HTMLparser.c
+++ b/Extras/LibXML/HTMLparser.c
@@ -192,16 +192,16 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
    const xmlChar *ret;

    if (ctxt->nameNr <= 0)
-        return (0);
+        return (NULL);
    ctxt->nameNr--;
    if (ctxt->nameNr < 0)
-        return (0);
+        return (NULL);
    if (ctxt->nameNr > 0)
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
    else
        ctxt->name = NULL;
    ret = ctxt->nameTab[ctxt->nameNr];
-    ctxt->nameTab[ctxt->nameNr] = 0;
+    ctxt->nameTab[ctxt->nameNr] = NULL;
    return (ret);
 }

@@ -230,7 +230,7 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
 *           UTF-8 if we are using this mode. It returns an int.
 *   NEXT    Skip to the next character, this does the proper decoding
 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
- *   NEXTL(l) Skip the current Unicode character of l xmlChars long.
+ *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 */

@@ -964,7 +964,6 @@ NULL
 static const char *htmlNoContentElements[] = {
    "html",
    "head",
-    "body",
    NULL
 };

@@ -1745,7 +1744,7 @@ htmlEntityLookup(const xmlChar *name) {

 /**
 * htmlEntityValueLookup:
- * @value: the entity's Unicode value
+ * @value: the entity's unicode value
 *
 * Lookup the given entity in EntitiesTable
 *
@@ -2042,6 +2041,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
    unsigned int i;
    int j;
    xmlNodePtr lastChild;
+    xmlDtdPtr dtd;

    for (j = 0;j < len;j++)
        if (!(IS_BLANK_CH(str[j]))) return(0);
@@ -2054,8 +2054,17 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
 	return(1);
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
 	return(1);
-    if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
-	return(1);
+
+    /* Only strip CDATA children of the body tag for strict HTML DTDs */
+    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
+        dtd = xmlGetIntSubset(ctxt->myDoc);
+        if (dtd != NULL && dtd->ExternalID != NULL) {
+            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
+                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
+                return(1);
+        }
+    }
+
    if (ctxt->node == NULL) return(0);
    lastChild = xmlGetLastChild(ctxt->node);
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
@@ -2627,12 +2636,12 @@ htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
 */
 static void
 htmlParseScript(htmlParserCtxtPtr ctxt) {
-    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
+    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
    int nbchar = 0;
-    xmlChar cur;
+    int cur,l;

    SHRINK;
-    cur = CUR;
+    cur = CUR_CHAR(l);
    while (IS_CHAR_CH(cur)) {
 	if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
 	    (NXT(3) == '-')) {
@@ -2648,20 +2657,39 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
 	    }
 	    nbchar = 0;
 	    htmlParseComment(ctxt);
-	    cur = CUR;
+	    cur = CUR_CHAR(l);
 	    continue;
 	} else if ((cur == '<') && (NXT(1) == '/')) {
-	    /*
-	     * One should break here, the specification is clear:
-	     * Authors should therefore escape "</" within the content.
-	     * Escape mechanisms are specific to each scripting or
-	     * style sheet language.
-	     */
-	    if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
-	        ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
-		break; /* while */
+            /*
+             * One should break here, the specification is clear:
+             * Authors should therefore escape "</" within the content.
+             * Escape mechanisms are specific to each scripting or
+             * style sheet language.
+             *
+             * In recovery mode, only break if end tag match the
+             * current tag, effectively ignoring all tags inside the
+             * script/style block and treating the entire block as
+             * CDATA.
+             */
+            if (ctxt->recovery) {
+                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 
+				   xmlStrlen(ctxt->name)) == 0) 
+                {
+                    break; /* while */
+                } else {
+		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
+				 "Element %s embeds close tag\n",
+		                 ctxt->name, NULL);
+		}
+            } else {
+                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
+                    ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 
+                {
+                    break; /* while */
+                }
+            }
 	}
-	buf[nbchar++] = cur;
+	COPY_BUF(l,buf,nbchar,cur);
 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
 	    if (ctxt->sax->cdataBlock!= NULL) {
 		/*
@@ -2673,9 +2701,11 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
 	    }
 	    nbchar = 0;
 	}
-	NEXT;
-	cur = CUR;
+	GROW;
+	NEXTL(l);
+	cur = CUR_CHAR(l);
    }
+
    if (!(IS_CHAR_CH(cur))) {
 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 	                "Invalid char in CDATA 0x%X\n", cur);
@@ -2743,6 +2773,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
 	}
    }
    if (nbchar != 0) {
+        buf[nbchar] = 0;
+
 	/*
 	 * Ok the segment is to be consumed as chars.
 	 */
@@ -3349,27 +3381,31 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
 *
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
 *
+ * Returns 0 in case of success and -1 in case of error.
 */

-static void
+static int
 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
    const xmlChar *name;
    const xmlChar *attname;
    xmlChar *attvalue;
-    const xmlChar **atts = ctxt->atts;
+    const xmlChar **atts;
    int nbatts = 0;
-    int maxatts = ctxt->maxatts;
+    int maxatts;
    int meta = 0;
    int i;

    if ((ctxt == NULL) || (ctxt->input == NULL)) {
 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
 		     "htmlParseStartTag: context error\n", NULL, NULL);
-	return;
+	return -1;
    }
-    if (CUR != '<') return;
+    if (CUR != '<') return -1;
    NEXT;

+    atts = ctxt->atts;
+    maxatts = ctxt->maxatts;
+
    GROW;
    name = htmlParseHTMLName(ctxt);
    if (name == NULL) {
@@ -3379,7 +3415,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	/* Dump the bogus tag like browsers do */
 	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
 	    NEXT;
-        return;
+        return -1;
    }
    if (xmlStrEqual(name, BAD_CAST"meta"))
 	meta = 1;
@@ -3402,14 +3438,14 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
 	             "htmlParseStartTag: misplaced <html> tag\n",
 		     name, NULL);
-	return;
+	return 0;
    }
    if ((ctxt->nameNr != 1) && 
 	(xmlStrEqual(name, BAD_CAST"head"))) {
 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
 	             "htmlParseStartTag: misplaced <head> tag\n",
 		     name, NULL);
-	return;
+	return 0;
    }
    if (xmlStrEqual(name, BAD_CAST"body")) {
 	int indx;
@@ -3420,7 +3456,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 			     name, NULL);
 		while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
 		    NEXT;
-		return;
+		return 0;
 	    }
 	}
    }
@@ -3533,6 +3569,8 @@ failed:
 		xmlFree((xmlChar *) atts[i]);
 	}
    }
+
+    return 0;
 }

 /**
@@ -3575,6 +3613,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
    if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
 	             "End tag : expected '>'\n", NULL, NULL);
+	if (ctxt->recovery) {
+	    /*
+	     * We're not at the ending > !!
+	     * Error, unless in recover mode where we search forwards
+	     * until we find a >
+	     */
+	    while (CUR != '\0' && CUR != '>') NEXT;
+	    NEXT;
+	}
    } else
        NEXT;

@@ -3709,10 +3756,8 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
 /**
 * htmlParseContent:
 * @ctxt:  an HTML parser context
- * @name:  the node name
 *
 * Parse a content: comment, sub-element, reference or text.
- *
 */

 static void
@@ -3830,6 +3875,19 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
    if (currentNode != NULL) xmlFree(currentNode);
 }

+/**
+ * htmlParseContent:
+ * @ctxt:  an HTML parser context
+ *
+ * Parse a content: comment, sub-element, reference or text.
+ */
+
+void
+__htmlParseContent(void *ctxt) {
+    if (ctxt != NULL)
+	htmlParseContent((htmlParserCtxtPtr) ctxt);
+}
+
 /**
 * htmlParseElement:
 * @ctxt:  an HTML parser context
@@ -3847,16 +3905,15 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
    xmlChar *currentNode = NULL;
    const htmlElemDesc * info;
    htmlParserNodeInfo node_info;
-    const xmlChar *oldname;
+    int failed;
    int depth;
    const xmlChar *oldptr;

    if ((ctxt == NULL) || (ctxt->input == NULL)) {
 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
-		     "htmlParseStartTag: context error\n", NULL, NULL);
+		     "htmlParseElement: context error\n", NULL, NULL);
 	return;
    }
-    depth = ctxt->nameNr;
    /* Capture start position */
    if (ctxt->record_info) {
        node_info.begin_pos = ctxt->input->consumed +
@@ -3864,11 +3921,9 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
 	node_info.begin_line = ctxt->input->line;
    }

-    oldname = ctxt->name;
-    htmlParseStartTag(ctxt);
+    failed = htmlParseStartTag(ctxt);
    name = ctxt->name;
-    if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
-        (name == NULL)) {
+    if (failed || (name == NULL)) {
 	if (CUR == '>')
 	    NEXT;
        return;
@@ -3911,7 +3966,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
 	/*
 	 * Capture end position and add node
 	 */
-	if ( currentNode != NULL && ctxt->record_info ) {
+	if (ctxt->record_info) {
 	   node_info.end_pos = ctxt->input->consumed +
 			      (CUR_PTR - ctxt->input->base);
 	   node_info.end_line = ctxt->input->line;
@@ -4577,11 +4632,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 #endif
                } else {
 		    ctxt->instate = XML_PARSER_MISC;
-		}
 #ifdef DEBUG_PUSH
-		xmlGenericError(xmlGenericErrorContext,
-			"HPP: entering MISC\n");
+		    xmlGenericError(xmlGenericErrorContext,
+			    "HPP: entering MISC\n");
 #endif
+		}
 		break;
            case XML_PARSER_MISC:
 		SKIP_BLANKS;
@@ -4738,8 +4793,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		}
 		break;
            case XML_PARSER_START_TAG: {
-	        const xmlChar *name, *oldname;
-		int depth = ctxt->nameNr;
+	        const xmlChar *name;
+		int failed;
 		const htmlElemDesc * info;

 		if (avail < 2)
@@ -4766,11 +4821,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
 		    goto done;

-		oldname = ctxt->name;
-		htmlParseStartTag(ctxt);
+		failed = htmlParseStartTag(ctxt);
 		name = ctxt->name;
-		if (((depth == ctxt->nameNr) &&
-		     (xmlStrEqual(oldname, ctxt->name))) ||
+		if (failed ||
 		    (name == NULL)) {
 		    if (CUR == '>')
 			NEXT;
@@ -4793,7 +4846,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    SKIP(2);
 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 			ctxt->sax->endElement(ctxt->userData, name);
-		    oldname = htmlnamePop(ctxt);
+		    htmlnamePop(ctxt);
 		    ctxt->instate = XML_PARSER_CONTENT;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -4814,7 +4867,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		     */
 		    if (xmlStrEqual(name, ctxt->name)) { 
 			nodePop(ctxt);
-			oldname = htmlnamePop(ctxt);
+			htmlnamePop(ctxt);
 		    }    

 		    ctxt->instate = XML_PARSER_CONTENT;
@@ -4831,7 +4884,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		if ((info != NULL) && (info->empty)) {
 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 			ctxt->sax->endElement(ctxt->userData, name);
-		    oldname = htmlnamePop(ctxt);
+		    htmlnamePop(ctxt);
 		}
 		ctxt->instate = XML_PARSER_CONTENT;
 #ifdef DEBUG_PUSH
@@ -5178,10 +5231,18 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
 	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
 	int cur = ctxt->input->cur - ctxt->input->base;
+	int res;
 	
-	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);	      
+	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);	      
+	if (res < 0) {
+	    ctxt->errNo = XML_PARSER_EOF;
+	    ctxt->disableSAX = 1;
+	    return (XML_PARSER_EOF);
+	}
 	ctxt->input->base = ctxt->input->buf->buffer->content + base;
 	ctxt->input->cur = ctxt->input->base + cur;
+	ctxt->input->end =
+	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
 #ifdef DEBUG_PUSH
 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
 #endif
@@ -5779,6 +5840,14 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
 	ctxt->options |= XML_PARSE_NOBLANKS;
    } else
        ctxt->keepBlanks = 1;
+    if (options & HTML_PARSE_RECOVER) {
+        ctxt->recovery = 1;
+    } else
+        ctxt->recovery = 0;
+    if (options & HTML_PARSE_COMPACT) {
+	ctxt->options |= HTML_PARSE_COMPACT;
+        options -= HTML_PARSE_COMPACT;
+    }
    ctxt->dictNames = 0;
    return (options);
 }