001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.Normalizer; 005import org.jsoup.internal.StringUtil; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.nodes.CDataNode; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.DataNode; 010import org.jsoup.nodes.Document; 011import org.jsoup.nodes.Element; 012import org.jsoup.nodes.FormElement; 013import org.jsoup.nodes.Node; 014import org.jsoup.nodes.TextNode; 015import org.jspecify.annotations.Nullable; 016 017import java.io.Reader; 018import java.util.ArrayList; 019import java.util.List; 020 021import static org.jsoup.internal.StringUtil.inSorted; 022import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster; 023import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent; 024import static org.jsoup.parser.Parser.*; 025 026/** 027 * HTML Tree Builder; creates a DOM from Tokens. 028 */ 029public class HtmlTreeBuilder extends TreeBuilder { 030 // tag searches. must be sorted, used in inSorted. HtmlTreeBuilderTest validates they're sorted. 031 static final String[] TagsSearchInScope = new String[]{ // a particular element in scope 032 "applet", "caption", "html", "marquee", "object", "table", "td", "template", "th" 033 }; 034 // math and svg namespaces for particular element in scope 035 static final String[]TagSearchInScopeMath = new String[] { 036 "annotation-xml", "mi", "mn", "mo", "ms", "mtext" 037 }; 038 static final String[]TagSearchInScopeSvg = new String[] { 039 "desc", "foreignobject", "title" // note normalized to lowercase to match other scope searches; will preserve input case as appropriate 040 }; 041 042 static final String[] TagSearchList = new String[]{"ol", "ul"}; 043 static final String[] TagSearchButton = new String[]{"button"}; 044 static final String[] TagSearchTableScope = new String[]{"html", "table"}; 045 static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"}; 046 static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc"}; 047 static final String[] TagThoroughSearchEndTags = new String[]{"caption", "colgroup", "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"}; 048 static final String[] TagSearchSpecial = new String[]{ 049 "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br", 050 "button", "caption", "center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt", "embed", 051 "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", 052 "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing", "main", 053 "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", 054 "pre", "script", "search", "section", "select", "source", "style", "summary", "table", "tbody", "td", 055 "template", "textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp"}; 056 static String[] TagSearchSpecialMath = {"annotation-xml", "mi", "mn", "mo", "ms", "mtext"}; // differs to MathML text integration point; adds annotation-xml 057 static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"}; 058 static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"}; 059 static final String[] TagFormListed = { 060 "button", "fieldset", "input", "keygen", "object", "output", "select", "textarea" 061 }; 062 063 /** @deprecated Not used anymore; configure parser depth via {@link Parser#setMaxDepth(int)}. Will be removed in jsoup 1.24.1. */ 064 @Deprecated 065 public static final int MaxScopeSearchDepth = 100; 066 067 private HtmlTreeBuilderState state; // the current state 068 private HtmlTreeBuilderState originalState; // original / marked state 069 070 private boolean baseUriSetFromDoc; 071 private @Nullable Element headElement; // the current head element 072 private @Nullable FormElement formElement; // the current form element 073 private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing 074 ArrayList<Element> formattingElements; // active (open) formatting elements 075 private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes 076 private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out 077 private Token.EndTag emptyEnd; // reused empty end tag 078 079 private boolean framesetOk; // if ok to go into frameset 080 private boolean fosterInserts; // if next inserts should be fostered 081 private boolean fragmentParsing; // if parsing a fragment of html 082 083 @Override ParseSettings defaultSettings() { 084 return ParseSettings.htmlDefault; 085 } 086 087 @Override 088 HtmlTreeBuilder newInstance() { 089 return new HtmlTreeBuilder(); 090 } 091 092 @Override 093 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 094 super.initialiseParse(input, baseUri, parser); 095 096 // this is a bit mucky. todo - probably just create new parser objects to ensure all reset. 097 state = HtmlTreeBuilderState.Initial; 098 originalState = null; 099 baseUriSetFromDoc = false; 100 headElement = null; 101 formElement = null; 102 contextElement = null; 103 formattingElements = new ArrayList<>(); 104 tmplInsertMode = new ArrayList<>(); 105 pendingTableCharacters = new ArrayList<>(); 106 emptyEnd = new Token.EndTag(this); 107 framesetOk = true; 108 fosterInserts = false; 109 fragmentParsing = false; 110 } 111 112 @Override void initialiseParseFragment(@Nullable Element context) { 113 // context may be null 114 state = HtmlTreeBuilderState.Initial; 115 fragmentParsing = true; 116 117 if (context != null) { 118 final String contextName = context.normalName(); 119 contextElement = new Element(tagFor(contextName, contextName, defaultNamespace(), settings), baseUri); 120 if (context.ownerDocument() != null) // quirks setup: 121 doc.quirksMode(context.ownerDocument().quirksMode()); 122 123 // initialise the tokeniser state: 124 switch (contextName) { 125 case "script": 126 tokeniser.transition(TokeniserState.ScriptData); 127 break; 128 case "plaintext": 129 tokeniser.transition(TokeniserState.PLAINTEXT); 130 break; 131 case "template": 132 tokeniser.transition(TokeniserState.Data); 133 pushTemplateMode(HtmlTreeBuilderState.InTemplate); 134 break; 135 default: 136 Tag tag = contextElement.tag(); 137 TokeniserState textState = tag.textState(); 138 if (textState != null) 139 tokeniser.transition(textState); // style, xmp, title, textarea, etc; or custom 140 else 141 tokeniser.transition(TokeniserState.Data); 142 } 143 doc.appendChild(contextElement); 144 push(contextElement); 145 resetInsertionMode(); 146 147 // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated 148 // with form correctly 149 Element formSearch = context; 150 while (formSearch != null) { 151 if (formSearch instanceof FormElement) { 152 formElement = (FormElement) formSearch; 153 break; 154 } 155 formSearch = formSearch.parent(); 156 } 157 } 158 } 159 160 @Override List<Node> completeParseFragment() { 161 if (contextElement != null) { 162 // depending on context and the input html, content may have been added outside of the root el 163 // e.g. context=p, input=div, the div will have been pushed out. 164 List<Node> nodes = contextElement.siblingNodes(); 165 if (!nodes.isEmpty()) 166 contextElement.insertChildren(-1, nodes); 167 return contextElement.childNodes(); 168 } 169 else 170 return doc.childNodes(); 171 } 172 173 @Override 174 protected boolean process(Token token) { 175 HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent; 176 return dispatch.process(token, this); 177 } 178 179 boolean useCurrentOrForeignInsert(Token token) { 180 // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction 181 // If the stack of open elements is empty 182 if (stack.isEmpty()) 183 return true; 184 final Element el = currentElement(); 185 final String ns = el.tag().namespace(); 186 187 // If the adjusted current node is an element in the HTML namespace 188 if (NamespaceHtml.equals(ns)) 189 return true; 190 191 // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" 192 // If the adjusted current node is a MathML text integration point and the token is a character token 193 if (isMathmlTextIntegration(el)) { 194 if (token.isStartTag() 195 && !"mglyph".equals(token.asStartTag().normalName) 196 && !"malignmark".equals(token.asStartTag().normalName)) 197 return true; 198 if (token.isCharacter()) 199 return true; 200 } 201 // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" 202 if (Parser.NamespaceMathml.equals(ns) 203 && el.nameIs("annotation-xml") 204 && token.isStartTag() 205 && "svg".equals(token.asStartTag().normalName)) 206 return true; 207 208 // If the adjusted current node is an HTML integration point and the token is a start tag 209 // If the adjusted current node is an HTML integration point and the token is a character token 210 if (isHtmlIntegration(el) 211 && (token.isStartTag() || token.isCharacter())) 212 return true; 213 214 // If the token is an end-of-file token 215 return token.isEOF(); 216 } 217 218 static boolean isMathmlTextIntegration(Element el) { 219 /* 220 A node is a MathML text integration point if it is one of the following elements: 221 A MathML mi element 222 A MathML mo element 223 A MathML mn element 224 A MathML ms element 225 A MathML mtext element 226 */ 227 return (Parser.NamespaceMathml.equals(el.tag().namespace()) 228 && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration)); 229 } 230 231 static boolean isHtmlIntegration(Element el) { 232 /* 233 A node is an HTML integration point if it is one of the following elements: 234 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" 235 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" 236 An SVG foreignObject element 237 An SVG desc element 238 An SVG title element 239 */ 240 if (Parser.NamespaceMathml.equals(el.tag().namespace()) 241 && el.nameIs("annotation-xml")) { 242 String encoding = Normalizer.normalize(el.attr("encoding")); 243 if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml")) 244 return true; 245 } 246 // note using .tagName for case-sensitive hit here of foreignObject 247 return Parser.NamespaceSvg.equals(el.tag().namespace()) && StringUtil.in(el.tagName(), TagSvgHtmlIntegration); 248 } 249 250 boolean process(Token token, HtmlTreeBuilderState state) { 251 return state.process(token, this); 252 } 253 254 void transition(HtmlTreeBuilderState state) { 255 this.state = state; 256 } 257 258 HtmlTreeBuilderState state() { 259 return state; 260 } 261 262 void markInsertionMode() { 263 originalState = state; 264 } 265 266 HtmlTreeBuilderState originalState() { 267 return originalState; 268 } 269 270 void framesetOk(boolean framesetOk) { 271 this.framesetOk = framesetOk; 272 } 273 274 boolean framesetOk() { 275 return framesetOk; 276 } 277 278 Document getDocument() { 279 return doc; 280 } 281 282 String getBaseUri() { 283 return baseUri; 284 } 285 286 void maybeSetBaseUri(Element base) { 287 if (baseUriSetFromDoc) // only listen to the first <base href> in parse 288 return; 289 290 String href = base.absUrl("href"); 291 if (href.length() != 0) { // ignore <base target> etc 292 baseUri = href; 293 baseUriSetFromDoc = true; 294 doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants 295 } 296 } 297 298 boolean isFragmentParsing() { 299 return fragmentParsing; 300 } 301 302 void error(HtmlTreeBuilderState state) { 303 if (parser.getErrors().canAddError()) 304 parser.getErrors().add(new ParseError(reader, "Unexpected %s token [%s] when in state [%s]", 305 currentToken.tokenType(), currentToken, state)); 306 } 307 308 Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) { 309 // dedupe and normalize the attributes: 310 Attributes attributes = startTag.attributes; 311 if (attributes != null && !attributes.isEmpty()) { 312 if (!forcePreserveCase) 313 settings.normalizeAttributes(attributes); 314 int dupes = attributes.deduplicate(settings); 315 if (dupes > 0) { 316 error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); 317 } 318 } 319 320 Tag tag = tagFor(startTag.name(), startTag.normalName, namespace, 321 forcePreserveCase ? ParseSettings.preserveCase : settings); 322 323 return (tag.normalName().equals("form")) ? 324 new FormElement(tag, null, attributes) : 325 new Element(tag, null, attributes); 326 } 327 328 /** Inserts an HTML element for the given tag */ 329 Element insertElementFor(final Token.StartTag startTag) { 330 Element el = createElementFor(startTag, NamespaceHtml, false); 331 doInsertElement(el); 332 333 // handle self-closing tags. when the spec expects an empty (void) tag, will directly hit insertEmpty, so won't generate this fake end tag. 334 if (startTag.isSelfClosing()) { 335 Tag tag = el.tag(); 336 tag.setSeenSelfClose(); // can infer output if in xml syntax 337 if (tag.isEmpty()) { 338 // treated as empty below; nothing further 339 } else if (tag.isKnownTag() && tag.isSelfClosing()) { 340 // ok, allow it. effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state 341 tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data 342 tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing 343 } else { 344 // error it, and leave the inserted element on 345 tokeniser.error("Tag [%s] cannot be self-closing; not a void tag", tag.normalName()); 346 } 347 } 348 349 if (el.tag().isEmpty()) { 350 pop(); // custom void tags behave like built-in voids (no children, not left on the stack); known empty go via insertEmpty 351 } 352 353 return el; 354 } 355 356 /** 357 Inserts a foreign element. Preserves the case of the tag name and of the attributes. 358 */ 359 Element insertForeignElementFor(final Token.StartTag startTag, String namespace) { 360 Element el = createElementFor(startTag, namespace, true); 361 doInsertElement(el); 362 363 if (startTag.isSelfClosing()) { // foreign els are OK to self-close 364 el.tag().setSeenSelfClose(); // remember this is self-closing for output 365 pop(); 366 } 367 368 return el; 369 } 370 371 Element insertEmptyElementFor(Token.StartTag startTag) { 372 Element el = createElementFor(startTag, NamespaceHtml, false); 373 doInsertElement(el); 374 pop(); 375 return el; 376 } 377 378 FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { 379 FormElement el = (FormElement) createElementFor(startTag, NamespaceHtml, false); 380 381 if (checkTemplateStack) { 382 if(!onStack("template")) 383 setFormElement(el); 384 } else 385 setFormElement(el); 386 387 doInsertElement(el); 388 if (!onStack) pop(); 389 return el; 390 } 391 392 /** Inserts the Element onto the stack. All element inserts must run through this method. Performs any general 393 tests on the Element before insertion. 394 * @param el the Element to insert and make the current element 395 */ 396 private void doInsertElement(Element el) { 397 enforceStackDepthLimit(); 398 399 if (formElement != null && el.tag().namespace.equals(NamespaceHtml) && StringUtil.inSorted(el.normalName(), TagFormListed)) 400 formElement.addElement(el); // connect form controls to their form element 401 402 // in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to 403 if (parser.getErrors().canAddError() && el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace())) 404 error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName()); 405 406 if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), InTableFoster)) 407 insertInFosterParent(el); 408 else 409 currentElement().appendChild(el); 410 411 push(el); 412 } 413 414 void insertCommentNode(Token.Comment token) { 415 Comment node = new Comment(token.getData()); 416 currentElement().appendChild(node); 417 onNodeInserted(node); 418 } 419 420 /** Inserts the provided character token into the current element. Any nulls in the data will be removed. */ 421 void insertCharacterNode(Token.Character characterToken) { 422 insertCharacterNode(characterToken, false); 423 } 424 425 /** 426 Inserts the provided character token into the current element. The tokenizer will have already raised precise character errors. 427 428 @param characterToken the character token to insert 429 @param replace if true, replaces any null chars in the data with the replacement char (U+FFFD). If false, removes 430 null chars. 431 */ 432 void insertCharacterNode(Token.Character characterToken, boolean replace) { 433 characterToken.normalizeNulls(replace); 434 Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) 435 insertCharacterToElement(characterToken, el); 436 } 437 438 /** Inserts the provided character token into the provided element. */ 439 void insertCharacterToElement(Token.Character characterToken, Element el) { 440 final Node node; 441 final String data = characterToken.getData(); 442 443 if (characterToken.isCData()) 444 node = new CDataNode(data); 445 else if (el.tag().is(Tag.Data)) 446 node = new DataNode(data); 447 else 448 node = new TextNode(data); 449 el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. 450 onNodeInserted(node); 451 } 452 453 ArrayList<Element> getStack() { 454 return stack; 455 } 456 457 boolean onStack(Element el) { 458 return onStack(stack, el); 459 } 460 461 /** Checks if there is an HTML element with the given name on the stack. */ 462 boolean onStack(String elName) { 463 return getFromStack(elName) != null; 464 } 465 466 private static final int maxQueueDepth = 256; // an arbitrary tension point between real HTML and crafted pain 467 private static boolean onStack(ArrayList<Element> queue, Element element) { 468 final int bottom = queue.size() - 1; 469 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 470 for (int pos = bottom; pos >= upper; pos--) { 471 Element next = queue.get(pos); 472 if (next == element) { 473 return true; 474 } 475 } 476 return false; 477 } 478 479 /** Gets the nearest (lowest) HTML element with the given name from the stack. */ 480 @Nullable 481 Element getFromStack(String elName) { 482 final int bottom = stack.size() - 1; 483 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 484 for (int pos = bottom; pos >= upper; pos--) { 485 Element next = stack.get(pos); 486 if (next.elementIs(elName, NamespaceHtml)) { 487 return next; 488 } 489 } 490 return null; 491 } 492 493 boolean removeFromStack(Element el) { 494 for (int pos = stack.size() -1; pos >= 0; pos--) { 495 Element next = stack.get(pos); 496 if (next == el) { 497 stack.remove(pos); 498 onNodeClosed(el); 499 return true; 500 } 501 } 502 return false; 503 } 504 505 @Override 506 void onStackPrunedForDepth(Element element) { 507 // handle other effects of popping to keep state correct 508 if (element == headElement) headElement = null; 509 if (element == formElement) setFormElement(null); 510 removeFromActiveFormattingElements(element); 511 if (element.nameIs("template")) { 512 clearFormattingElementsToLastMarker(); 513 if (templateModeSize() > 0) 514 popTemplateMode(); 515 resetInsertionMode(); 516 } 517 } 518 519 /** Pops the stack until the given HTML element is removed. */ 520 @Nullable 521 Element popStackToClose(String elName) { 522 for (int pos = stack.size() -1; pos >= 0; pos--) { 523 Element el = pop(); 524 if (el.elementIs(elName, NamespaceHtml)) { 525 return el; 526 } 527 } 528 return null; 529 } 530 531 /** Pops the stack until an element with the supplied name is removed, irrespective of namespace. */ 532 @Nullable 533 Element popStackToCloseAnyNamespace(String elName) { 534 for (int pos = stack.size() -1; pos >= 0; pos--) { 535 Element el = pop(); 536 if (el.nameIs(elName)) { 537 return el; 538 } 539 } 540 return null; 541 } 542 543 /** Pops the stack until one of the given HTML elements is removed. */ 544 void popStackToClose(String... elNames) { // elnames is sorted, comes from Constants 545 for (int pos = stack.size() -1; pos >= 0; pos--) { 546 Element el = pop(); 547 if (inSorted(el.normalName(), elNames) && NamespaceHtml.equals(el.tag().namespace())) { 548 break; 549 } 550 } 551 } 552 553 void clearStackToTableContext() { 554 clearStackToContext("table", "template"); 555 } 556 557 void clearStackToTableBodyContext() { 558 clearStackToContext("tbody", "tfoot", "thead", "template"); 559 } 560 561 void clearStackToTableRowContext() { 562 clearStackToContext("tr", "template"); 563 } 564 565 /** Removes elements from the stack until one of the supplied HTML elements is removed. */ 566 private void clearStackToContext(String... nodeNames) { 567 for (int pos = stack.size() -1; pos >= 0; pos--) { 568 Element next = stack.get(pos); 569 if (NamespaceHtml.equals(next.tag().namespace()) && 570 (StringUtil.in(next.normalName(), nodeNames) || next.nameIs("html"))) 571 break; 572 else 573 pop(); 574 } 575 } 576 577 /** 578 Gets the Element immediately above the supplied element on the stack. Which due to adoption, may not necessarily be 579 its parent. 580 581 @param el 582 @return the Element immediately above the supplied element, or null if there is no such element. 583 */ 584 @Nullable Element aboveOnStack(Element el) { 585 if (!onStack(el)) return null; 586 for (int pos = stack.size() -1; pos > 0; pos--) { 587 Element next = stack.get(pos); 588 if (next == el) { 589 return stack.get(pos-1); 590 } 591 } 592 return null; 593 } 594 595 void insertOnStackAfter(Element after, Element in) { 596 int i = stack.lastIndexOf(after); 597 if (i == -1) { 598 error("Did not find element on stack to insert after"); 599 stack.add(in); 600 // may happen on particularly malformed inputs during adoption 601 } else { 602 stack.add(i+1, in); 603 } 604 } 605 606 void replaceOnStack(Element out, Element in) { 607 replaceInQueue(stack, out, in); 608 } 609 610 private static void replaceInQueue(ArrayList<Element> queue, Element out, Element in) { 611 int i = queue.lastIndexOf(out); 612 Validate.isTrue(i != -1); 613 queue.set(i, in); 614 } 615 616 /** 617 * Reset the insertion mode, by searching up the stack for an appropriate insertion mode. The stack search depth 618 * is limited to {@link #maxQueueDepth}. 619 * @return true if the insertion mode was actually changed. 620 */ 621 boolean resetInsertionMode() { 622 // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode 623 boolean last = false; 624 final int bottom = stack.size() - 1; 625 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 626 final HtmlTreeBuilderState origState = this.state; 627 628 if (stack.size() == 0) { // nothing left of stack, just get to body 629 transition(HtmlTreeBuilderState.InBody); 630 } 631 632 LOOP: for (int pos = bottom; pos >= upper; pos--) { 633 Element node = stack.get(pos); 634 if (pos == upper) { 635 last = true; 636 if (fragmentParsing) 637 node = contextElement; 638 } 639 String name = node != null ? node.normalName() : ""; 640 if (!NamespaceHtml.equals(node.tag().namespace())) 641 continue; // only looking for HTML elements here 642 643 switch (name) { 644 case "select": 645 transition(HtmlTreeBuilderState.InSelect); 646 // todo - should loop up (with some limit) and check for table or template hits 647 break LOOP; 648 case "td": 649 case "th": 650 if (!last) { 651 transition(HtmlTreeBuilderState.InCell); 652 break LOOP; 653 } 654 break; 655 case "tr": 656 transition(HtmlTreeBuilderState.InRow); 657 break LOOP; 658 case "tbody": 659 case "thead": 660 case "tfoot": 661 transition(HtmlTreeBuilderState.InTableBody); 662 break LOOP; 663 case "caption": 664 transition(HtmlTreeBuilderState.InCaption); 665 break LOOP; 666 case "colgroup": 667 transition(HtmlTreeBuilderState.InColumnGroup); 668 break LOOP; 669 case "table": 670 transition(HtmlTreeBuilderState.InTable); 671 break LOOP; 672 case "template": 673 HtmlTreeBuilderState tmplState = currentTemplateMode(); 674 Validate.notNull(tmplState, "Bug: no template insertion mode on stack!"); 675 transition(tmplState); 676 break LOOP; 677 case "head": 678 if (!last) { 679 transition(HtmlTreeBuilderState.InHead); 680 break LOOP; 681 } 682 break; 683 case "body": 684 transition(HtmlTreeBuilderState.InBody); 685 break LOOP; 686 case "frameset": 687 transition(HtmlTreeBuilderState.InFrameset); 688 break LOOP; 689 case "html": 690 transition(headElement == null ? HtmlTreeBuilderState.BeforeHead : HtmlTreeBuilderState.AfterHead); 691 break LOOP; 692 } 693 if (last) { 694 transition(HtmlTreeBuilderState.InBody); 695 break; 696 } 697 } 698 return state != origState; 699 } 700 701 /** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */ 702 void resetBody() { 703 if (!onStack("body")) { 704 stack.add(doc.body()); // not onNodeInserted, as already seen 705 } 706 transition(HtmlTreeBuilderState.InBody); 707 } 708 709 // todo: tidy up in specific scope methods 710 private final String[] specificScopeTarget = {null}; 711 712 private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { 713 specificScopeTarget[0] = targetName; 714 return inSpecificScope(specificScopeTarget, baseTypes, extraTypes); 715 } 716 717 private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) { 718 // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope 719 final int bottom = stack.size() -1; 720 // don't walk too far up the tree 721 for (int pos = bottom; pos >= 0; pos--) { 722 Element el = stack.get(pos); 723 String elName = el.normalName(); 724 // namespace checks - arguments provided are always in html ns, with this bolt-on for math and svg: 725 String ns = el.tag().namespace(); 726 if (ns.equals(NamespaceHtml)) { 727 if (inSorted(elName, targetNames)) 728 return true; 729 if (inSorted(elName, baseTypes)) 730 return false; 731 if (extraTypes != null && inSorted(elName, extraTypes)) 732 return false; 733 } else if (baseTypes == TagsSearchInScope) { 734 if (ns.equals(NamespaceMathml) && inSorted(elName, TagSearchInScopeMath)) 735 return false; 736 if (ns.equals(NamespaceSvg) && inSorted(elName, TagSearchInScopeSvg)) 737 return false; 738 } 739 } 740 //Validate.fail("Should not be reachable"); // would end up false because hitting 'html' at root (basetypes) 741 return false; 742 } 743 744 boolean inScope(String[] targetNames) { 745 return inSpecificScope(targetNames, TagsSearchInScope, null); 746 } 747 748 boolean inScope(String targetName) { 749 return inScope(targetName, null); 750 } 751 752 boolean inScope(String targetName, String[] extras) { 753 return inSpecificScope(targetName, TagsSearchInScope, extras); 754 } 755 756 boolean inListItemScope(String targetName) { 757 return inScope(targetName, TagSearchList); 758 } 759 760 boolean inButtonScope(String targetName) { 761 return inScope(targetName, TagSearchButton); 762 } 763 764 boolean inTableScope(String targetName) { 765 return inSpecificScope(targetName, TagSearchTableScope, null); 766 } 767 768 boolean inSelectScope(String targetName) { 769 for (int pos = stack.size() -1; pos >= 0; pos--) { 770 Element el = stack.get(pos); 771 String elName = el.normalName(); 772 if (elName.equals(targetName)) 773 return true; 774 if (!inSorted(elName, TagSearchSelectScope)) // all elements except 775 return false; 776 } 777 return false; // nothing left on stack 778 } 779 780 /** Tests if there is some element on the stack that is not in the provided set. */ 781 boolean onStackNot(String[] allowedTags) { 782 for (int pos = stack.size() - 1; pos >= 0; pos--) { 783 final String elName = stack.get(pos).normalName(); 784 if (!inSorted(elName, allowedTags)) 785 return true; 786 } 787 return false; 788 } 789 790 void setHeadElement(Element headElement) { 791 this.headElement = headElement; 792 } 793 794 Element getHeadElement() { 795 return headElement; 796 } 797 798 boolean isFosterInserts() { 799 return fosterInserts; 800 } 801 802 void setFosterInserts(boolean fosterInserts) { 803 this.fosterInserts = fosterInserts; 804 } 805 806 @Nullable FormElement getFormElement() { 807 return formElement; 808 } 809 810 void setFormElement(FormElement formElement) { 811 this.formElement = formElement; 812 } 813 814 void resetPendingTableCharacters() { 815 pendingTableCharacters.clear(); 816 } 817 818 List<Token.Character> getPendingTableCharacters() { 819 return pendingTableCharacters; 820 } 821 822 void addPendingTableCharacters(Token.Character c) { 823 // make a copy of the token to maintain its state (as Tokens are otherwise reset) 824 Token.Character copy = new Token.Character(c); 825 pendingTableCharacters.add(copy); 826 } 827 828 /** 829 13.2.6.3 Closing elements that have implied end tags 830 When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements. 831 832 If a step requires the UA to generate implied end tags but lists an element to exclude from the process, then the UA must perform the above steps as if that element was not in the above list. 833 834 When the steps below require the UA to generate all implied end tags thoroughly, then, while the current node is a caption element, a colgroup element, a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, an rtc element, a tbody element, a td element, a tfoot element, a th element, a thead element, or a tr element, the UA must pop the current node off the stack of open elements. 835 836 @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the 837 process, then the UA must perform the above steps as if that element was not in the above list. 838 */ 839 void generateImpliedEndTags(String excludeTag) { 840 while (inSorted(currentElement().normalName(), TagSearchEndTags)) { 841 if (excludeTag != null && currentElementIs(excludeTag)) 842 break; 843 pop(); 844 } 845 } 846 847 void generateImpliedEndTags() { 848 generateImpliedEndTags(false); 849 } 850 851 /** 852 Pops HTML elements off the stack according to the implied end tag rules 853 @param thorough if we are thorough (includes table elements etc) or not 854 */ 855 void generateImpliedEndTags(boolean thorough) { 856 final String[] search = thorough ? TagThoroughSearchEndTags : TagSearchEndTags; 857 while (NamespaceHtml.equals(currentElement().tag().namespace()) 858 && inSorted(currentElement().normalName(), search)) { 859 pop(); 860 } 861 } 862 863 void closeElement(String name) { 864 generateImpliedEndTags(name); 865 if (!name.equals(currentElement().normalName())) error(state()); 866 popStackToClose(name); 867 } 868 869 static boolean isSpecial(Element el) { 870 String namespace = el.tag().namespace(); 871 String name = el.normalName(); 872 switch (namespace) { 873 case NamespaceHtml: 874 return inSorted(name, TagSearchSpecial); 875 case Parser.NamespaceMathml: 876 return inSorted(name, TagSearchSpecialMath); 877 case Parser.NamespaceSvg: 878 return inSorted(name, TagSvgHtmlIntegration); 879 default: 880 return false; 881 } 882 } 883 884 Element lastFormattingElement() { 885 return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null; 886 } 887 888 int positionOfElement(Element el){ 889 for (int i = 0; i < formattingElements.size(); i++){ 890 if (el == formattingElements.get(i)) 891 return i; 892 } 893 return -1; 894 } 895 896 Element removeLastFormattingElement() { 897 int size = formattingElements.size(); 898 if (size > 0) 899 return formattingElements.remove(size-1); 900 else 901 return null; 902 } 903 904 // active formatting elements 905 void pushActiveFormattingElements(Element in) { 906 checkActiveFormattingElements(in); 907 formattingElements.add(in); 908 } 909 910 void pushWithBookmark(Element in, int bookmark){ 911 checkActiveFormattingElements(in); 912 // catch any range errors and assume bookmark is incorrect - saves a redundant range check. 913 try { 914 formattingElements.add(bookmark, in); 915 } catch (IndexOutOfBoundsException e) { 916 formattingElements.add(in); 917 } 918 } 919 920 void checkActiveFormattingElements(Element in){ 921 int numSeen = 0; 922 final int size = formattingElements.size() -1; 923 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 924 925 for (int pos = size; pos >= ceil; pos--) { 926 Element el = formattingElements.get(pos); 927 if (el == null) // marker 928 break; 929 930 if (isSameFormattingElement(in, el)) 931 numSeen++; 932 933 if (numSeen == 3) { 934 formattingElements.remove(pos); 935 break; 936 } 937 } 938 } 939 940 private static boolean isSameFormattingElement(Element a, Element b) { 941 // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children 942 return a.normalName().equals(b.normalName()) && 943 // a.namespace().equals(b.namespace()) && 944 a.attributes().equals(b.attributes()); 945 // todo: namespaces 946 } 947 948 void reconstructFormattingElements() { 949 if (stack.size() > maxQueueDepth) 950 return; 951 Element last = lastFormattingElement(); 952 if (last == null || onStack(last)) 953 return; 954 955 Element entry = last; 956 int size = formattingElements.size(); 957 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 958 int pos = size - 1; 959 boolean skip = false; 960 while (true) { 961 if (pos == ceil) { // step 4. if none before, skip to 8 962 skip = true; 963 break; 964 } 965 entry = formattingElements.get(--pos); // step 5. one earlier than entry 966 if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack 967 break; // jump to 8, else continue back to 4 968 } 969 while(true) { 970 if (!skip) // step 7: on later than entry 971 entry = formattingElements.get(++pos); 972 Validate.notNull(entry); // should not occur, as we break at last element 973 974 // 8. create new element from element, 9 insert into current node, onto stack 975 skip = false; // can only skip increment from 4. 976 Element newEl = new Element(tagFor(entry.nodeName(), entry.normalName(), defaultNamespace(), settings), null, entry.attributes().clone()); 977 doInsertElement(newEl); 978 979 // 10. replace entry with new entry 980 formattingElements.set(pos, newEl); 981 982 // 11 983 if (pos == size-1) // if not last entry in list, jump to 7 984 break; 985 } 986 } 987 private static final int maxUsedFormattingElements = 12; // limit how many elements get recreated 988 989 void clearFormattingElementsToLastMarker() { 990 while (!formattingElements.isEmpty()) { 991 Element el = removeLastFormattingElement(); 992 if (el == null) 993 break; 994 } 995 } 996 997 void removeFromActiveFormattingElements(Element el) { 998 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 999 Element next = formattingElements.get(pos); 1000 if (next == el) { 1001 formattingElements.remove(pos); 1002 break; 1003 } 1004 } 1005 } 1006 1007 boolean isInActiveFormattingElements(Element el) { 1008 return onStack(formattingElements, el); 1009 } 1010 1011 @Nullable 1012 Element getActiveFormattingElement(String nodeName) { 1013 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 1014 Element next = formattingElements.get(pos); 1015 if (next == null) // scope marker 1016 break; 1017 else if (next.nameIs(nodeName)) 1018 return next; 1019 } 1020 return null; 1021 } 1022 1023 void replaceActiveFormattingElement(Element out, Element in) { 1024 replaceInQueue(formattingElements, out, in); 1025 } 1026 1027 void insertMarkerToFormattingElements() { 1028 formattingElements.add(null); 1029 } 1030 1031 void insertInFosterParent(Node in) { 1032 Element fosterParent; 1033 Element lastTable = getFromStack("table"); 1034 boolean isLastTableParent = false; 1035 if (lastTable != null) { 1036 if (lastTable.parent() != null) { 1037 fosterParent = lastTable.parent(); 1038 isLastTableParent = true; 1039 } else 1040 fosterParent = aboveOnStack(lastTable); 1041 } else { // no table == frag 1042 fosterParent = stack.get(0); 1043 } 1044 1045 if (isLastTableParent) { 1046 Validate.notNull(lastTable); // last table cannot be null by this point. 1047 lastTable.before(in); 1048 } 1049 else 1050 fosterParent.appendChild(in); 1051 } 1052 1053 // Template Insertion Mode stack 1054 void pushTemplateMode(HtmlTreeBuilderState state) { 1055 tmplInsertMode.add(state); 1056 } 1057 1058 @Nullable HtmlTreeBuilderState popTemplateMode() { 1059 if (tmplInsertMode.size() > 0) { 1060 return tmplInsertMode.remove(tmplInsertMode.size() -1); 1061 } else { 1062 return null; 1063 } 1064 } 1065 1066 int templateModeSize() { 1067 return tmplInsertMode.size(); 1068 } 1069 1070 @Nullable HtmlTreeBuilderState currentTemplateMode() { 1071 return (tmplInsertMode.size() > 0) ? tmplInsertMode.get(tmplInsertMode.size() -1) : null; 1072 } 1073 1074 @Override 1075 public String toString() { 1076 return "TreeBuilder{" + 1077 "currentToken=" + currentToken + 1078 ", state=" + state + 1079 ", currentElement=" + currentElement() + 1080 '}'; 1081 } 1082 1083}