001package org.jsoup.nodes; 002 003import org.jsoup.Connection; 004import org.jsoup.Jsoup; 005import org.jsoup.helper.DataUtil; 006import org.jsoup.helper.Validate; 007import org.jsoup.internal.StringUtil; 008import org.jsoup.parser.ParseSettings; 009import org.jsoup.parser.Parser; 010import org.jsoup.parser.Tag; 011import org.jsoup.select.Elements; 012import org.jsoup.select.Evaluator; 013import org.jsoup.select.Selector; 014import org.jspecify.annotations.Nullable; 015 016import java.nio.charset.Charset; 017import java.util.List; 018 019import static org.jsoup.parser.Parser.NamespaceHtml; 020 021/** 022 A HTML Document. 023 024 @author Jonathan Hedley, jonathan@hedley.net */ 025public class Document extends Element { 026 private @Nullable Connection connection; // the connection this doc was fetched from, if any 027 private OutputSettings outputSettings = new OutputSettings(); 028 private Parser parser; // the parser used to parse this document 029 private QuirksMode quirksMode = QuirksMode.noQuirks; 030 private final String location; 031 032 /** 033 Create a new, empty Document, in the specified namespace. 034 @param namespace the namespace of this Document's root node. 035 @param baseUri base URI of document 036 @see org.jsoup.Jsoup#parse 037 @see #createShell 038 */ 039 public Document(String namespace, String baseUri) { 040 this(namespace, baseUri, Parser.htmlParser()); // default HTML parser, but overridable 041 } 042 043 private Document(String namespace, String baseUri, Parser parser) { 044 super(new Tag("#root", namespace), baseUri); 045 this.location = baseUri; 046 this.parser = parser; 047 } 048 049 /** 050 Create a new, empty Document, in the HTML namespace. 051 @param baseUri base URI of document 052 @see org.jsoup.Jsoup#parse 053 @see #Document(String namespace, String baseUri) 054 */ 055 public Document(String baseUri) { 056 this(NamespaceHtml, baseUri); 057 } 058 059 /** 060 Create a valid, empty shell of an HTML document, suitable for adding more elements to. 061 @param baseUri baseUri of document 062 @return document with html, head, and body elements. 063 */ 064 public static Document createShell(String baseUri) { 065 Validate.notNull(baseUri); 066 067 Document doc = new Document(baseUri); 068 Element html = doc.appendElement("html"); 069 html.appendElement("head"); 070 html.appendElement("body"); 071 072 return doc; 073 } 074 075 /** 076 * Get the URL this Document was parsed from. If the starting URL is a redirect, 077 * this will return the final URL from which the document was served from. 078 * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String). 079 * @return location 080 */ 081 public String location() { 082 return location; 083 } 084 085 /** 086 Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new 087 default Connection object. This can be used to continue a session, preserving settings and cookies, etc. 088 @return the Connection (session) associated with this Document, or an empty one otherwise. 089 @see Connection#newRequest() 090 */ 091 public Connection connection() { 092 if (connection == null) 093 return Jsoup.newSession(); 094 else 095 return connection; 096 } 097 098 /** 099 * Returns this Document's doctype. 100 * @return document type, or null if not set 101 */ 102 public @Nullable DocumentType documentType() { 103 for (Node node : childNodes) { 104 if (node instanceof DocumentType) 105 return (DocumentType) node; 106 else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc 107 break; 108 } 109 return null; 110 } 111 112 /** 113 Find the root HTML element, or create it if it doesn't exist. 114 @return the root HTML element. 115 */ 116 private Element htmlEl() { 117 Element el = firstElementChild(); 118 while (el != null) { 119 if (el.nameIs("html")) 120 return el; 121 el = el.nextElementSibling(); 122 } 123 return appendElement("html"); 124 } 125 126 /** 127 Get this document's {@code head} element. 128 <p> 129 As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want 130 that, use {@code #selectFirst("head")} instead. 131 132 @return {@code head} element. 133 */ 134 public Element head() { 135 final Element html = htmlEl(); 136 Element el = html.firstElementChild(); 137 while (el != null) { 138 if (el.nameIs("head")) 139 return el; 140 el = el.nextElementSibling(); 141 } 142 return html.prependElement("head"); 143 } 144 145 /** 146 Get this document's {@code <body>} or {@code <frameset>} element. 147 <p> 148 As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code 149 <body>} element. If you do not want that, use {@code #selectFirst("body")} instead. 150 151 @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document 152 had no contents, or the outermost {@code <frameset> element} for frameset documents. 153 */ 154 public Element body() { 155 final Element html = htmlEl(); 156 Element el = html.firstElementChild(); 157 while (el != null) { 158 if (el.nameIs("body") || el.nameIs("frameset")) 159 return el; 160 el = el.nextElementSibling(); 161 } 162 return html.appendElement("body"); 163 } 164 165 /** 166 Get each of the {@code <form>} elements contained in this document. 167 @return a List of FormElement objects, which will be empty if there are none. 168 @see Elements#forms() 169 @see FormElement#elements() 170 @since 1.15.4 171 */ 172 public List<FormElement> forms() { 173 return select("form").forms(); 174 } 175 176 /** 177 Selects the first {@link FormElement} in this document that matches the query. If none match, throws an 178 {@link IllegalArgumentException}. 179 @param cssQuery a {@link Selector} CSS query 180 @return the first matching {@code <form>} element 181 @throws IllegalArgumentException if no match is found 182 @since 1.15.4 183 */ 184 public FormElement expectForm(String cssQuery) { 185 Elements els = select(cssQuery); 186 for (Element el : els) { 187 if (el instanceof FormElement) return (FormElement) el; 188 } 189 Validate.fail("No form elements matched the query '%s' in the document.", cssQuery); 190 return null; // (not really) 191 } 192 193 /** 194 Get the string contents of the document's {@code title} element. 195 @return Trimmed title, or empty string if none set. 196 */ 197 public String title() { 198 // title is a preserve whitespace tag (for document output), but normalised here 199 Element titleEl = head().selectFirst(titleEval); 200 return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : ""; 201 } 202 private static final Evaluator titleEval = new Evaluator.Tag("title"); 203 204 /** 205 Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if 206 not present 207 @param title string to set as title 208 */ 209 public void title(String title) { 210 Validate.notNull(title); 211 Element titleEl = head().selectFirst(titleEval); 212 if (titleEl == null) // add to head 213 titleEl = head().appendElement("title"); 214 titleEl.text(title); 215 } 216 217 /** 218 Create a new Element, with this document's base uri. Does not make the new element a child of this document. 219 @param tagName element tag name (e.g. {@code a}) 220 @return new element 221 */ 222 public Element createElement(String tagName) { 223 return new Element( 224 parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), 225 searchUpForAttribute(this, BaseUriKey) 226 ); 227 } 228 229 @Override 230 public String outerHtml() { 231 return super.html(); // no outer wrapper tag 232 } 233 234 /** 235 Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. 236 @param text un-encoded text 237 @return this document 238 */ 239 @Override 240 public Element text(String text) { 241 body().text(text); // overridden to not nuke doc structure 242 return this; 243 } 244 245 @Override 246 public String nodeName() { 247 return "#document"; 248 } 249 250 /** 251 Set the output character set of this Document. This method is equivalent to 252 {@link OutputSettings#charset(java.nio.charset.Charset) OutputSettings.charset(Charset)}, but additionally adds or 253 updates the charset / encoding element within the Document. 254 255 <p>If there's no existing element with charset / encoding information yet, one will 256 be created. Obsolete charset / encoding definitions are removed.</p> 257 258 <p><b>Elements used:</b></p> 259 260 <ul> 261 <li><b>HTML:</b> <i><meta charset="CHARSET"></i></li> 262 <li><b>XML:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 263 </ul> 264 265 @param charset Charset 266 @see OutputSettings#charset(java.nio.charset.Charset) 267 */ 268 public void charset(Charset charset) { 269 outputSettings.charset(charset); 270 ensureMetaCharsetElement(); 271 } 272 273 /** 274 Get the output character set of this Document. This method is equivalent to {@link OutputSettings#charset()}. 275 276 @return the current Charset 277 @see OutputSettings#charset() 278 */ 279 public Charset charset() { 280 return outputSettings.charset(); 281 } 282 283 @Override 284 public Document clone() { 285 Document clone = (Document) super.clone(); 286 if (attributes != null) clone.attributes = attributes.clone(); 287 clone.outputSettings = this.outputSettings.clone(); 288 // parser is pointer copy 289 return clone; 290 } 291 292 @Override 293 public Document shallowClone() { 294 Document clone = new Document(this.tag().namespace(), baseUri(), parser); // preserves parser pointer 295 if (attributes != null) clone.attributes = attributes.clone(); 296 clone.outputSettings = this.outputSettings.clone(); 297 return clone; 298 } 299 300 301 private void ensureMetaCharsetElement() { 302 OutputSettings.Syntax syntax = outputSettings().syntax(); 303 304 if (syntax == OutputSettings.Syntax.html) { 305 Element metaCharset = selectFirst("meta[charset]"); 306 if (metaCharset != null) { 307 metaCharset.attr("charset", charset().displayName()); 308 } else { 309 head().appendElement("meta").attr("charset", charset().displayName()); 310 } 311 select("meta[name=charset]").remove(); // Remove obsolete elements 312 } else if (syntax == OutputSettings.Syntax.xml) { 313 XmlDeclaration decl = ensureXmlDecl(); 314 decl.attr("version", "1.0"); 315 decl.attr("encoding", charset().displayName()); 316 } 317 } 318 319 private XmlDeclaration ensureXmlDecl() { 320 Node node = firstChild(); 321 if (node instanceof XmlDeclaration) { 322 XmlDeclaration decl = (XmlDeclaration) node; 323 if (decl.name().equals("xml")) return decl; 324 } 325 XmlDeclaration decl = new XmlDeclaration("xml", false); 326 prependChild(decl); 327 return decl; 328 } 329 330 331 /** 332 * A Document's output settings control the form of the text() and html() methods. 333 */ 334 public static class OutputSettings implements Cloneable { 335 /** 336 * The output serialization syntax. 337 */ 338 public enum Syntax {html, xml} 339 private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; 340 private Charset charset = DataUtil.UTF_8; 341 private boolean prettyPrint = true; 342 private boolean outline = false; 343 private int indentAmount = 1; 344 private int maxPaddingWidth = 30; 345 private Syntax syntax = Syntax.html; 346 347 /** 348 Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing, 349 indent amount of 1). 350 */ 351 public OutputSettings() { 352 } 353 354 /** 355 Get the document's current entity escape mode: 356 <ul> 357 <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li> 358 <li><code>base</code>, which provides a limited set of named HTML 359 entities and escapes other characters as numbered entities for maximum compatibility</li> 360 <li><code>extended</code>, 361 which uses the complete set of HTML named entities.</li> 362 </ul> 363 <p>The default escape mode is <code>base</code>. 364 @return the document's current escape mode 365 */ 366 public Entities.EscapeMode escapeMode() { 367 return escapeMode; 368 } 369 370 /** 371 * Set the document's escape mode, which determines how characters are escaped when the output character set 372 * does not support a given character:- using either a named or a numbered escape. 373 * @param escapeMode the new escape mode to use 374 * @return the document's output settings, for chaining 375 */ 376 public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { 377 this.escapeMode = escapeMode; 378 return this; 379 } 380 381 /** 382 * Get the document's current output charset, which is used to control which characters are escaped when 383 * generating HTML (via the <code>html()</code> methods), and which are kept intact. 384 * <p> 385 * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the 386 * input charset. Otherwise, it defaults to UTF-8. 387 * @return the document's current charset. 388 */ 389 public Charset charset() { 390 return charset; 391 } 392 393 /** 394 * Update the document's output charset. 395 * @param charset the new charset to use. 396 * @return the document's output settings, for chaining 397 */ 398 public OutputSettings charset(Charset charset) { 399 this.charset = charset; 400 return this; 401 } 402 403 /** 404 * Update the document's output charset. 405 * @param charset the new charset (by name) to use. 406 * @return the document's output settings, for chaining 407 */ 408 public OutputSettings charset(String charset) { 409 charset(Charset.forName(charset)); 410 return this; 411 } 412 413 /** 414 * Get the document's current output syntax. 415 * @return current syntax 416 */ 417 public Syntax syntax() { 418 return syntax; 419 } 420 421 /** 422 * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or 423 * {@code xml}, with self-closing tags. 424 * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is 425 * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p> 426 * @param syntax serialization syntax 427 * @return the document's output settings, for chaining 428 */ 429 public OutputSettings syntax(Syntax syntax) { 430 this.syntax = syntax; 431 if (syntax == Syntax.xml) 432 this.escapeMode(Entities.EscapeMode.xhtml); 433 return this; 434 } 435 436 /** 437 * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format 438 * the output, and the output will generally look like the input. 439 * @return if pretty printing is enabled. 440 */ 441 public boolean prettyPrint() { 442 return prettyPrint; 443 } 444 445 /** 446 * Enable or disable pretty printing. 447 * @param pretty new pretty print setting 448 * @return this, for chaining 449 */ 450 public OutputSettings prettyPrint(boolean pretty) { 451 prettyPrint = pretty; 452 return this; 453 } 454 455 /** 456 * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider 457 * all tags as block. 458 * @return if outline mode is enabled. 459 */ 460 public boolean outline() { 461 return outline; 462 } 463 464 /** 465 * Enable or disable HTML outline mode. 466 * @param outlineMode new outline setting 467 * @return this, for chaining 468 */ 469 public OutputSettings outline(boolean outlineMode) { 470 outline = outlineMode; 471 return this; 472 } 473 474 /** 475 * Get the current tag indent amount, used when pretty printing. 476 * @return the current indent amount 477 */ 478 public int indentAmount() { 479 return indentAmount; 480 } 481 482 /** 483 * Set the indent amount for pretty printing 484 * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0. 485 * @return this, for chaining 486 */ 487 public OutputSettings indentAmount(int indentAmount) { 488 Validate.isTrue(indentAmount >= 0); 489 this.indentAmount = indentAmount; 490 return this; 491 } 492 493 /** 494 * Get the current max padding amount, used when pretty printing 495 * so very deeply nested nodes don't get insane padding amounts. 496 * @return the current indent amount 497 */ 498 public int maxPaddingWidth() { 499 return maxPaddingWidth; 500 } 501 502 /** 503 * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts. 504 * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1. 505 * Default is 30 and -1 means unlimited. 506 * @return this, for chaining 507 */ 508 public OutputSettings maxPaddingWidth(int maxPaddingWidth) { 509 Validate.isTrue(maxPaddingWidth >= -1); 510 this.maxPaddingWidth = maxPaddingWidth; 511 return this; 512 } 513 514 @Override 515 public OutputSettings clone() { 516 OutputSettings clone; 517 try { 518 clone = (OutputSettings) super.clone(); 519 } catch (CloneNotSupportedException e) { 520 throw new RuntimeException(e); 521 } 522 clone.charset(charset.name()); // new charset, coreCharset, and charset encoder 523 clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); 524 // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle 525 return clone; 526 } 527 } 528 529 /** 530 * Get the document's current output settings. 531 * @return the document's current output settings. 532 */ 533 public OutputSettings outputSettings() { 534 return outputSettings; 535 } 536 537 /** 538 * Set the document's output settings. 539 * @param outputSettings new output settings. 540 * @return this document, for chaining. 541 */ 542 public Document outputSettings(OutputSettings outputSettings) { 543 Validate.notNull(outputSettings); 544 this.outputSettings = outputSettings; 545 return this; 546 } 547 548 public enum QuirksMode { 549 noQuirks, quirks, limitedQuirks 550 } 551 552 public QuirksMode quirksMode() { 553 return quirksMode; 554 } 555 556 public Document quirksMode(QuirksMode quirksMode) { 557 this.quirksMode = quirksMode; 558 return this; 559 } 560 561 /** 562 * Get the parser that was used to parse this document. 563 * @return the parser 564 */ 565 public Parser parser() { 566 return parser; 567 } 568 569 /** 570 * Set the parser used to create this document. This parser is then used when further parsing within this document 571 * is required. 572 * @param parser the configured parser to use when further parsing is required for this document. 573 * @return this document, for chaining. 574 */ 575 public Document parser(Parser parser) { 576 this.parser = parser; 577 return this; 578 } 579 580 /** 581 Set the Connection used to fetch this document. This Connection is used as a session object when further requests are 582 made (e.g. when a form is submitted). 583 584 @param connection to set 585 @return this document, for chaining 586 @see Connection#newRequest() 587 @since 1.14.1 588 */ 589 public Document connection(Connection connection) { 590 Validate.notNull(connection); 591 this.connection = connection; 592 return this; 593 } 594}