001package org.jsoup.nodes;
002
003import org.jsoup.Connection;
004import org.jsoup.Jsoup;
005import org.jsoup.helper.DataUtil;
006import org.jsoup.helper.Validate;
007import org.jsoup.internal.StringUtil;
008import org.jsoup.parser.ParseSettings;
009import org.jsoup.parser.Parser;
010import org.jsoup.parser.Tag;
011import org.jsoup.select.Elements;
012import org.jsoup.select.Evaluator;
013import org.jsoup.select.Selector;
014import org.jspecify.annotations.Nullable;
015
016import java.nio.charset.Charset;
017import java.util.List;
018
019import static org.jsoup.parser.Parser.NamespaceHtml;
020
021/**
022 A HTML Document.
023
024 @author Jonathan Hedley, jonathan@hedley.net */
025public class Document extends Element {
026    private @Nullable Connection connection; // the connection this doc was fetched from, if any
027    private OutputSettings outputSettings = new OutputSettings();
028    private Parser parser; // the parser used to parse this document
029    private QuirksMode quirksMode = QuirksMode.noQuirks;
030    private final String location;
031
032    /**
033     Create a new, empty Document, in the specified namespace.
034     @param namespace the namespace of this Document's root node.
035     @param baseUri base URI of document
036     @see org.jsoup.Jsoup#parse
037     @see #createShell
038     */
039    public Document(String namespace, String baseUri) {
040        this(namespace, baseUri, Parser.htmlParser()); // default HTML parser, but overridable
041    }
042
043    private Document(String namespace, String baseUri, Parser parser) {
044        super(new Tag("#root", namespace), baseUri);
045        this.location = baseUri;
046        this.parser = parser;
047    }
048
049    /**
050     Create a new, empty Document, in the HTML namespace.
051     @param baseUri base URI of document
052     @see org.jsoup.Jsoup#parse
053     @see #Document(String namespace, String baseUri)
054     */
055    public Document(String baseUri) {
056        this(NamespaceHtml, baseUri);
057    }
058
059    /**
060     Create a valid, empty shell of an HTML document, suitable for adding more elements to.
061     @param baseUri baseUri of document
062     @return document with html, head, and body elements.
063     */
064    public static Document createShell(String baseUri) {
065        Validate.notNull(baseUri);
066
067        Document doc = new Document(baseUri);
068        Element html = doc.appendElement("html");
069        html.appendElement("head");
070        html.appendElement("body");
071
072        return doc;
073    }
074
075    /**
076     * Get the URL this Document was parsed from. If the starting URL is a redirect,
077     * this will return the final URL from which the document was served from.
078     * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String).
079     * @return location
080     */
081    public String location() {
082        return location;
083    }
084
085    /**
086     Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new
087     default Connection object. This can be used to continue a session, preserving settings and cookies, etc.
088     @return the Connection (session) associated with this Document, or an empty one otherwise.
089     @see Connection#newRequest()
090     */
091    public Connection connection() {
092        if (connection == null)
093            return Jsoup.newSession();
094        else
095            return connection;
096    }
097
098    /**
099     * Returns this Document's doctype.
100     * @return document type, or null if not set
101     */
102    public @Nullable DocumentType documentType() {
103        for (Node node : childNodes) {
104            if (node instanceof DocumentType)
105                return (DocumentType) node;
106            else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc
107                break;
108        }
109        return null;
110    }
111
112    /**
113     Find the root HTML element, or create it if it doesn't exist.
114     @return the root HTML element.
115     */
116    private Element htmlEl() {
117        Element el = firstElementChild();
118        while (el != null) {
119            if (el.nameIs("html"))
120                return el;
121            el = el.nextElementSibling();
122        }
123        return appendElement("html");
124    }
125
126    /**
127     Get this document's {@code head} element.
128     <p>
129     As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want
130     that, use {@code #selectFirst("head")} instead.
131
132     @return {@code head} element.
133     */
134    public Element head() {
135        final Element html = htmlEl();
136        Element el = html.firstElementChild();
137        while (el != null) {
138            if (el.nameIs("head"))
139                return el;
140            el = el.nextElementSibling();
141        }
142        return html.prependElement("head");
143    }
144
145    /**
146     Get this document's {@code <body>} or {@code <frameset>} element.
147     <p>
148     As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code
149    <body>} element. If you do not want that, use {@code #selectFirst("body")} instead.
150
151     @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document
152     had no contents, or the outermost {@code <frameset> element} for frameset documents.
153     */
154    public Element body() {
155        final Element html = htmlEl();
156        Element el = html.firstElementChild();
157        while (el != null) {
158            if (el.nameIs("body") || el.nameIs("frameset"))
159                return el;
160            el = el.nextElementSibling();
161        }
162        return html.appendElement("body");
163    }
164
165    /**
166     Get each of the {@code <form>} elements contained in this document.
167     @return a List of FormElement objects, which will be empty if there are none.
168     @see Elements#forms()
169     @see FormElement#elements()
170     @since 1.15.4
171     */
172    public List<FormElement> forms() {
173        return select("form").forms();
174    }
175
176    /**
177     Selects the first {@link FormElement} in this document that matches the query. If none match, throws an
178     {@link IllegalArgumentException}.
179     @param cssQuery a {@link Selector} CSS query
180     @return the first matching {@code <form>} element
181     @throws IllegalArgumentException if no match is found
182     @since 1.15.4
183     */
184    public FormElement expectForm(String cssQuery) {
185        Elements els = select(cssQuery);
186        for (Element el : els) {
187            if (el instanceof FormElement) return (FormElement) el;
188        }
189        Validate.fail("No form elements matched the query '%s' in the document.", cssQuery);
190        return null; // (not really)
191    }
192
193    /**
194     Get the string contents of the document's {@code title} element.
195     @return Trimmed title, or empty string if none set.
196     */
197    public String title() {
198        // title is a preserve whitespace tag (for document output), but normalised here
199        Element titleEl = head().selectFirst(titleEval);
200        return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
201    }
202    private static final Evaluator titleEval = new Evaluator.Tag("title");
203
204    /**
205     Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
206     not present
207     @param title string to set as title
208     */
209    public void title(String title) {
210        Validate.notNull(title);
211        Element titleEl = head().selectFirst(titleEval);
212        if (titleEl == null) // add to head
213            titleEl = head().appendElement("title");
214        titleEl.text(title);
215    }
216
217    /**
218     Create a new Element, with this document's base uri. Does not make the new element a child of this document.
219     @param tagName element tag name (e.g. {@code a})
220     @return new element
221     */
222    public Element createElement(String tagName) {
223        return new Element(
224            parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase),
225            searchUpForAttribute(this, BaseUriKey)
226        );
227    }
228
229    @Override
230    public String outerHtml() {
231        return super.html(); // no outer wrapper tag
232    }
233
234    /**
235     Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
236     @param text un-encoded text
237     @return this document
238     */
239    @Override
240    public Element text(String text) {
241        body().text(text); // overridden to not nuke doc structure
242        return this;
243    }
244
245    @Override
246    public String nodeName() {
247        return "#document";
248    }
249
250    /**
251     Set the output character set of this Document. This method is equivalent to
252     {@link OutputSettings#charset(java.nio.charset.Charset) OutputSettings.charset(Charset)}, but additionally adds or
253     updates the charset / encoding element within the Document.
254
255     <p>If there's no existing element with charset / encoding information yet, one will
256     be created. Obsolete charset / encoding definitions are removed.</p>
257
258     <p><b>Elements used:</b></p>
259
260     <ul>
261     <li><b>HTML:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
262     <li><b>XML:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
263     </ul>
264
265     @param charset Charset
266     @see OutputSettings#charset(java.nio.charset.Charset)
267     */
268    public void charset(Charset charset) {
269        outputSettings.charset(charset);
270        ensureMetaCharsetElement();
271    }
272
273    /**
274     Get the output character set of this Document. This method is equivalent to {@link OutputSettings#charset()}.
275
276     @return the current Charset
277     @see OutputSettings#charset()
278     */
279    public Charset charset() {
280        return outputSettings.charset();
281    }
282
283    @Override
284    public Document clone() {
285        Document clone = (Document) super.clone();
286        if (attributes != null) clone.attributes = attributes.clone();
287        clone.outputSettings = this.outputSettings.clone();
288        // parser is pointer copy
289        return clone;
290    }
291
292    @Override
293    public Document shallowClone() {
294        Document clone = new Document(this.tag().namespace(), baseUri(), parser); // preserves parser pointer
295        if (attributes != null) clone.attributes = attributes.clone();
296        clone.outputSettings = this.outputSettings.clone();
297        return clone;
298    }
299    
300
301    private void ensureMetaCharsetElement() {
302        OutputSettings.Syntax syntax = outputSettings().syntax();
303
304        if (syntax == OutputSettings.Syntax.html) {
305            Element metaCharset = selectFirst("meta[charset]");
306            if (metaCharset != null) {
307                metaCharset.attr("charset", charset().displayName());
308            } else {
309                head().appendElement("meta").attr("charset", charset().displayName());
310            }
311            select("meta[name=charset]").remove(); // Remove obsolete elements
312        } else if (syntax == OutputSettings.Syntax.xml) {
313            XmlDeclaration decl = ensureXmlDecl();
314            decl.attr("version", "1.0");
315            decl.attr("encoding", charset().displayName());
316        }
317    }
318
319    private XmlDeclaration ensureXmlDecl() {
320        Node node = firstChild();
321        if (node instanceof XmlDeclaration) {
322            XmlDeclaration decl = (XmlDeclaration) node;
323            if (decl.name().equals("xml")) return decl;
324        }
325        XmlDeclaration decl = new XmlDeclaration("xml", false);
326        prependChild(decl);
327        return decl;
328    }
329
330
331    /**
332     * A Document's output settings control the form of the text() and html() methods.
333     */
334    public static class OutputSettings implements Cloneable {
335        /**
336         * The output serialization syntax.
337         */
338        public enum Syntax {html, xml}
339        private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
340        private Charset charset = DataUtil.UTF_8;
341        private boolean prettyPrint = true;
342        private boolean outline = false;
343        private int indentAmount = 1;
344        private int maxPaddingWidth = 30;
345        private Syntax syntax = Syntax.html;
346
347        /**
348         Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing,
349         indent amount of 1).
350         */
351        public OutputSettings() {
352        }
353
354        /**
355         Get the document's current entity escape mode:
356         <ul>
357         <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li>
358         <li><code>base</code>, which provides a limited set of named HTML
359         entities and escapes other characters as numbered entities for maximum compatibility</li>
360         <li><code>extended</code>,
361         which uses the complete set of HTML named entities.</li>
362         </ul>
363         <p>The default escape mode is <code>base</code>.
364         @return the document's current escape mode
365         */
366        public Entities.EscapeMode escapeMode() {
367            return escapeMode;
368        }
369
370        /**
371         * Set the document's escape mode, which determines how characters are escaped when the output character set
372         * does not support a given character:- using either a named or a numbered escape.
373         * @param escapeMode the new escape mode to use
374         * @return the document's output settings, for chaining
375         */
376        public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
377            this.escapeMode = escapeMode;
378            return this;
379        }
380
381        /**
382         * Get the document's current output charset, which is used to control which characters are escaped when
383         * generating HTML (via the <code>html()</code> methods), and which are kept intact.
384         * <p>
385         * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
386         * input charset. Otherwise, it defaults to UTF-8.
387         * @return the document's current charset.
388         */
389        public Charset charset() {
390            return charset;
391        }
392
393        /**
394         * Update the document's output charset.
395         * @param charset the new charset to use.
396         * @return the document's output settings, for chaining
397         */
398        public OutputSettings charset(Charset charset) {
399            this.charset = charset;
400            return this;
401        }
402
403        /**
404         * Update the document's output charset.
405         * @param charset the new charset (by name) to use.
406         * @return the document's output settings, for chaining
407         */
408        public OutputSettings charset(String charset) {
409            charset(Charset.forName(charset));
410            return this;
411        }
412
413        /**
414         * Get the document's current output syntax.
415         * @return current syntax
416         */
417        public Syntax syntax() {
418            return syntax;
419        }
420
421        /**
422         * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
423         * {@code xml}, with self-closing tags.
424         * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is
425         * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p>
426         * @param syntax serialization syntax
427         * @return the document's output settings, for chaining
428         */
429        public OutputSettings syntax(Syntax syntax) {
430            this.syntax = syntax;
431            if (syntax == Syntax.xml)
432                this.escapeMode(Entities.EscapeMode.xhtml);
433            return this;
434        }
435
436        /**
437         * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
438         * the output, and the output will generally look like the input.
439         * @return if pretty printing is enabled.
440         */
441        public boolean prettyPrint() {
442            return prettyPrint;
443        }
444
445        /**
446         * Enable or disable pretty printing.
447         * @param pretty new pretty print setting
448         * @return this, for chaining
449         */
450        public OutputSettings prettyPrint(boolean pretty) {
451            prettyPrint = pretty;
452            return this;
453        }
454        
455        /**
456         * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
457         * all tags as block.
458         * @return if outline mode is enabled.
459         */
460        public boolean outline() {
461            return outline;
462        }
463        
464        /**
465         * Enable or disable HTML outline mode.
466         * @param outlineMode new outline setting
467         * @return this, for chaining
468         */
469        public OutputSettings outline(boolean outlineMode) {
470            outline = outlineMode;
471            return this;
472        }
473
474        /**
475         * Get the current tag indent amount, used when pretty printing.
476         * @return the current indent amount
477         */
478        public int indentAmount() {
479            return indentAmount;
480        }
481
482        /**
483         * Set the indent amount for pretty printing
484         * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
485         * @return this, for chaining
486         */
487        public OutputSettings indentAmount(int indentAmount) {
488            Validate.isTrue(indentAmount >= 0);
489            this.indentAmount = indentAmount;
490            return this;
491        }
492
493        /**
494         * Get the current max padding amount, used when pretty printing
495         * so very deeply nested nodes don't get insane padding amounts.
496         * @return the current indent amount
497         */
498        public int maxPaddingWidth() {
499            return maxPaddingWidth;
500        }
501
502        /**
503         * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts.
504         * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1.
505         *        Default is 30 and -1 means unlimited.
506         * @return this, for chaining
507         */
508        public OutputSettings maxPaddingWidth(int maxPaddingWidth) {
509            Validate.isTrue(maxPaddingWidth >= -1);
510            this.maxPaddingWidth = maxPaddingWidth;
511            return this;
512        }
513
514        @Override
515        public OutputSettings clone() {
516            OutputSettings clone;
517            try {
518                clone = (OutputSettings) super.clone();
519            } catch (CloneNotSupportedException e) {
520                throw new RuntimeException(e);
521            }
522            clone.charset(charset.name()); // new charset, coreCharset, and charset encoder
523            clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
524            // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle
525            return clone;
526        }
527    }
528
529    /**
530     * Get the document's current output settings.
531     * @return the document's current output settings.
532     */
533    public OutputSettings outputSettings() {
534        return outputSettings;
535    }
536
537    /**
538     * Set the document's output settings.
539     * @param outputSettings new output settings.
540     * @return this document, for chaining.
541     */
542    public Document outputSettings(OutputSettings outputSettings) {
543        Validate.notNull(outputSettings);
544        this.outputSettings = outputSettings;
545        return this;
546    }
547
548    public enum QuirksMode {
549        noQuirks, quirks, limitedQuirks
550    }
551
552    public QuirksMode quirksMode() {
553        return quirksMode;
554    }
555
556    public Document quirksMode(QuirksMode quirksMode) {
557        this.quirksMode = quirksMode;
558        return this;
559    }
560
561    /**
562     * Get the parser that was used to parse this document.
563     * @return the parser
564     */
565    public Parser parser() {
566        return parser;
567    }
568
569    /**
570     * Set the parser used to create this document. This parser is then used when further parsing within this document
571     * is required.
572     * @param parser the configured parser to use when further parsing is required for this document.
573     * @return this document, for chaining.
574     */
575    public Document parser(Parser parser) {
576        this.parser = parser;
577        return this;
578    }
579
580    /**
581     Set the Connection used to fetch this document. This Connection is used as a session object when further requests are
582     made (e.g. when a form is submitted).
583
584     @param connection to set
585     @return this document, for chaining
586     @see Connection#newRequest()
587     @since 1.14.1
588     */
589    public Document connection(Connection connection) {
590        Validate.notNull(connection);
591        this.connection = connection;
592        return this;
593    }
594}