001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.Document;
005import org.jsoup.nodes.Element;
006import org.jsoup.nodes.Node;
007import org.jspecify.annotations.Nullable;
008
009import java.io.Reader;
010import java.io.StringReader;
011import java.util.List;
012import java.util.concurrent.locks.ReentrantLock;
013
014/**
015 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
016 {@link org.jsoup.Jsoup}.
017 <p>Note that a given Parser instance object is threadsafe, but not concurrent. (Concurrent parse calls will
018 synchronize.) To reuse a Parser configuration in a multithreaded environment, use {@link #newInstance()} to make
019 copies.</p>
020 */
021public class Parser implements Cloneable {
022    public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml";
023    public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace";
024    public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML";
025    public static final String NamespaceSvg = "http://www.w3.org/2000/svg";
026
027    private final TreeBuilder treeBuilder;
028    private ParseErrorList errors;
029    private ParseSettings settings;
030    private boolean trackPosition = false;
031    private @Nullable TagSet tagSet;
032    private final ReentrantLock lock = new ReentrantLock();
033    private int maxDepth;
034
035    /**
036     * Create a new Parser, using the specified TreeBuilder
037     * @param treeBuilder TreeBuilder to use to parse input into Documents.
038     */
039    public Parser(TreeBuilder treeBuilder) {
040        this.treeBuilder = treeBuilder;
041        settings = treeBuilder.defaultSettings();
042        errors = ParseErrorList.noTracking();
043        maxDepth = treeBuilder.defaultMaxDepth();
044    }
045
046    /**
047     Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use.
048     @return a copied parser
049     */
050    public Parser newInstance() {
051        return new Parser(this);
052    }
053
054    @SuppressWarnings("MethodDoesntCallSuperMethod") // because we use the copy constructor instead
055    @Override
056    public Parser clone() {
057        return new Parser(this);
058    }
059
060    private Parser(Parser copy) {
061        treeBuilder = copy.treeBuilder.newInstance(); // because extended
062        errors = new ParseErrorList(copy.errors); // only copies size, not contents
063        settings = new ParseSettings(copy.settings);
064        trackPosition = copy.trackPosition;
065        maxDepth = copy.maxDepth;
066        tagSet = new TagSet(copy.tagSet());
067    }
068
069    /**
070     Parse the contents of a String.
071
072     @param html HTML to parse
073     @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
074     @return parsed Document
075     */
076    public Document parseInput(String html, String baseUri) {
077        return parseInput(new StringReader(html), baseUri);
078    }
079
080    /**
081     Parse the contents of Reader.
082
083     @param inputHtml HTML to parse
084     @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
085     @return parsed Document
086     @throws java.io.UncheckedIOException if an I/O error occurs in the Reader
087     */
088    public Document parseInput(Reader inputHtml, String baseUri) {
089        try {
090            lock.lock(); // using a lock vs synchronized to support loom threads
091            return treeBuilder.parse(inputHtml, baseUri, this);
092        } finally {
093            lock.unlock();
094        }
095    }
096
097    /**
098     Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
099
100     @param fragment the fragment of HTML to parse
101     @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML).
102     @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
103     @return list of nodes parsed from the input HTML.
104     */
105    public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) {
106        return parseFragmentInput(new StringReader(fragment), context, baseUri);
107    }
108
109    /**
110     Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
111
112     @param fragment the fragment of HTML to parse
113     @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML).
114     @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
115     @return list of nodes parsed from the input HTML.
116     @throws java.io.UncheckedIOException if an I/O error occurs in the Reader
117     */
118    public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) {
119        try {
120            lock.lock();
121            return treeBuilder.parseFragment(fragment, context, baseUri, this);
122        } finally {
123            lock.unlock();
124        }
125    }
126
127    // gets & sets
128    /**
129     * Get the TreeBuilder currently in use.
130     * @return current TreeBuilder.
131     */
132    public TreeBuilder getTreeBuilder() {
133        return treeBuilder;
134    }
135
136    /**
137     * Check if parse error tracking is enabled.
138     * @return current track error state.
139     */
140    public boolean isTrackErrors() {
141        return errors.getMaxSize() > 0;
142    }
143
144    /**
145     * Enable or disable parse error tracking for the next parse.
146     * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
147     * @return this, for chaining
148     */
149    public Parser setTrackErrors(int maxErrors) {
150        errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
151        return this;
152    }
153
154    /**
155     * Retrieve the parse errors, if any, from the last parse.
156     * @return list of parse errors, up to the size of the maximum errors tracked.
157     * @see #setTrackErrors(int)
158     */
159    public ParseErrorList getErrors() {
160        return errors;
161    }
162
163    /**
164     Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input
165     source they were created from. By default, tracking is not enabled.
166     * @return current track position setting
167     */
168    public boolean isTrackPosition() {
169        return trackPosition;
170    }
171
172    /**
173     Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original
174     input source they were created from.
175     @param trackPosition position tracking setting; {@code true} to enable
176     @return this Parser, for chaining
177     */
178    public Parser setTrackPosition(boolean trackPosition) {
179        this.trackPosition = trackPosition;
180        return this;
181    }
182
183    /**
184     Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes.
185     * @param settings the new settings
186     * @return this Parser
187     */
188    public Parser settings(ParseSettings settings) {
189        this.settings = settings;
190        return this;
191    }
192
193    /**
194     Gets the current ParseSettings for this Parser
195     * @return current ParseSettings
196     */
197    public ParseSettings settings() {
198        return settings;
199    }
200
201    /**
202     Set the parser's maximum stack depth (maximum number of open elements). When reached, new open elements will be
203     removed to prevent excessive nesting. Defaults to 512 for the HTML parser, and unlimited for the XML
204     parser.
205
206     @param maxDepth maximum parser depth; must be >= 1
207     @return this Parser, for chaining
208     */
209    public Parser setMaxDepth(int maxDepth) {
210        Validate.isTrue(maxDepth >= 1, "maxDepth must be >= 1");
211        this.maxDepth = maxDepth;
212        return this;
213    }
214
215    /**
216     * Get the maximum parser depth (maximum number of open elements).
217     * @return the current max parser depth
218     */
219    public int getMaxDepth() {
220        return maxDepth;
221    }
222
223    /**
224     Set a custom TagSet to use for this Parser. This allows you to define your own tags, and control how they are
225     parsed. For example, you can set a tag to preserve whitespace, or to be treated as a block tag.
226     <p>You can start with the {@link TagSet#Html()} defaults and customize, or a new empty TagSet.</p>
227
228     @param tagSet the TagSet to use. This gets copied, so that changes that the parse makes (tags found in the document will be added) do not clobber the original TagSet.
229     @return this Parser
230     @since 1.20.1
231     */
232    public Parser tagSet(TagSet tagSet) {
233        Validate.notNull(tagSet);
234        this.tagSet = new TagSet(tagSet); // copy it as we are going to mutate it
235        return this;
236    }
237
238    /**
239     Get the current TagSet for this Parser, which will be either this parser's default, or one that you have set.
240     @return the current TagSet. After the parse, this will contain any new tags that were found in the document.
241     @since 1.20.1
242     */
243    public TagSet tagSet() {
244        if (tagSet == null)
245            tagSet = treeBuilder.defaultTagSet();
246        return tagSet;
247    }
248
249    public String defaultNamespace() {
250        return getTreeBuilder().defaultNamespace();
251    }
252
253    // static parse functions below
254    /**
255     * Parse HTML into a Document.
256     *
257     * @param html HTML to parse
258     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
259     *
260     * @return parsed Document
261     */
262    public static Document parse(String html, String baseUri) {
263        TreeBuilder treeBuilder = new HtmlTreeBuilder();
264        return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder));
265    }
266
267    /**
268     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
269     *
270     * @param fragmentHtml the fragment of HTML to parse
271     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
272     * provides stack context (for implicit element creation).
273     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
274     *
275     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
276     */
277    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
278        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
279        return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, new Parser(treeBuilder));
280    }
281
282    /**
283     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
284     *
285     * @param fragmentHtml the fragment of HTML to parse
286     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
287     * provides stack context (for implicit element creation).
288     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
289     * @param errorList list to add errors to
290     *
291     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
292     */
293    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) {
294        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
295        Parser parser = new Parser(treeBuilder);
296        parser.errors = errorList;
297        return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, parser);
298    }
299
300    /**
301     * Parse a fragment of XML into a list of nodes.
302     *
303     * @param fragmentXml the fragment of XML to parse
304     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
305     * @return list of nodes parsed from the input XML.
306     */
307    public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
308        XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
309        return treeBuilder.parseFragment(new StringReader(fragmentXml), null, baseUri, new Parser(treeBuilder));
310    }
311
312    /**
313     * Parse a fragment of HTML into the {@code body} of a Document.
314     *
315     * @param bodyHtml fragment of HTML
316     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
317     *
318     * @return Document, with empty head, and HTML parsed into body
319     */
320    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
321        Document doc = Document.createShell(baseUri);
322        Element body = doc.body();
323        List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
324        body.appendChildren(nodeList);
325        return doc;
326    }
327
328    /**
329     Utility method to unescape HTML entities from a string.
330     <p>To track errors while unescaping, use
331     {@link #unescape(String, boolean)} with a Parser instance that has error tracking enabled.</p>
332
333     @param string HTML escaped string
334     @param inAttribute if the string is to be escaped in strict mode (as attributes are)
335     @return an unescaped string
336     @see #unescape(String, boolean)
337     */
338    public static String unescapeEntities(String string, boolean inAttribute) {
339        Validate.notNull(string);
340        if (string.indexOf('&') < 0) return string; // nothing to unescape
341        return Parser.htmlParser().unescape(string, inAttribute);
342    }
343
344    /**
345     Utility method to unescape HTML entities from a string, using this {@code Parser}'s configuration (for example, to
346     collect errors while unescaping).
347
348     @param string HTML escaped string
349     @param inAttribute if the string is to be escaped in strict mode (as attributes are)
350     @return an unescaped string
351     @see #setTrackErrors(int)
352     @see #unescapeEntities(String, boolean)
353     */
354    public String unescape(String string, boolean inAttribute) {
355        Validate.notNull(string);
356        if (string.indexOf('&') < 0) return string; // nothing to unescape
357        this.treeBuilder.initialiseParse(new StringReader(string), "", this);
358        Tokeniser tokeniser = new Tokeniser(this.treeBuilder);
359        return tokeniser.unescapeEntities(inAttribute);
360    }
361
362    // builders
363
364    /**
365     * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
366     * based on a knowledge of the semantics of the incoming tags.
367     * @return a new HTML parser.
368     */
369    public static Parser htmlParser() {
370        return new Parser(new HtmlTreeBuilder());
371    }
372
373    /**
374     * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
375     * rather creates a simple tree directly from the input.
376     * @return a new simple XML parser.
377     */
378    public static Parser xmlParser() {
379        return new Parser(new XmlTreeBuilder()).setMaxDepth(Integer.MAX_VALUE);
380    }
381}