001package org.jsoup;
002
003import org.jsoup.helper.DataUtil;
004import org.jsoup.helper.HttpConnection;
005import org.jsoup.nodes.Document;
006import org.jsoup.nodes.Element;
007import org.jsoup.parser.Parser;
008import org.jsoup.safety.Cleaner;
009import org.jsoup.safety.Safelist;
010import org.jspecify.annotations.Nullable;
011
012import java.io.File;
013import java.io.IOException;
014import java.io.InputStream;
015import java.net.URL;
016import java.nio.file.Path;
017
018import static org.jsoup.internal.SharedConstants.DummyUri;
019
020/**
021 The core public access point to the jsoup functionality.
022
023 @author Jonathan Hedley */
024
025public class Jsoup {
026    private Jsoup() {}
027
028    /**
029     Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
030
031     @param html    HTML to parse
032     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
033     before the HTML declares a {@code <base href>} tag.
034     @return sane HTML
035     */
036    public static Document parse(String html, String baseUri) {
037        return Parser.parse(html, baseUri);
038    }
039
040    /**
041     Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
042     (non-HTML) parser.
043
044     @param html    HTML to parse
045     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
046     before the HTML declares a {@code <base href>} tag.
047     @param parser alternate {@link Parser#xmlParser() parser} to use.
048     @return sane HTML
049     */
050    public static Document parse(String html, String baseUri, Parser parser) {
051        return parser.parseInput(html, baseUri);
052    }
053
054    /**
055     Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
056     (non-HTML) parser.  As no base URI is specified, absolute URL resolution, if required, relies on the HTML including
057     a {@code <base href>} tag.
058
059     @param html    HTML to parse
060     before the HTML declares a {@code <base href>} tag.
061     @param parser alternate {@link Parser#xmlParser() parser} to use.
062     @return sane HTML
063     */
064    public static Document parse(String html, Parser parser) {
065        return parser.parseInput(html, "");
066    }
067
068    /**
069     Parse HTML into a Document. As no base URI is specified, absolute URL resolution, if required, relies on the HTML
070     including a {@code <base href>} tag.
071
072     @param html HTML to parse
073     @return sane HTML
074
075     @see #parse(String, String)
076     */
077    public static Document parse(String html) {
078        return Parser.parse(html, "");
079    }
080
081    /**
082     * Creates a new {@link Connection} (session), with the defined request URL. Use to fetch and parse a HTML page.
083     * <p>
084     * Use examples:
085     * <ul>
086     *  <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li>
087     *  <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li>
088     * </ul>
089     * @param url URL to connect to. The protocol must be {@code http} or {@code https}.
090     * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute.
091     * @see #newSession()
092     * @see Connection#newRequest()
093     */
094    public static Connection connect(String url) {
095        return HttpConnection.connect(url);
096    }
097
098    /**
099     Creates a new {@link Connection} to use as a session. Connection settings (user-agent, timeouts, URL, etc), and
100     cookies will be maintained for the session. Use examples:
101<pre><code>
102Connection session = Jsoup.newSession()
103     .timeout(20 * 1000)
104     .userAgent("FooBar 2000");
105
106Document doc1 = session.newRequest()
107     .url("https://jsoup.org/").data("ref", "example")
108     .get();
109Document doc2 = session.newRequest()
110     .url("https://en.wikipedia.org/wiki/Main_Page")
111     .get();
112Connection con3 = session.newRequest();
113</code></pre>
114
115     <p>For multi-threaded requests, it is safe to use this session between threads, but take care to call {@link
116    Connection#newRequest()} per request and not share that instance between threads when executing or parsing.</p>
117
118     @return a connection
119     @since 1.14.1
120     */
121    public static Connection newSession() {
122        return new HttpConnection();
123    }
124
125    /**
126     Parse the contents of a file as HTML.
127
128     @param file          file to load HTML from. Supports gzipped files (ending in .z or .gz).
129     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
130     present, or fall back to {@code UTF-8} (which is often safe to do).
131     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
132     @return sane HTML
133
134     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
135     */
136    public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException {
137        return DataUtil.load(file, charsetName, baseUri);
138    }
139
140    /**
141     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
142
143     @param file        file to load HTML from. Supports gzipped files (ending in .z or .gz).
144     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
145     present, or fall back to {@code UTF-8} (which is often safe to do).
146     @return sane HTML
147
148     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
149     @see #parse(File, String, String) parse(file, charset, baseUri)
150     */
151    public static Document parse(File file, @Nullable String charsetName) throws IOException {
152        return DataUtil.load(file, charsetName, file.getAbsolutePath());
153    }
154
155    /**
156     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
157     The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag,
158     or if neither is present, will be {@code UTF-8}.
159
160     <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p>
161
162     @param file the file to load HTML from. Supports gzipped files (ending in .z or .gz).
163     @return sane HTML
164     @throws IOException if the file could not be found or read.
165     @see #parse(File, String, String) parse(file, charset, baseUri)
166     @since 1.15.1
167     */
168    public static Document parse(File file) throws IOException {
169        return DataUtil.load(file, null, file.getAbsolutePath());
170    }
171
172    /**
173     Parse the contents of a file as HTML.
174
175     @param file          file to load HTML from. Supports gzipped files (ending in .z or .gz).
176     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
177     present, or fall back to {@code UTF-8} (which is often safe to do).
178     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
179     @param parser alternate {@link Parser#xmlParser() parser} to use.
180     @return sane HTML
181
182     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
183     @since 1.14.2
184     */
185    public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
186        return DataUtil.load(file, charsetName, baseUri, parser);
187    }
188
189    /**
190     Parse the contents of a file as HTML.
191
192     @param path          file to load HTML from. Supports gzipped files (ending in .z or .gz).
193     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
194     present, or fall back to {@code UTF-8} (which is often safe to do).
195     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
196     @return sane HTML
197
198     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
199     @since 1.18.1
200     */
201    public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException {
202        return DataUtil.load(path, charsetName, baseUri);
203    }
204
205    /**
206     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
207
208     @param path        file to load HTML from. Supports gzipped files (ending in .z or .gz).
209     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
210     present, or fall back to {@code UTF-8} (which is often safe to do).
211     @return sane HTML
212
213     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
214     @see #parse(File, String, String) parse(file, charset, baseUri)
215     @since 1.18.1
216     */
217    public static Document parse(Path path, @Nullable String charsetName) throws IOException {
218        return DataUtil.load(path, charsetName, path.toAbsolutePath().toString());
219    }
220
221    /**
222     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
223     The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag,
224     or if neither is present, will be {@code UTF-8}.
225
226     <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p>
227
228     @param path the file to load HTML from. Supports gzipped files (ending in .z or .gz).
229     @return sane HTML
230     @throws IOException if the file could not be found or read.
231     @see #parse(Path, String, String) parse(file, charset, baseUri)
232     @since 1.18.1
233     */
234    public static Document parse(Path path) throws IOException {
235        return DataUtil.load(path, null, path.toAbsolutePath().toString());
236    }
237
238    /**
239     Parse the contents of a file as HTML.
240
241     @param path          file to load HTML from. Supports gzipped files (ending in .z or .gz).
242     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
243     present, or fall back to {@code UTF-8} (which is often safe to do).
244     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
245     @param parser alternate {@link Parser#xmlParser() parser} to use.
246     @return sane HTML
247
248     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
249     @since 1.18.1
250     */
251    public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
252        return DataUtil.load(path, charsetName, baseUri, parser);
253    }
254
255     /**
256     Read an input stream, and parse it to a Document.
257
258     @param in          input stream to read. The stream will be closed after reading.
259     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
260     present, or fall back to {@code UTF-8} (which is often safe to do).
261     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
262     @return sane HTML
263
264     @throws IOException if the stream could not be read, or if the charsetName is invalid.
265     */
266    public static Document parse(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
267        return DataUtil.load(in, charsetName, baseUri);
268    }
269
270    /**
271     Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
272     (non-HTML) parser.
273
274     @param in          input stream to read. Make sure to close it after parsing.
275     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
276     present, or fall back to {@code UTF-8} (which is often safe to do).
277     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
278     @param parser alternate {@link Parser#xmlParser() parser} to use.
279     @return sane HTML
280
281     @throws IOException if the stream could not be read, or if the charsetName is invalid.
282     */
283    public static Document parse(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
284        return DataUtil.load(in, charsetName, baseUri, parser);
285    }
286
287    /**
288     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
289
290     @param bodyHtml body HTML fragment
291     @param baseUri  URL to resolve relative URLs against.
292     @return sane HTML document
293
294     @see Document#body()
295     */
296    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
297        return Parser.parseBodyFragment(bodyHtml, baseUri);
298    }
299
300    /**
301     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
302
303     @param bodyHtml body HTML fragment
304     @return sane HTML document
305
306     @see Document#body()
307     */
308    public static Document parseBodyFragment(String bodyHtml) {
309        return Parser.parseBodyFragment(bodyHtml, "");
310    }
311
312    /**
313     Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.
314     <p>
315     The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}.
316
317     @param url           URL to fetch (with a GET). The protocol must be {@code http} or {@code https}.
318     @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown.
319     @return The parsed HTML.
320
321     @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed
322     @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored
323     @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored
324     @throws java.net.SocketTimeoutException if the connection times out
325     @throws IOException if a connection or read error occurs
326
327     @see #connect(String)
328     */
329    public static Document parse(URL url, int timeoutMillis) throws IOException {
330        Connection con = HttpConnection.connect(url);
331        con.timeout(timeoutMillis);
332        return con.get();
333    }
334
335    /**
336     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through an allow-list of safe
337     tags and attributes.
338
339     @param bodyHtml  input untrusted HTML (body fragment)
340     @param baseUri   URL to resolve relative URLs against
341     @param safelist  list of permitted HTML elements
342     @return safe HTML (body fragment)
343
344     @see Cleaner#clean(Document)
345     */
346    public static String clean(String bodyHtml, String baseUri, Safelist safelist) {
347        if (baseUri.isEmpty() && safelist.preserveRelativeLinks()) {
348            baseUri = DummyUri; // set a placeholder URI to allow relative links to pass abs resolution for protocol tests; won't leak to output
349        }
350
351        Document dirty = parseBodyFragment(bodyHtml, baseUri);
352        Cleaner cleaner = new Cleaner(safelist);
353        Document clean = cleaner.clean(dirty);
354        return clean.body().html();
355    }
356
357    /**
358     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of permitted
359     tags and attributes.
360
361     <p>Note that as this method does not take a base href URL to resolve attributes with relative URLs against, those
362     URLs will be removed, unless the input HTML contains a {@code <base href> tag}. If you wish to preserve those, use
363     the {@link Jsoup#clean(String html, String baseHref, Safelist)} method instead, and enable
364     {@link Safelist#preserveRelativeLinks(boolean)}.</p>
365
366     <p>Note that the output of this method is still <b>HTML</b> even when using the TextNode only
367     {@link Safelist#none()}, and so any HTML entities in the output will be appropriately escaped.
368     If you want plain text, not HTML, you should use a text method such as {@link Element#text()} instead, after
369     cleaning the document.</p>
370     <p>Example:</p>
371     <pre>{@code
372     String sourceBodyHtml = "<p>5 is &lt; 6.</p>";
373     String html = Jsoup.clean(sourceBodyHtml, Safelist.none());
374
375     Cleaner cleaner = new Cleaner(Safelist.none());
376     String text = cleaner.clean(Jsoup.parse(sourceBodyHtml)).text();
377
378     // html is: 5 is &lt; 6.
379     // text is: 5 is < 6.
380     }</pre>
381
382     @param bodyHtml input untrusted HTML (body fragment)
383     @param safelist list of permitted HTML elements
384     @return safe HTML (body fragment)
385     @see Cleaner#clean(Document)
386     */
387    public static String clean(String bodyHtml, Safelist safelist) {
388        return clean(bodyHtml, "", safelist);
389    }
390
391    /**
392     * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of
393     * permitted tags and attributes.
394     * <p>The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an
395     * existing document. If you want to clean full documents, use {@link Cleaner#clean(Document)} instead, and add
396     * structural tags (<code>html, head, body</code> etc) to the safelist.
397     *
398     * @param bodyHtml input untrusted HTML (body fragment)
399     * @param baseUri URL to resolve relative URLs against
400     * @param safelist list of permitted HTML elements
401     * @param outputSettings document output settings; use to control pretty-printing and entity escape modes
402     * @return safe HTML (body fragment)
403     * @see Cleaner#clean(Document)
404     */
405    public static String clean(String bodyHtml, String baseUri, Safelist safelist, Document.OutputSettings outputSettings) {
406        Document dirty = parseBodyFragment(bodyHtml, baseUri);
407        Cleaner cleaner = new Cleaner(safelist);
408        Document clean = cleaner.clean(dirty);
409        clean.outputSettings(outputSettings);
410        return clean.body().html();
411    }
412
413    /**
414     Test if the input body HTML has only tags and attributes allowed by the Safelist. Useful for form validation.
415     <p>
416     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
417     output of this method, the input document <b>must always</b> be normalized using a method such as
418     {@link #clean(String, String, Safelist)}, and the result of that method used to store or serialize the document
419     before later reuse such as presentation to end users. This ensures that enforced attributes are set correctly, and
420     that any differences between how a given browser and how jsoup parses the input HTML are normalized.
421     </p>
422     <p>Example:</p>
423     <pre>{@code
424     Safelist safelist = Safelist.relaxed();
425     boolean isValid = Jsoup.isValid(sourceBodyHtml, safelist);
426     String normalizedHtml = Jsoup.clean(sourceBodyHtml, "https://example.com/", safelist);
427     }</pre>
428     <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.)
429     @param bodyHtml HTML to test
430     @param safelist safelist to test against
431     @return true if no tags or attributes were removed; false otherwise
432     @see #clean(String, Safelist)
433     */
434    public static boolean isValid(String bodyHtml, Safelist safelist) {
435        return new Cleaner(safelist).isValidBodyHtml(bodyHtml);
436    }
437}