001package org.jsoup; 002 003import org.jsoup.helper.DataUtil; 004import org.jsoup.helper.HttpConnection; 005import org.jsoup.nodes.Document; 006import org.jsoup.nodes.Element; 007import org.jsoup.parser.Parser; 008import org.jsoup.safety.Cleaner; 009import org.jsoup.safety.Safelist; 010import org.jspecify.annotations.Nullable; 011 012import java.io.File; 013import java.io.IOException; 014import java.io.InputStream; 015import java.net.URL; 016import java.nio.file.Path; 017 018import static org.jsoup.internal.SharedConstants.DummyUri; 019 020/** 021 The core public access point to the jsoup functionality. 022 023 @author Jonathan Hedley */ 024 025public class Jsoup { 026 private Jsoup() {} 027 028 /** 029 Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. 030 031 @param html HTML to parse 032 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur 033 before the HTML declares a {@code <base href>} tag. 034 @return sane HTML 035 */ 036 public static Document parse(String html, String baseUri) { 037 return Parser.parse(html, baseUri); 038 } 039 040 /** 041 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML 042 (non-HTML) parser. 043 044 @param html HTML to parse 045 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur 046 before the HTML declares a {@code <base href>} tag. 047 @param parser alternate {@link Parser#xmlParser() parser} to use. 048 @return sane HTML 049 */ 050 public static Document parse(String html, String baseUri, Parser parser) { 051 return parser.parseInput(html, baseUri); 052 } 053 054 /** 055 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML 056 (non-HTML) parser. As no base URI is specified, absolute URL resolution, if required, relies on the HTML including 057 a {@code <base href>} tag. 058 059 @param html HTML to parse 060 before the HTML declares a {@code <base href>} tag. 061 @param parser alternate {@link Parser#xmlParser() parser} to use. 062 @return sane HTML 063 */ 064 public static Document parse(String html, Parser parser) { 065 return parser.parseInput(html, ""); 066 } 067 068 /** 069 Parse HTML into a Document. As no base URI is specified, absolute URL resolution, if required, relies on the HTML 070 including a {@code <base href>} tag. 071 072 @param html HTML to parse 073 @return sane HTML 074 075 @see #parse(String, String) 076 */ 077 public static Document parse(String html) { 078 return Parser.parse(html, ""); 079 } 080 081 /** 082 * Creates a new {@link Connection} (session), with the defined request URL. Use to fetch and parse a HTML page. 083 * <p> 084 * Use examples: 085 * <ul> 086 * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li> 087 * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li> 088 * </ul> 089 * @param url URL to connect to. The protocol must be {@code http} or {@code https}. 090 * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. 091 * @see #newSession() 092 * @see Connection#newRequest() 093 */ 094 public static Connection connect(String url) { 095 return HttpConnection.connect(url); 096 } 097 098 /** 099 Creates a new {@link Connection} to use as a session. Connection settings (user-agent, timeouts, URL, etc), and 100 cookies will be maintained for the session. Use examples: 101<pre><code> 102Connection session = Jsoup.newSession() 103 .timeout(20 * 1000) 104 .userAgent("FooBar 2000"); 105 106Document doc1 = session.newRequest() 107 .url("https://jsoup.org/").data("ref", "example") 108 .get(); 109Document doc2 = session.newRequest() 110 .url("https://en.wikipedia.org/wiki/Main_Page") 111 .get(); 112Connection con3 = session.newRequest(); 113</code></pre> 114 115 <p>For multi-threaded requests, it is safe to use this session between threads, but take care to call {@link 116 Connection#newRequest()} per request and not share that instance between threads when executing or parsing.</p> 117 118 @return a connection 119 @since 1.14.1 120 */ 121 public static Connection newSession() { 122 return new HttpConnection(); 123 } 124 125 /** 126 Parse the contents of a file as HTML. 127 128 @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). 129 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 130 present, or fall back to {@code UTF-8} (which is often safe to do). 131 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 132 @return sane HTML 133 134 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 135 */ 136 public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException { 137 return DataUtil.load(file, charsetName, baseUri); 138 } 139 140 /** 141 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 142 143 @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). 144 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 145 present, or fall back to {@code UTF-8} (which is often safe to do). 146 @return sane HTML 147 148 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 149 @see #parse(File, String, String) parse(file, charset, baseUri) 150 */ 151 public static Document parse(File file, @Nullable String charsetName) throws IOException { 152 return DataUtil.load(file, charsetName, file.getAbsolutePath()); 153 } 154 155 /** 156 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 157 The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag, 158 or if neither is present, will be {@code UTF-8}. 159 160 <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p> 161 162 @param file the file to load HTML from. Supports gzipped files (ending in .z or .gz). 163 @return sane HTML 164 @throws IOException if the file could not be found or read. 165 @see #parse(File, String, String) parse(file, charset, baseUri) 166 @since 1.15.1 167 */ 168 public static Document parse(File file) throws IOException { 169 return DataUtil.load(file, null, file.getAbsolutePath()); 170 } 171 172 /** 173 Parse the contents of a file as HTML. 174 175 @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). 176 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 177 present, or fall back to {@code UTF-8} (which is often safe to do). 178 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 179 @param parser alternate {@link Parser#xmlParser() parser} to use. 180 @return sane HTML 181 182 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 183 @since 1.14.2 184 */ 185 public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 186 return DataUtil.load(file, charsetName, baseUri, parser); 187 } 188 189 /** 190 Parse the contents of a file as HTML. 191 192 @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). 193 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 194 present, or fall back to {@code UTF-8} (which is often safe to do). 195 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 196 @return sane HTML 197 198 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 199 @since 1.18.1 200 */ 201 public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException { 202 return DataUtil.load(path, charsetName, baseUri); 203 } 204 205 /** 206 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 207 208 @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). 209 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 210 present, or fall back to {@code UTF-8} (which is often safe to do). 211 @return sane HTML 212 213 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 214 @see #parse(File, String, String) parse(file, charset, baseUri) 215 @since 1.18.1 216 */ 217 public static Document parse(Path path, @Nullable String charsetName) throws IOException { 218 return DataUtil.load(path, charsetName, path.toAbsolutePath().toString()); 219 } 220 221 /** 222 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 223 The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag, 224 or if neither is present, will be {@code UTF-8}. 225 226 <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p> 227 228 @param path the file to load HTML from. Supports gzipped files (ending in .z or .gz). 229 @return sane HTML 230 @throws IOException if the file could not be found or read. 231 @see #parse(Path, String, String) parse(file, charset, baseUri) 232 @since 1.18.1 233 */ 234 public static Document parse(Path path) throws IOException { 235 return DataUtil.load(path, null, path.toAbsolutePath().toString()); 236 } 237 238 /** 239 Parse the contents of a file as HTML. 240 241 @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). 242 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 243 present, or fall back to {@code UTF-8} (which is often safe to do). 244 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 245 @param parser alternate {@link Parser#xmlParser() parser} to use. 246 @return sane HTML 247 248 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 249 @since 1.18.1 250 */ 251 public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 252 return DataUtil.load(path, charsetName, baseUri, parser); 253 } 254 255 /** 256 Read an input stream, and parse it to a Document. 257 258 @param in input stream to read. The stream will be closed after reading. 259 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 260 present, or fall back to {@code UTF-8} (which is often safe to do). 261 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 262 @return sane HTML 263 264 @throws IOException if the stream could not be read, or if the charsetName is invalid. 265 */ 266 public static Document parse(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 267 return DataUtil.load(in, charsetName, baseUri); 268 } 269 270 /** 271 Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML 272 (non-HTML) parser. 273 274 @param in input stream to read. Make sure to close it after parsing. 275 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 276 present, or fall back to {@code UTF-8} (which is often safe to do). 277 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 278 @param parser alternate {@link Parser#xmlParser() parser} to use. 279 @return sane HTML 280 281 @throws IOException if the stream could not be read, or if the charsetName is invalid. 282 */ 283 public static Document parse(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 284 return DataUtil.load(in, charsetName, baseUri, parser); 285 } 286 287 /** 288 Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. 289 290 @param bodyHtml body HTML fragment 291 @param baseUri URL to resolve relative URLs against. 292 @return sane HTML document 293 294 @see Document#body() 295 */ 296 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 297 return Parser.parseBodyFragment(bodyHtml, baseUri); 298 } 299 300 /** 301 Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. 302 303 @param bodyHtml body HTML fragment 304 @return sane HTML document 305 306 @see Document#body() 307 */ 308 public static Document parseBodyFragment(String bodyHtml) { 309 return Parser.parseBodyFragment(bodyHtml, ""); 310 } 311 312 /** 313 Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead. 314 <p> 315 The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. 316 317 @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. 318 @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. 319 @return The parsed HTML. 320 321 @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed 322 @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored 323 @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored 324 @throws java.net.SocketTimeoutException if the connection times out 325 @throws IOException if a connection or read error occurs 326 327 @see #connect(String) 328 */ 329 public static Document parse(URL url, int timeoutMillis) throws IOException { 330 Connection con = HttpConnection.connect(url); 331 con.timeout(timeoutMillis); 332 return con.get(); 333 } 334 335 /** 336 Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through an allow-list of safe 337 tags and attributes. 338 339 @param bodyHtml input untrusted HTML (body fragment) 340 @param baseUri URL to resolve relative URLs against 341 @param safelist list of permitted HTML elements 342 @return safe HTML (body fragment) 343 344 @see Cleaner#clean(Document) 345 */ 346 public static String clean(String bodyHtml, String baseUri, Safelist safelist) { 347 if (baseUri.isEmpty() && safelist.preserveRelativeLinks()) { 348 baseUri = DummyUri; // set a placeholder URI to allow relative links to pass abs resolution for protocol tests; won't leak to output 349 } 350 351 Document dirty = parseBodyFragment(bodyHtml, baseUri); 352 Cleaner cleaner = new Cleaner(safelist); 353 Document clean = cleaner.clean(dirty); 354 return clean.body().html(); 355 } 356 357 /** 358 Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of permitted 359 tags and attributes. 360 361 <p>Note that as this method does not take a base href URL to resolve attributes with relative URLs against, those 362 URLs will be removed, unless the input HTML contains a {@code <base href> tag}. If you wish to preserve those, use 363 the {@link Jsoup#clean(String html, String baseHref, Safelist)} method instead, and enable 364 {@link Safelist#preserveRelativeLinks(boolean)}.</p> 365 366 <p>Note that the output of this method is still <b>HTML</b> even when using the TextNode only 367 {@link Safelist#none()}, and so any HTML entities in the output will be appropriately escaped. 368 If you want plain text, not HTML, you should use a text method such as {@link Element#text()} instead, after 369 cleaning the document.</p> 370 <p>Example:</p> 371 <pre>{@code 372 String sourceBodyHtml = "<p>5 is < 6.</p>"; 373 String html = Jsoup.clean(sourceBodyHtml, Safelist.none()); 374 375 Cleaner cleaner = new Cleaner(Safelist.none()); 376 String text = cleaner.clean(Jsoup.parse(sourceBodyHtml)).text(); 377 378 // html is: 5 is < 6. 379 // text is: 5 is < 6. 380 }</pre> 381 382 @param bodyHtml input untrusted HTML (body fragment) 383 @param safelist list of permitted HTML elements 384 @return safe HTML (body fragment) 385 @see Cleaner#clean(Document) 386 */ 387 public static String clean(String bodyHtml, Safelist safelist) { 388 return clean(bodyHtml, "", safelist); 389 } 390 391 /** 392 * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of 393 * permitted tags and attributes. 394 * <p>The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an 395 * existing document. If you want to clean full documents, use {@link Cleaner#clean(Document)} instead, and add 396 * structural tags (<code>html, head, body</code> etc) to the safelist. 397 * 398 * @param bodyHtml input untrusted HTML (body fragment) 399 * @param baseUri URL to resolve relative URLs against 400 * @param safelist list of permitted HTML elements 401 * @param outputSettings document output settings; use to control pretty-printing and entity escape modes 402 * @return safe HTML (body fragment) 403 * @see Cleaner#clean(Document) 404 */ 405 public static String clean(String bodyHtml, String baseUri, Safelist safelist, Document.OutputSettings outputSettings) { 406 Document dirty = parseBodyFragment(bodyHtml, baseUri); 407 Cleaner cleaner = new Cleaner(safelist); 408 Document clean = cleaner.clean(dirty); 409 clean.outputSettings(outputSettings); 410 return clean.body().html(); 411 } 412 413 /** 414 Test if the input body HTML has only tags and attributes allowed by the Safelist. Useful for form validation. 415 <p> 416 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 417 output of this method, the input document <b>must always</b> be normalized using a method such as 418 {@link #clean(String, String, Safelist)}, and the result of that method used to store or serialize the document 419 before later reuse such as presentation to end users. This ensures that enforced attributes are set correctly, and 420 that any differences between how a given browser and how jsoup parses the input HTML are normalized. 421 </p> 422 <p>Example:</p> 423 <pre>{@code 424 Safelist safelist = Safelist.relaxed(); 425 boolean isValid = Jsoup.isValid(sourceBodyHtml, safelist); 426 String normalizedHtml = Jsoup.clean(sourceBodyHtml, "https://example.com/", safelist); 427 }</pre> 428 <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.) 429 @param bodyHtml HTML to test 430 @param safelist safelist to test against 431 @return true if no tags or attributes were removed; false otherwise 432 @see #clean(String, Safelist) 433 */ 434 public static boolean isValid(String bodyHtml, Safelist safelist) { 435 return new Cleaner(safelist).isValidBodyHtml(bodyHtml); 436 } 437}