001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Document; 005import org.jsoup.nodes.Element; 006import org.jsoup.nodes.Node; 007import org.jspecify.annotations.Nullable; 008 009import java.io.Reader; 010import java.io.StringReader; 011import java.util.List; 012import java.util.concurrent.locks.ReentrantLock; 013 014/** 015 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in 016 {@link org.jsoup.Jsoup}. 017 <p>Note that a given Parser instance object is threadsafe, but not concurrent. (Concurrent parse calls will 018 synchronize.) To reuse a Parser configuration in a multithreaded environment, use {@link #newInstance()} to make 019 copies.</p> 020 */ 021public class Parser implements Cloneable { 022 public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml"; 023 public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace"; 024 public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML"; 025 public static final String NamespaceSvg = "http://www.w3.org/2000/svg"; 026 027 private final TreeBuilder treeBuilder; 028 private ParseErrorList errors; 029 private ParseSettings settings; 030 private boolean trackPosition = false; 031 private @Nullable TagSet tagSet; 032 private final ReentrantLock lock = new ReentrantLock(); 033 private int maxDepth; 034 035 /** 036 * Create a new Parser, using the specified TreeBuilder 037 * @param treeBuilder TreeBuilder to use to parse input into Documents. 038 */ 039 public Parser(TreeBuilder treeBuilder) { 040 this.treeBuilder = treeBuilder; 041 settings = treeBuilder.defaultSettings(); 042 errors = ParseErrorList.noTracking(); 043 maxDepth = treeBuilder.defaultMaxDepth(); 044 } 045 046 /** 047 Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use. 048 @return a copied parser 049 */ 050 public Parser newInstance() { 051 return new Parser(this); 052 } 053 054 @SuppressWarnings("MethodDoesntCallSuperMethod") // because we use the copy constructor instead 055 @Override 056 public Parser clone() { 057 return new Parser(this); 058 } 059 060 private Parser(Parser copy) { 061 treeBuilder = copy.treeBuilder.newInstance(); // because extended 062 errors = new ParseErrorList(copy.errors); // only copies size, not contents 063 settings = new ParseSettings(copy.settings); 064 trackPosition = copy.trackPosition; 065 maxDepth = copy.maxDepth; 066 tagSet = new TagSet(copy.tagSet()); 067 } 068 069 /** 070 Parse the contents of a String. 071 072 @param html HTML to parse 073 @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 074 @return parsed Document 075 */ 076 public Document parseInput(String html, String baseUri) { 077 return parseInput(new StringReader(html), baseUri); 078 } 079 080 /** 081 Parse the contents of Reader. 082 083 @param inputHtml HTML to parse 084 @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 085 @return parsed Document 086 @throws java.io.UncheckedIOException if an I/O error occurs in the Reader 087 */ 088 public Document parseInput(Reader inputHtml, String baseUri) { 089 try { 090 lock.lock(); // using a lock vs synchronized to support loom threads 091 return treeBuilder.parse(inputHtml, baseUri, this); 092 } finally { 093 lock.unlock(); 094 } 095 } 096 097 /** 098 Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 099 100 @param fragment the fragment of HTML to parse 101 @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). 102 @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 103 @return list of nodes parsed from the input HTML. 104 */ 105 public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) { 106 return parseFragmentInput(new StringReader(fragment), context, baseUri); 107 } 108 109 /** 110 Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 111 112 @param fragment the fragment of HTML to parse 113 @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). 114 @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 115 @return list of nodes parsed from the input HTML. 116 @throws java.io.UncheckedIOException if an I/O error occurs in the Reader 117 */ 118 public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) { 119 try { 120 lock.lock(); 121 return treeBuilder.parseFragment(fragment, context, baseUri, this); 122 } finally { 123 lock.unlock(); 124 } 125 } 126 127 // gets & sets 128 /** 129 * Get the TreeBuilder currently in use. 130 * @return current TreeBuilder. 131 */ 132 public TreeBuilder getTreeBuilder() { 133 return treeBuilder; 134 } 135 136 /** 137 * Check if parse error tracking is enabled. 138 * @return current track error state. 139 */ 140 public boolean isTrackErrors() { 141 return errors.getMaxSize() > 0; 142 } 143 144 /** 145 * Enable or disable parse error tracking for the next parse. 146 * @param maxErrors the maximum number of errors to track. Set to 0 to disable. 147 * @return this, for chaining 148 */ 149 public Parser setTrackErrors(int maxErrors) { 150 errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); 151 return this; 152 } 153 154 /** 155 * Retrieve the parse errors, if any, from the last parse. 156 * @return list of parse errors, up to the size of the maximum errors tracked. 157 * @see #setTrackErrors(int) 158 */ 159 public ParseErrorList getErrors() { 160 return errors; 161 } 162 163 /** 164 Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input 165 source they were created from. By default, tracking is not enabled. 166 * @return current track position setting 167 */ 168 public boolean isTrackPosition() { 169 return trackPosition; 170 } 171 172 /** 173 Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original 174 input source they were created from. 175 @param trackPosition position tracking setting; {@code true} to enable 176 @return this Parser, for chaining 177 */ 178 public Parser setTrackPosition(boolean trackPosition) { 179 this.trackPosition = trackPosition; 180 return this; 181 } 182 183 /** 184 Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes. 185 * @param settings the new settings 186 * @return this Parser 187 */ 188 public Parser settings(ParseSettings settings) { 189 this.settings = settings; 190 return this; 191 } 192 193 /** 194 Gets the current ParseSettings for this Parser 195 * @return current ParseSettings 196 */ 197 public ParseSettings settings() { 198 return settings; 199 } 200 201 /** 202 Set the parser's maximum stack depth (maximum number of open elements). When reached, new open elements will be 203 removed to prevent excessive nesting. Defaults to 512 for the HTML parser, and unlimited for the XML 204 parser. 205 206 @param maxDepth maximum parser depth; must be >= 1 207 @return this Parser, for chaining 208 */ 209 public Parser setMaxDepth(int maxDepth) { 210 Validate.isTrue(maxDepth >= 1, "maxDepth must be >= 1"); 211 this.maxDepth = maxDepth; 212 return this; 213 } 214 215 /** 216 * Get the maximum parser depth (maximum number of open elements). 217 * @return the current max parser depth 218 */ 219 public int getMaxDepth() { 220 return maxDepth; 221 } 222 223 /** 224 Set a custom TagSet to use for this Parser. This allows you to define your own tags, and control how they are 225 parsed. For example, you can set a tag to preserve whitespace, or to be treated as a block tag. 226 <p>You can start with the {@link TagSet#Html()} defaults and customize, or a new empty TagSet.</p> 227 228 @param tagSet the TagSet to use. This gets copied, so that changes that the parse makes (tags found in the document will be added) do not clobber the original TagSet. 229 @return this Parser 230 @since 1.20.1 231 */ 232 public Parser tagSet(TagSet tagSet) { 233 Validate.notNull(tagSet); 234 this.tagSet = new TagSet(tagSet); // copy it as we are going to mutate it 235 return this; 236 } 237 238 /** 239 Get the current TagSet for this Parser, which will be either this parser's default, or one that you have set. 240 @return the current TagSet. After the parse, this will contain any new tags that were found in the document. 241 @since 1.20.1 242 */ 243 public TagSet tagSet() { 244 if (tagSet == null) 245 tagSet = treeBuilder.defaultTagSet(); 246 return tagSet; 247 } 248 249 public String defaultNamespace() { 250 return getTreeBuilder().defaultNamespace(); 251 } 252 253 // static parse functions below 254 /** 255 * Parse HTML into a Document. 256 * 257 * @param html HTML to parse 258 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 259 * 260 * @return parsed Document 261 */ 262 public static Document parse(String html, String baseUri) { 263 TreeBuilder treeBuilder = new HtmlTreeBuilder(); 264 return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder)); 265 } 266 267 /** 268 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 269 * 270 * @param fragmentHtml the fragment of HTML to parse 271 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 272 * provides stack context (for implicit element creation). 273 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 274 * 275 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 276 */ 277 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { 278 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 279 return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, new Parser(treeBuilder)); 280 } 281 282 /** 283 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 284 * 285 * @param fragmentHtml the fragment of HTML to parse 286 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 287 * provides stack context (for implicit element creation). 288 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 289 * @param errorList list to add errors to 290 * 291 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 292 */ 293 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) { 294 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 295 Parser parser = new Parser(treeBuilder); 296 parser.errors = errorList; 297 return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, parser); 298 } 299 300 /** 301 * Parse a fragment of XML into a list of nodes. 302 * 303 * @param fragmentXml the fragment of XML to parse 304 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 305 * @return list of nodes parsed from the input XML. 306 */ 307 public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) { 308 XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); 309 return treeBuilder.parseFragment(new StringReader(fragmentXml), null, baseUri, new Parser(treeBuilder)); 310 } 311 312 /** 313 * Parse a fragment of HTML into the {@code body} of a Document. 314 * 315 * @param bodyHtml fragment of HTML 316 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 317 * 318 * @return Document, with empty head, and HTML parsed into body 319 */ 320 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 321 Document doc = Document.createShell(baseUri); 322 Element body = doc.body(); 323 List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); 324 body.appendChildren(nodeList); 325 return doc; 326 } 327 328 /** 329 Utility method to unescape HTML entities from a string. 330 <p>To track errors while unescaping, use 331 {@link #unescape(String, boolean)} with a Parser instance that has error tracking enabled.</p> 332 333 @param string HTML escaped string 334 @param inAttribute if the string is to be escaped in strict mode (as attributes are) 335 @return an unescaped string 336 @see #unescape(String, boolean) 337 */ 338 public static String unescapeEntities(String string, boolean inAttribute) { 339 Validate.notNull(string); 340 if (string.indexOf('&') < 0) return string; // nothing to unescape 341 return Parser.htmlParser().unescape(string, inAttribute); 342 } 343 344 /** 345 Utility method to unescape HTML entities from a string, using this {@code Parser}'s configuration (for example, to 346 collect errors while unescaping). 347 348 @param string HTML escaped string 349 @param inAttribute if the string is to be escaped in strict mode (as attributes are) 350 @return an unescaped string 351 @see #setTrackErrors(int) 352 @see #unescapeEntities(String, boolean) 353 */ 354 public String unescape(String string, boolean inAttribute) { 355 Validate.notNull(string); 356 if (string.indexOf('&') < 0) return string; // nothing to unescape 357 this.treeBuilder.initialiseParse(new StringReader(string), "", this); 358 Tokeniser tokeniser = new Tokeniser(this.treeBuilder); 359 return tokeniser.unescapeEntities(inAttribute); 360 } 361 362 // builders 363 364 /** 365 * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, 366 * based on a knowledge of the semantics of the incoming tags. 367 * @return a new HTML parser. 368 */ 369 public static Parser htmlParser() { 370 return new Parser(new HtmlTreeBuilder()); 371 } 372 373 /** 374 * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, 375 * rather creates a simple tree directly from the input. 376 * @return a new simple XML parser. 377 */ 378 public static Parser xmlParser() { 379 return new Parser(new XmlTreeBuilder()).setMaxDepth(Integer.MAX_VALUE); 380 } 381}