001package org.jsoup.parser; 002 003import org.jsoup.Connection; 004import org.jsoup.helper.Validate; 005import org.jsoup.nodes.Document; 006import org.jsoup.nodes.Element; 007import org.jsoup.nodes.Node; 008import org.jsoup.select.Evaluator; 009import org.jsoup.select.NodeVisitor; 010import org.jsoup.select.Selector; 011import org.jspecify.annotations.Nullable; 012 013import java.io.Closeable; 014import java.io.IOException; 015import java.io.Reader; 016import java.io.StringReader; 017import java.io.UncheckedIOException; 018import java.util.Iterator; 019import java.util.LinkedList; 020import java.util.List; 021import java.util.NoSuchElementException; 022import java.util.Queue; 023import java.util.Spliterator; 024import java.util.Spliterators; 025import java.util.stream.Stream; 026import java.util.stream.StreamSupport; 027 028/** 029 A StreamParser provides a progressive parse of its input. As each Element is completed, it is emitted via a Stream or 030 Iterator interface. Elements returned will be complete with all their children, and an (empty) next sibling, if 031 applicable. 032 <p>To conserve memory, you can {@link Node#remove() remove()} Elements (or their children) from the DOM during the 033 parse. This provides a mechanism to parse an input document that would otherwise be too large to fit into memory, yet 034 still providing a DOM interface to the document and its elements.</p> 035 <p> 036 Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will 037 run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another 038 {@code select()} call, or via the {@link #stream()} or {@link #iterator()} methods. 039 </p> 040 <p>Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be 041 read, call {@link #stop()} and {@link #close()}.</p> 042 <p>The {@link #document()} method will return the Document being parsed into, which will be only partially complete 043 until the input is fully consumed.</p> 044 <p>A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs. 045 New parsers should be used in each thread.</p> 046 <p>If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the iterator and 047 stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.</p> 048 <p>For examples, see the jsoup 049 <a href="https://jsoup.org/cookbook/input/streamparser-dom-sax">StreamParser cookbook.</a></p> 050 <p> 051 Selectors that depend on knowing all siblings (e.g. {@code :last-child}, {@code :last-of-type}, {@code :nth-last-child}, 052 {@code :only-child} and their negations) cannot be correctly evaluated while streaming, because the parser does not know 053 if a later sibling will appear. For those cases, run {@link #complete()} first to finish the parse (which is effectively 054 the same as using {@code Jsoup.parse(...)} unless you have already removed nodes during streaming). 055 </p> 056 @since 1.18.1 */ 057public class StreamParser implements Closeable { 058 final private Parser parser; 059 final private TreeBuilder treeBuilder; 060 final private ElementIterator it = new ElementIterator(); 061 @Nullable private Document document; 062 private boolean stopped = false; 063 064 /** 065 Construct a new StreamParser, using the supplied base Parser. 066 @param parser the configured base parser 067 */ 068 public StreamParser(Parser parser) { 069 this.parser = parser; 070 treeBuilder = parser.getTreeBuilder(); 071 treeBuilder.nodeListener(it); 072 } 073 074 /** 075 Provide the input for a Document parse. The input is not read until a consuming operation is called. 076 @param input the input to be read. 077 @param baseUri the URL of this input, for absolute link resolution 078 @return this parser, for chaining 079 */ 080 public StreamParser parse(Reader input, String baseUri) { 081 close(); // probably a no-op, but ensures any previous reader is closed 082 it.reset(); 083 treeBuilder.initialiseParse(input, baseUri, parser); // reader is not read, so no chance of IO error 084 document = treeBuilder.doc; 085 return this; 086 } 087 088 /** 089 Provide the input for a Document parse. The input is not read until a consuming operation is called. 090 @param input the input to be read 091 @param baseUri the URL of this input, for absolute link resolution 092 @return this parser 093 */ 094 public StreamParser parse(String input, String baseUri) { 095 return parse(new StringReader(input), baseUri); 096 } 097 098 /** 099 Provide the input for a fragment parse. The input is not read until a consuming operation is called. 100 @param input the input to be read 101 @param context the optional fragment context element 102 @param baseUri the URL of this input, for absolute link resolution 103 @return this parser 104 @see #completeFragment() 105 */ 106 public StreamParser parseFragment(Reader input, @Nullable Element context, String baseUri) { 107 parse(input, baseUri); 108 treeBuilder.initialiseParseFragment(context); 109 return this; 110 } 111 112 /** 113 Provide the input for a fragment parse. The input is not read until a consuming operation is called. 114 @param input the input to be read 115 @param context the optional fragment context element 116 @param baseUri the URL of this input, for absolute link resolution 117 @return this parser 118 @see #completeFragment() 119 */ 120 public StreamParser parseFragment(String input, @Nullable Element context, String baseUri) { 121 return parseFragment(new StringReader(input), context, baseUri); 122 } 123 124 /** 125 Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each 126 Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that 127 (empty) sibling will exist at {@link Element#nextElementSibling()}). The stream will be emitted in document order as 128 each element is closed. That means that child elements will be returned prior to their parents. 129 <p>The stream will start from the current position of the backing iterator and the parse.</p> 130 <p>When consuming the stream, if the Reader that the Parser is reading throws an I/O exception (for example a 131 SocketTimeoutException), that will be emitted as an {@link UncheckedIOException}</p> 132 @return a stream of Element objects 133 @throws UncheckedIOException if the underlying Reader excepts during a read (in stream consuming methods) 134 */ 135 public Stream<Element> stream() { 136 return StreamSupport.stream( 137 Spliterators.spliteratorUnknownSize( 138 it, Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED), 139 false); 140 } 141 142 /** 143 Returns an {@link Iterator} of {@link Element}s, with the input being parsed as each element is consumed. Each 144 Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that 145 (empty) sibling will exist at {@link Element#nextElementSibling()}). The elements will be emitted in document order as 146 each element is closed. That means that child elements will be returned prior to their parents. 147 <p>The iterator will start from the current position of the parse.</p> 148 <p>The iterator is backed by this StreamParser, and the resources it holds.</p> 149 @return a stream of Element objects 150 */ 151 public Iterator<Element> iterator() { 152 //noinspection ReturnOfInnerClass 153 return it; 154 } 155 156 /** 157 Flags that the parse should be stopped; the backing iterator will not return any more Elements. 158 @return this parser 159 */ 160 public StreamParser stop() { 161 stopped = true; 162 return this; 163 } 164 165 /** 166 Closes the input and releases resources including the underlying parser and reader. 167 <p>The parser will also be closed when the input is fully read.</p> 168 <p>The parser can be reused with another call to {@link #parse(Reader, String)}.</p> 169 */ 170 @Override public void close() { 171 treeBuilder.completeParse(); // closes the reader, frees resources 172 } 173 174 /** 175 Get the current {@link Document} as it is being parsed. It will be only partially complete until the input is fully 176 read. Structural changes (e.g. insert, remove) may be made to the Document contents. 177 @return the (partial) Document 178 */ 179 public Document document() { 180 document = treeBuilder.doc; 181 Validate.notNull(document, "Must run parse() before calling."); 182 return document; 183 } 184 185 /** 186 Runs the parser until the input is fully read, and returns the completed Document. 187 @return the completed Document 188 @throws IOException if an I/O error occurs 189 */ 190 public Document complete() throws IOException { 191 Document doc = document(); 192 treeBuilder.runParser(); 193 return doc; 194 } 195 196 /** 197 When initialized as a fragment parse, runs the parser until the input is fully read, and returns the completed 198 fragment child nodes. 199 @return the completed child nodes 200 @throws IOException if an I/O error occurs 201 @see #parseFragment(Reader, Element, String) 202 */ 203 public List<Node> completeFragment() throws IOException { 204 treeBuilder.runParser(); 205 return treeBuilder.completeParseFragment(); 206 } 207 208 /** 209 Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the 210 input will be parsed until the first match is found, or the input is completely read. 211 @param query the {@link org.jsoup.select.Selector} query. 212 @return the first matching {@link Element}, or {@code null} if there's no match 213 @throws IOException if an I/O error occurs 214 @see #selectFirst(Evaluator) 215 */ 216 public @Nullable Element selectFirst(String query) throws IOException { 217 return selectFirst(Selector.evaluatorOf(query)); 218 } 219 220 /** 221 Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This 222 is useful if you want to simply abort processing on a failed match. 223 @param query the {@link org.jsoup.select.Selector} query. 224 @return the first matching element 225 @throws IllegalArgumentException if no match is found 226 @throws IOException if an I/O error occurs 227 */ 228 public Element expectFirst(String query) throws IOException { 229 return Validate.expectNotNull( 230 selectFirst(query), 231 "No elements matched the query '%s' in the document." 232 , query 233 ); 234 } 235 236 /** 237 Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the 238 input will be parsed until the first match is found, or the input is completely read. 239 <p>By providing a compiled evaluator vs a CSS selector, this method may be more efficient when executing the same 240 query against multiple documents.</p> 241 @param eval the {@link org.jsoup.select.Selector} evaluator. 242 @return the first matching {@link Element}, or {@code null} if there's no match 243 @throws IOException if an I/O error occurs 244 @see Selector#evaluatorOf(String css) 245 */ 246 public @Nullable Element selectFirst(Evaluator eval) throws IOException { 247 final Document doc = document(); 248 249 // run the query on the existing (partial) doc first, as there may be a hit already parsed 250 Element first = doc.selectFirst(eval); 251 if (first != null) return first; 252 253 return selectNext(eval); 254 } 255 256 /** 257 Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or 258 the input is completely read. 259 @param query the {@link org.jsoup.select.Selector} query. 260 @return the next matching {@link Element}, or {@code null} if there's no match 261 @throws IOException if an I/O error occurs 262 @see #selectNext(Evaluator) 263 */ 264 public @Nullable Element selectNext(String query) throws IOException { 265 return selectNext(Selector.evaluatorOf(query)); 266 } 267 268 /** 269 Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This 270 is useful if you want to simply abort processing on a failed match. 271 @param query the {@link org.jsoup.select.Selector} query. 272 @return the first matching element 273 @throws IllegalArgumentException if no match is found 274 @throws IOException if an I/O error occurs 275 */ 276 public Element expectNext(String query) throws IOException { 277 return Validate.expectNotNull( 278 selectNext(query), 279 "No elements matched the query '%s' in the document." 280 , query 281 ); 282 } 283 284 /** 285 Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or 286 the input is completely read. 287 <p>By providing a compiled evaluator vs a CSS selector, this method may be more efficient when executing the same 288 query against multiple documents.</p> 289 @param eval the {@link org.jsoup.select.Selector} evaluator. 290 @return the next matching {@link Element}, or {@code null} if there's no match 291 @throws IOException if an I/O error occurs 292 @see Selector#evaluatorOf(String css) 293 */ 294 public @Nullable Element selectNext(Evaluator eval) throws IOException { 295 try { 296 final Document doc = document(); // validates the parse was initialized, keeps stack trace out of stream 297 return stream() 298 .filter(eval.asPredicate(doc)) 299 .findFirst() 300 .orElse(null); 301 } catch (UncheckedIOException e) { 302 // Reader threw an IO exception emitted via Iterator's next() 303 throw e.getCause(); 304 } 305 } 306 307 final class ElementIterator implements Iterator<Element>, NodeVisitor { 308 // listeners add to a next emit queue, as a single token read step may yield multiple elements 309 final private Queue<Element> emitQueue = new LinkedList<>(); 310 private @Nullable Element current; // most recently emitted 311 private @Nullable Element next; // element waiting to be picked up 312 private @Nullable Element tail; // The last tailed element (</html>), on hold for final pop 313 314 void reset() { 315 emitQueue.clear(); 316 current = next = tail = null; 317 stopped = false; 318 } 319 320 // Iterator Interface: 321 /** 322 {@inheritDoc} 323 @throws UncheckedIOException if the underlying Reader errors during a read 324 */ 325 @Override public boolean hasNext() { 326 maybeFindNext(); 327 return next != null; 328 } 329 330 /** 331 {@inheritDoc} 332 @throws UncheckedIOException if the underlying Reader errors during a read 333 */ 334 @Override public Element next() { 335 maybeFindNext(); 336 if (next == null) throw new NoSuchElementException(); 337 current = next; 338 next = null; 339 return current; 340 } 341 342 private void maybeFindNext() { 343 if (stopped || next != null) return; 344 345 // drain the current queue before stepping to get more 346 if (!emitQueue.isEmpty()) { 347 next = emitQueue.remove(); 348 return; 349 } 350 351 // step the parser, which will hit the node listeners to add to the queue: 352 while (treeBuilder.stepParser()) { 353 if (!emitQueue.isEmpty()) { 354 next = emitQueue.remove(); 355 return; 356 } 357 } 358 stop(); 359 close(); 360 361 // send the final element out: 362 if (tail != null) { 363 next = tail; 364 tail = null; 365 } 366 } 367 368 @Override public void remove() { 369 if (current == null) throw new NoSuchElementException(); 370 current.remove(); 371 } 372 373 // NodeVisitor Interface: 374 @Override public void head(Node node, int depth) { 375 if (node instanceof Element) { 376 Element prev = node.previousElementSibling(); 377 // We prefer to wait until an element has a next sibling before emitting it; otherwise, get it in tail 378 if (prev != null) emitQueue.add(prev); 379 } 380 } 381 382 @Override public void tail(Node node, int depth) { 383 if (node instanceof Element) { 384 tail = (Element) node; // kept for final hit 385 Element lastChild = tail.lastElementChild(); // won't get a nextsib, so emit that: 386 if (lastChild != null) emitQueue.add(lastChild); 387 } 388 } 389 } 390}