001package org.jsoup.parser;
002
003import org.jsoup.Connection;
004import org.jsoup.helper.Validate;
005import org.jsoup.nodes.Document;
006import org.jsoup.nodes.Element;
007import org.jsoup.nodes.Node;
008import org.jsoup.select.Evaluator;
009import org.jsoup.select.NodeVisitor;
010import org.jsoup.select.Selector;
011import org.jspecify.annotations.Nullable;
012
013import java.io.Closeable;
014import java.io.IOException;
015import java.io.Reader;
016import java.io.StringReader;
017import java.io.UncheckedIOException;
018import java.util.Iterator;
019import java.util.LinkedList;
020import java.util.List;
021import java.util.NoSuchElementException;
022import java.util.Queue;
023import java.util.Spliterator;
024import java.util.Spliterators;
025import java.util.stream.Stream;
026import java.util.stream.StreamSupport;
027
028/**
029 A StreamParser provides a progressive parse of its input. As each Element is completed, it is emitted via a Stream or
030 Iterator interface. Elements returned will be complete with all their children, and an (empty) next sibling, if
031 applicable.
032 <p>To conserve memory, you can {@link Node#remove() remove()} Elements (or their children) from the DOM during the
033 parse. This provides a mechanism to parse an input document that would otherwise be too large to fit into memory, yet
034 still providing a DOM interface to the document and its elements.</p>
035 <p>
036 Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will
037 run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another
038 {@code select()} call, or via the {@link #stream()} or {@link #iterator()} methods.
039 </p>
040 <p>Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be
041 read, call {@link #stop()} and {@link #close()}.</p>
042 <p>The {@link #document()} method will return the Document being parsed into, which will be only partially complete
043 until the input is fully consumed.</p>
044 <p>A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs.
045 New parsers should be used in each thread.</p>
046 <p>If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the iterator and
047 stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.</p>
048 <p>For examples, see the jsoup
049 <a href="https://jsoup.org/cookbook/input/streamparser-dom-sax">StreamParser cookbook.</a></p>
050 <p>
051 Selectors that depend on knowing all siblings (e.g. {@code :last-child}, {@code :last-of-type}, {@code :nth-last-child},
052 {@code :only-child} and their negations) cannot be correctly evaluated while streaming, because the parser does not know
053 if a later sibling will appear. For those cases, run {@link #complete()} first to finish the parse (which is effectively
054 the same as using {@code Jsoup.parse(...)} unless you have already removed nodes during streaming).
055 </p>
056 @since 1.18.1 */
057public class StreamParser implements Closeable {
058    final private Parser parser;
059    final private TreeBuilder treeBuilder;
060    final private ElementIterator it = new ElementIterator();
061    @Nullable private Document document;
062    private boolean stopped = false;
063
064    /**
065     Construct a new StreamParser, using the supplied base Parser.
066     @param parser the configured base parser
067     */
068    public StreamParser(Parser parser) {
069        this.parser = parser;
070        treeBuilder = parser.getTreeBuilder();
071        treeBuilder.nodeListener(it);
072    }
073
074    /**
075     Provide the input for a Document parse. The input is not read until a consuming operation is called.
076     @param input the input to be read.
077     @param baseUri the URL of this input, for absolute link resolution
078     @return this parser, for chaining
079     */
080    public StreamParser parse(Reader input, String baseUri) {
081        close(); // probably a no-op, but ensures any previous reader is closed
082        it.reset();
083        treeBuilder.initialiseParse(input, baseUri, parser); // reader is not read, so no chance of IO error
084        document = treeBuilder.doc;
085        return this;
086    }
087
088    /**
089     Provide the input for a Document parse. The input is not read until a consuming operation is called.
090     @param input the input to be read
091     @param baseUri the URL of this input, for absolute link resolution
092     @return this parser
093     */
094    public StreamParser parse(String input, String baseUri) {
095        return parse(new StringReader(input), baseUri);
096    }
097
098    /**
099     Provide the input for a fragment parse. The input is not read until a consuming operation is called.
100     @param input the input to be read
101     @param context the optional fragment context element
102     @param baseUri the URL of this input, for absolute link resolution
103     @return this parser
104     @see #completeFragment()
105     */
106    public StreamParser parseFragment(Reader input, @Nullable Element context, String baseUri) {
107        parse(input, baseUri);
108        treeBuilder.initialiseParseFragment(context);
109        return this;
110    }
111
112    /**
113     Provide the input for a fragment parse. The input is not read until a consuming operation is called.
114     @param input the input to be read
115     @param context the optional fragment context element
116     @param baseUri the URL of this input, for absolute link resolution
117     @return this parser
118     @see #completeFragment()
119     */
120    public StreamParser parseFragment(String input, @Nullable Element context, String baseUri) {
121        return parseFragment(new StringReader(input), context, baseUri);
122    }
123
124    /**
125     Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each
126     Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that
127     (empty) sibling will exist at {@link Element#nextElementSibling()}). The stream will be emitted in document order as
128     each element is closed. That means that child elements will be returned prior to their parents.
129     <p>The stream will start from the current position of the backing iterator and the parse.</p>
130     <p>When consuming the stream, if the Reader that the Parser is reading throws an I/O exception (for example a
131     SocketTimeoutException), that will be emitted as an {@link UncheckedIOException}</p>
132     @return a stream of Element objects
133     @throws UncheckedIOException if the underlying Reader excepts during a read (in stream consuming methods)
134     */
135    public Stream<Element> stream() {
136        return StreamSupport.stream(
137            Spliterators.spliteratorUnknownSize(
138                it, Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED),
139            false);
140    }
141
142    /**
143     Returns an {@link Iterator} of {@link Element}s, with the input being parsed as each element is consumed. Each
144     Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that
145     (empty) sibling will exist at {@link Element#nextElementSibling()}). The elements will be emitted in document order as
146     each element is closed. That means that child elements will be returned prior to their parents.
147     <p>The iterator will start from the current position of the parse.</p>
148     <p>The iterator is backed by this StreamParser, and the resources it holds.</p>
149     @return a stream of Element objects
150     */
151    public Iterator<Element> iterator() {
152        //noinspection ReturnOfInnerClass
153        return it;
154    }
155
156    /**
157     Flags that the parse should be stopped; the backing iterator will not return any more Elements.
158     @return this parser
159     */
160    public StreamParser stop() {
161        stopped = true;
162        return this;
163    }
164
165    /**
166     Closes the input and releases resources including the underlying parser and reader.
167     <p>The parser will also be closed when the input is fully read.</p>
168     <p>The parser can be reused with another call to {@link #parse(Reader, String)}.</p>
169     */
170    @Override public void close() {
171        treeBuilder.completeParse(); // closes the reader, frees resources
172    }
173
174    /**
175     Get the current {@link Document} as it is being parsed. It will be only partially complete until the input is fully
176     read. Structural changes (e.g. insert, remove) may be made to the Document contents.
177     @return the (partial) Document
178     */
179    public Document document() {
180        document = treeBuilder.doc;
181        Validate.notNull(document, "Must run parse() before calling.");
182        return document;
183    }
184
185    /**
186     Runs the parser until the input is fully read, and returns the completed Document.
187     @return the completed Document
188     @throws IOException if an I/O error occurs
189     */
190    public Document complete() throws IOException {
191        Document doc = document();
192        treeBuilder.runParser();
193        return doc;
194    }
195
196    /**
197     When initialized as a fragment parse, runs the parser until the input is fully read, and returns the completed
198     fragment child nodes.
199     @return the completed child nodes
200     @throws IOException if an I/O error occurs
201     @see #parseFragment(Reader, Element, String)
202     */
203    public List<Node> completeFragment() throws IOException {
204        treeBuilder.runParser();
205        return treeBuilder.completeParseFragment();
206    }
207
208    /**
209     Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the
210     input will be parsed until the first match is found, or the input is completely read.
211     @param query the {@link org.jsoup.select.Selector} query.
212     @return the first matching {@link Element}, or {@code null} if there's no match
213     @throws IOException if an I/O error occurs
214     @see #selectFirst(Evaluator)
215     */
216    public @Nullable Element selectFirst(String query) throws IOException {
217        return selectFirst(Selector.evaluatorOf(query));
218    }
219
220    /**
221     Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This
222     is useful if you want to simply abort processing on a failed match.
223     @param query the {@link org.jsoup.select.Selector} query.
224     @return the first matching element
225     @throws IllegalArgumentException if no match is found
226     @throws IOException if an I/O error occurs
227     */
228    public Element expectFirst(String query) throws IOException {
229        return Validate.expectNotNull(
230            selectFirst(query),
231            "No elements matched the query '%s' in the document."
232            , query
233        );
234    }
235
236    /**
237     Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the
238     input will be parsed until the first match is found, or the input is completely read.
239     <p>By providing a compiled evaluator vs a CSS selector, this method may be more efficient when executing the same
240     query against multiple documents.</p>
241     @param eval the {@link org.jsoup.select.Selector} evaluator.
242     @return the first matching {@link Element}, or {@code null} if there's no match
243     @throws IOException if an I/O error occurs
244     @see Selector#evaluatorOf(String css)
245     */
246    public @Nullable Element selectFirst(Evaluator eval) throws IOException {
247        final Document doc = document();
248
249        // run the query on the existing (partial) doc first, as there may be a hit already parsed
250        Element first = doc.selectFirst(eval);
251        if (first != null) return first;
252
253        return selectNext(eval);
254    }
255
256    /**
257     Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or
258     the input is completely read.
259     @param query the {@link org.jsoup.select.Selector} query.
260     @return the next matching {@link Element}, or {@code null} if there's no match
261     @throws IOException if an I/O error occurs
262     @see #selectNext(Evaluator)
263     */
264    public @Nullable Element selectNext(String query) throws IOException {
265        return selectNext(Selector.evaluatorOf(query));
266    }
267
268    /**
269     Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This
270     is useful if you want to simply abort processing on a failed match.
271     @param query the {@link org.jsoup.select.Selector} query.
272     @return the first matching element
273     @throws IllegalArgumentException if no match is found
274     @throws IOException if an I/O error occurs
275     */
276    public Element expectNext(String query) throws IOException {
277        return Validate.expectNotNull(
278            selectNext(query),
279            "No elements matched the query '%s' in the document."
280            , query
281        );
282    }
283
284    /**
285     Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or
286     the input is completely read.
287     <p>By providing a compiled evaluator vs a CSS selector, this method may be more efficient when executing the same
288     query against multiple documents.</p>
289     @param eval the {@link org.jsoup.select.Selector} evaluator.
290     @return the next matching {@link Element}, or {@code null} if there's no match
291     @throws IOException if an I/O error occurs
292     @see Selector#evaluatorOf(String css)
293     */
294    public @Nullable Element selectNext(Evaluator eval) throws IOException {
295        try {
296            final Document doc = document(); // validates the parse was initialized, keeps stack trace out of stream
297            return stream()
298                .filter(eval.asPredicate(doc))
299                .findFirst()
300                .orElse(null);
301        } catch (UncheckedIOException e) {
302            // Reader threw an IO exception emitted via Iterator's next()
303            throw e.getCause();
304        }
305    }
306
307    final class ElementIterator implements Iterator<Element>, NodeVisitor {
308        // listeners add to a next emit queue, as a single token read step may yield multiple elements
309        final private Queue<Element> emitQueue = new LinkedList<>();
310        private @Nullable Element current;  // most recently emitted
311        private @Nullable Element next;     // element waiting to be picked up
312        private @Nullable Element tail;     // The last tailed element (</html>), on hold for final pop
313
314        void reset() {
315            emitQueue.clear();
316            current = next = tail = null;
317            stopped = false;
318        }
319
320        // Iterator Interface:
321        /**
322         {@inheritDoc}
323         @throws UncheckedIOException if the underlying Reader errors during a read
324         */
325        @Override public boolean hasNext() {
326            maybeFindNext();
327            return next != null;
328        }
329
330        /**
331         {@inheritDoc}
332         @throws UncheckedIOException if the underlying Reader errors during a read
333         */
334        @Override public Element next() {
335            maybeFindNext();
336            if (next == null) throw new NoSuchElementException();
337            current = next;
338            next = null;
339            return current;
340        }
341
342        private void maybeFindNext() {
343            if (stopped || next != null) return;
344
345            // drain the current queue before stepping to get more
346            if (!emitQueue.isEmpty()) {
347                next = emitQueue.remove();
348                return;
349            }
350
351            // step the parser, which will hit the node listeners to add to the queue:
352            while (treeBuilder.stepParser()) {
353                if (!emitQueue.isEmpty()) {
354                    next = emitQueue.remove();
355                    return;
356                }
357            }
358            stop();
359            close();
360
361            // send the final element out:
362            if (tail != null) {
363                next = tail;
364                tail = null;
365            }
366        }
367
368        @Override public void remove() {
369            if (current == null) throw new NoSuchElementException();
370            current.remove();
371        }
372
373        // NodeVisitor Interface:
374        @Override public void head(Node node, int depth) {
375            if (node instanceof Element) {
376                Element prev = node.previousElementSibling();
377                // We prefer to wait until an element has a next sibling before emitting it; otherwise, get it in tail
378                if (prev != null) emitQueue.add(prev);
379            }
380        }
381
382        @Override public void tail(Node node, int depth) {
383            if (node instanceof Element) {
384                tail = (Element) node; // kept for final hit
385                Element lastChild = tail.lastElementChild(); // won't get a nextsib, so emit that:
386                if (lastChild != null) emitQueue.add(lastChild);
387            }
388        }
389    }
390}