001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SoftPool;
005import org.jsoup.internal.StringUtil;
006import org.jspecify.annotations.Nullable;
007
008import java.io.IOException;
009import java.io.UncheckedIOException;
010import java.io.Reader;
011import java.io.StringReader;
012import java.util.ArrayList;
013import java.util.Arrays;
014import java.util.Collections;
015import java.util.Locale;
016
017/**
018 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
019 <p>If the underlying reader throws an IOException during any operation, the CharacterReader will throw an
020 {@link UncheckedIOException}. That won't happen with String / StringReader inputs.</p>
021 */
022public final class CharacterReader implements AutoCloseable {
023    static final char EOF = (char) -1;
024    private static final int MaxStringCacheLen = 12;
025    private static final int StringCacheSize = 512;
026    private String[] stringCache; // holds reused strings in this doc, to lessen garbage
027    private static final SoftPool<String[]> StringPool = new SoftPool<>(() -> new String[StringCacheSize]); // reuse cache between iterations
028
029    static final int BufferSize = 1024 * 2;         // visible for testing
030    static final int RefillPoint = BufferSize / 2;  // when bufPos characters read, refill; visible for testing
031    private static final int RewindLimit = 1024;    // the maximum we can rewind. No HTML entities can be larger than this.
032
033    private Reader reader;      // underlying Reader, will be backed by a buffered+controlled input stream, or StringReader
034    private char[] charBuf;     // character buffer we consume from; filled from Reader
035    private int bufPos;         // position in charBuf that's been consumed to
036    private int bufLength;      // the num of characters actually buffered in charBuf, <= charBuf.length
037    private int fillPoint = 0;  // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp
038    private int consumed;       // how many characters total have been consumed from this CharacterReader (less the current bufPos)
039    private int bufMark = -1;   // if not -1, the marked rewind position
040    private boolean readFully;  // if the underlying stream has been completely read, no value in further buffering
041
042    private static final SoftPool<char[]> BufferPool = new SoftPool<>(() -> new char[BufferSize]); // recycled char buffer
043
044    @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp()
045    private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)]
046
047    public CharacterReader(Reader input, int sz) {
048        this(input); // sz is no longer used
049    }
050
051    public CharacterReader(Reader input) {
052        Validate.notNull(input);
053        reader = input;
054        charBuf = BufferPool.borrow();
055        stringCache = StringPool.borrow();
056        bufferUp();
057    }
058
059    public CharacterReader(String input) {
060        this(new StringReader(input));
061    }
062
063    @Override
064    public void close() {
065        if (reader == null)
066            return;
067        try {
068            reader.close();
069        } catch (IOException ignored) {
070        } finally {
071            reader = null;
072            Arrays.fill(charBuf, (char) 0); // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer
073            BufferPool.release(charBuf);
074            charBuf = null;
075            StringPool.release(stringCache); // conversely, we don't clear the string cache, so we can reuse the contents
076            stringCache = null;
077        }
078    }
079
080    private void bufferUp() {
081        if (readFully || bufPos < fillPoint || bufMark != -1)
082            return;
083        doBufferUp(); // structured so bufferUp may become an intrinsic candidate
084    }
085
086    /**
087     Reads into the buffer. Will throw an UncheckedIOException if the underling reader throws an IOException.
088     @throws UncheckedIOException if the underlying reader throws an IOException
089     */
090    private void doBufferUp() {
091        /*
092        The flow:
093        - if read fully, or if bufPos < fillPoint, or if marked - do not fill.
094        - update readerPos (total amount consumed from this CharacterReader) += bufPos
095        - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount
096        - loop read the Reader until we fill charBuf. bufLength += read.
097        - readFully = true when read = -1
098         */
099        consumed += bufPos;
100        bufLength -= bufPos;
101        if (bufLength > 0)
102            System.arraycopy(charBuf, bufPos, charBuf, 0, bufLength);
103        bufPos = 0;
104        while (bufLength < BufferSize) {
105            try {
106                int read = reader.read(charBuf, bufLength, charBuf.length - bufLength);
107                if (read == -1) {
108                    readFully = true;
109                    break;
110                }
111                if (read == 0) {
112                    break; // if we have a surrogate on the buffer boundary and trying to read 1; will have enough in our buffer to proceed
113                }
114                bufLength += read;
115            } catch (IOException e) {
116                throw new UncheckedIOException(e);
117            }
118        }
119        fillPoint = Math.min(bufLength, RefillPoint);
120
121        scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking
122        lastIcSeq = null; // cache for last containsIgnoreCase(seq)
123    }
124
125    void mark() {
126        // make sure there is enough look ahead capacity
127        if (bufLength - bufPos < RewindLimit)
128            fillPoint = 0;
129
130        bufferUp();
131        bufMark = bufPos;
132    }
133
134    void unmark() {
135        bufMark = -1;
136    }
137
138    void rewindToMark() {
139        if (bufMark == -1)
140            throw new UncheckedIOException(new IOException("Mark invalid"));
141
142        bufPos = bufMark;
143        unmark();
144    }
145
146    /**
147     * Gets the position currently read to in the content. Starts at 0.
148     * @return current position
149     */
150    public int pos() {
151        return consumed + bufPos;
152    }
153
154    /** Tests if the buffer has been fully read. */
155    boolean readFully() {
156        return readFully;
157    }
158
159    /**
160     Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the
161     legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of
162     use.
163
164     @param track set tracking on|off
165     @since 1.14.3
166     */
167    public void trackNewlines(boolean track) {
168        if (track && newlinePositions == null) {
169            newlinePositions = new ArrayList<>(BufferSize / 80); // rough guess of likely count
170            scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp
171        }
172        else if (!track)
173            newlinePositions = null;
174    }
175
176    /**
177     Check if the tracking of newlines is enabled.
178     @return the current newline tracking state
179     @since 1.14.3
180     */
181    public boolean isTrackNewlines() {
182        return newlinePositions != null;
183    }
184
185    /**
186     Get the current line number (that the reader has consumed to). Starts at line #1.
187     @return the current line number, or 1 if line tracking is not enabled.
188     @since 1.14.3
189     @see #trackNewlines(boolean)
190     */
191    public int lineNumber() {
192        return lineNumber(pos());
193    }
194
195    int lineNumber(int pos) {
196        // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that
197        // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array
198        if (!isTrackNewlines())
199            return 1;
200
201        int i = lineNumIndex(pos);
202        if (i == -1)
203            return lineNumberOffset; // first line
204        return i + lineNumberOffset + 1;
205    }
206
207    /**
208     Get the current column number (that the reader has consumed to). Starts at column #1.
209     @return the current column number
210     @since 1.14.3
211     @see #trackNewlines(boolean)
212     */
213    public int columnNumber() {
214        return columnNumber(pos());
215    }
216
217    int columnNumber(int pos) {
218        if (!isTrackNewlines())
219            return pos + 1;
220
221        int i = lineNumIndex(pos);
222        if (i == -1)
223          return pos + 1;
224        return pos - newlinePositions.get(i) + 1;
225    }
226
227    /**
228     Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line
229     number 5 and column number 10.
230     @return line:col position
231     @since 1.14.3
232     @see #trackNewlines(boolean)
233     */
234    String posLineCol() {
235        return lineNumber() + ":" + columnNumber();
236    }
237
238    private int lineNumIndex(int pos) {
239        if (!isTrackNewlines()) return 0;
240        int i = Collections.binarySearch(newlinePositions, pos);
241        if (i < -1) i = Math.abs(i) - 2;
242        return i;
243    }
244
245    /**
246     Scans the buffer for newline position, and tracks their location in newlinePositions.
247     */
248    private void scanBufferForNewlines() {
249        if (!isTrackNewlines())
250            return;
251
252        if (newlinePositions.size() > 0) {
253            // work out the line number that we have read up to (as we have likely scanned past this point)
254            int index = lineNumIndex(consumed);
255            if (index == -1) index = 0; // first line
256            int linePos = newlinePositions.get(index);
257            lineNumberOffset += index; // the num lines we've read up to
258            newlinePositions.clear();
259            newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer
260        }
261
262        for (int i = bufPos; i < bufLength; i++) {
263            if (charBuf[i] == '\n')
264                newlinePositions.add(1 + consumed + i);
265        }
266    }
267
268    /**
269     * Tests if all the content has been read.
270     * @return true if nothing left to read.
271     */
272    public boolean isEmpty() {
273        bufferUp();
274        return bufPos >= bufLength;
275    }
276
277    private boolean isEmptyNoBufferUp() {
278        return bufPos >= bufLength;
279    }
280
281    /**
282     * Get the char at the current position.
283     * @return char
284     */
285    public char current() {
286        bufferUp();
287        return isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
288    }
289
290    /**
291     Consume one character off the queue.
292     @return first character on queue, or EOF if the queue is empty.
293     */
294    public char consume() {
295        bufferUp();
296        char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
297        bufPos++;
298        return val;
299    }
300
301    /**
302     Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp.
303     */
304    void unconsume() {
305        if (bufPos < 1)
306            throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it.
307
308        bufPos--;
309    }
310
311    /**
312     * Moves the current position by one.
313     */
314    public void advance() {
315        bufPos++;
316    }
317
318    /**
319     * Returns the number of characters between the current position and the next instance of the input char
320     * @param c scan target
321     * @return offset between current position and next instance of target. -1 if not found.
322     */
323    int nextIndexOf(char c) {
324        // doesn't handle scanning for surrogates
325        bufferUp();
326        for (int i = bufPos; i < bufLength; i++) {
327            if (c == charBuf[i])
328                return i - bufPos;
329        }
330        return -1;
331    }
332
333    /**
334     * Returns the number of characters between the current position and the next instance of the input sequence
335     *
336     * @param seq scan target
337     * @return offset between current position and next instance of target. -1 if not found.
338     */
339    int nextIndexOf(CharSequence seq) {
340        bufferUp();
341        // doesn't handle scanning for surrogates
342        char startChar = seq.charAt(0);
343        for (int offset = bufPos; offset < bufLength; offset++) {
344            // scan to first instance of startchar:
345            if (startChar != charBuf[offset])
346                while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
347            int i = offset + 1;
348            int last = i + seq.length()-1;
349            if (offset < bufLength && last <= bufLength) {
350                for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
351                if (i == last) // found full sequence
352                    return offset - bufPos;
353            }
354        }
355        return -1;
356    }
357
358    /**
359     * Reads characters up to the specific char.
360     * @param c the delimiter
361     * @return the chars read
362     */
363    public String consumeTo(char c) {
364        int offset = nextIndexOf(c);
365        if (offset != -1) {
366            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
367            bufPos += offset;
368            return consumed;
369        } else {
370            return consumeToEnd();
371        }
372    }
373
374    /**
375     Reads the characters up to (but not including) the specified case-sensitive string.
376     <p>If the sequence is not found in the buffer, will return the remainder of the current buffered amount, less the
377     length of the sequence, such that this call may be repeated.
378     @param seq the delimiter
379     @return the chars read
380     */
381    public String consumeTo(String seq) {
382        int offset = nextIndexOf(seq);
383        if (offset != -1) {
384            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
385            bufPos += offset;
386            return consumed;
387        } else if (bufLength - bufPos < seq.length()) {
388            // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF
389            return consumeToEnd();
390        } else {
391            // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters
392            // unread in case they contain the beginning of the search string
393            int endPos = bufLength - seq.length() + 1;
394            String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos);
395            bufPos = endPos;
396            return consumed;
397        }
398    }
399
400    /**
401     Read characters while the input predicate returns true.
402     @return characters read
403     */
404    String consumeMatching(CharPredicate func) {
405        return consumeMatching(func, -1);
406    }
407
408    /**
409     Read characters while the input predicate returns true, up to a maximum length.
410     @param func predicate to test
411     @param maxLength maximum length to read. -1 indicates no maximum
412     @return characters read
413     */
414    String consumeMatching(CharPredicate func, int maxLength) {
415        bufferUp();
416        int pos = bufPos;
417        final int start = pos;
418        final int remaining = bufLength;
419        final char[] val = charBuf;
420
421        while (pos < remaining && (maxLength == -1 || pos - start < maxLength) && func.test(val[pos])) {
422            pos++;
423        }
424
425        bufPos = pos;
426        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
427    }
428
429    /**
430     * Read characters until the first of any delimiters is found.
431     * @param chars delimiters to scan for
432     * @return characters read up to the matched delimiter.
433     */
434    public String consumeToAny(final char... chars) {
435        return consumeMatching(c -> { // seeks until we see one of the terminating chars
436            for (char seek : chars)
437                if (c == seek) return false;
438            return true;
439        });
440    }
441
442    String consumeToAnySorted(final char... chars) {
443        return consumeMatching(c -> Arrays.binarySearch(chars, c) < 0); // matches until a hit
444    }
445
446    String consumeData() {
447        // consumes until &, <, null
448        return consumeMatching(c -> c != '&' && c != '<' && c != TokeniserState.nullChar);
449    }
450
451    String consumeAttributeQuoted(final boolean single) {
452        // null, " or ', &
453        return consumeMatching(c -> c != TokeniserState.nullChar && c != '&' && (single ? c != '\'' : c != '"'));
454    }
455
456    String consumeRawData() {
457        // <, null
458        return consumeMatching(c -> c != '<' && c != TokeniserState.nullChar);
459    }
460
461    String consumeTagName() {
462        // '\t', '\n', '\r', '\f', ' ', '/', '>'
463        // NOTE: out of spec; does not stop and append on nullChar but eats
464        return consumeMatching(c -> {
465            switch (c) {
466                case '\t':
467                case '\n':
468                case '\r':
469                case '\f':
470                case ' ':
471                case '/':
472                case '>':
473                    return false;
474            }
475            return true;
476        });
477    }
478
479    String consumeToEnd() {
480        bufferUp();
481        String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
482        bufPos = bufLength;
483        return data;
484    }
485
486    String consumeLetterSequence() {
487        return consumeMatching(Character::isLetter);
488    }
489
490    String consumeLetterThenDigitSequence() {
491        bufferUp();
492        int start = bufPos;
493        while (bufPos < bufLength) {
494            if (StringUtil.isAsciiLetter(charBuf[bufPos])) bufPos++;
495            else break;
496        }
497        while (!isEmptyNoBufferUp()) {
498            if (StringUtil.isDigit(charBuf[bufPos])) bufPos++;
499            else break;
500        }
501
502        return cacheString(charBuf, stringCache, start, bufPos - start);
503    }
504
505    String consumeHexSequence() {
506        return consumeMatching(StringUtil::isHexDigit);
507    }
508
509    String consumeDigitSequence() {
510        return consumeMatching(c -> c >= '0' && c <= '9');
511    }
512
513    boolean matches(char c) {
514        return !isEmpty() && charBuf[bufPos] == c;
515    }
516
517    boolean matches(String seq) {
518        bufferUp();
519        int scanLength = seq.length();
520        if (scanLength > bufLength - bufPos)
521            return false;
522
523        for (int offset = 0; offset < scanLength; offset++)
524            if (seq.charAt(offset) != charBuf[bufPos +offset])
525                return false;
526        return true;
527    }
528
529    boolean matchesIgnoreCase(String seq) {
530        bufferUp();
531        int scanLength = seq.length();
532        if (scanLength > bufLength - bufPos)
533            return false;
534
535        for (int offset = 0; offset < scanLength; offset++) {
536            char scan = seq.charAt(offset);
537            char target = charBuf[bufPos + offset];
538            if (scan == target) continue;
539
540            scan = Character.toUpperCase(scan);
541            target = Character.toUpperCase(target);
542            if (scan != target) return false;
543        }
544        return true;
545    }
546
547    /**
548     Tests if the next character in the queue matches any of the characters in the sequence, case sensitively.
549     @param seq list of characters to check for
550     @return true if any matched, false if none did
551     */
552    boolean matchesAny(char... seq) {
553        if (isEmpty())
554            return false;
555
556        bufferUp();
557        char c = charBuf[bufPos];
558        for (char seek : seq) {
559            if (seek == c)
560                return true;
561        }
562        return false;
563    }
564
565    boolean matchesAnySorted(char[] seq) {
566        bufferUp();
567        return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
568    }
569
570    /**
571     Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha
572     @return if it matches or not
573     */
574    boolean matchesAsciiAlpha() {
575        if (isEmpty()) return false;
576        return StringUtil.isAsciiLetter(charBuf[bufPos]);
577    }
578
579    boolean matchesDigit() {
580        if (isEmpty()) return false;
581        return StringUtil.isDigit(charBuf[bufPos]);
582    }
583
584    boolean matchConsume(String seq) {
585        bufferUp();
586        if (matches(seq)) {
587            bufPos += seq.length();
588            return true;
589        } else {
590            return false;
591        }
592    }
593
594    boolean matchConsumeIgnoreCase(String seq) {
595        if (matchesIgnoreCase(seq)) {
596            bufPos += seq.length();
597            return true;
598        } else {
599            return false;
600        }
601    }
602
603    // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans.
604    // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p
605    // looking for the </title>. Resets in bufferUp()
606    @Nullable private String lastIcSeq; // scan cache
607    private int lastIcIndex; // nearest found indexOf
608
609    /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */
610    boolean containsIgnoreCase(String seq) {
611        if (seq.equals(lastIcSeq)) {
612            if (lastIcIndex == -1) return false;
613            if (lastIcIndex >= bufPos) return true;
614        }
615        lastIcSeq = seq;
616
617        String loScan = seq.toLowerCase(Locale.ENGLISH);
618        int lo = nextIndexOf(loScan);
619        if (lo > -1) {
620            lastIcIndex = bufPos + lo; return true;
621        }
622
623        String hiScan = seq.toUpperCase(Locale.ENGLISH);
624        int hi = nextIndexOf(hiScan);
625        boolean found = hi > -1;
626        lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains
627        return found;
628    }
629
630    @Override
631    public String toString() {
632        if (bufLength - bufPos < 0) return "";
633        return new String(charBuf, bufPos, bufLength - bufPos);
634    }
635
636    /**
637     * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks.
638     * <p />
639     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
640     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
641     * some more duplicates.
642     */
643    private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
644        if (count > MaxStringCacheLen) // don't cache strings that are too big
645            return new String(charBuf, start, count);
646        if (count < 1)
647            return "";
648
649        // calculate hash:
650        int hash = 0;
651        int end = count + start;
652        for (int i = start; i < end; i++) {
653            hash = 31 * hash + charBuf[i];
654        }
655
656        // get from cache
657        final int index = hash & StringCacheSize - 1;
658        String cached = stringCache[index];
659
660        if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit
661            return cached;
662        else {
663            cached = new String(charBuf, start, count);
664            stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next
665        }
666
667        return cached;
668    }
669
670    /**
671     * Check if the value of the provided range equals the string.
672     */
673    static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
674        if (count == cached.length()) {
675            int i = start;
676            int j = 0;
677            while (count-- != 0) {
678                if (charBuf[i++] != cached.charAt(j++))
679                    return false;
680            }
681            return true;
682        }
683        return false;
684    }
685
686    // just used for testing
687    boolean rangeEquals(final int start, final int count, final String cached) {
688        return rangeEquals(charBuf, start, count, cached);
689    }
690
691    @FunctionalInterface
692    interface CharPredicate {
693        boolean test(char c);
694    }
695}