001package org.jsoup.parser;
002
003import org.jsoup.internal.StringUtil;
004import org.jsoup.helper.Validate;
005
006/**
007 A character reader with helpers focusing on parsing CSS selectors. Used internally by jsoup. API subject to changes.
008 */
009
010public class TokenQueue implements AutoCloseable {
011    private static final char Esc = '\\'; // escape char for chomp balanced.
012    private static final char Hyphen_Minus = '-';
013    private static final char Unicode_Null = '\u0000';
014    private static final char Replacement = '\uFFFD';
015
016    private final CharacterReader reader;
017
018    /**
019     Create a new TokenQueue.
020     @param data string of data to back queue.
021     */
022    public TokenQueue(String data) {
023        reader = new CharacterReader(data);
024    }
025
026    /**
027     Is the queue empty?
028     @return true if no data left in queue.
029     */
030    public boolean isEmpty() {
031        return reader.isEmpty();
032    }
033
034    /**
035     Consume one character off queue.
036     @return first character on queue.
037     */
038    public char consume() {
039        return reader.consume();
040    }
041
042    /**
043     Drops the next character off the queue.
044     */
045    public void advance() {
046        if (!isEmpty()) reader.advance();
047    }
048
049    char current() {
050        return reader.current();
051    }
052
053    /**
054     Tests if the next characters on the queue match the sequence, case-insensitively.
055     @param seq String to check queue for.
056     @return true if the next characters match.
057     */
058    public boolean matches(String seq) {
059        return reader.matchesIgnoreCase(seq);
060    }
061
062    /** Tests if the next character on the queue matches the character, case-sensitively. */
063    public boolean matches(char c) {
064        return reader.matches(c);
065    }
066
067    /**
068     Tests if the next characters match any of the sequences, case-<b>sensitively</b>.
069     @param seq list of chars to case-sensitively check for
070     @return true of any matched, false if none did
071     */
072    public boolean matchesAny(char... seq) {
073        return reader.matchesAny(seq);
074    }
075
076    /**
077     If the queue case-insensitively matches the supplied string, consume it off the queue.
078     @param seq String to search for, and if found, remove from queue.
079     @return true if found and removed, false if not found.
080     */
081    public boolean matchChomp(String seq) {
082        return reader.matchConsumeIgnoreCase(seq);
083    }
084
085    /** If the queue matches the supplied (case-sensitive) character, consume it off the queue. */
086    public boolean matchChomp(char c) {
087        if (reader.matches(c)) {
088            consume();
089            return true;
090        }
091        return false;
092    }
093
094    /**
095     Tests if queue starts with a whitespace character.
096     @return if starts with whitespace
097     */
098    public boolean matchesWhitespace() {
099        return StringUtil.isWhitespace(reader.current());
100    }
101
102    /**
103     Test if the queue matches a tag word character (letter or digit).
104     @return if matches a word character
105     */
106    public boolean matchesWord() {
107        return Character.isLetterOrDigit(reader.current());
108    }
109
110    /**
111     Consumes the supplied sequence of the queue, case-insensitively. If the queue does not start with the supplied
112     sequence, will throw an illegal state exception -- but you should be running match() against that condition.
113
114     @param seq sequence to remove from head of queue.
115     */
116    public void consume(String seq) {
117        boolean found = reader.matchConsumeIgnoreCase(seq);
118        if (!found) throw new IllegalStateException("Queue did not match expected sequence");
119    }
120
121    /**
122     Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
123     @param seq String to end on (and not include in return, but leave on queue). <b>Case-sensitive.</b>
124     @return The matched data consumed from queue.
125     */
126    public String consumeTo(String seq) {
127        return reader.consumeTo(seq);
128    }
129
130    /**
131     Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
132     @param seq any number of terminators to consume to. <b>Case-insensitive.</b>
133     @return consumed string
134     */
135    public String consumeToAny(String... seq) {
136        StringBuilder sb = StringUtil.borrowBuilder();
137        OUT: while (!isEmpty()) {
138            for (String s : seq) {
139                if (reader.matchesIgnoreCase(s)) break OUT;
140            }
141            sb.append(consume());
142        }
143        return StringUtil.releaseBuilder(sb);
144    }
145
146    /**
147     Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
148     and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \).
149     Those escapes will be left in the returned string, which is suitable for regexes (where we need to preserve the
150     escape), but unsuitable for contains text strings; use unescape for that.
151
152     @param open opener
153     @param close closer
154     @return data matched from the queue
155     */
156    public String chompBalanced(char open, char close) {
157        StringBuilder accum = StringUtil.borrowBuilder();
158        int depth = 0;
159        char prev = 0;
160        boolean inSingle = false;
161        boolean inDouble = false;
162        boolean inRegexQE = false; // regex \Q .. \E escapes from Pattern.quote()
163        reader.mark(); // mark the initial position to restore if needed
164
165        do {
166            if (isEmpty()) break;
167            char c = consume();
168            if (prev == Esc) {
169                if      (c == 'Q') inRegexQE = true;
170                else if (c == 'E') inRegexQE = false;
171                accum.append(c);
172            } else {
173                if      (c == '\'' && c != open && !inDouble) inSingle = !inSingle;
174                else if (c == '"'  && c != open && !inSingle) inDouble = !inDouble;
175
176                if (inSingle || inDouble || inRegexQE) {
177                    accum.append(c);
178                } else if (c == open) {
179                    depth++;
180                    if (depth > 1) accum.append(c); // don't include the outer match pair in the return
181                } else if (c == close) {
182                    depth--;
183                    if (depth > 0) accum.append(c);
184                } else {
185                    accum.append(c);
186                }
187            }
188            prev = c;
189        } while (depth > 0);
190
191        String out = StringUtil.releaseBuilder(accum);
192        if (depth > 0) {// ran out of queue before seeing enough )
193            reader.rewindToMark(); // restore position if we don't have a balanced string
194            Validate.fail("Did not find balanced marker at '" + out + "'");
195        }
196        return out;
197    }
198    
199    /**
200     * Unescape a \ escaped string.
201     * @param in backslash escaped string
202     * @return unescaped string
203     */
204    public static String unescape(String in) {
205        if (in.indexOf(Esc) == -1) return in;
206
207        StringBuilder out = StringUtil.borrowBuilder();
208        char last = 0;
209        for (char c : in.toCharArray()) {
210            if (c == Esc) {
211                if (last == Esc) {
212                    out.append(c);
213                    c = 0;
214                }
215            }
216            else 
217                out.append(c);
218            last = c;
219        }
220        return StringUtil.releaseBuilder(out);
221    }
222
223    /**
224     Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
225     valid in a selector.
226
227     @see <a href="https://www.w3.org/TR/cssom-1/#serialize-an-identifier">CSS Object Model, serialize an identifier</a>
228     */
229    public static String escapeCssIdentifier(String in) {
230        if (in.isEmpty()) return in;
231
232        StringBuilder out = StringUtil.borrowBuilder();
233        TokenQueue q = new TokenQueue(in);
234
235        char firstChar = q.current();
236        if (firstChar == Hyphen_Minus) {
237            q.advance();
238            if (q.isEmpty()) {
239                // If the character is the first character and is a "-" (U+002D), and there is no second character, then
240                // the escaped character.
241                appendEscaped(out, Hyphen_Minus);
242            } else {
243                out.append(Hyphen_Minus);
244
245                char secondChar = q.current();
246                if (StringUtil.isDigit(secondChar)) {
247                    // If the character is the second character and is in the range [0-9] (U+0030 to U+0039) and the
248                    // first character is a "-" (U+002D), then the character escaped as code point.
249                    appendEscapedCodepoint(out, q.consume());
250                }
251            }
252        } else if (StringUtil.isDigit(firstChar)) {
253            // If the character is the first character and is in the range [0-9] (U+0030 to U+0039), then the character
254            // escaped as code point.
255            appendEscapedCodepoint(out, q.consume());
256        }
257
258        while (!q.isEmpty()) {
259            // Note: It's fine to iterate on chars because non-ASCII characters are never escaped. So surrogate pairs
260            // are kept intact.
261            char c = q.consume();
262            if (c == Unicode_Null) {
263                // If the character is NULL (U+0000), then the REPLACEMENT CHARACTER (U+FFFD).
264                out.append(Replacement);
265            } else if (c <= '\u001F' || c == '\u007F') {
266                // If the character is in the range [\1-\1f] (U+0001 to U+001F) or is U+007F, then the character
267                // escaped as code point.
268                appendEscapedCodepoint(out, c);
269            } else if (isIdent(c)) {
270                // If the character is not handled by one of the above rules and is greater than or equal to U+0080,
271                // is "-" (U+002D) or "_" (U+005F), or is in one of the ranges [0-9] (U+0030 to U+0039),
272                // [A-Z] (U+0041 to U+005A), or [a-z] (U+0061 to U+007A), then the character itself.
273                out.append(c);
274            } else {
275                // Otherwise, the escaped character.
276                appendEscaped(out, c);
277            }
278        }
279
280        q.close();
281        return StringUtil.releaseBuilder(out);
282    }
283
284    private static void appendEscaped(StringBuilder out, char c) {
285        out.append(Esc).append(c);
286    }
287
288    private static void appendEscapedCodepoint(StringBuilder out, char c) {
289        out.append(Esc).append(Integer.toHexString(c)).append(' ');
290    }
291
292    /**
293     * Pulls the next run of whitespace characters of the queue.
294     * @return Whether consuming whitespace or not
295     */
296    public boolean consumeWhitespace() {
297        boolean seen = false;
298        while (matchesWhitespace()) {
299            advance();
300            seen = true;
301        }
302        return seen;
303    }
304
305    /**
306     * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects).
307     * 
308     * @return tag name
309     */
310    public String consumeElementSelector() {
311        return consumeEscapedCssIdentifier(ElementSelectorChars);
312    }
313    private static final char[] ElementSelectorChars = {'*', '|', '_', '-'};
314
315    /**
316     Consume a CSS identifier (ID or class) off the queue.
317     <p>Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead
318     of {@code \31}.</p>
319
320     @return The unescaped identifier.
321     @throws IllegalArgumentException if an invalid escape sequence was found. Afterward, the state of the TokenQueue
322     is undefined.
323     @see <a href="https://www.w3.org/TR/css-syntax-3/#consume-name">CSS Syntax Module Level 3, Consume an ident sequence</a>
324     @see <a href="https://www.w3.org/TR/css-syntax-3/#typedef-ident-token">CSS Syntax Module Level 3, ident-token</a>
325     */
326    public String consumeCssIdentifier() {
327        if (isEmpty()) throw new IllegalArgumentException("CSS identifier expected, but end of input found");
328
329        // Fast path for CSS identifiers that don't contain escape sequences.
330        String identifier = reader.consumeMatching(TokenQueue::isIdent);
331        char c = current();
332        if (c != Esc && c != Unicode_Null) {
333            // If we didn't end on an Esc or a Null, we consumed the whole identifier
334            return identifier;
335        }
336
337        // An escape sequence was found. Use a StringBuilder to store the decoded CSS identifier.
338        StringBuilder out = StringUtil.borrowBuilder();
339        if (!identifier.isEmpty()) {
340            // Copy the CSS identifier up to the first escape sequence.
341            out.append(identifier);
342        }
343
344        while (!isEmpty()) {
345            c = current();
346            if (isIdent(c)) {
347                out.append(consume());
348            } else if (c == Unicode_Null) {
349                // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
350                advance();
351                out.append(Replacement);
352            } else if (c == Esc) {
353                advance();
354                if (!isEmpty() && isNewline(current())) {
355                    // Not a valid escape sequence. This is treated as the end of the CSS identifier.
356                    reader.unconsume();
357                    break;
358                } else {
359                    consumeCssEscapeSequenceInto(out);
360                }
361            } else {
362                break;
363            }
364        }
365        return StringUtil.releaseBuilder(out);
366    }
367
368    private void consumeCssEscapeSequenceInto(StringBuilder out) {
369        if (isEmpty()) {
370            out.append(Replacement);
371            return;
372        }
373
374        char firstEscaped = consume();
375        if (!StringUtil.isHexDigit(firstEscaped)) {
376            out.append(firstEscaped);
377        } else {
378            reader.unconsume(); // put back the first hex digit
379            String hexString = reader.consumeMatching(StringUtil::isHexDigit, 6); // consume up to 6 hex digits
380            int codePoint;
381            try {
382                codePoint = Integer.parseInt(hexString, 16);
383            } catch (NumberFormatException e) {
384                throw new IllegalArgumentException("Invalid escape sequence: " + hexString, e);
385            }
386            if (isValidCodePoint(codePoint)) {
387                out.appendCodePoint(codePoint);
388            } else {
389                out.append(Replacement);
390            }
391
392            if (!isEmpty()) {
393                char c = current();
394                if (c == '\r') {
395                    // Since there's currently no input preprocessing, check for CRLF here.
396                    // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
397                    advance();
398                    if (!isEmpty() && current() == '\n') advance();
399                } else if (c == ' ' || c == '\t' || isNewline(c)) {
400                    advance();
401                }
402            }
403        }
404    }
405
406    // statics below specifically for CSS identifiers:
407
408    // https://www.w3.org/TR/css-syntax-3/#non-ascii-code-point
409    private static boolean isNonAscii(char c) {
410        return c >= '\u0080';
411    }
412
413    // https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
414    private static boolean isIdentStart(char c) {
415        return c == '_' || StringUtil.isAsciiLetter(c) || isNonAscii(c);
416    }
417
418    // https://www.w3.org/TR/css-syntax-3/#ident-code-point
419    private static boolean isIdent(char c) {
420        return c == Hyphen_Minus || StringUtil.isDigit(c) || isIdentStart(c);
421    }
422
423    // https://www.w3.org/TR/css-syntax-3/#newline
424    // Note: currently there's no preprocessing happening.
425    private static boolean isNewline(char c) {
426        return c == '\n' || c == '\r' || c == '\f';
427    }
428
429    // https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
430    private static boolean isValidCodePoint(int codePoint) {
431        return codePoint != 0 && Character.isValidCodePoint(codePoint) && !Character.isSurrogate((char) codePoint);
432    }
433
434    private static final char[] CssIdentifierChars = {'-', '_'};
435
436    private String consumeEscapedCssIdentifier(char... matches) {
437        StringBuilder sb = StringUtil.borrowBuilder();
438        while (!isEmpty()) {
439            char c = current();
440            if (c == Esc) {
441                advance();
442                if (!isEmpty()) sb.append(consume());
443                else break;
444            } else if (matchesCssIdentifier(matches)) {
445                sb.append(c);
446                advance();
447            } else {
448                break;
449            }
450        }
451        return StringUtil.releaseBuilder(sb);
452    }
453
454    private boolean matchesCssIdentifier(char... matches) {
455        return matchesWord() || reader.matchesAny(matches);
456    }
457
458    /**
459     Consume and return whatever is left on the queue.
460     @return remainder of queue.
461     */
462    public String remainder() {
463        return reader.consumeToEnd();
464    }
465
466    @Override
467    public String toString() {
468        return reader.toString();
469    }
470
471    @Override
472    public void close() {
473        reader.close(); // releases buffer back to pool
474    }
475}