001package org.jsoup.select;
002
003import org.jsoup.helper.Regex;
004import org.jsoup.internal.StringUtil;
005import org.jsoup.helper.Validate;
006import org.jsoup.nodes.CDataNode;
007import org.jsoup.nodes.Comment;
008import org.jsoup.nodes.DataNode;
009import org.jsoup.nodes.LeafNode;
010import org.jsoup.nodes.Node;
011import org.jsoup.nodes.TextNode;
012import org.jsoup.parser.TokenQueue;
013import org.jspecify.annotations.Nullable;
014
015import java.util.function.Function;
016import java.util.regex.Matcher;
017import java.util.regex.Pattern;
018
019import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun;
020import static org.jsoup.internal.Normalizer.normalize;
021
022/**
023 * Parses a CSS selector into an Evaluator tree.
024 */
025public class QueryParser implements AutoCloseable {
026    private final static char[] Combinators = {'>', '+', '~'}; // ' ' is also a combinator, but found implicitly
027    private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="};
028    private final static char[] SequenceEnders = {',', ')'};
029
030    private final TokenQueue tq;
031    private final String query;
032    private boolean inNodeContext; // ::comment:contains should act on node value, vs element text
033
034    /**
035     * Create a new QueryParser.
036     * @param query CSS query
037     */
038    private QueryParser(String query) {
039        Validate.notEmpty(query);
040        query = query.trim();
041        this.query = query;
042        this.tq = new TokenQueue(query);
043    }
044
045    /**
046     Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to
047     parse it once and reuse the Evaluator.
048
049     @param query CSS query
050     @return Evaluator
051     @see Selector selector query syntax
052     @throws Selector.SelectorParseException if the CSS query is invalid
053     */
054    public static Evaluator parse(String query) {
055        try (QueryParser p = new QueryParser(query)) {
056            return p.parse();
057        } catch (IllegalArgumentException e) {
058            throw new Selector.SelectorParseException(e.getMessage());
059        }
060    }
061
062    /**
063     Parse the query. We use this simplified expression of the grammar:
064     <pre>
065     SelectorGroup   ::= Selector (',' Selector)*
066     Selector        ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
067     SimpleSequence  ::= [ TypeSelector ] ( ID | Class | Attribute | Pseudo )*
068     Pseudo           ::= ':' Name [ '(' SelectorGroup ')' ]
069     Combinator      ::= S+         // descendant (whitespace)
070     | '>'       // child
071     | '+'       // adjacent sibling
072     | '~'       // general sibling
073     </pre>
074
075     See <a href="https://www.w3.org/TR/selectors-4/#grammar">selectors-4</a> for the real thing
076     */
077    Evaluator parse() {
078        Evaluator eval = parseSelectorGroup();
079        tq.consumeWhitespace();
080        if (!tq.isEmpty())
081            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
082        return eval;
083    }
084
085    Evaluator parseSelectorGroup() {
086        // SelectorGroup. Into an Or if > 1 Selector
087        Evaluator left = parseSelector();
088        while (tq.matchChomp(',')) {
089            Evaluator right = parseSelector();
090            left = or(left, right);
091        }
092        return left;
093    }
094
095    Evaluator parseSelector() {
096        // Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
097        tq.consumeWhitespace();
098
099        Evaluator left;
100        if (tq.matchesAny(Combinators)) {
101            // e.g. query is "> div"; left side is root element
102            left = new StructuralEvaluator.Root();
103        } else {
104            left = parseSimpleSequence();
105        }
106
107        while (true) {
108            char combinator = 0;
109            if (tq.consumeWhitespace())
110                combinator = ' ';            // maybe descendant?
111            if (tq.matchesAny(Combinators)) // no, explicit
112                combinator = tq.consume();
113            else if (tq.matchesAny(SequenceEnders)) // , - space after simple like "foo , bar"; ) - close of :has()
114                break;
115
116            if (combinator != 0) {
117                Evaluator right = parseSimpleSequence();
118                left = combinator(left, combinator, right);
119            } else {
120                break;
121            }
122        }
123        return left;
124    }
125
126    Evaluator parseSimpleSequence() {
127        // SimpleSequence ::= TypeSelector? ( Hash | Class | Pseudo )*
128        Evaluator left = null;
129        tq.consumeWhitespace();
130
131        // one optional type selector
132        if (tq.matchesWord() || tq.matches("*|"))
133            left = byTag();
134        else if (tq.matchChomp('*'))
135            left = new Evaluator.AllElements();
136
137        // zero or more subclasses (#, ., [)
138        while(true) {
139            Evaluator right = parseSubclass();
140            if (right != null) {
141                left = and(left, right);
142            }
143            else break; // no more simple tokens
144        }
145
146        if (left == null)
147            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
148        return left;
149    }
150
151    static Evaluator combinator(Evaluator left, char combinator, Evaluator right) {
152        switch (combinator) {
153            case '>':
154                ImmediateParentRun run = left instanceof ImmediateParentRun ?
155                    (ImmediateParentRun) left : new ImmediateParentRun(left);
156                run.add(right);
157                return run;
158            case ' ':
159                return and(new StructuralEvaluator.Ancestor(left), right);
160            case '+':
161                return and(new StructuralEvaluator.ImmediatePreviousSibling(left), right);
162            case '~':
163                return and(new StructuralEvaluator.PreviousSibling(left), right);
164            default:
165                throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator);
166        }
167    }
168
169    @Nullable Evaluator parseSubclass() {
170        //  Subclass: ID | Class | Attribute | Pseudo
171        if      (tq.matchChomp('#'))    return byId();
172        else if (tq.matchChomp('.'))    return byClass();
173        else if (tq.matches('['))       return byAttribute();
174        else if (tq.matchChomp("::"))   return parseNodeSelector(); // ::comment etc
175        else if (tq.matchChomp(':'))    return parsePseudoSelector();
176        else                            return null;
177    }
178
179    /** Merge two evals into an Or. */
180    static Evaluator or(Evaluator left, Evaluator right) {
181        if (left instanceof CombiningEvaluator.Or) {
182            ((CombiningEvaluator.Or) left).add(right);
183            return left;
184        }
185        return new CombiningEvaluator.Or(left, right);
186    }
187
188    /** Merge two evals into an And. */
189    static Evaluator and(@Nullable Evaluator left, Evaluator right) {
190        if (left == null) return right;
191        if (left instanceof CombiningEvaluator.And) {
192            ((CombiningEvaluator.And) left).add(right);
193            return left;
194        }
195        return new CombiningEvaluator.And(left, right);
196    }
197
198    private Evaluator parsePseudoSelector() {
199        final String pseudo = tq.consumeCssIdentifier();
200        switch (pseudo) {
201            case "lt":
202                return new Evaluator.IndexLessThan(consumeIndex());
203            case "gt":
204                return new Evaluator.IndexGreaterThan(consumeIndex());
205            case "eq":
206                return new Evaluator.IndexEquals(consumeIndex());
207            case "has":
208                return has();
209            case "is":
210                return is();
211            case "contains":
212                return contains(false);
213            case "containsOwn":
214                return contains(true);
215            case "containsWholeText":
216                return containsWholeText(false);
217            case "containsWholeOwnText":
218                return containsWholeText(true);
219            case "containsData":
220                return containsData();
221            case "matches":
222                return matches(false);
223            case "matchesOwn":
224                return matches(true);
225            case "matchesWholeText":
226                return matchesWholeText(false);
227            case "matchesWholeOwnText":
228                return matchesWholeText(true);
229            case "not":
230                return not();
231            case "nth-child":
232                return cssNthChild(false, false);
233            case "nth-last-child":
234                return cssNthChild(true, false);
235            case "nth-of-type":
236                return cssNthChild(false, true);
237            case "nth-last-of-type":
238                return cssNthChild(true, true);
239            case "first-child":
240                return new Evaluator.IsFirstChild();
241            case "last-child":
242                return new Evaluator.IsLastChild();
243            case "first-of-type":
244                return new Evaluator.IsFirstOfType();
245            case "last-of-type":
246                return new Evaluator.IsLastOfType();
247            case "only-child":
248                return new Evaluator.IsOnlyChild();
249            case "only-of-type":
250                return new Evaluator.IsOnlyOfType();
251            case "empty":
252                return new Evaluator.IsEmpty();
253            case "blank":
254                return new NodeEvaluator.BlankValue();
255            case "root":
256                return new Evaluator.IsRoot();
257            case "matchText":
258                return new Evaluator.MatchText();
259            default:
260                throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
261        }
262    }
263
264    // ::comment etc
265    private Evaluator parseNodeSelector() {
266        final String pseudo = tq.consumeCssIdentifier();
267        inNodeContext = true;  // Enter node context
268
269        Evaluator left;
270        switch (pseudo) {
271            case "node":
272                left = new NodeEvaluator.InstanceType(Node.class, pseudo);
273                break;
274            case "leafnode":
275                left = new NodeEvaluator.InstanceType(LeafNode.class, pseudo);
276                break;
277            case "text":
278                left = new NodeEvaluator.InstanceType(TextNode.class, pseudo);
279                break;
280            case "comment":
281                left = new NodeEvaluator.InstanceType(Comment.class, pseudo);
282                break;
283            case "data":
284                left = new NodeEvaluator.InstanceType(DataNode.class, pseudo);
285                break;
286            case "cdata":
287                left = new NodeEvaluator.InstanceType(CDataNode.class, pseudo);
288                break;
289            default:
290                throw new Selector.SelectorParseException(
291                    "Could not parse query '%s': unknown node type '::%s'", query, pseudo);
292        }
293
294        // Handle following subclasses in node context (like ::comment:contains())
295        Evaluator right;
296        while ((right = parseSubclass()) != null) {
297            left = and(left, right);
298        }
299
300        inNodeContext = false;
301        return left;
302    }
303
304    private Evaluator byId() {
305        String id = tq.consumeCssIdentifier();
306        Validate.notEmpty(id);
307        return new Evaluator.Id(id);
308    }
309
310    private Evaluator byClass() {
311        String className = tq.consumeCssIdentifier();
312        Validate.notEmpty(className);
313        return new Evaluator.Class(className.trim());
314    }
315
316    private Evaluator byTag() {
317        // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make
318        // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for
319        // consistency - both the selector and the element tag
320        String tagName = normalize(tq.consumeElementSelector());
321        Validate.notEmpty(tagName);
322
323        // namespaces:
324        if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName
325            String plainTag = tagName.substring(2); // strip *|
326            return new CombiningEvaluator.Or(
327                new Evaluator.Tag(plainTag),
328                new Evaluator.TagEndsWith(":" + plainTag)
329            );
330        } else if (tagName.endsWith("|*")) { // ns|*
331            String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns:
332            return new Evaluator.TagStartsWith(ns);
333        } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def"
334            tagName = tagName.replace("|", ":");
335        }
336
337        return new Evaluator.Tag(tagName);
338    }
339
340    private Evaluator byAttribute() {
341        try (TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']'))) {
342            return evaluatorForAttribute(cq);
343        }
344    }
345
346    private Evaluator evaluatorForAttribute(TokenQueue cq) {
347        String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
348        key = normalize(key);
349        Validate.notEmpty(key);
350        Validate.isFalse(key.equals("abs:"), "Absolute attribute key must have a name");
351        cq.consumeWhitespace();
352        final Evaluator eval;
353
354        if (cq.isEmpty()) {
355            if (key.startsWith("^"))
356                eval = new Evaluator.AttributeStarting(key.substring(1));
357            else if (key.equals("*")) // any attribute
358                eval = new Evaluator.AttributeStarting("");
359            else
360                eval = new Evaluator.Attribute(key);
361        } else {
362            if (cq.matchChomp('='))
363                eval = new Evaluator.AttributeWithValue(key, cq.remainder());
364            else if (cq.matchChomp("!="))
365                eval = new Evaluator.AttributeWithValueNot(key, cq.remainder());
366            else if (cq.matchChomp("^="))
367                eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder());
368            else if (cq.matchChomp("$="))
369                eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder());
370            else if (cq.matchChomp("*="))
371                eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder());
372            else if (cq.matchChomp("~="))
373                eval = new Evaluator.AttributeWithValueMatching(key, Regex.compile(cq.remainder()));
374            else
375                throw new Selector.SelectorParseException(
376                    "Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
377        }
378        return eval;
379    }
380
381    //pseudo selectors :first-child, :last-child, :nth-child, ...
382    private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
383    private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)");
384
385    private Evaluator cssNthChild(boolean last, boolean ofType) {
386        String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd)
387        final int step, offset;
388        if ("odd".equals(arg)) {
389            step = 2;
390            offset = 1;
391        } else if ("even".equals(arg)) {
392            step = 2;
393            offset = 0;
394        } else {
395            Matcher stepOffsetM, stepM;
396            if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) {
397                if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2
398                    step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", ""));
399                else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1;
400                    step = "-".equals(stepOffsetM.group(2)) ? -1 : 1;
401                offset =
402                    stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0;
403            } else if ((stepM = NthOffset.matcher(arg)).matches()) {
404                step = 0;
405                offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", ""));
406            } else {
407                throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg);
408            }
409        }
410
411        return ofType
412            ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset))
413            : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset));
414    }
415
416    private String consumeParens() {
417        return tq.chompBalanced('(', ')');
418    }
419
420    private int consumeIndex() {
421        String index = consumeParens().trim();
422        Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric");
423        return Integer.parseInt(index);
424    }
425
426    // pseudo selector :has(el)
427    private Evaluator has() {
428        return parseNested(StructuralEvaluator.Has::new, ":has() must have a selector");
429    }
430
431    // pseudo selector :is()
432    private Evaluator is() {
433        return parseNested(StructuralEvaluator.Is::new, ":is() must have a selector");
434    }
435
436    private Evaluator parseNested(Function<Evaluator, Evaluator> func, String err) {
437        Validate.isTrue(tq.matchChomp('('), err);
438        Evaluator eval = parseSelectorGroup();
439        Validate.isTrue(tq.matchChomp(')'), err);
440        return func.apply(eval);
441    }
442
443    // pseudo selector :contains(text), containsOwn(text)
444    private Evaluator contains(boolean own) {
445        String query = own ? ":containsOwn" : ":contains";
446        String searchText = TokenQueue.unescape(consumeParens());
447        Validate.notEmpty(searchText, query + "(text) query must not be empty");
448
449        if (inNodeContext)
450            return new NodeEvaluator.ContainsValue(searchText);
451
452        return own
453            ? new Evaluator.ContainsOwnText(searchText)
454            : new Evaluator.ContainsText(searchText);
455    }
456
457    private Evaluator containsWholeText(boolean own) {
458        String query = own ? ":containsWholeOwnText" : ":containsWholeText";
459        String searchText = TokenQueue.unescape(consumeParens());
460        Validate.notEmpty(searchText, query + "(text) query must not be empty");
461        return own
462            ? new Evaluator.ContainsWholeOwnText(searchText)
463            : new Evaluator.ContainsWholeText(searchText);
464    }
465
466    // pseudo selector :containsData(data)
467    private Evaluator containsData() {
468        String searchText = TokenQueue.unescape(consumeParens());
469        Validate.notEmpty(searchText, ":containsData(text) query must not be empty");
470        return new Evaluator.ContainsData(searchText);
471    }
472
473    // :matches(regex), matchesOwn(regex)
474    private Evaluator matches(boolean own) {
475        String query = own ? ":matchesOwn" : ":matches";
476        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
477        Validate.notEmpty(regex, query + "(regex) query must not be empty");
478        Regex pattern = Regex.compile(regex);
479
480        if (inNodeContext)
481            return new NodeEvaluator.MatchesValue(pattern);
482
483        return own
484            ? new Evaluator.MatchesOwn(pattern)
485            : new Evaluator.Matches(pattern);
486    }
487
488    // :matches(regex), matchesOwn(regex)
489    private Evaluator matchesWholeText(boolean own) {
490        String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
491        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
492        Validate.notEmpty(regex, query + "(regex) query must not be empty");
493
494        Regex pattern = Regex.compile(regex);
495        return own
496            ? new Evaluator.MatchesWholeOwnText(pattern)
497            : new Evaluator.MatchesWholeText(pattern);
498    }
499
500    // :not(selector)
501    private Evaluator not() {
502        String subQuery = consumeParens();
503        Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
504
505        return new StructuralEvaluator.Not(parse(subQuery));
506    }
507
508    @Override
509    public String toString() {
510        return query;
511    }
512
513    @Override
514    public void close() {
515        tq.close();
516    }
517}