001package org.jsoup.select; 002 003import org.jsoup.helper.Regex; 004import org.jsoup.internal.StringUtil; 005import org.jsoup.helper.Validate; 006import org.jsoup.nodes.CDataNode; 007import org.jsoup.nodes.Comment; 008import org.jsoup.nodes.DataNode; 009import org.jsoup.nodes.LeafNode; 010import org.jsoup.nodes.Node; 011import org.jsoup.nodes.TextNode; 012import org.jsoup.parser.TokenQueue; 013import org.jspecify.annotations.Nullable; 014 015import java.util.function.Function; 016import java.util.regex.Matcher; 017import java.util.regex.Pattern; 018 019import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun; 020import static org.jsoup.internal.Normalizer.normalize; 021 022/** 023 * Parses a CSS selector into an Evaluator tree. 024 */ 025public class QueryParser implements AutoCloseable { 026 private final static char[] Combinators = {'>', '+', '~'}; // ' ' is also a combinator, but found implicitly 027 private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="}; 028 private final static char[] SequenceEnders = {',', ')'}; 029 030 private final TokenQueue tq; 031 private final String query; 032 private boolean inNodeContext; // ::comment:contains should act on node value, vs element text 033 034 /** 035 * Create a new QueryParser. 036 * @param query CSS query 037 */ 038 private QueryParser(String query) { 039 Validate.notEmpty(query); 040 query = query.trim(); 041 this.query = query; 042 this.tq = new TokenQueue(query); 043 } 044 045 /** 046 Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to 047 parse it once and reuse the Evaluator. 048 049 @param query CSS query 050 @return Evaluator 051 @see Selector selector query syntax 052 @throws Selector.SelectorParseException if the CSS query is invalid 053 */ 054 public static Evaluator parse(String query) { 055 try (QueryParser p = new QueryParser(query)) { 056 return p.parse(); 057 } catch (IllegalArgumentException e) { 058 throw new Selector.SelectorParseException(e.getMessage()); 059 } 060 } 061 062 /** 063 Parse the query. We use this simplified expression of the grammar: 064 <pre> 065 SelectorGroup ::= Selector (',' Selector)* 066 Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )* 067 SimpleSequence ::= [ TypeSelector ] ( ID | Class | Attribute | Pseudo )* 068 Pseudo ::= ':' Name [ '(' SelectorGroup ')' ] 069 Combinator ::= S+ // descendant (whitespace) 070 | '>' // child 071 | '+' // adjacent sibling 072 | '~' // general sibling 073 </pre> 074 075 See <a href="https://www.w3.org/TR/selectors-4/#grammar">selectors-4</a> for the real thing 076 */ 077 Evaluator parse() { 078 Evaluator eval = parseSelectorGroup(); 079 tq.consumeWhitespace(); 080 if (!tq.isEmpty()) 081 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 082 return eval; 083 } 084 085 Evaluator parseSelectorGroup() { 086 // SelectorGroup. Into an Or if > 1 Selector 087 Evaluator left = parseSelector(); 088 while (tq.matchChomp(',')) { 089 Evaluator right = parseSelector(); 090 left = or(left, right); 091 } 092 return left; 093 } 094 095 Evaluator parseSelector() { 096 // Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )* 097 tq.consumeWhitespace(); 098 099 Evaluator left; 100 if (tq.matchesAny(Combinators)) { 101 // e.g. query is "> div"; left side is root element 102 left = new StructuralEvaluator.Root(); 103 } else { 104 left = parseSimpleSequence(); 105 } 106 107 while (true) { 108 char combinator = 0; 109 if (tq.consumeWhitespace()) 110 combinator = ' '; // maybe descendant? 111 if (tq.matchesAny(Combinators)) // no, explicit 112 combinator = tq.consume(); 113 else if (tq.matchesAny(SequenceEnders)) // , - space after simple like "foo , bar"; ) - close of :has() 114 break; 115 116 if (combinator != 0) { 117 Evaluator right = parseSimpleSequence(); 118 left = combinator(left, combinator, right); 119 } else { 120 break; 121 } 122 } 123 return left; 124 } 125 126 Evaluator parseSimpleSequence() { 127 // SimpleSequence ::= TypeSelector? ( Hash | Class | Pseudo )* 128 Evaluator left = null; 129 tq.consumeWhitespace(); 130 131 // one optional type selector 132 if (tq.matchesWord() || tq.matches("*|")) 133 left = byTag(); 134 else if (tq.matchChomp('*')) 135 left = new Evaluator.AllElements(); 136 137 // zero or more subclasses (#, ., [) 138 while(true) { 139 Evaluator right = parseSubclass(); 140 if (right != null) { 141 left = and(left, right); 142 } 143 else break; // no more simple tokens 144 } 145 146 if (left == null) 147 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 148 return left; 149 } 150 151 static Evaluator combinator(Evaluator left, char combinator, Evaluator right) { 152 switch (combinator) { 153 case '>': 154 ImmediateParentRun run = left instanceof ImmediateParentRun ? 155 (ImmediateParentRun) left : new ImmediateParentRun(left); 156 run.add(right); 157 return run; 158 case ' ': 159 return and(new StructuralEvaluator.Ancestor(left), right); 160 case '+': 161 return and(new StructuralEvaluator.ImmediatePreviousSibling(left), right); 162 case '~': 163 return and(new StructuralEvaluator.PreviousSibling(left), right); 164 default: 165 throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator); 166 } 167 } 168 169 @Nullable Evaluator parseSubclass() { 170 // Subclass: ID | Class | Attribute | Pseudo 171 if (tq.matchChomp('#')) return byId(); 172 else if (tq.matchChomp('.')) return byClass(); 173 else if (tq.matches('[')) return byAttribute(); 174 else if (tq.matchChomp("::")) return parseNodeSelector(); // ::comment etc 175 else if (tq.matchChomp(':')) return parsePseudoSelector(); 176 else return null; 177 } 178 179 /** Merge two evals into an Or. */ 180 static Evaluator or(Evaluator left, Evaluator right) { 181 if (left instanceof CombiningEvaluator.Or) { 182 ((CombiningEvaluator.Or) left).add(right); 183 return left; 184 } 185 return new CombiningEvaluator.Or(left, right); 186 } 187 188 /** Merge two evals into an And. */ 189 static Evaluator and(@Nullable Evaluator left, Evaluator right) { 190 if (left == null) return right; 191 if (left instanceof CombiningEvaluator.And) { 192 ((CombiningEvaluator.And) left).add(right); 193 return left; 194 } 195 return new CombiningEvaluator.And(left, right); 196 } 197 198 private Evaluator parsePseudoSelector() { 199 final String pseudo = tq.consumeCssIdentifier(); 200 switch (pseudo) { 201 case "lt": 202 return new Evaluator.IndexLessThan(consumeIndex()); 203 case "gt": 204 return new Evaluator.IndexGreaterThan(consumeIndex()); 205 case "eq": 206 return new Evaluator.IndexEquals(consumeIndex()); 207 case "has": 208 return has(); 209 case "is": 210 return is(); 211 case "contains": 212 return contains(false); 213 case "containsOwn": 214 return contains(true); 215 case "containsWholeText": 216 return containsWholeText(false); 217 case "containsWholeOwnText": 218 return containsWholeText(true); 219 case "containsData": 220 return containsData(); 221 case "matches": 222 return matches(false); 223 case "matchesOwn": 224 return matches(true); 225 case "matchesWholeText": 226 return matchesWholeText(false); 227 case "matchesWholeOwnText": 228 return matchesWholeText(true); 229 case "not": 230 return not(); 231 case "nth-child": 232 return cssNthChild(false, false); 233 case "nth-last-child": 234 return cssNthChild(true, false); 235 case "nth-of-type": 236 return cssNthChild(false, true); 237 case "nth-last-of-type": 238 return cssNthChild(true, true); 239 case "first-child": 240 return new Evaluator.IsFirstChild(); 241 case "last-child": 242 return new Evaluator.IsLastChild(); 243 case "first-of-type": 244 return new Evaluator.IsFirstOfType(); 245 case "last-of-type": 246 return new Evaluator.IsLastOfType(); 247 case "only-child": 248 return new Evaluator.IsOnlyChild(); 249 case "only-of-type": 250 return new Evaluator.IsOnlyOfType(); 251 case "empty": 252 return new Evaluator.IsEmpty(); 253 case "blank": 254 return new NodeEvaluator.BlankValue(); 255 case "root": 256 return new Evaluator.IsRoot(); 257 case "matchText": 258 return new Evaluator.MatchText(); 259 default: 260 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 261 } 262 } 263 264 // ::comment etc 265 private Evaluator parseNodeSelector() { 266 final String pseudo = tq.consumeCssIdentifier(); 267 inNodeContext = true; // Enter node context 268 269 Evaluator left; 270 switch (pseudo) { 271 case "node": 272 left = new NodeEvaluator.InstanceType(Node.class, pseudo); 273 break; 274 case "leafnode": 275 left = new NodeEvaluator.InstanceType(LeafNode.class, pseudo); 276 break; 277 case "text": 278 left = new NodeEvaluator.InstanceType(TextNode.class, pseudo); 279 break; 280 case "comment": 281 left = new NodeEvaluator.InstanceType(Comment.class, pseudo); 282 break; 283 case "data": 284 left = new NodeEvaluator.InstanceType(DataNode.class, pseudo); 285 break; 286 case "cdata": 287 left = new NodeEvaluator.InstanceType(CDataNode.class, pseudo); 288 break; 289 default: 290 throw new Selector.SelectorParseException( 291 "Could not parse query '%s': unknown node type '::%s'", query, pseudo); 292 } 293 294 // Handle following subclasses in node context (like ::comment:contains()) 295 Evaluator right; 296 while ((right = parseSubclass()) != null) { 297 left = and(left, right); 298 } 299 300 inNodeContext = false; 301 return left; 302 } 303 304 private Evaluator byId() { 305 String id = tq.consumeCssIdentifier(); 306 Validate.notEmpty(id); 307 return new Evaluator.Id(id); 308 } 309 310 private Evaluator byClass() { 311 String className = tq.consumeCssIdentifier(); 312 Validate.notEmpty(className); 313 return new Evaluator.Class(className.trim()); 314 } 315 316 private Evaluator byTag() { 317 // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make 318 // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for 319 // consistency - both the selector and the element tag 320 String tagName = normalize(tq.consumeElementSelector()); 321 Validate.notEmpty(tagName); 322 323 // namespaces: 324 if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName 325 String plainTag = tagName.substring(2); // strip *| 326 return new CombiningEvaluator.Or( 327 new Evaluator.Tag(plainTag), 328 new Evaluator.TagEndsWith(":" + plainTag) 329 ); 330 } else if (tagName.endsWith("|*")) { // ns|* 331 String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns: 332 return new Evaluator.TagStartsWith(ns); 333 } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def" 334 tagName = tagName.replace("|", ":"); 335 } 336 337 return new Evaluator.Tag(tagName); 338 } 339 340 private Evaluator byAttribute() { 341 try (TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']'))) { 342 return evaluatorForAttribute(cq); 343 } 344 } 345 346 private Evaluator evaluatorForAttribute(TokenQueue cq) { 347 String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val) 348 key = normalize(key); 349 Validate.notEmpty(key); 350 Validate.isFalse(key.equals("abs:"), "Absolute attribute key must have a name"); 351 cq.consumeWhitespace(); 352 final Evaluator eval; 353 354 if (cq.isEmpty()) { 355 if (key.startsWith("^")) 356 eval = new Evaluator.AttributeStarting(key.substring(1)); 357 else if (key.equals("*")) // any attribute 358 eval = new Evaluator.AttributeStarting(""); 359 else 360 eval = new Evaluator.Attribute(key); 361 } else { 362 if (cq.matchChomp('=')) 363 eval = new Evaluator.AttributeWithValue(key, cq.remainder()); 364 else if (cq.matchChomp("!=")) 365 eval = new Evaluator.AttributeWithValueNot(key, cq.remainder()); 366 else if (cq.matchChomp("^=")) 367 eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder()); 368 else if (cq.matchChomp("$=")) 369 eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder()); 370 else if (cq.matchChomp("*=")) 371 eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder()); 372 else if (cq.matchChomp("~=")) 373 eval = new Evaluator.AttributeWithValueMatching(key, Regex.compile(cq.remainder())); 374 else 375 throw new Selector.SelectorParseException( 376 "Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); 377 } 378 return eval; 379 } 380 381 //pseudo selectors :first-child, :last-child, :nth-child, ... 382 private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE); 383 private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)"); 384 385 private Evaluator cssNthChild(boolean last, boolean ofType) { 386 String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd) 387 final int step, offset; 388 if ("odd".equals(arg)) { 389 step = 2; 390 offset = 1; 391 } else if ("even".equals(arg)) { 392 step = 2; 393 offset = 0; 394 } else { 395 Matcher stepOffsetM, stepM; 396 if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) { 397 if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2 398 step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", "")); 399 else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1; 400 step = "-".equals(stepOffsetM.group(2)) ? -1 : 1; 401 offset = 402 stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0; 403 } else if ((stepM = NthOffset.matcher(arg)).matches()) { 404 step = 0; 405 offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", "")); 406 } else { 407 throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg); 408 } 409 } 410 411 return ofType 412 ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset)) 413 : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset)); 414 } 415 416 private String consumeParens() { 417 return tq.chompBalanced('(', ')'); 418 } 419 420 private int consumeIndex() { 421 String index = consumeParens().trim(); 422 Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric"); 423 return Integer.parseInt(index); 424 } 425 426 // pseudo selector :has(el) 427 private Evaluator has() { 428 return parseNested(StructuralEvaluator.Has::new, ":has() must have a selector"); 429 } 430 431 // pseudo selector :is() 432 private Evaluator is() { 433 return parseNested(StructuralEvaluator.Is::new, ":is() must have a selector"); 434 } 435 436 private Evaluator parseNested(Function<Evaluator, Evaluator> func, String err) { 437 Validate.isTrue(tq.matchChomp('('), err); 438 Evaluator eval = parseSelectorGroup(); 439 Validate.isTrue(tq.matchChomp(')'), err); 440 return func.apply(eval); 441 } 442 443 // pseudo selector :contains(text), containsOwn(text) 444 private Evaluator contains(boolean own) { 445 String query = own ? ":containsOwn" : ":contains"; 446 String searchText = TokenQueue.unescape(consumeParens()); 447 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 448 449 if (inNodeContext) 450 return new NodeEvaluator.ContainsValue(searchText); 451 452 return own 453 ? new Evaluator.ContainsOwnText(searchText) 454 : new Evaluator.ContainsText(searchText); 455 } 456 457 private Evaluator containsWholeText(boolean own) { 458 String query = own ? ":containsWholeOwnText" : ":containsWholeText"; 459 String searchText = TokenQueue.unescape(consumeParens()); 460 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 461 return own 462 ? new Evaluator.ContainsWholeOwnText(searchText) 463 : new Evaluator.ContainsWholeText(searchText); 464 } 465 466 // pseudo selector :containsData(data) 467 private Evaluator containsData() { 468 String searchText = TokenQueue.unescape(consumeParens()); 469 Validate.notEmpty(searchText, ":containsData(text) query must not be empty"); 470 return new Evaluator.ContainsData(searchText); 471 } 472 473 // :matches(regex), matchesOwn(regex) 474 private Evaluator matches(boolean own) { 475 String query = own ? ":matchesOwn" : ":matches"; 476 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 477 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 478 Regex pattern = Regex.compile(regex); 479 480 if (inNodeContext) 481 return new NodeEvaluator.MatchesValue(pattern); 482 483 return own 484 ? new Evaluator.MatchesOwn(pattern) 485 : new Evaluator.Matches(pattern); 486 } 487 488 // :matches(regex), matchesOwn(regex) 489 private Evaluator matchesWholeText(boolean own) { 490 String query = own ? ":matchesWholeOwnText" : ":matchesWholeText"; 491 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 492 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 493 494 Regex pattern = Regex.compile(regex); 495 return own 496 ? new Evaluator.MatchesWholeOwnText(pattern) 497 : new Evaluator.MatchesWholeText(pattern); 498 } 499 500 // :not(selector) 501 private Evaluator not() { 502 String subQuery = consumeParens(); 503 Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); 504 505 return new StructuralEvaluator.Not(parse(subQuery)); 506 } 507 508 @Override 509 public String toString() { 510 return query; 511 } 512 513 @Override 514 public void close() { 515 tq.close(); 516 } 517}