001package org.jsoup.parser; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.helper.Validate; 005 006/** 007 A character reader with helpers focusing on parsing CSS selectors. Used internally by jsoup. API subject to changes. 008 */ 009 010public class TokenQueue implements AutoCloseable { 011 private static final char Esc = '\\'; // escape char for chomp balanced. 012 private static final char Hyphen_Minus = '-'; 013 private static final char Unicode_Null = '\u0000'; 014 private static final char Replacement = '\uFFFD'; 015 016 private final CharacterReader reader; 017 018 /** 019 Create a new TokenQueue. 020 @param data string of data to back queue. 021 */ 022 public TokenQueue(String data) { 023 reader = new CharacterReader(data); 024 } 025 026 /** 027 Is the queue empty? 028 @return true if no data left in queue. 029 */ 030 public boolean isEmpty() { 031 return reader.isEmpty(); 032 } 033 034 /** 035 Consume one character off queue. 036 @return first character on queue. 037 */ 038 public char consume() { 039 return reader.consume(); 040 } 041 042 /** 043 Drops the next character off the queue. 044 */ 045 public void advance() { 046 if (!isEmpty()) reader.advance(); 047 } 048 049 char current() { 050 return reader.current(); 051 } 052 053 /** 054 Tests if the next characters on the queue match the sequence, case-insensitively. 055 @param seq String to check queue for. 056 @return true if the next characters match. 057 */ 058 public boolean matches(String seq) { 059 return reader.matchesIgnoreCase(seq); 060 } 061 062 /** Tests if the next character on the queue matches the character, case-sensitively. */ 063 public boolean matches(char c) { 064 return reader.matches(c); 065 } 066 067 /** 068 Tests if the next characters match any of the sequences, case-<b>sensitively</b>. 069 @param seq list of chars to case-sensitively check for 070 @return true of any matched, false if none did 071 */ 072 public boolean matchesAny(char... seq) { 073 return reader.matchesAny(seq); 074 } 075 076 /** 077 If the queue case-insensitively matches the supplied string, consume it off the queue. 078 @param seq String to search for, and if found, remove from queue. 079 @return true if found and removed, false if not found. 080 */ 081 public boolean matchChomp(String seq) { 082 return reader.matchConsumeIgnoreCase(seq); 083 } 084 085 /** If the queue matches the supplied (case-sensitive) character, consume it off the queue. */ 086 public boolean matchChomp(char c) { 087 if (reader.matches(c)) { 088 consume(); 089 return true; 090 } 091 return false; 092 } 093 094 /** 095 Tests if queue starts with a whitespace character. 096 @return if starts with whitespace 097 */ 098 public boolean matchesWhitespace() { 099 return StringUtil.isWhitespace(reader.current()); 100 } 101 102 /** 103 Test if the queue matches a tag word character (letter or digit). 104 @return if matches a word character 105 */ 106 public boolean matchesWord() { 107 return Character.isLetterOrDigit(reader.current()); 108 } 109 110 /** 111 Consumes the supplied sequence of the queue, case-insensitively. If the queue does not start with the supplied 112 sequence, will throw an illegal state exception -- but you should be running match() against that condition. 113 114 @param seq sequence to remove from head of queue. 115 */ 116 public void consume(String seq) { 117 boolean found = reader.matchConsumeIgnoreCase(seq); 118 if (!found) throw new IllegalStateException("Queue did not match expected sequence"); 119 } 120 121 /** 122 Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. 123 @param seq String to end on (and not include in return, but leave on queue). <b>Case-sensitive.</b> 124 @return The matched data consumed from queue. 125 */ 126 public String consumeTo(String seq) { 127 return reader.consumeTo(seq); 128 } 129 130 /** 131 Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. 132 @param seq any number of terminators to consume to. <b>Case-insensitive.</b> 133 @return consumed string 134 */ 135 public String consumeToAny(String... seq) { 136 StringBuilder sb = StringUtil.borrowBuilder(); 137 OUT: while (!isEmpty()) { 138 for (String s : seq) { 139 if (reader.matchesIgnoreCase(s)) break OUT; 140 } 141 sb.append(consume()); 142 } 143 return StringUtil.releaseBuilder(sb); 144 } 145 146 /** 147 Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", 148 and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \). 149 Those escapes will be left in the returned string, which is suitable for regexes (where we need to preserve the 150 escape), but unsuitable for contains text strings; use unescape for that. 151 152 @param open opener 153 @param close closer 154 @return data matched from the queue 155 */ 156 public String chompBalanced(char open, char close) { 157 StringBuilder accum = StringUtil.borrowBuilder(); 158 int depth = 0; 159 char prev = 0; 160 boolean inSingle = false; 161 boolean inDouble = false; 162 boolean inRegexQE = false; // regex \Q .. \E escapes from Pattern.quote() 163 reader.mark(); // mark the initial position to restore if needed 164 165 do { 166 if (isEmpty()) break; 167 char c = consume(); 168 if (prev == Esc) { 169 if (c == 'Q') inRegexQE = true; 170 else if (c == 'E') inRegexQE = false; 171 accum.append(c); 172 } else { 173 if (c == '\'' && c != open && !inDouble) inSingle = !inSingle; 174 else if (c == '"' && c != open && !inSingle) inDouble = !inDouble; 175 176 if (inSingle || inDouble || inRegexQE) { 177 accum.append(c); 178 } else if (c == open) { 179 depth++; 180 if (depth > 1) accum.append(c); // don't include the outer match pair in the return 181 } else if (c == close) { 182 depth--; 183 if (depth > 0) accum.append(c); 184 } else { 185 accum.append(c); 186 } 187 } 188 prev = c; 189 } while (depth > 0); 190 191 String out = StringUtil.releaseBuilder(accum); 192 if (depth > 0) {// ran out of queue before seeing enough ) 193 reader.rewindToMark(); // restore position if we don't have a balanced string 194 Validate.fail("Did not find balanced marker at '" + out + "'"); 195 } 196 return out; 197 } 198 199 /** 200 * Unescape a \ escaped string. 201 * @param in backslash escaped string 202 * @return unescaped string 203 */ 204 public static String unescape(String in) { 205 if (in.indexOf(Esc) == -1) return in; 206 207 StringBuilder out = StringUtil.borrowBuilder(); 208 char last = 0; 209 for (char c : in.toCharArray()) { 210 if (c == Esc) { 211 if (last == Esc) { 212 out.append(c); 213 c = 0; 214 } 215 } 216 else 217 out.append(c); 218 last = c; 219 } 220 return StringUtil.releaseBuilder(out); 221 } 222 223 /** 224 Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be 225 valid in a selector. 226 227 @see <a href="https://www.w3.org/TR/cssom-1/#serialize-an-identifier">CSS Object Model, serialize an identifier</a> 228 */ 229 public static String escapeCssIdentifier(String in) { 230 if (in.isEmpty()) return in; 231 232 StringBuilder out = StringUtil.borrowBuilder(); 233 TokenQueue q = new TokenQueue(in); 234 235 char firstChar = q.current(); 236 if (firstChar == Hyphen_Minus) { 237 q.advance(); 238 if (q.isEmpty()) { 239 // If the character is the first character and is a "-" (U+002D), and there is no second character, then 240 // the escaped character. 241 appendEscaped(out, Hyphen_Minus); 242 } else { 243 out.append(Hyphen_Minus); 244 245 char secondChar = q.current(); 246 if (StringUtil.isDigit(secondChar)) { 247 // If the character is the second character and is in the range [0-9] (U+0030 to U+0039) and the 248 // first character is a "-" (U+002D), then the character escaped as code point. 249 appendEscapedCodepoint(out, q.consume()); 250 } 251 } 252 } else if (StringUtil.isDigit(firstChar)) { 253 // If the character is the first character and is in the range [0-9] (U+0030 to U+0039), then the character 254 // escaped as code point. 255 appendEscapedCodepoint(out, q.consume()); 256 } 257 258 while (!q.isEmpty()) { 259 // Note: It's fine to iterate on chars because non-ASCII characters are never escaped. So surrogate pairs 260 // are kept intact. 261 char c = q.consume(); 262 if (c == Unicode_Null) { 263 // If the character is NULL (U+0000), then the REPLACEMENT CHARACTER (U+FFFD). 264 out.append(Replacement); 265 } else if (c <= '\u001F' || c == '\u007F') { 266 // If the character is in the range [\1-\1f] (U+0001 to U+001F) or is U+007F, then the character 267 // escaped as code point. 268 appendEscapedCodepoint(out, c); 269 } else if (isIdent(c)) { 270 // If the character is not handled by one of the above rules and is greater than or equal to U+0080, 271 // is "-" (U+002D) or "_" (U+005F), or is in one of the ranges [0-9] (U+0030 to U+0039), 272 // [A-Z] (U+0041 to U+005A), or [a-z] (U+0061 to U+007A), then the character itself. 273 out.append(c); 274 } else { 275 // Otherwise, the escaped character. 276 appendEscaped(out, c); 277 } 278 } 279 280 q.close(); 281 return StringUtil.releaseBuilder(out); 282 } 283 284 private static void appendEscaped(StringBuilder out, char c) { 285 out.append(Esc).append(c); 286 } 287 288 private static void appendEscapedCodepoint(StringBuilder out, char c) { 289 out.append(Esc).append(Integer.toHexString(c)).append(' '); 290 } 291 292 /** 293 * Pulls the next run of whitespace characters of the queue. 294 * @return Whether consuming whitespace or not 295 */ 296 public boolean consumeWhitespace() { 297 boolean seen = false; 298 while (matchesWhitespace()) { 299 advance(); 300 seen = true; 301 } 302 return seen; 303 } 304 305 /** 306 * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects). 307 * 308 * @return tag name 309 */ 310 public String consumeElementSelector() { 311 return consumeEscapedCssIdentifier(ElementSelectorChars); 312 } 313 private static final char[] ElementSelectorChars = {'*', '|', '_', '-'}; 314 315 /** 316 Consume a CSS identifier (ID or class) off the queue. 317 <p>Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead 318 of {@code \31}.</p> 319 320 @return The unescaped identifier. 321 @throws IllegalArgumentException if an invalid escape sequence was found. Afterward, the state of the TokenQueue 322 is undefined. 323 @see <a href="https://www.w3.org/TR/css-syntax-3/#consume-name">CSS Syntax Module Level 3, Consume an ident sequence</a> 324 @see <a href="https://www.w3.org/TR/css-syntax-3/#typedef-ident-token">CSS Syntax Module Level 3, ident-token</a> 325 */ 326 public String consumeCssIdentifier() { 327 if (isEmpty()) throw new IllegalArgumentException("CSS identifier expected, but end of input found"); 328 329 // Fast path for CSS identifiers that don't contain escape sequences. 330 String identifier = reader.consumeMatching(TokenQueue::isIdent); 331 char c = current(); 332 if (c != Esc && c != Unicode_Null) { 333 // If we didn't end on an Esc or a Null, we consumed the whole identifier 334 return identifier; 335 } 336 337 // An escape sequence was found. Use a StringBuilder to store the decoded CSS identifier. 338 StringBuilder out = StringUtil.borrowBuilder(); 339 if (!identifier.isEmpty()) { 340 // Copy the CSS identifier up to the first escape sequence. 341 out.append(identifier); 342 } 343 344 while (!isEmpty()) { 345 c = current(); 346 if (isIdent(c)) { 347 out.append(consume()); 348 } else if (c == Unicode_Null) { 349 // https://www.w3.org/TR/css-syntax-3/#input-preprocessing 350 advance(); 351 out.append(Replacement); 352 } else if (c == Esc) { 353 advance(); 354 if (!isEmpty() && isNewline(current())) { 355 // Not a valid escape sequence. This is treated as the end of the CSS identifier. 356 reader.unconsume(); 357 break; 358 } else { 359 consumeCssEscapeSequenceInto(out); 360 } 361 } else { 362 break; 363 } 364 } 365 return StringUtil.releaseBuilder(out); 366 } 367 368 private void consumeCssEscapeSequenceInto(StringBuilder out) { 369 if (isEmpty()) { 370 out.append(Replacement); 371 return; 372 } 373 374 char firstEscaped = consume(); 375 if (!StringUtil.isHexDigit(firstEscaped)) { 376 out.append(firstEscaped); 377 } else { 378 reader.unconsume(); // put back the first hex digit 379 String hexString = reader.consumeMatching(StringUtil::isHexDigit, 6); // consume up to 6 hex digits 380 int codePoint; 381 try { 382 codePoint = Integer.parseInt(hexString, 16); 383 } catch (NumberFormatException e) { 384 throw new IllegalArgumentException("Invalid escape sequence: " + hexString, e); 385 } 386 if (isValidCodePoint(codePoint)) { 387 out.appendCodePoint(codePoint); 388 } else { 389 out.append(Replacement); 390 } 391 392 if (!isEmpty()) { 393 char c = current(); 394 if (c == '\r') { 395 // Since there's currently no input preprocessing, check for CRLF here. 396 // https://www.w3.org/TR/css-syntax-3/#input-preprocessing 397 advance(); 398 if (!isEmpty() && current() == '\n') advance(); 399 } else if (c == ' ' || c == '\t' || isNewline(c)) { 400 advance(); 401 } 402 } 403 } 404 } 405 406 // statics below specifically for CSS identifiers: 407 408 // https://www.w3.org/TR/css-syntax-3/#non-ascii-code-point 409 private static boolean isNonAscii(char c) { 410 return c >= '\u0080'; 411 } 412 413 // https://www.w3.org/TR/css-syntax-3/#ident-start-code-point 414 private static boolean isIdentStart(char c) { 415 return c == '_' || StringUtil.isAsciiLetter(c) || isNonAscii(c); 416 } 417 418 // https://www.w3.org/TR/css-syntax-3/#ident-code-point 419 private static boolean isIdent(char c) { 420 return c == Hyphen_Minus || StringUtil.isDigit(c) || isIdentStart(c); 421 } 422 423 // https://www.w3.org/TR/css-syntax-3/#newline 424 // Note: currently there's no preprocessing happening. 425 private static boolean isNewline(char c) { 426 return c == '\n' || c == '\r' || c == '\f'; 427 } 428 429 // https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point 430 private static boolean isValidCodePoint(int codePoint) { 431 return codePoint != 0 && Character.isValidCodePoint(codePoint) && !Character.isSurrogate((char) codePoint); 432 } 433 434 private static final char[] CssIdentifierChars = {'-', '_'}; 435 436 private String consumeEscapedCssIdentifier(char... matches) { 437 StringBuilder sb = StringUtil.borrowBuilder(); 438 while (!isEmpty()) { 439 char c = current(); 440 if (c == Esc) { 441 advance(); 442 if (!isEmpty()) sb.append(consume()); 443 else break; 444 } else if (matchesCssIdentifier(matches)) { 445 sb.append(c); 446 advance(); 447 } else { 448 break; 449 } 450 } 451 return StringUtil.releaseBuilder(sb); 452 } 453 454 private boolean matchesCssIdentifier(char... matches) { 455 return matchesWord() || reader.matchesAny(matches); 456 } 457 458 /** 459 Consume and return whatever is left on the queue. 460 @return remainder of queue. 461 */ 462 public String remainder() { 463 return reader.consumeToEnd(); 464 } 465 466 @Override 467 public String toString() { 468 return reader.toString(); 469 } 470 471 @Override 472 public void close() { 473 reader.close(); // releases buffer back to pool 474 } 475}