001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SoftPool; 005import org.jsoup.internal.StringUtil; 006import org.jspecify.annotations.Nullable; 007 008import java.io.IOException; 009import java.io.UncheckedIOException; 010import java.io.Reader; 011import java.io.StringReader; 012import java.util.ArrayList; 013import java.util.Arrays; 014import java.util.Collections; 015import java.util.Locale; 016 017/** 018 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. 019 <p>If the underlying reader throws an IOException during any operation, the CharacterReader will throw an 020 {@link UncheckedIOException}. That won't happen with String / StringReader inputs.</p> 021 */ 022public final class CharacterReader implements AutoCloseable { 023 static final char EOF = (char) -1; 024 private static final int MaxStringCacheLen = 12; 025 private static final int StringCacheSize = 512; 026 private String[] stringCache; // holds reused strings in this doc, to lessen garbage 027 private static final SoftPool<String[]> StringPool = new SoftPool<>(() -> new String[StringCacheSize]); // reuse cache between iterations 028 029 static final int BufferSize = 1024 * 2; // visible for testing 030 static final int RefillPoint = BufferSize / 2; // when bufPos characters read, refill; visible for testing 031 private static final int RewindLimit = 1024; // the maximum we can rewind. No HTML entities can be larger than this. 032 033 private Reader reader; // underlying Reader, will be backed by a buffered+controlled input stream, or StringReader 034 private char[] charBuf; // character buffer we consume from; filled from Reader 035 private int bufPos; // position in charBuf that's been consumed to 036 private int bufLength; // the num of characters actually buffered in charBuf, <= charBuf.length 037 private int fillPoint = 0; // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp 038 private int consumed; // how many characters total have been consumed from this CharacterReader (less the current bufPos) 039 private int bufMark = -1; // if not -1, the marked rewind position 040 private boolean readFully; // if the underlying stream has been completely read, no value in further buffering 041 042 private static final SoftPool<char[]> BufferPool = new SoftPool<>(() -> new char[BufferSize]); // recycled char buffer 043 044 @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp() 045 private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)] 046 047 public CharacterReader(Reader input, int sz) { 048 this(input); // sz is no longer used 049 } 050 051 public CharacterReader(Reader input) { 052 Validate.notNull(input); 053 reader = input; 054 charBuf = BufferPool.borrow(); 055 stringCache = StringPool.borrow(); 056 bufferUp(); 057 } 058 059 public CharacterReader(String input) { 060 this(new StringReader(input)); 061 } 062 063 @Override 064 public void close() { 065 if (reader == null) 066 return; 067 try { 068 reader.close(); 069 } catch (IOException ignored) { 070 } finally { 071 reader = null; 072 Arrays.fill(charBuf, (char) 0); // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer 073 BufferPool.release(charBuf); 074 charBuf = null; 075 StringPool.release(stringCache); // conversely, we don't clear the string cache, so we can reuse the contents 076 stringCache = null; 077 } 078 } 079 080 private void bufferUp() { 081 if (readFully || bufPos < fillPoint || bufMark != -1) 082 return; 083 doBufferUp(); // structured so bufferUp may become an intrinsic candidate 084 } 085 086 /** 087 Reads into the buffer. Will throw an UncheckedIOException if the underling reader throws an IOException. 088 @throws UncheckedIOException if the underlying reader throws an IOException 089 */ 090 private void doBufferUp() { 091 /* 092 The flow: 093 - if read fully, or if bufPos < fillPoint, or if marked - do not fill. 094 - update readerPos (total amount consumed from this CharacterReader) += bufPos 095 - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount 096 - loop read the Reader until we fill charBuf. bufLength += read. 097 - readFully = true when read = -1 098 */ 099 consumed += bufPos; 100 bufLength -= bufPos; 101 if (bufLength > 0) 102 System.arraycopy(charBuf, bufPos, charBuf, 0, bufLength); 103 bufPos = 0; 104 while (bufLength < BufferSize) { 105 try { 106 int read = reader.read(charBuf, bufLength, charBuf.length - bufLength); 107 if (read == -1) { 108 readFully = true; 109 break; 110 } 111 if (read == 0) { 112 break; // if we have a surrogate on the buffer boundary and trying to read 1; will have enough in our buffer to proceed 113 } 114 bufLength += read; 115 } catch (IOException e) { 116 throw new UncheckedIOException(e); 117 } 118 } 119 fillPoint = Math.min(bufLength, RefillPoint); 120 121 scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking 122 lastIcSeq = null; // cache for last containsIgnoreCase(seq) 123 } 124 125 void mark() { 126 // make sure there is enough look ahead capacity 127 if (bufLength - bufPos < RewindLimit) 128 fillPoint = 0; 129 130 bufferUp(); 131 bufMark = bufPos; 132 } 133 134 void unmark() { 135 bufMark = -1; 136 } 137 138 void rewindToMark() { 139 if (bufMark == -1) 140 throw new UncheckedIOException(new IOException("Mark invalid")); 141 142 bufPos = bufMark; 143 unmark(); 144 } 145 146 /** 147 * Gets the position currently read to in the content. Starts at 0. 148 * @return current position 149 */ 150 public int pos() { 151 return consumed + bufPos; 152 } 153 154 /** Tests if the buffer has been fully read. */ 155 boolean readFully() { 156 return readFully; 157 } 158 159 /** 160 Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the 161 legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of 162 use. 163 164 @param track set tracking on|off 165 @since 1.14.3 166 */ 167 public void trackNewlines(boolean track) { 168 if (track && newlinePositions == null) { 169 newlinePositions = new ArrayList<>(BufferSize / 80); // rough guess of likely count 170 scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp 171 } 172 else if (!track) 173 newlinePositions = null; 174 } 175 176 /** 177 Check if the tracking of newlines is enabled. 178 @return the current newline tracking state 179 @since 1.14.3 180 */ 181 public boolean isTrackNewlines() { 182 return newlinePositions != null; 183 } 184 185 /** 186 Get the current line number (that the reader has consumed to). Starts at line #1. 187 @return the current line number, or 1 if line tracking is not enabled. 188 @since 1.14.3 189 @see #trackNewlines(boolean) 190 */ 191 public int lineNumber() { 192 return lineNumber(pos()); 193 } 194 195 int lineNumber(int pos) { 196 // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that 197 // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array 198 if (!isTrackNewlines()) 199 return 1; 200 201 int i = lineNumIndex(pos); 202 if (i == -1) 203 return lineNumberOffset; // first line 204 return i + lineNumberOffset + 1; 205 } 206 207 /** 208 Get the current column number (that the reader has consumed to). Starts at column #1. 209 @return the current column number 210 @since 1.14.3 211 @see #trackNewlines(boolean) 212 */ 213 public int columnNumber() { 214 return columnNumber(pos()); 215 } 216 217 int columnNumber(int pos) { 218 if (!isTrackNewlines()) 219 return pos + 1; 220 221 int i = lineNumIndex(pos); 222 if (i == -1) 223 return pos + 1; 224 return pos - newlinePositions.get(i) + 1; 225 } 226 227 /** 228 Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line 229 number 5 and column number 10. 230 @return line:col position 231 @since 1.14.3 232 @see #trackNewlines(boolean) 233 */ 234 String posLineCol() { 235 return lineNumber() + ":" + columnNumber(); 236 } 237 238 private int lineNumIndex(int pos) { 239 if (!isTrackNewlines()) return 0; 240 int i = Collections.binarySearch(newlinePositions, pos); 241 if (i < -1) i = Math.abs(i) - 2; 242 return i; 243 } 244 245 /** 246 Scans the buffer for newline position, and tracks their location in newlinePositions. 247 */ 248 private void scanBufferForNewlines() { 249 if (!isTrackNewlines()) 250 return; 251 252 if (newlinePositions.size() > 0) { 253 // work out the line number that we have read up to (as we have likely scanned past this point) 254 int index = lineNumIndex(consumed); 255 if (index == -1) index = 0; // first line 256 int linePos = newlinePositions.get(index); 257 lineNumberOffset += index; // the num lines we've read up to 258 newlinePositions.clear(); 259 newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer 260 } 261 262 for (int i = bufPos; i < bufLength; i++) { 263 if (charBuf[i] == '\n') 264 newlinePositions.add(1 + consumed + i); 265 } 266 } 267 268 /** 269 * Tests if all the content has been read. 270 * @return true if nothing left to read. 271 */ 272 public boolean isEmpty() { 273 bufferUp(); 274 return bufPos >= bufLength; 275 } 276 277 private boolean isEmptyNoBufferUp() { 278 return bufPos >= bufLength; 279 } 280 281 /** 282 * Get the char at the current position. 283 * @return char 284 */ 285 public char current() { 286 bufferUp(); 287 return isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 288 } 289 290 /** 291 Consume one character off the queue. 292 @return first character on queue, or EOF if the queue is empty. 293 */ 294 public char consume() { 295 bufferUp(); 296 char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 297 bufPos++; 298 return val; 299 } 300 301 /** 302 Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp. 303 */ 304 void unconsume() { 305 if (bufPos < 1) 306 throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it. 307 308 bufPos--; 309 } 310 311 /** 312 * Moves the current position by one. 313 */ 314 public void advance() { 315 bufPos++; 316 } 317 318 /** 319 * Returns the number of characters between the current position and the next instance of the input char 320 * @param c scan target 321 * @return offset between current position and next instance of target. -1 if not found. 322 */ 323 int nextIndexOf(char c) { 324 // doesn't handle scanning for surrogates 325 bufferUp(); 326 for (int i = bufPos; i < bufLength; i++) { 327 if (c == charBuf[i]) 328 return i - bufPos; 329 } 330 return -1; 331 } 332 333 /** 334 * Returns the number of characters between the current position and the next instance of the input sequence 335 * 336 * @param seq scan target 337 * @return offset between current position and next instance of target. -1 if not found. 338 */ 339 int nextIndexOf(CharSequence seq) { 340 bufferUp(); 341 // doesn't handle scanning for surrogates 342 char startChar = seq.charAt(0); 343 for (int offset = bufPos; offset < bufLength; offset++) { 344 // scan to first instance of startchar: 345 if (startChar != charBuf[offset]) 346 while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ } 347 int i = offset + 1; 348 int last = i + seq.length()-1; 349 if (offset < bufLength && last <= bufLength) { 350 for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ } 351 if (i == last) // found full sequence 352 return offset - bufPos; 353 } 354 } 355 return -1; 356 } 357 358 /** 359 * Reads characters up to the specific char. 360 * @param c the delimiter 361 * @return the chars read 362 */ 363 public String consumeTo(char c) { 364 int offset = nextIndexOf(c); 365 if (offset != -1) { 366 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 367 bufPos += offset; 368 return consumed; 369 } else { 370 return consumeToEnd(); 371 } 372 } 373 374 /** 375 Reads the characters up to (but not including) the specified case-sensitive string. 376 <p>If the sequence is not found in the buffer, will return the remainder of the current buffered amount, less the 377 length of the sequence, such that this call may be repeated. 378 @param seq the delimiter 379 @return the chars read 380 */ 381 public String consumeTo(String seq) { 382 int offset = nextIndexOf(seq); 383 if (offset != -1) { 384 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 385 bufPos += offset; 386 return consumed; 387 } else if (bufLength - bufPos < seq.length()) { 388 // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF 389 return consumeToEnd(); 390 } else { 391 // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters 392 // unread in case they contain the beginning of the search string 393 int endPos = bufLength - seq.length() + 1; 394 String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos); 395 bufPos = endPos; 396 return consumed; 397 } 398 } 399 400 /** 401 Read characters while the input predicate returns true. 402 @return characters read 403 */ 404 String consumeMatching(CharPredicate func) { 405 return consumeMatching(func, -1); 406 } 407 408 /** 409 Read characters while the input predicate returns true, up to a maximum length. 410 @param func predicate to test 411 @param maxLength maximum length to read. -1 indicates no maximum 412 @return characters read 413 */ 414 String consumeMatching(CharPredicate func, int maxLength) { 415 bufferUp(); 416 int pos = bufPos; 417 final int start = pos; 418 final int remaining = bufLength; 419 final char[] val = charBuf; 420 421 while (pos < remaining && (maxLength == -1 || pos - start < maxLength) && func.test(val[pos])) { 422 pos++; 423 } 424 425 bufPos = pos; 426 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 427 } 428 429 /** 430 * Read characters until the first of any delimiters is found. 431 * @param chars delimiters to scan for 432 * @return characters read up to the matched delimiter. 433 */ 434 public String consumeToAny(final char... chars) { 435 return consumeMatching(c -> { // seeks until we see one of the terminating chars 436 for (char seek : chars) 437 if (c == seek) return false; 438 return true; 439 }); 440 } 441 442 String consumeToAnySorted(final char... chars) { 443 return consumeMatching(c -> Arrays.binarySearch(chars, c) < 0); // matches until a hit 444 } 445 446 String consumeData() { 447 // consumes until &, <, null 448 return consumeMatching(c -> c != '&' && c != '<' && c != TokeniserState.nullChar); 449 } 450 451 String consumeAttributeQuoted(final boolean single) { 452 // null, " or ', & 453 return consumeMatching(c -> c != TokeniserState.nullChar && c != '&' && (single ? c != '\'' : c != '"')); 454 } 455 456 String consumeRawData() { 457 // <, null 458 return consumeMatching(c -> c != '<' && c != TokeniserState.nullChar); 459 } 460 461 String consumeTagName() { 462 // '\t', '\n', '\r', '\f', ' ', '/', '>' 463 // NOTE: out of spec; does not stop and append on nullChar but eats 464 return consumeMatching(c -> { 465 switch (c) { 466 case '\t': 467 case '\n': 468 case '\r': 469 case '\f': 470 case ' ': 471 case '/': 472 case '>': 473 return false; 474 } 475 return true; 476 }); 477 } 478 479 String consumeToEnd() { 480 bufferUp(); 481 String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos); 482 bufPos = bufLength; 483 return data; 484 } 485 486 String consumeLetterSequence() { 487 return consumeMatching(Character::isLetter); 488 } 489 490 String consumeLetterThenDigitSequence() { 491 bufferUp(); 492 int start = bufPos; 493 while (bufPos < bufLength) { 494 if (StringUtil.isAsciiLetter(charBuf[bufPos])) bufPos++; 495 else break; 496 } 497 while (!isEmptyNoBufferUp()) { 498 if (StringUtil.isDigit(charBuf[bufPos])) bufPos++; 499 else break; 500 } 501 502 return cacheString(charBuf, stringCache, start, bufPos - start); 503 } 504 505 String consumeHexSequence() { 506 return consumeMatching(StringUtil::isHexDigit); 507 } 508 509 String consumeDigitSequence() { 510 return consumeMatching(c -> c >= '0' && c <= '9'); 511 } 512 513 boolean matches(char c) { 514 return !isEmpty() && charBuf[bufPos] == c; 515 } 516 517 boolean matches(String seq) { 518 bufferUp(); 519 int scanLength = seq.length(); 520 if (scanLength > bufLength - bufPos) 521 return false; 522 523 for (int offset = 0; offset < scanLength; offset++) 524 if (seq.charAt(offset) != charBuf[bufPos +offset]) 525 return false; 526 return true; 527 } 528 529 boolean matchesIgnoreCase(String seq) { 530 bufferUp(); 531 int scanLength = seq.length(); 532 if (scanLength > bufLength - bufPos) 533 return false; 534 535 for (int offset = 0; offset < scanLength; offset++) { 536 char scan = seq.charAt(offset); 537 char target = charBuf[bufPos + offset]; 538 if (scan == target) continue; 539 540 scan = Character.toUpperCase(scan); 541 target = Character.toUpperCase(target); 542 if (scan != target) return false; 543 } 544 return true; 545 } 546 547 /** 548 Tests if the next character in the queue matches any of the characters in the sequence, case sensitively. 549 @param seq list of characters to check for 550 @return true if any matched, false if none did 551 */ 552 boolean matchesAny(char... seq) { 553 if (isEmpty()) 554 return false; 555 556 bufferUp(); 557 char c = charBuf[bufPos]; 558 for (char seek : seq) { 559 if (seek == c) 560 return true; 561 } 562 return false; 563 } 564 565 boolean matchesAnySorted(char[] seq) { 566 bufferUp(); 567 return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0; 568 } 569 570 /** 571 Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha 572 @return if it matches or not 573 */ 574 boolean matchesAsciiAlpha() { 575 if (isEmpty()) return false; 576 return StringUtil.isAsciiLetter(charBuf[bufPos]); 577 } 578 579 boolean matchesDigit() { 580 if (isEmpty()) return false; 581 return StringUtil.isDigit(charBuf[bufPos]); 582 } 583 584 boolean matchConsume(String seq) { 585 bufferUp(); 586 if (matches(seq)) { 587 bufPos += seq.length(); 588 return true; 589 } else { 590 return false; 591 } 592 } 593 594 boolean matchConsumeIgnoreCase(String seq) { 595 if (matchesIgnoreCase(seq)) { 596 bufPos += seq.length(); 597 return true; 598 } else { 599 return false; 600 } 601 } 602 603 // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans. 604 // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p 605 // looking for the </title>. Resets in bufferUp() 606 @Nullable private String lastIcSeq; // scan cache 607 private int lastIcIndex; // nearest found indexOf 608 609 /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */ 610 boolean containsIgnoreCase(String seq) { 611 if (seq.equals(lastIcSeq)) { 612 if (lastIcIndex == -1) return false; 613 if (lastIcIndex >= bufPos) return true; 614 } 615 lastIcSeq = seq; 616 617 String loScan = seq.toLowerCase(Locale.ENGLISH); 618 int lo = nextIndexOf(loScan); 619 if (lo > -1) { 620 lastIcIndex = bufPos + lo; return true; 621 } 622 623 String hiScan = seq.toUpperCase(Locale.ENGLISH); 624 int hi = nextIndexOf(hiScan); 625 boolean found = hi > -1; 626 lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains 627 return found; 628 } 629 630 @Override 631 public String toString() { 632 if (bufLength - bufPos < 0) return ""; 633 return new String(charBuf, bufPos, bufLength - bufPos); 634 } 635 636 /** 637 * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks. 638 * <p /> 639 * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. 640 * That saves both having to create objects as hash keys, and running through the entry list, at the expense of 641 * some more duplicates. 642 */ 643 private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) { 644 if (count > MaxStringCacheLen) // don't cache strings that are too big 645 return new String(charBuf, start, count); 646 if (count < 1) 647 return ""; 648 649 // calculate hash: 650 int hash = 0; 651 int end = count + start; 652 for (int i = start; i < end; i++) { 653 hash = 31 * hash + charBuf[i]; 654 } 655 656 // get from cache 657 final int index = hash & StringCacheSize - 1; 658 String cached = stringCache[index]; 659 660 if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit 661 return cached; 662 else { 663 cached = new String(charBuf, start, count); 664 stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next 665 } 666 667 return cached; 668 } 669 670 /** 671 * Check if the value of the provided range equals the string. 672 */ 673 static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) { 674 if (count == cached.length()) { 675 int i = start; 676 int j = 0; 677 while (count-- != 0) { 678 if (charBuf[i++] != cached.charAt(j++)) 679 return false; 680 } 681 return true; 682 } 683 return false; 684 } 685 686 // just used for testing 687 boolean rangeEquals(final int start, final int count, final String cached) { 688 return rangeEquals(charBuf, start, count, cached); 689 } 690 691 @FunctionalInterface 692 interface CharPredicate { 693 boolean test(char c); 694 } 695}