001package org.jsoup.helper; 002 003import org.jsoup.Connection; 004import org.jsoup.internal.ControllableInputStream; 005import org.jsoup.internal.Normalizer; 006import org.jsoup.internal.SimpleStreamReader; 007import org.jsoup.internal.StringUtil; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.Document; 010import org.jsoup.nodes.Element; 011import org.jsoup.nodes.Node; 012import org.jsoup.nodes.XmlDeclaration; 013import org.jsoup.parser.Parser; 014import org.jsoup.parser.StreamParser; 015import org.jsoup.select.Elements; 016import org.jsoup.select.Evaluator; 017import org.jsoup.select.Selector; 018import org.jspecify.annotations.Nullable; 019 020import java.io.File; 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.OutputStream; 024import java.io.Reader; 025import java.io.UncheckedIOException; 026import java.nio.ByteBuffer; 027import java.nio.channels.Channels; 028import java.nio.channels.SeekableByteChannel; 029import java.nio.charset.Charset; 030import java.nio.charset.IllegalCharsetNameException; 031import java.nio.file.Files; 032import java.nio.file.Path; 033import java.util.Locale; 034import java.util.Random; 035import java.util.regex.Matcher; 036import java.util.regex.Pattern; 037import java.util.zip.GZIPInputStream; 038 039import static org.jsoup.internal.SharedConstants.DefaultBufferSize; 040 041/** 042 * Internal static utilities for handling data. 043 * 044 */ 045@SuppressWarnings("CharsetObjectCanBeUsed") 046public final class DataUtil { 047 private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)"); 048 public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10. 049 static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset 050 private static final int firstReadBufferSize = 1024 * 5; 051 private static final char[] mimeBoundaryChars = 052 "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray(); 053 static final int boundaryLength = 32; 054 055 private DataUtil() {} 056 057 /** 058 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 059 * are supported in addition to uncompressed files. 060 * 061 * @param file file to load 062 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 063 * the file will always override this setting. 064 * @param baseUri base URI of document, to resolve relative links against 065 * @return Document 066 * @throws IOException on IO error 067 */ 068 public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException { 069 return load(file.toPath(), charsetName, baseUri); 070 } 071 072 /** 073 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 074 * are supported in addition to uncompressed files. 075 * 076 * @param file file to load 077 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 078 * the file will always override this setting. 079 * @param baseUri base URI of document, to resolve relative links against 080 * @param parser alternate {@link Parser#xmlParser() parser} to use. 081 082 * @return Document 083 * @throws IOException on IO error 084 * @since 1.14.2 085 */ 086 public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 087 return load(file.toPath(), charsetName, baseUri, parser); 088 } 089 090 /** 091 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 092 * are supported in addition to uncompressed files. 093 * 094 * @param path file to load 095 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 096 * the file will always override this setting. 097 * @param baseUri base URI of document, to resolve relative links against 098 * @return Document 099 * @throws IOException on IO error 100 */ 101 public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException { 102 return load(path, charsetName, baseUri, Parser.htmlParser()); 103 } 104 105 /** 106 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 107 * are supported in addition to uncompressed files. 108 * 109 * @param path file to load 110 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 111 * the file will always override this setting. 112 * @param baseUri base URI of document, to resolve relative links against 113 * @param parser alternate {@link Parser#xmlParser() parser} to use. 114 115 * @return Document 116 * @throws IOException on IO error 117 * @since 1.17.2 118 */ 119 public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 120 return parseInputStream(openStream(path), charsetName, baseUri, parser); 121 } 122 123 /** 124 * Returns a {@link StreamParser} that will parse the supplied file progressively. 125 * Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 126 * are supported in addition to uncompressed files. 127 * 128 * @param path file to load 129 * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata. 130 * A BOM in the file will always override this setting. 131 * @param baseUri base URI of document, to resolve relative links against 132 * @param parser underlying HTML or XML parser to use. 133 134 * @return Document 135 * @throws IOException on IO error 136 * @since 1.18.2 137 * @see Connection.Response#streamParser() 138 */ 139 public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException { 140 StreamParser streamer = new StreamParser(parser); 141 String charsetName = charset != null? charset.name() : null; 142 try { 143 DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharsetForStreamParser(openStream(path), charsetName, baseUri, parser); 144 Reader reader = new SimpleStreamReader(charsetDoc.input, charsetDoc.charset); 145 streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it 146 } catch (IOException e) { 147 streamer.close(); 148 throw e; 149 } 150 return streamer; 151 } 152 153 /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */ 154 private static ControllableInputStream openStream(Path path) throws IOException { 155 final SeekableByteChannel byteChannel = Files.newByteChannel(path); 156 InputStream stream = Channels.newInputStream(byteChannel); 157 String name = Normalizer.lowerCase(path.getFileName().toString()); 158 if (name.endsWith(".gz") || name.endsWith(".z")) { 159 try { 160 final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes 161 byteChannel.position(0); // reset to start of file 162 if (zipped) stream = new GZIPInputStream(stream); 163 } catch (IOException e) { 164 stream.close(); // error during our first read; close the stream and cascade close byteChannel 165 throw e; 166 } 167 } 168 return ControllableInputStream.wrap(stream, 0); 169 } 170 171 /** 172 * Parses a Document from an input steam. 173 * @param in input stream to parse. The stream will be closed after reading. 174 * @param charsetName character set of input (optional) 175 * @param baseUri base URI of document, to resolve relative links against 176 * @return Document 177 * @throws IOException on IO error 178 */ 179 public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 180 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser()); 181 } 182 183 /** 184 * Parses a Document from an input steam, using the provided Parser. 185 * @param in input stream to parse. The stream will be closed after reading. 186 * @param charsetName character set of input (optional) 187 * @param baseUri base URI of document, to resolve relative links against 188 * @param parser alternate {@link Parser#xmlParser() parser} to use. 189 * @return Document 190 * @throws IOException on IO error 191 */ 192 public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 193 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser); 194 } 195 196 /** 197 * Writes the input stream to the output stream. Doesn't close them. 198 * @param in input stream to read from 199 * @param out output stream to write to 200 * @throws IOException on IO error 201 */ 202 static void crossStreams(final InputStream in, final OutputStream out) throws IOException { 203 final byte[] buffer = new byte[DefaultBufferSize]; 204 int len; 205 while ((len = in.read(buffer)) != -1) { 206 out.write(buffer, 0, len); 207 } 208 } 209 210 /** A struct to return a detected charset, and a document (if fully read). */ 211 static class CharsetDoc { 212 Charset charset; 213 InputStream input; 214 @Nullable Document doc; 215 216 CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) { 217 this.charset = charset; 218 this.input = input; 219 this.doc = doc; 220 } 221 } 222 223 static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 224 if (input == null) return new Document(baseUri); // empty body 225 226 final Document doc; 227 CharsetDoc charsetDoc = null; 228 try { 229 charsetDoc = detectCharset(input, charsetName, baseUri, parser); 230 doc = parseInputStream(charsetDoc, baseUri, parser); 231 } finally { 232 if (charsetDoc != null) 233 charsetDoc.input.close(); 234 } 235 return doc; 236 } 237 238 private static final Evaluator metaCharset = Selector.evaluatorOf("meta[http-equiv=content-type], meta[charset]"); 239 240 /** Detects charset for a regular parse, and may reuse a fully sniffed document. */ 241 static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 242 return detectCharset(input, charsetName, baseUri, parser, true); 243 } 244 245 /** Detects charset for a stream parse, and leaves the input readable for subsequent parsing. */ 246 static CharsetDoc detectCharsetForStreamParser(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 247 return detectCharset(input, charsetName, baseUri, parser, false); 248 } 249 250 /** Shared charset detection worker; regular parse can reuse a fully sniffed doc, stream parse cannot. */ 251 private static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser, boolean reuseDocIfFullyRead) throws IOException { 252 Document doc = null; 253 // read the start of the stream and look for a BOM or meta charset: 254 // look for BOM - overrides any other header or input 255 String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately 256 if (bomCharset != null) 257 charsetName = bomCharset; 258 259 if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8 260 int origMax = input.max(); 261 input.max(firstReadBufferSize); 262 input.resetFullyRead(); // clear any pre-read (e.g., BOM) state before capped sniff 263 input.mark(firstReadBufferSize); 264 input.allowClose(false); // ignores closes during parse, in case we need to rewind 265 try (Reader reader = new SimpleStreamReader(input, UTF_8)) { // input is currently capped to firstReadBufferSize 266 doc = parser.parseInput(reader, baseUri); 267 input.reset(); 268 input.max(origMax); // reset for a full read if required 269 } catch (UncheckedIOException e) { 270 throw e.getCause(); 271 } finally { 272 input.allowClose(true); 273 } 274 275 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> 276 Elements metaElements = doc.select(metaCharset); 277 String foundCharset = null; // if not found, will keep utf-8 as best attempt 278 for (Element meta : metaElements) { 279 if (meta.hasAttr("http-equiv")) 280 foundCharset = getCharsetFromContentType(meta.attr("content")); 281 if (foundCharset == null && meta.hasAttr("charset")) 282 foundCharset = meta.attr("charset"); 283 if (foundCharset != null) 284 break; 285 } 286 287 // look for <?xml encoding='ISO-8859-1'?> 288 if (foundCharset == null && doc.childNodeSize() > 0) { 289 Node first = doc.childNode(0); 290 XmlDeclaration decl = null; 291 if (first instanceof XmlDeclaration) 292 decl = (XmlDeclaration) first; 293 else if (first instanceof Comment) { 294 Comment comment = (Comment) first; 295 if (comment.isXmlDeclaration()) 296 decl = comment.asXmlDeclaration(); 297 } 298 if (decl != null && decl.name().equalsIgnoreCase("xml")) { 299 foundCharset = decl.attr("encoding"); 300 } 301 } 302 foundCharset = validateCharset(foundCharset); 303 if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works) 304 foundCharset = foundCharset.trim().replaceAll("[\"']", ""); 305 charsetName = foundCharset; 306 doc = null; 307 } else if (reuseDocIfFullyRead && input.baseReadFully()) { // keep the current parse if the caller can use a fully read doc 308 input.close(); // the parser tried to close it 309 } else { 310 doc = null; 311 } 312 } else { // specified by content type header (or by user on file load) 313 Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); 314 } 315 316 // finally: prepare the return struct 317 if (charsetName == null) 318 charsetName = defaultCharsetName; 319 Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); 320 return new CharsetDoc(charset, doc, input); 321 } 322 323 static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException { 324 // if doc != null it was fully parsed during charset detection; so just return that 325 if (charsetDoc.doc != null) 326 return charsetDoc.doc; 327 328 final InputStream input = charsetDoc.input; 329 Validate.notNull(input); 330 final Document doc; 331 final Charset charset = charsetDoc.charset; 332 try (Reader reader = new SimpleStreamReader(input, charset)) { 333 try { 334 doc = parser.parseInput(reader, baseUri); 335 } catch (UncheckedIOException e) { 336 // io exception when parsing (not seen before because reading the stream as we go) 337 throw e.getCause(); 338 } 339 doc.outputSettings().charset(charset); 340 if (!charset.canEncode()) { 341 // some charsets can read but not encode; switch to an encodable charset and update the meta el 342 doc.charset(UTF_8); 343 } 344 } 345 return doc; 346 } 347 348 /** 349 * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this 350 * method is executing on. The data read until being interrupted will be available. 351 * @param inStream the input stream to read from 352 * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. 353 * @return the filled byte buffer 354 * @throws IOException if an exception occurs whilst reading from the input stream. 355 */ 356 public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException { 357 return ControllableInputStream.readToByteBuffer(inStream, maxSize); 358 } 359 360 static ByteBuffer emptyByteBuffer() { 361 return ByteBuffer.allocate(0); 362 } 363 364 /** 365 * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default 366 * will kick in.) 367 * @param contentType e.g. "text/html; charset=EUC-JP" 368 * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. 369 */ 370 static @Nullable String getCharsetFromContentType(@Nullable String contentType) { 371 if (contentType == null) return null; 372 Matcher m = charsetPattern.matcher(contentType); 373 if (m.find()) { 374 String charset = m.group(1).trim(); 375 charset = charset.replace("charset=", ""); 376 return validateCharset(charset); 377 } 378 return null; 379 } 380 381 private @Nullable static String validateCharset(@Nullable String cs) { 382 if (cs == null || cs.length() == 0) return null; 383 cs = cs.trim().replaceAll("[\"']", ""); 384 try { 385 if (Charset.isSupported(cs)) return cs; 386 cs = cs.toUpperCase(Locale.ENGLISH); 387 if (Charset.isSupported(cs)) return cs; 388 } catch (IllegalCharsetNameException e) { 389 // if all this charset matching fails.... we just take the default 390 } 391 return null; 392 } 393 394 /** 395 * Creates a random string, suitable for use as a mime boundary 396 */ 397 static String mimeBoundary() { 398 final StringBuilder mime = StringUtil.borrowBuilder(); 399 final Random rand = new Random(); 400 for (int i = 0; i < boundaryLength; i++) { 401 mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]); 402 } 403 return StringUtil.releaseBuilder(mime); 404 } 405 406 private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException { 407 byte[] bom = new byte[4]; 408 input.mark(bom.length); 409 //noinspection ResultOfMethodCallIgnored 410 input.read(bom, 0, 4); 411 input.reset(); 412 413 // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here 414 if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE 415 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE 416 return "UTF-32"; // and I hope it's on your system 417 } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE 418 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) { 419 return "UTF-16"; // in all Javas 420 } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) { 421 input.read(bom, 0, 3); // consume the UTF-8 BOM 422 return "UTF-8"; // in all Javas 423 } 424 return null; 425 } 426}