001package org.jsoup.helper; 002 003import org.jsoup.Connection; 004import org.jsoup.internal.ControllableInputStream; 005import org.jsoup.internal.Normalizer; 006import org.jsoup.internal.SimpleStreamReader; 007import org.jsoup.internal.StringUtil; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.Document; 010import org.jsoup.nodes.Element; 011import org.jsoup.nodes.Node; 012import org.jsoup.nodes.XmlDeclaration; 013import org.jsoup.parser.Parser; 014import org.jsoup.parser.StreamParser; 015import org.jsoup.select.Elements; 016import org.jsoup.select.Evaluator; 017import org.jsoup.select.Selector; 018import org.jspecify.annotations.Nullable; 019 020import java.io.File; 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.OutputStream; 024import java.io.Reader; 025import java.io.UncheckedIOException; 026import java.nio.ByteBuffer; 027import java.nio.channels.Channels; 028import java.nio.channels.SeekableByteChannel; 029import java.nio.charset.Charset; 030import java.nio.charset.IllegalCharsetNameException; 031import java.nio.file.Files; 032import java.nio.file.Path; 033import java.util.Locale; 034import java.util.Random; 035import java.util.regex.Matcher; 036import java.util.regex.Pattern; 037import java.util.zip.GZIPInputStream; 038 039import static org.jsoup.internal.SharedConstants.DefaultBufferSize; 040 041/** 042 * Internal static utilities for handling data. 043 * 044 */ 045@SuppressWarnings("CharsetObjectCanBeUsed") 046public final class DataUtil { 047 private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)"); 048 public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10. 049 static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset 050 private static final int firstReadBufferSize = 1024 * 5; 051 private static final char[] mimeBoundaryChars = 052 "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray(); 053 static final int boundaryLength = 32; 054 055 private DataUtil() {} 056 057 /** 058 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 059 * are supported in addition to uncompressed files. 060 * 061 * @param file file to load 062 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 063 * the file will always override this setting. 064 * @param baseUri base URI of document, to resolve relative links against 065 * @return Document 066 * @throws IOException on IO error 067 */ 068 public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException { 069 return load(file.toPath(), charsetName, baseUri); 070 } 071 072 /** 073 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 074 * are supported in addition to uncompressed files. 075 * 076 * @param file file to load 077 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 078 * the file will always override this setting. 079 * @param baseUri base URI of document, to resolve relative links against 080 * @param parser alternate {@link Parser#xmlParser() parser} to use. 081 082 * @return Document 083 * @throws IOException on IO error 084 * @since 1.14.2 085 */ 086 public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 087 return load(file.toPath(), charsetName, baseUri, parser); 088 } 089 090 /** 091 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 092 * are supported in addition to uncompressed files. 093 * 094 * @param path file to load 095 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 096 * the file will always override this setting. 097 * @param baseUri base URI of document, to resolve relative links against 098 * @return Document 099 * @throws IOException on IO error 100 */ 101 public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException { 102 return load(path, charsetName, baseUri, Parser.htmlParser()); 103 } 104 105 /** 106 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 107 * are supported in addition to uncompressed files. 108 * 109 * @param path file to load 110 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 111 * the file will always override this setting. 112 * @param baseUri base URI of document, to resolve relative links against 113 * @param parser alternate {@link Parser#xmlParser() parser} to use. 114 115 * @return Document 116 * @throws IOException on IO error 117 * @since 1.17.2 118 */ 119 public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 120 return parseInputStream(openStream(path), charsetName, baseUri, parser); 121 } 122 123 /** 124 * Returns a {@link StreamParser} that will parse the supplied file progressively. 125 * Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 126 * are supported in addition to uncompressed files. 127 * 128 * @param path file to load 129 * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata. 130 * A BOM in the file will always override this setting. 131 * @param baseUri base URI of document, to resolve relative links against 132 * @param parser underlying HTML or XML parser to use. 133 134 * @return Document 135 * @throws IOException on IO error 136 * @since 1.18.2 137 * @see Connection.Response#streamParser() 138 */ 139 public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException { 140 StreamParser streamer = new StreamParser(parser); 141 String charsetName = charset != null? charset.name() : null; 142 try { 143 DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser); 144 Reader reader = new SimpleStreamReader(charsetDoc.input, charsetDoc.charset); 145 streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it 146 } catch (IOException e) { 147 streamer.close(); 148 throw e; 149 } 150 return streamer; 151 } 152 153 /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */ 154 private static ControllableInputStream openStream(Path path) throws IOException { 155 final SeekableByteChannel byteChannel = Files.newByteChannel(path); 156 InputStream stream = Channels.newInputStream(byteChannel); 157 String name = Normalizer.lowerCase(path.getFileName().toString()); 158 if (name.endsWith(".gz") || name.endsWith(".z")) { 159 try { 160 final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes 161 byteChannel.position(0); // reset to start of file 162 if (zipped) stream = new GZIPInputStream(stream); 163 } catch (IOException e) { 164 stream.close(); // error during our first read; close the stream and cascade close byteChannel 165 throw e; 166 } 167 } 168 return ControllableInputStream.wrap(stream, 0); 169 } 170 171 /** 172 * Parses a Document from an input steam. 173 * @param in input stream to parse. The stream will be closed after reading. 174 * @param charsetName character set of input (optional) 175 * @param baseUri base URI of document, to resolve relative links against 176 * @return Document 177 * @throws IOException on IO error 178 */ 179 public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 180 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser()); 181 } 182 183 /** 184 * Parses a Document from an input steam, using the provided Parser. 185 * @param in input stream to parse. The stream will be closed after reading. 186 * @param charsetName character set of input (optional) 187 * @param baseUri base URI of document, to resolve relative links against 188 * @param parser alternate {@link Parser#xmlParser() parser} to use. 189 * @return Document 190 * @throws IOException on IO error 191 */ 192 public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 193 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser); 194 } 195 196 /** 197 * Writes the input stream to the output stream. Doesn't close them. 198 * @param in input stream to read from 199 * @param out output stream to write to 200 * @throws IOException on IO error 201 */ 202 static void crossStreams(final InputStream in, final OutputStream out) throws IOException { 203 final byte[] buffer = new byte[DefaultBufferSize]; 204 int len; 205 while ((len = in.read(buffer)) != -1) { 206 out.write(buffer, 0, len); 207 } 208 } 209 210 /** A struct to return a detected charset, and a document (if fully read). */ 211 static class CharsetDoc { 212 Charset charset; 213 InputStream input; 214 @Nullable Document doc; 215 216 CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) { 217 this.charset = charset; 218 this.input = input; 219 this.doc = doc; 220 } 221 } 222 223 static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 224 if (input == null) return new Document(baseUri); // empty body 225 226 final Document doc; 227 CharsetDoc charsetDoc = null; 228 try { 229 charsetDoc = detectCharset(input, charsetName, baseUri, parser); 230 doc = parseInputStream(charsetDoc, baseUri, parser); 231 } finally { 232 if (charsetDoc != null) 233 charsetDoc.input.close(); 234 } 235 return doc; 236 } 237 238 private static final Evaluator metaCharset = Selector.evaluatorOf("meta[http-equiv=content-type], meta[charset]"); 239 240 static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 241 Document doc = null; 242 // read the start of the stream and look for a BOM or meta charset: 243 // look for BOM - overrides any other header or input 244 String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately 245 if (bomCharset != null) 246 charsetName = bomCharset; 247 248 if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8 249 int origMax = input.max(); 250 input.max(firstReadBufferSize); 251 input.resetFullyRead(); // clear any pre-read (e.g., BOM) state before capped sniff 252 input.mark(firstReadBufferSize); 253 input.allowClose(false); // ignores closes during parse, in case we need to rewind 254 try (Reader reader = new SimpleStreamReader(input, UTF_8)) { // input is currently capped to firstReadBufferSize 255 doc = parser.parseInput(reader, baseUri); 256 input.reset(); 257 input.max(origMax); // reset for a full read if required 258 } catch (UncheckedIOException e) { 259 throw e.getCause(); 260 } finally { 261 input.allowClose(true); 262 } 263 264 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> 265 Elements metaElements = doc.select(metaCharset); 266 String foundCharset = null; // if not found, will keep utf-8 as best attempt 267 for (Element meta : metaElements) { 268 if (meta.hasAttr("http-equiv")) 269 foundCharset = getCharsetFromContentType(meta.attr("content")); 270 if (foundCharset == null && meta.hasAttr("charset")) 271 foundCharset = meta.attr("charset"); 272 if (foundCharset != null) 273 break; 274 } 275 276 // look for <?xml encoding='ISO-8859-1'?> 277 if (foundCharset == null && doc.childNodeSize() > 0) { 278 Node first = doc.childNode(0); 279 XmlDeclaration decl = null; 280 if (first instanceof XmlDeclaration) 281 decl = (XmlDeclaration) first; 282 else if (first instanceof Comment) { 283 Comment comment = (Comment) first; 284 if (comment.isXmlDeclaration()) 285 decl = comment.asXmlDeclaration(); 286 } 287 if (decl != null && decl.name().equalsIgnoreCase("xml")) { 288 foundCharset = decl.attr("encoding"); 289 } 290 } 291 foundCharset = validateCharset(foundCharset); 292 if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works) 293 foundCharset = foundCharset.trim().replaceAll("[\"']", ""); 294 charsetName = foundCharset; 295 doc = null; 296 } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse 297 input.close(); // the parser tried to close it 298 } else { 299 doc = null; 300 } 301 } else { // specified by content type header (or by user on file load) 302 Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); 303 } 304 305 // finally: prepare the return struct 306 if (charsetName == null) 307 charsetName = defaultCharsetName; 308 Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); 309 return new CharsetDoc(charset, doc, input); 310 } 311 312 static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException { 313 // if doc != null it was fully parsed during charset detection; so just return that 314 if (charsetDoc.doc != null) 315 return charsetDoc.doc; 316 317 final InputStream input = charsetDoc.input; 318 Validate.notNull(input); 319 final Document doc; 320 final Charset charset = charsetDoc.charset; 321 try (Reader reader = new SimpleStreamReader(input, charset)) { 322 try { 323 doc = parser.parseInput(reader, baseUri); 324 } catch (UncheckedIOException e) { 325 // io exception when parsing (not seen before because reading the stream as we go) 326 throw e.getCause(); 327 } 328 doc.outputSettings().charset(charset); 329 if (!charset.canEncode()) { 330 // some charsets can read but not encode; switch to an encodable charset and update the meta el 331 doc.charset(UTF_8); 332 } 333 } 334 return doc; 335 } 336 337 /** 338 * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this 339 * method is executing on. The data read until being interrupted will be available. 340 * @param inStream the input stream to read from 341 * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. 342 * @return the filled byte buffer 343 * @throws IOException if an exception occurs whilst reading from the input stream. 344 */ 345 public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException { 346 return ControllableInputStream.readToByteBuffer(inStream, maxSize); 347 } 348 349 static ByteBuffer emptyByteBuffer() { 350 return ByteBuffer.allocate(0); 351 } 352 353 /** 354 * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default 355 * will kick in.) 356 * @param contentType e.g. "text/html; charset=EUC-JP" 357 * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. 358 */ 359 static @Nullable String getCharsetFromContentType(@Nullable String contentType) { 360 if (contentType == null) return null; 361 Matcher m = charsetPattern.matcher(contentType); 362 if (m.find()) { 363 String charset = m.group(1).trim(); 364 charset = charset.replace("charset=", ""); 365 return validateCharset(charset); 366 } 367 return null; 368 } 369 370 private @Nullable static String validateCharset(@Nullable String cs) { 371 if (cs == null || cs.length() == 0) return null; 372 cs = cs.trim().replaceAll("[\"']", ""); 373 try { 374 if (Charset.isSupported(cs)) return cs; 375 cs = cs.toUpperCase(Locale.ENGLISH); 376 if (Charset.isSupported(cs)) return cs; 377 } catch (IllegalCharsetNameException e) { 378 // if all this charset matching fails.... we just take the default 379 } 380 return null; 381 } 382 383 /** 384 * Creates a random string, suitable for use as a mime boundary 385 */ 386 static String mimeBoundary() { 387 final StringBuilder mime = StringUtil.borrowBuilder(); 388 final Random rand = new Random(); 389 for (int i = 0; i < boundaryLength; i++) { 390 mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]); 391 } 392 return StringUtil.releaseBuilder(mime); 393 } 394 395 private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException { 396 byte[] bom = new byte[4]; 397 input.mark(bom.length); 398 //noinspection ResultOfMethodCallIgnored 399 input.read(bom, 0, 4); 400 input.reset(); 401 402 // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here 403 if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE 404 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE 405 return "UTF-32"; // and I hope it's on your system 406 } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE 407 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) { 408 return "UTF-16"; // in all Javas 409 } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) { 410 input.read(bom, 0, 3); // consume the UTF-8 BOM 411 return "UTF-8"; // in all Javas 412 } 413 return null; 414 } 415}