001package org.jsoup.helper;
002
003import org.jsoup.Connection;
004import org.jsoup.internal.ControllableInputStream;
005import org.jsoup.internal.Normalizer;
006import org.jsoup.internal.SimpleStreamReader;
007import org.jsoup.internal.StringUtil;
008import org.jsoup.nodes.Comment;
009import org.jsoup.nodes.Document;
010import org.jsoup.nodes.Element;
011import org.jsoup.nodes.Node;
012import org.jsoup.nodes.XmlDeclaration;
013import org.jsoup.parser.Parser;
014import org.jsoup.parser.StreamParser;
015import org.jsoup.select.Elements;
016import org.jsoup.select.Evaluator;
017import org.jsoup.select.Selector;
018import org.jspecify.annotations.Nullable;
019
020import java.io.File;
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.OutputStream;
024import java.io.Reader;
025import java.io.UncheckedIOException;
026import java.nio.ByteBuffer;
027import java.nio.channels.Channels;
028import java.nio.channels.SeekableByteChannel;
029import java.nio.charset.Charset;
030import java.nio.charset.IllegalCharsetNameException;
031import java.nio.file.Files;
032import java.nio.file.Path;
033import java.util.Locale;
034import java.util.Random;
035import java.util.regex.Matcher;
036import java.util.regex.Pattern;
037import java.util.zip.GZIPInputStream;
038
039import static org.jsoup.internal.SharedConstants.DefaultBufferSize;
040
041/**
042 * Internal static utilities for handling data.
043 *
044 */
045@SuppressWarnings("CharsetObjectCanBeUsed")
046public final class DataUtil {
047    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)");
048    public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10.
049    static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset
050    private static final int firstReadBufferSize = 1024 * 5;
051    private static final char[] mimeBoundaryChars =
052            "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
053    static final int boundaryLength = 32;
054
055    private DataUtil() {}
056
057    /**
058     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
059     * are supported in addition to uncompressed files.
060     *
061     * @param file file to load
062     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
063     *     the file will always override this setting.
064     * @param baseUri base URI of document, to resolve relative links against
065     * @return Document
066     * @throws IOException on IO error
067     */
068    public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
069        return load(file.toPath(), charsetName, baseUri);
070    }
071
072    /**
073     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
074     * are supported in addition to uncompressed files.
075     *
076     * @param file file to load
077     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
078     *     the file will always override this setting.
079     * @param baseUri base URI of document, to resolve relative links against
080     * @param parser alternate {@link Parser#xmlParser() parser} to use.
081
082     * @return Document
083     * @throws IOException on IO error
084     * @since 1.14.2
085     */
086    public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
087        return load(file.toPath(), charsetName, baseUri, parser);
088    }
089
090    /**
091     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
092     * are supported in addition to uncompressed files.
093     *
094     * @param path file to load
095     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
096     *     the file will always override this setting.
097     * @param baseUri base URI of document, to resolve relative links against
098     * @return Document
099     * @throws IOException on IO error
100     */
101    public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
102        return load(path, charsetName, baseUri, Parser.htmlParser());
103    }
104
105    /**
106     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
107     * are supported in addition to uncompressed files.
108     *
109     * @param path file to load
110     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
111     * the file will always override this setting.
112     * @param baseUri base URI of document, to resolve relative links against
113     * @param parser alternate {@link Parser#xmlParser() parser} to use.
114
115     * @return Document
116     * @throws IOException on IO error
117     * @since 1.17.2
118     */
119    public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
120        return parseInputStream(openStream(path), charsetName, baseUri, parser);
121    }
122
123    /**
124     * Returns a {@link StreamParser} that will parse the supplied file progressively.
125     * Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
126     * are supported in addition to uncompressed files.
127     *
128     * @param path file to load
129     * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata.
130     * A BOM in the file will always override this setting.
131     * @param baseUri base URI of document, to resolve relative links against
132     * @param parser underlying HTML or XML parser to use.
133
134     * @return Document
135     * @throws IOException on IO error
136     * @since 1.18.2
137     * @see Connection.Response#streamParser()
138     */
139    public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException {
140        StreamParser streamer = new StreamParser(parser);
141        String charsetName = charset != null? charset.name() : null;
142        try {
143            DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharsetForStreamParser(openStream(path), charsetName, baseUri, parser);
144            Reader reader = new SimpleStreamReader(charsetDoc.input, charsetDoc.charset);
145            streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
146        } catch (IOException e) {
147            streamer.close();
148            throw e;
149        }
150        return streamer;
151    }
152
153    /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
154    private static ControllableInputStream openStream(Path path) throws IOException {
155        final SeekableByteChannel byteChannel = Files.newByteChannel(path);
156        InputStream stream = Channels.newInputStream(byteChannel);
157        String name = Normalizer.lowerCase(path.getFileName().toString());
158        if (name.endsWith(".gz") || name.endsWith(".z")) {
159            try {
160                final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
161                byteChannel.position(0); // reset to start of file
162                if (zipped) stream = new GZIPInputStream(stream);
163            } catch (IOException e) {
164                stream.close(); // error during our first read; close the stream and cascade close byteChannel
165                throw e;
166            }
167        }
168        return ControllableInputStream.wrap(stream, 0);
169    }
170
171    /**
172     * Parses a Document from an input steam.
173     * @param in input stream to parse. The stream will be closed after reading.
174     * @param charsetName character set of input (optional)
175     * @param baseUri base URI of document, to resolve relative links against
176     * @return Document
177     * @throws IOException on IO error
178     */
179    public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
180        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser());
181    }
182
183    /**
184     * Parses a Document from an input steam, using the provided Parser.
185     * @param in input stream to parse. The stream will be closed after reading.
186     * @param charsetName character set of input (optional)
187     * @param baseUri base URI of document, to resolve relative links against
188     * @param parser alternate {@link Parser#xmlParser() parser} to use.
189     * @return Document
190     * @throws IOException on IO error
191     */
192    public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
193        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser);
194    }
195
196    /**
197     * Writes the input stream to the output stream. Doesn't close them.
198     * @param in input stream to read from
199     * @param out output stream to write to
200     * @throws IOException on IO error
201     */
202    static void crossStreams(final InputStream in, final OutputStream out) throws IOException {
203        final byte[] buffer = new byte[DefaultBufferSize];
204        int len;
205        while ((len = in.read(buffer)) != -1) {
206            out.write(buffer, 0, len);
207        }
208    }
209
210    /** A struct to return a detected charset, and a document (if fully read). */
211    static class CharsetDoc {
212        Charset charset;
213        InputStream input;
214        @Nullable Document doc;
215
216        CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) {
217            this.charset = charset;
218            this.input = input;
219            this.doc = doc;
220        }
221    }
222
223    static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
224        if (input == null) return new Document(baseUri); // empty body
225
226        final Document doc;
227        CharsetDoc charsetDoc = null;
228        try {
229            charsetDoc = detectCharset(input, charsetName, baseUri, parser);
230            doc = parseInputStream(charsetDoc, baseUri, parser);
231        } finally {
232            if (charsetDoc != null)
233                charsetDoc.input.close();
234        }
235        return doc;
236    }
237
238    private static final Evaluator metaCharset = Selector.evaluatorOf("meta[http-equiv=content-type], meta[charset]");
239
240    /** Detects charset for a regular parse, and may reuse a fully sniffed document. */
241    static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
242        return detectCharset(input, charsetName, baseUri, parser, true);
243    }
244
245    /** Detects charset for a stream parse, and leaves the input readable for subsequent parsing. */
246    static CharsetDoc detectCharsetForStreamParser(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
247        return detectCharset(input, charsetName, baseUri, parser, false);
248    }
249
250    /** Shared charset detection worker; regular parse can reuse a fully sniffed doc, stream parse cannot. */
251    private static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser, boolean reuseDocIfFullyRead) throws IOException {
252        Document doc = null;
253        // read the start of the stream and look for a BOM or meta charset:
254        // look for BOM - overrides any other header or input
255        String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately
256        if (bomCharset != null)
257            charsetName = bomCharset;
258
259        if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
260            int origMax = input.max();
261            input.max(firstReadBufferSize);
262            input.resetFullyRead(); // clear any pre-read (e.g., BOM) state before capped sniff
263            input.mark(firstReadBufferSize);
264            input.allowClose(false); // ignores closes during parse, in case we need to rewind
265            try (Reader reader = new SimpleStreamReader(input, UTF_8)) { // input is currently capped to firstReadBufferSize
266                doc = parser.parseInput(reader, baseUri);
267                input.reset();
268                input.max(origMax); // reset for a full read if required
269            } catch (UncheckedIOException e) {
270                throw e.getCause();
271            } finally {
272                input.allowClose(true);
273            }
274
275            // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
276            Elements metaElements = doc.select(metaCharset);
277            String foundCharset = null; // if not found, will keep utf-8 as best attempt
278            for (Element meta : metaElements) {
279                if (meta.hasAttr("http-equiv"))
280                    foundCharset = getCharsetFromContentType(meta.attr("content"));
281                if (foundCharset == null && meta.hasAttr("charset"))
282                    foundCharset = meta.attr("charset");
283                if (foundCharset != null)
284                    break;
285            }
286
287            // look for <?xml encoding='ISO-8859-1'?>
288            if (foundCharset == null && doc.childNodeSize() > 0) {
289                Node first = doc.childNode(0);
290                XmlDeclaration decl = null;
291                if (first instanceof XmlDeclaration)
292                    decl = (XmlDeclaration) first;
293                else if (first instanceof Comment) {
294                    Comment comment = (Comment) first;
295                    if (comment.isXmlDeclaration())
296                        decl = comment.asXmlDeclaration();
297                }
298                if (decl != null && decl.name().equalsIgnoreCase("xml")) {
299                    foundCharset = decl.attr("encoding");
300                }
301            }
302            foundCharset = validateCharset(foundCharset);
303            if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works)
304                foundCharset = foundCharset.trim().replaceAll("[\"']", "");
305                charsetName = foundCharset;
306                doc = null;
307            } else if (reuseDocIfFullyRead && input.baseReadFully()) { // keep the current parse if the caller can use a fully read doc
308                input.close(); // the parser tried to close it
309            } else {
310                doc = null;
311            }
312        } else { // specified by content type header (or by user on file load)
313            Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
314        }
315
316        // finally: prepare the return struct
317        if (charsetName == null)
318            charsetName = defaultCharsetName;
319        Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
320        return new CharsetDoc(charset, doc, input);
321    }
322
323    static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
324        // if doc != null it was fully parsed during charset detection; so just return that
325        if (charsetDoc.doc != null)
326            return charsetDoc.doc;
327
328        final InputStream input = charsetDoc.input;
329        Validate.notNull(input);
330        final Document doc;
331        final Charset charset = charsetDoc.charset;
332        try (Reader reader = new SimpleStreamReader(input, charset)) {
333            try {
334                doc = parser.parseInput(reader, baseUri);
335            } catch (UncheckedIOException e) {
336                // io exception when parsing (not seen before because reading the stream as we go)
337                throw e.getCause();
338            }
339            doc.outputSettings().charset(charset);
340            if (!charset.canEncode()) {
341                // some charsets can read but not encode; switch to an encodable charset and update the meta el
342                doc.charset(UTF_8);
343            }
344        }
345        return doc;
346    }
347
348    /**
349     * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
350     * method is executing on. The data read until being interrupted will be available.
351     * @param inStream the input stream to read from
352     * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited.
353     * @return the filled byte buffer
354     * @throws IOException if an exception occurs whilst reading from the input stream.
355     */
356    public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException {
357        return ControllableInputStream.readToByteBuffer(inStream, maxSize);
358    }
359
360    static ByteBuffer emptyByteBuffer() {
361        return ByteBuffer.allocate(0);
362    }
363
364    /**
365     * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default
366     * will kick in.)
367     * @param contentType e.g. "text/html; charset=EUC-JP"
368     * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
369     */
370    static @Nullable String getCharsetFromContentType(@Nullable String contentType) {
371        if (contentType == null) return null;
372        Matcher m = charsetPattern.matcher(contentType);
373        if (m.find()) {
374            String charset = m.group(1).trim();
375            charset = charset.replace("charset=", "");
376            return validateCharset(charset);
377        }
378        return null;
379    }
380
381    private @Nullable static String validateCharset(@Nullable String cs) {
382        if (cs == null || cs.length() == 0) return null;
383        cs = cs.trim().replaceAll("[\"']", "");
384        try {
385            if (Charset.isSupported(cs)) return cs;
386            cs = cs.toUpperCase(Locale.ENGLISH);
387            if (Charset.isSupported(cs)) return cs;
388        } catch (IllegalCharsetNameException e) {
389            // if all this charset matching fails.... we just take the default
390        }
391        return null;
392    }
393
394    /**
395     * Creates a random string, suitable for use as a mime boundary
396     */
397    static String mimeBoundary() {
398        final StringBuilder mime = StringUtil.borrowBuilder();
399        final Random rand = new Random();
400        for (int i = 0; i < boundaryLength; i++) {
401            mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]);
402        }
403        return StringUtil.releaseBuilder(mime);
404    }
405
406    private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException {
407        byte[] bom = new byte[4];
408        input.mark(bom.length);
409        //noinspection ResultOfMethodCallIgnored
410        input.read(bom, 0, 4);
411        input.reset();
412
413        // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
414        if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
415            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
416            return "UTF-32"; // and I hope it's on your system
417        } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
418            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
419            return "UTF-16"; // in all Javas
420        } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
421            input.read(bom, 0, 3); // consume the UTF-8 BOM
422            return "UTF-8"; // in all Javas
423        }
424        return null;
425    }
426}