001package org.jsoup.helper;
002
003import org.jsoup.Connection;
004import org.jsoup.internal.ControllableInputStream;
005import org.jsoup.internal.Normalizer;
006import org.jsoup.internal.SimpleStreamReader;
007import org.jsoup.internal.StringUtil;
008import org.jsoup.nodes.Comment;
009import org.jsoup.nodes.Document;
010import org.jsoup.nodes.Element;
011import org.jsoup.nodes.Node;
012import org.jsoup.nodes.XmlDeclaration;
013import org.jsoup.parser.Parser;
014import org.jsoup.parser.StreamParser;
015import org.jsoup.select.Elements;
016import org.jsoup.select.Evaluator;
017import org.jsoup.select.Selector;
018import org.jspecify.annotations.Nullable;
019
020import java.io.File;
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.OutputStream;
024import java.io.Reader;
025import java.io.UncheckedIOException;
026import java.nio.ByteBuffer;
027import java.nio.channels.Channels;
028import java.nio.channels.SeekableByteChannel;
029import java.nio.charset.Charset;
030import java.nio.charset.IllegalCharsetNameException;
031import java.nio.file.Files;
032import java.nio.file.Path;
033import java.util.Locale;
034import java.util.Random;
035import java.util.regex.Matcher;
036import java.util.regex.Pattern;
037import java.util.zip.GZIPInputStream;
038
039import static org.jsoup.internal.SharedConstants.DefaultBufferSize;
040
041/**
042 * Internal static utilities for handling data.
043 *
044 */
045@SuppressWarnings("CharsetObjectCanBeUsed")
046public final class DataUtil {
047    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)");
048    public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10.
049    static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset
050    private static final int firstReadBufferSize = 1024 * 5;
051    private static final char[] mimeBoundaryChars =
052            "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
053    static final int boundaryLength = 32;
054
055    private DataUtil() {}
056
057    /**
058     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
059     * are supported in addition to uncompressed files.
060     *
061     * @param file file to load
062     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
063     *     the file will always override this setting.
064     * @param baseUri base URI of document, to resolve relative links against
065     * @return Document
066     * @throws IOException on IO error
067     */
068    public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
069        return load(file.toPath(), charsetName, baseUri);
070    }
071
072    /**
073     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
074     * are supported in addition to uncompressed files.
075     *
076     * @param file file to load
077     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
078     *     the file will always override this setting.
079     * @param baseUri base URI of document, to resolve relative links against
080     * @param parser alternate {@link Parser#xmlParser() parser} to use.
081
082     * @return Document
083     * @throws IOException on IO error
084     * @since 1.14.2
085     */
086    public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
087        return load(file.toPath(), charsetName, baseUri, parser);
088    }
089
090    /**
091     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
092     * are supported in addition to uncompressed files.
093     *
094     * @param path file to load
095     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
096     *     the file will always override this setting.
097     * @param baseUri base URI of document, to resolve relative links against
098     * @return Document
099     * @throws IOException on IO error
100     */
101    public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
102        return load(path, charsetName, baseUri, Parser.htmlParser());
103    }
104
105    /**
106     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
107     * are supported in addition to uncompressed files.
108     *
109     * @param path file to load
110     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
111     * the file will always override this setting.
112     * @param baseUri base URI of document, to resolve relative links against
113     * @param parser alternate {@link Parser#xmlParser() parser} to use.
114
115     * @return Document
116     * @throws IOException on IO error
117     * @since 1.17.2
118     */
119    public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
120        return parseInputStream(openStream(path), charsetName, baseUri, parser);
121    }
122
123    /**
124     * Returns a {@link StreamParser} that will parse the supplied file progressively.
125     * Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
126     * are supported in addition to uncompressed files.
127     *
128     * @param path file to load
129     * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata.
130     * A BOM in the file will always override this setting.
131     * @param baseUri base URI of document, to resolve relative links against
132     * @param parser underlying HTML or XML parser to use.
133
134     * @return Document
135     * @throws IOException on IO error
136     * @since 1.18.2
137     * @see Connection.Response#streamParser()
138     */
139    public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException {
140        StreamParser streamer = new StreamParser(parser);
141        String charsetName = charset != null? charset.name() : null;
142        try {
143            DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
144            Reader reader = new SimpleStreamReader(charsetDoc.input, charsetDoc.charset);
145            streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
146        } catch (IOException e) {
147            streamer.close();
148            throw e;
149        }
150        return streamer;
151    }
152
153    /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
154    private static ControllableInputStream openStream(Path path) throws IOException {
155        final SeekableByteChannel byteChannel = Files.newByteChannel(path);
156        InputStream stream = Channels.newInputStream(byteChannel);
157        String name = Normalizer.lowerCase(path.getFileName().toString());
158        if (name.endsWith(".gz") || name.endsWith(".z")) {
159            try {
160                final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
161                byteChannel.position(0); // reset to start of file
162                if (zipped) stream = new GZIPInputStream(stream);
163            } catch (IOException e) {
164                stream.close(); // error during our first read; close the stream and cascade close byteChannel
165                throw e;
166            }
167        }
168        return ControllableInputStream.wrap(stream, 0);
169    }
170
171    /**
172     * Parses a Document from an input steam.
173     * @param in input stream to parse. The stream will be closed after reading.
174     * @param charsetName character set of input (optional)
175     * @param baseUri base URI of document, to resolve relative links against
176     * @return Document
177     * @throws IOException on IO error
178     */
179    public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
180        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser());
181    }
182
183    /**
184     * Parses a Document from an input steam, using the provided Parser.
185     * @param in input stream to parse. The stream will be closed after reading.
186     * @param charsetName character set of input (optional)
187     * @param baseUri base URI of document, to resolve relative links against
188     * @param parser alternate {@link Parser#xmlParser() parser} to use.
189     * @return Document
190     * @throws IOException on IO error
191     */
192    public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
193        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser);
194    }
195
196    /**
197     * Writes the input stream to the output stream. Doesn't close them.
198     * @param in input stream to read from
199     * @param out output stream to write to
200     * @throws IOException on IO error
201     */
202    static void crossStreams(final InputStream in, final OutputStream out) throws IOException {
203        final byte[] buffer = new byte[DefaultBufferSize];
204        int len;
205        while ((len = in.read(buffer)) != -1) {
206            out.write(buffer, 0, len);
207        }
208    }
209
210    /** A struct to return a detected charset, and a document (if fully read). */
211    static class CharsetDoc {
212        Charset charset;
213        InputStream input;
214        @Nullable Document doc;
215
216        CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) {
217            this.charset = charset;
218            this.input = input;
219            this.doc = doc;
220        }
221    }
222
223    static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
224        if (input == null) return new Document(baseUri); // empty body
225
226        final Document doc;
227        CharsetDoc charsetDoc = null;
228        try {
229            charsetDoc = detectCharset(input, charsetName, baseUri, parser);
230            doc = parseInputStream(charsetDoc, baseUri, parser);
231        } finally {
232            if (charsetDoc != null)
233                charsetDoc.input.close();
234        }
235        return doc;
236    }
237
238    private static final Evaluator metaCharset = Selector.evaluatorOf("meta[http-equiv=content-type], meta[charset]");
239
240    static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
241        Document doc = null;
242        // read the start of the stream and look for a BOM or meta charset:
243        // look for BOM - overrides any other header or input
244        String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately
245        if (bomCharset != null)
246            charsetName = bomCharset;
247
248        if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
249            int origMax = input.max();
250            input.max(firstReadBufferSize);
251            input.resetFullyRead(); // clear any pre-read (e.g., BOM) state before capped sniff
252            input.mark(firstReadBufferSize);
253            input.allowClose(false); // ignores closes during parse, in case we need to rewind
254            try (Reader reader = new SimpleStreamReader(input, UTF_8)) { // input is currently capped to firstReadBufferSize
255                doc = parser.parseInput(reader, baseUri);
256                input.reset();
257                input.max(origMax); // reset for a full read if required
258            } catch (UncheckedIOException e) {
259                throw e.getCause();
260            } finally {
261                input.allowClose(true);
262            }
263
264            // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
265            Elements metaElements = doc.select(metaCharset);
266            String foundCharset = null; // if not found, will keep utf-8 as best attempt
267            for (Element meta : metaElements) {
268                if (meta.hasAttr("http-equiv"))
269                    foundCharset = getCharsetFromContentType(meta.attr("content"));
270                if (foundCharset == null && meta.hasAttr("charset"))
271                    foundCharset = meta.attr("charset");
272                if (foundCharset != null)
273                    break;
274            }
275
276            // look for <?xml encoding='ISO-8859-1'?>
277            if (foundCharset == null && doc.childNodeSize() > 0) {
278                Node first = doc.childNode(0);
279                XmlDeclaration decl = null;
280                if (first instanceof XmlDeclaration)
281                    decl = (XmlDeclaration) first;
282                else if (first instanceof Comment) {
283                    Comment comment = (Comment) first;
284                    if (comment.isXmlDeclaration())
285                        decl = comment.asXmlDeclaration();
286                }
287                if (decl != null && decl.name().equalsIgnoreCase("xml")) {
288                    foundCharset = decl.attr("encoding");
289                }
290            }
291            foundCharset = validateCharset(foundCharset);
292            if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works)
293                foundCharset = foundCharset.trim().replaceAll("[\"']", "");
294                charsetName = foundCharset;
295                doc = null;
296            } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse
297                input.close(); // the parser tried to close it
298            } else {
299                doc = null;
300            }
301        } else { // specified by content type header (or by user on file load)
302            Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
303        }
304
305        // finally: prepare the return struct
306        if (charsetName == null)
307            charsetName = defaultCharsetName;
308        Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
309        return new CharsetDoc(charset, doc, input);
310    }
311
312    static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
313        // if doc != null it was fully parsed during charset detection; so just return that
314        if (charsetDoc.doc != null)
315            return charsetDoc.doc;
316
317        final InputStream input = charsetDoc.input;
318        Validate.notNull(input);
319        final Document doc;
320        final Charset charset = charsetDoc.charset;
321        try (Reader reader = new SimpleStreamReader(input, charset)) {
322            try {
323                doc = parser.parseInput(reader, baseUri);
324            } catch (UncheckedIOException e) {
325                // io exception when parsing (not seen before because reading the stream as we go)
326                throw e.getCause();
327            }
328            doc.outputSettings().charset(charset);
329            if (!charset.canEncode()) {
330                // some charsets can read but not encode; switch to an encodable charset and update the meta el
331                doc.charset(UTF_8);
332            }
333        }
334        return doc;
335    }
336
337    /**
338     * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
339     * method is executing on. The data read until being interrupted will be available.
340     * @param inStream the input stream to read from
341     * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited.
342     * @return the filled byte buffer
343     * @throws IOException if an exception occurs whilst reading from the input stream.
344     */
345    public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException {
346        return ControllableInputStream.readToByteBuffer(inStream, maxSize);
347    }
348
349    static ByteBuffer emptyByteBuffer() {
350        return ByteBuffer.allocate(0);
351    }
352
353    /**
354     * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default
355     * will kick in.)
356     * @param contentType e.g. "text/html; charset=EUC-JP"
357     * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
358     */
359    static @Nullable String getCharsetFromContentType(@Nullable String contentType) {
360        if (contentType == null) return null;
361        Matcher m = charsetPattern.matcher(contentType);
362        if (m.find()) {
363            String charset = m.group(1).trim();
364            charset = charset.replace("charset=", "");
365            return validateCharset(charset);
366        }
367        return null;
368    }
369
370    private @Nullable static String validateCharset(@Nullable String cs) {
371        if (cs == null || cs.length() == 0) return null;
372        cs = cs.trim().replaceAll("[\"']", "");
373        try {
374            if (Charset.isSupported(cs)) return cs;
375            cs = cs.toUpperCase(Locale.ENGLISH);
376            if (Charset.isSupported(cs)) return cs;
377        } catch (IllegalCharsetNameException e) {
378            // if all this charset matching fails.... we just take the default
379        }
380        return null;
381    }
382
383    /**
384     * Creates a random string, suitable for use as a mime boundary
385     */
386    static String mimeBoundary() {
387        final StringBuilder mime = StringUtil.borrowBuilder();
388        final Random rand = new Random();
389        for (int i = 0; i < boundaryLength; i++) {
390            mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]);
391        }
392        return StringUtil.releaseBuilder(mime);
393    }
394
395    private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException {
396        byte[] bom = new byte[4];
397        input.mark(bom.length);
398        //noinspection ResultOfMethodCallIgnored
399        input.read(bom, 0, 4);
400        input.reset();
401
402        // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
403        if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
404            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
405            return "UTF-32"; // and I hope it's on your system
406        } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
407            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
408            return "UTF-16"; // in all Javas
409        } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
410            input.read(bom, 0, 3); // consume the UTF-8 BOM
411            return "UTF-8"; // in all Javas
412        }
413        return null;
414    }
415}