001package org.jsoup.nodes;
002
003import org.jsoup.helper.DataUtil;
004import org.jsoup.internal.QuietAppendable;
005import org.jsoup.internal.StringUtil;
006import org.jsoup.helper.Validate;
007import org.jsoup.nodes.Document.OutputSettings;
008import org.jsoup.parser.CharacterReader;
009import org.jsoup.parser.Parser;
010
011import java.nio.charset.Charset;
012import java.nio.charset.CharsetEncoder;
013import java.util.ArrayList;
014import java.util.Arrays;
015import java.util.Collections;
016import java.util.HashMap;
017
018import static org.jsoup.nodes.Entities.EscapeMode.base;
019import static org.jsoup.nodes.Entities.EscapeMode.extended;
020
021/**
022 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
023 * HTML named character references</a>.
024 */
025public class Entities {
026    // constants for escape options:
027    static final int ForText = 0x1;
028    static final int ForAttribute = 0x2;
029    static final int Normalise = 0x4;
030    static final int TrimLeading = 0x8;
031    static final int TrimTrailing = 0x10;
032
033    private static final int empty = -1;
034    private static final String emptyName = "";
035    static final int codepointRadix = 36;
036    private static final char[] codeDelims = {',', ';'};
037    private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
038
039    private static final int BaseCount = 106;
040    private static final ArrayList<String> baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching
041
042    public enum EscapeMode {
043        /**
044         * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
045         */
046        xhtml(EntitiesData.xmlPoints, 4),
047        /**
048         * Default HTML output entities.
049         */
050        base(EntitiesData.basePoints, 106),
051        /**
052         * Complete HTML entities.
053         */
054        extended(EntitiesData.fullPoints, 2125);
055
056        static {
057            // sort the base names by length, for prefix matching
058            Collections.addAll(baseSorted, base.nameKeys);
059            baseSorted.sort((a, b) -> b.length() - a.length());
060        }
061
062        // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
063        private String[] nameKeys;
064        private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
065
066        // table of codepoints to named entities.
067        private int[] codeKeys; // we don't support multicodepoints to single named value currently
068        private String[] nameVals;
069
070        EscapeMode(String file, int size) {
071            load(this, file, size);
072        }
073
074        int codepointForName(final String name) {
075            int index = Arrays.binarySearch(nameKeys, name);
076            return index >= 0 ? codeVals[index] : empty;
077        }
078
079        String nameForCodepoint(final int codepoint) {
080            final int index = Arrays.binarySearch(codeKeys, codepoint);
081            if (index >= 0) {
082                // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
083                // (and binary search for same item with multi results is undefined
084                return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
085                    nameVals[index + 1] : nameVals[index];
086            }
087            return emptyName;
088        }
089    }
090
091    private Entities() {
092    }
093
094    /**
095     * Check if the input is a known named entity
096     *
097     * @param name the possible entity name (e.g. "lt" or "amp")
098     * @return true if a known named entity
099     */
100    public static boolean isNamedEntity(final String name) {
101        return extended.codepointForName(name) != empty;
102    }
103
104    /**
105     * Check if the input is a known named entity in the base entity set.
106     *
107     * @param name the possible entity name (e.g. "lt" or "amp")
108     * @return true if a known named entity in the base set
109     * @see #isNamedEntity(String)
110     */
111    public static boolean isBaseNamedEntity(final String name) {
112        return base.codepointForName(name) != empty;
113    }
114
115    /**
116     * Get the character(s) represented by the named entity
117     *
118     * @param name entity (e.g. "lt" or "amp")
119     * @return the string value of the character(s) represented by this entity, or "" if not defined
120     */
121    public static String getByName(String name) {
122        String val = multipoints.get(name);
123        if (val != null)
124            return val;
125        int codepoint = extended.codepointForName(name);
126        if (codepoint != empty)
127            return new String(new int[]{codepoint}, 0, 1);
128        return emptyName;
129    }
130
131    public static int codepointsForName(final String name, final int[] codepoints) {
132        String val = multipoints.get(name);
133        if (val != null) {
134            codepoints[0] = val.codePointAt(0);
135            codepoints[1] = val.codePointAt(1);
136            return 2;
137        }
138        int codepoint = extended.codepointForName(name);
139        if (codepoint != empty) {
140            codepoints[0] = codepoint;
141            return 1;
142        }
143        return 0;
144    }
145
146    /**
147     Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not".
148
149     @return longest entity name that is a prefix of the input, or "" if no entity matches
150     */
151    public static String findPrefix(String input) {
152        for (String name : baseSorted) {
153            if (input.startsWith(name)) return name;
154        }
155        return emptyName;
156        // if perf critical, could look at using a Trie vs a scan
157    }
158
159    /**
160     HTML escape an input string. That is, {@code <} is returned as {@code &lt;}. The escaped string is suitable for use
161     both in attributes and in text data.
162     @param data the un-escaped string to escape
163     @param out the output settings to use. This configures the character set escaped against (that is, if a
164     character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML
165     settings.
166     @return the escaped string
167     */
168    public static String escape(String data, OutputSettings out) {
169        return escapeString(data, out.escapeMode(), out.charset());
170    }
171
172    /**
173     HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is
174     returned as {@code &lt;}. The escaped string is suitable for use both in attributes and in text data.
175     @param data the un-escaped string to escape
176     @return the escaped string
177     @see #escape(String, OutputSettings)
178     */
179    public static String escape(String data) {
180        return escapeString(data, base, DataUtil.UTF_8);
181    }
182
183    private static String escapeString(String data, EscapeMode escapeMode, Charset charset) {
184        if (data == null) return "";
185        StringBuilder sb = StringUtil.borrowBuilder();
186        doEscape(data, QuietAppendable.wrap(sb), escapeMode, charset, ForText | ForAttribute);
187        return StringUtil.releaseBuilder(sb);
188    }
189
190    static void escape(QuietAppendable accum, String data, OutputSettings out, int options) {
191        doEscape(data, accum, out.escapeMode(), out.charset(), options);
192    }
193
194    private static void doEscape(String data, QuietAppendable accum, EscapeMode mode, Charset charset, int options) {
195        final CoreCharset coreCharset = CoreCharset.byName(charset.name());
196        final CharsetEncoder fallback = encoderFor(charset);
197        final int length = data.length();
198
199        int codePoint;
200        boolean lastWasWhite = false;
201        boolean reachedNonWhite = false;
202        boolean skipped = false;
203        for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
204            codePoint = data.codePointAt(offset);
205
206            if ((options & Normalise) != 0) {
207                if (StringUtil.isWhitespace(codePoint)) {
208                    if ((options & TrimLeading) != 0 && !reachedNonWhite) continue;
209                    if (lastWasWhite) continue;
210                    if ((options & TrimTrailing) != 0) {
211                        skipped = true;
212                        continue;
213                    }
214                    accum.append(' ');
215                    lastWasWhite = true;
216                    continue;
217                } else {
218                    lastWasWhite = false;
219                    reachedNonWhite = true;
220                    if (skipped) {
221                        accum.append(' '); // wasn't the end, so need to place a normalized space
222                        skipped = false;
223                    }
224                }
225            }
226            appendEscaped(codePoint, accum, options, mode, coreCharset, fallback);
227        }
228    }
229
230    private static void appendEscaped(int codePoint, QuietAppendable accum, int options, EscapeMode escapeMode,
231        CoreCharset coreCharset, CharsetEncoder fallback) {
232        // specific character range for xml 1.0; drop (not encode) if so
233        if (EscapeMode.xhtml == escapeMode && !isValidXmlChar(codePoint)) {
234            return;
235        }
236
237        // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
238        final char c = (char) codePoint;
239        if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
240            // html specific and required escapes:
241            switch (c) {
242                case '&':
243                    accum.append("&amp;");
244                    break;
245                case 0xA0:
246                    appendNbsp(accum, escapeMode);
247                    break;
248                case '<':
249                    accum.append("&lt;");
250                    break;
251                case '>':
252                    accum.append("&gt;");
253                    break;
254                case '"':
255                    if ((options & ForAttribute) != 0) accum.append("&quot;");
256                    else accum.append(c);
257                    break;
258                case '\'':
259                    // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape.
260                    appendApos(accum, options, escapeMode);
261                    break;
262                // we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
263                case 0x9:
264                case 0xA:
265                case 0xD:
266                    accum.append(c);
267                    break;
268                default:
269                    if (c < 0x20 || !canEncode(coreCharset, c, fallback)) appendEncoded(accum, escapeMode, codePoint);
270                    else accum.append(c);
271            }
272        } else {
273            if (canEncode(coreCharset, c, fallback)) {
274                // reads into charBuf - we go through these steps to avoid GC objects as much as possible (would be a new String and a new char[2] for each character)
275                char[] chars = charBuf.get();
276                int len = Character.toChars(codePoint, chars, 0);
277                accum.append(chars, 0, len);
278            } else {
279                appendEncoded(accum, escapeMode, codePoint);
280            }
281        }
282    }
283
284    private static final ThreadLocal<char[]> charBuf = ThreadLocal.withInitial(() -> new char[2]);
285
286    private static void appendNbsp(QuietAppendable accum, EscapeMode escapeMode) {
287        if (escapeMode != EscapeMode.xhtml) accum.append("&nbsp;");
288        else accum.append("&#xa0;");
289    }
290
291    private static void appendApos(QuietAppendable accum, int options, EscapeMode escapeMode) {
292        if ((options & ForAttribute) != 0 && (options & ForText) != 0) {
293            if (escapeMode == EscapeMode.xhtml) accum.append("&#x27;");
294            else accum.append("&apos;");
295        } else {
296            accum.append('\'');
297        }
298    }
299
300    private static void appendEncoded(QuietAppendable accum, EscapeMode escapeMode, int codePoint) {
301        final String name = escapeMode.nameForCodepoint(codePoint);
302        if (!emptyName.equals(name)) // ok for identity check
303            accum.append('&').append(name).append(';');
304        else
305            accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
306    }
307
308    /**
309     * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
310     *
311     * @param string the HTML string to un-escape
312     * @return the unescaped string
313     */
314    public static String unescape(String string) {
315        return unescape(string, false);
316    }
317
318    /**
319     * Unescape the input string.
320     *
321     * @param string to un-HTML-escape
322     * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
323     * @return unescaped string
324     */
325    static String unescape(String string, boolean strict) {
326        return Parser.unescapeEntities(string, strict);
327    }
328
329    /*
330     * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
331     * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
332     * performance may be bad. We can add more encoders for common character sets that are impacted by performance
333     * issues on Android if required.
334     *
335     * Benchmarks:     *
336     * OLD toHtml() impl v New (fastpath) in millis
337     * Wiki: 1895, 16
338     * CNN: 6378, 55
339     * Alterslash: 3013, 28
340     * Jsoup: 167, 2
341     */
342    private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
343        // todo add more charset tests if impacted by Android's bad perf in canEncode
344        switch (charset) {
345            case ascii:
346                return c < 0x80;
347            case utf:
348                return !(c >= Character.MIN_SURROGATE && c < (Character.MAX_SURROGATE + 1)); // !Character.isSurrogate(c); but not in Android 10 desugar
349            default:
350                return fallback.canEncode(c);
351        }
352    }
353
354    private static boolean isValidXmlChar(int codePoint) {
355        // https://www.w3.org/TR/2006/REC-xml-20060816/Overview.html#charsets
356        // Char    ::=          #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]  any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
357        return (codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || (codePoint >= 0x20 && codePoint <= 0xD7FF)
358            || (codePoint >= 0xE000 && codePoint <= 0xFFFD) || (codePoint >= 0x10000 && codePoint <= 0x10FFFF));
359    }
360
361    enum CoreCharset {
362        ascii, utf, fallback;
363
364        static CoreCharset byName(final String name) {
365            if (name.equals("US-ASCII"))
366                return ascii;
367            if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
368                return utf;
369            return fallback;
370        }
371    }
372
373    // cache the last used fallback encoder to save recreating on every use
374    private static final ThreadLocal<CharsetEncoder> LocalEncoder = new ThreadLocal<>();
375    private static CharsetEncoder encoderFor(Charset charset) {
376        CharsetEncoder encoder = LocalEncoder.get();
377        if (encoder == null || !encoder.charset().equals(charset)) {
378            encoder = charset.newEncoder();
379            LocalEncoder.set(encoder);
380        }
381        return encoder;
382    }
383
384    private static void load(EscapeMode e, String pointsData, int size) {
385        e.nameKeys = new String[size];
386        e.codeVals = new int[size];
387        e.codeKeys = new int[size];
388        e.nameVals = new String[size];
389
390        int i = 0;
391        try (CharacterReader reader = new CharacterReader(pointsData)) {
392            while (!reader.isEmpty()) {
393                // NotNestedLessLess=10913,824;1887&
394
395                final String name = reader.consumeTo('=');
396                reader.advance();
397                final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
398                final char codeDelim = reader.current();
399                reader.advance();
400                final int cp2;
401                if (codeDelim == ',') {
402                    cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
403                    reader.advance();
404                } else {
405                    cp2 = empty;
406                }
407                final String indexS = reader.consumeTo('&');
408                final int index = Integer.parseInt(indexS, codepointRadix);
409                reader.advance();
410
411                e.nameKeys[i] = name;
412                e.codeVals[i] = cp1;
413                e.codeKeys[index] = cp1;
414                e.nameVals[index] = name;
415
416                if (cp2 != empty) {
417                    multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
418                }
419                i++;
420            }
421
422            Validate.isTrue(i == size, "Unexpected count of entities loaded");
423        }
424    }
425}