001package org.jsoup.nodes; 002 003import org.jsoup.helper.DataUtil; 004import org.jsoup.internal.QuietAppendable; 005import org.jsoup.internal.StringUtil; 006import org.jsoup.helper.Validate; 007import org.jsoup.nodes.Document.OutputSettings; 008import org.jsoup.parser.CharacterReader; 009import org.jsoup.parser.Parser; 010 011import java.nio.charset.Charset; 012import java.nio.charset.CharsetEncoder; 013import java.util.ArrayList; 014import java.util.Arrays; 015import java.util.Collections; 016import java.util.HashMap; 017 018import static org.jsoup.nodes.Entities.EscapeMode.base; 019import static org.jsoup.nodes.Entities.EscapeMode.extended; 020 021/** 022 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C 023 * HTML named character references</a>. 024 */ 025public class Entities { 026 // constants for escape options: 027 static final int ForText = 0x1; 028 static final int ForAttribute = 0x2; 029 static final int Normalise = 0x4; 030 static final int TrimLeading = 0x8; 031 static final int TrimTrailing = 0x10; 032 033 private static final int empty = -1; 034 private static final String emptyName = ""; 035 static final int codepointRadix = 36; 036 private static final char[] codeDelims = {',', ';'}; 037 private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references 038 039 private static final int BaseCount = 106; 040 private static final ArrayList<String> baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching 041 042 public enum EscapeMode { 043 /** 044 * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. 045 */ 046 xhtml(EntitiesData.xmlPoints, 4), 047 /** 048 * Default HTML output entities. 049 */ 050 base(EntitiesData.basePoints, 106), 051 /** 052 * Complete HTML entities. 053 */ 054 extended(EntitiesData.fullPoints, 2125); 055 056 static { 057 // sort the base names by length, for prefix matching 058 Collections.addAll(baseSorted, base.nameKeys); 059 baseSorted.sort((a, b) -> b.length() - a.length()); 060 } 061 062 // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities. 063 private String[] nameKeys; 064 private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints. 065 066 // table of codepoints to named entities. 067 private int[] codeKeys; // we don't support multicodepoints to single named value currently 068 private String[] nameVals; 069 070 EscapeMode(String file, int size) { 071 load(this, file, size); 072 } 073 074 int codepointForName(final String name) { 075 int index = Arrays.binarySearch(nameKeys, name); 076 return index >= 0 ? codeVals[index] : empty; 077 } 078 079 String nameForCodepoint(final int codepoint) { 080 final int index = Arrays.binarySearch(codeKeys, codepoint); 081 if (index >= 0) { 082 // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower 083 // (and binary search for same item with multi results is undefined 084 return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ? 085 nameVals[index + 1] : nameVals[index]; 086 } 087 return emptyName; 088 } 089 } 090 091 private Entities() { 092 } 093 094 /** 095 * Check if the input is a known named entity 096 * 097 * @param name the possible entity name (e.g. "lt" or "amp") 098 * @return true if a known named entity 099 */ 100 public static boolean isNamedEntity(final String name) { 101 return extended.codepointForName(name) != empty; 102 } 103 104 /** 105 * Check if the input is a known named entity in the base entity set. 106 * 107 * @param name the possible entity name (e.g. "lt" or "amp") 108 * @return true if a known named entity in the base set 109 * @see #isNamedEntity(String) 110 */ 111 public static boolean isBaseNamedEntity(final String name) { 112 return base.codepointForName(name) != empty; 113 } 114 115 /** 116 * Get the character(s) represented by the named entity 117 * 118 * @param name entity (e.g. "lt" or "amp") 119 * @return the string value of the character(s) represented by this entity, or "" if not defined 120 */ 121 public static String getByName(String name) { 122 String val = multipoints.get(name); 123 if (val != null) 124 return val; 125 int codepoint = extended.codepointForName(name); 126 if (codepoint != empty) 127 return new String(new int[]{codepoint}, 0, 1); 128 return emptyName; 129 } 130 131 public static int codepointsForName(final String name, final int[] codepoints) { 132 String val = multipoints.get(name); 133 if (val != null) { 134 codepoints[0] = val.codePointAt(0); 135 codepoints[1] = val.codePointAt(1); 136 return 2; 137 } 138 int codepoint = extended.codepointForName(name); 139 if (codepoint != empty) { 140 codepoints[0] = codepoint; 141 return 1; 142 } 143 return 0; 144 } 145 146 /** 147 Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not". 148 149 @return longest entity name that is a prefix of the input, or "" if no entity matches 150 */ 151 public static String findPrefix(String input) { 152 for (String name : baseSorted) { 153 if (input.startsWith(name)) return name; 154 } 155 return emptyName; 156 // if perf critical, could look at using a Trie vs a scan 157 } 158 159 /** 160 HTML escape an input string. That is, {@code <} is returned as {@code <}. The escaped string is suitable for use 161 both in attributes and in text data. 162 @param data the un-escaped string to escape 163 @param out the output settings to use. This configures the character set escaped against (that is, if a 164 character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML 165 settings. 166 @return the escaped string 167 */ 168 public static String escape(String data, OutputSettings out) { 169 return escapeString(data, out.escapeMode(), out.charset()); 170 } 171 172 /** 173 HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is 174 returned as {@code <}. The escaped string is suitable for use both in attributes and in text data. 175 @param data the un-escaped string to escape 176 @return the escaped string 177 @see #escape(String, OutputSettings) 178 */ 179 public static String escape(String data) { 180 return escapeString(data, base, DataUtil.UTF_8); 181 } 182 183 private static String escapeString(String data, EscapeMode escapeMode, Charset charset) { 184 if (data == null) return ""; 185 StringBuilder sb = StringUtil.borrowBuilder(); 186 doEscape(data, QuietAppendable.wrap(sb), escapeMode, charset, ForText | ForAttribute); 187 return StringUtil.releaseBuilder(sb); 188 } 189 190 static void escape(QuietAppendable accum, String data, OutputSettings out, int options) { 191 doEscape(data, accum, out.escapeMode(), out.charset(), options); 192 } 193 194 private static void doEscape(String data, QuietAppendable accum, EscapeMode mode, Charset charset, int options) { 195 final CoreCharset coreCharset = CoreCharset.byName(charset.name()); 196 final CharsetEncoder fallback = encoderFor(charset); 197 final int length = data.length(); 198 199 int codePoint; 200 boolean lastWasWhite = false; 201 boolean reachedNonWhite = false; 202 boolean skipped = false; 203 for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { 204 codePoint = data.codePointAt(offset); 205 206 if ((options & Normalise) != 0) { 207 if (StringUtil.isWhitespace(codePoint)) { 208 if ((options & TrimLeading) != 0 && !reachedNonWhite) continue; 209 if (lastWasWhite) continue; 210 if ((options & TrimTrailing) != 0) { 211 skipped = true; 212 continue; 213 } 214 accum.append(' '); 215 lastWasWhite = true; 216 continue; 217 } else { 218 lastWasWhite = false; 219 reachedNonWhite = true; 220 if (skipped) { 221 accum.append(' '); // wasn't the end, so need to place a normalized space 222 skipped = false; 223 } 224 } 225 } 226 appendEscaped(codePoint, accum, options, mode, coreCharset, fallback); 227 } 228 } 229 230 private static void appendEscaped(int codePoint, QuietAppendable accum, int options, EscapeMode escapeMode, 231 CoreCharset coreCharset, CharsetEncoder fallback) { 232 // specific character range for xml 1.0; drop (not encode) if so 233 if (EscapeMode.xhtml == escapeMode && !isValidXmlChar(codePoint)) { 234 return; 235 } 236 237 // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): 238 final char c = (char) codePoint; 239 if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 240 // html specific and required escapes: 241 switch (c) { 242 case '&': 243 accum.append("&"); 244 break; 245 case 0xA0: 246 appendNbsp(accum, escapeMode); 247 break; 248 case '<': 249 accum.append("<"); 250 break; 251 case '>': 252 accum.append(">"); 253 break; 254 case '"': 255 if ((options & ForAttribute) != 0) accum.append("""); 256 else accum.append(c); 257 break; 258 case '\'': 259 // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape. 260 appendApos(accum, options, escapeMode); 261 break; 262 // we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets 263 case 0x9: 264 case 0xA: 265 case 0xD: 266 accum.append(c); 267 break; 268 default: 269 if (c < 0x20 || !canEncode(coreCharset, c, fallback)) appendEncoded(accum, escapeMode, codePoint); 270 else accum.append(c); 271 } 272 } else { 273 if (canEncode(coreCharset, c, fallback)) { 274 // reads into charBuf - we go through these steps to avoid GC objects as much as possible (would be a new String and a new char[2] for each character) 275 char[] chars = charBuf.get(); 276 int len = Character.toChars(codePoint, chars, 0); 277 accum.append(chars, 0, len); 278 } else { 279 appendEncoded(accum, escapeMode, codePoint); 280 } 281 } 282 } 283 284 private static final ThreadLocal<char[]> charBuf = ThreadLocal.withInitial(() -> new char[2]); 285 286 private static void appendNbsp(QuietAppendable accum, EscapeMode escapeMode) { 287 if (escapeMode != EscapeMode.xhtml) accum.append(" "); 288 else accum.append(" "); 289 } 290 291 private static void appendApos(QuietAppendable accum, int options, EscapeMode escapeMode) { 292 if ((options & ForAttribute) != 0 && (options & ForText) != 0) { 293 if (escapeMode == EscapeMode.xhtml) accum.append("'"); 294 else accum.append("'"); 295 } else { 296 accum.append('\''); 297 } 298 } 299 300 private static void appendEncoded(QuietAppendable accum, EscapeMode escapeMode, int codePoint) { 301 final String name = escapeMode.nameForCodepoint(codePoint); 302 if (!emptyName.equals(name)) // ok for identity check 303 accum.append('&').append(name).append(';'); 304 else 305 accum.append("&#x").append(Integer.toHexString(codePoint)).append(';'); 306 } 307 308 /** 309 * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}. 310 * 311 * @param string the HTML string to un-escape 312 * @return the unescaped string 313 */ 314 public static String unescape(String string) { 315 return unescape(string, false); 316 } 317 318 /** 319 * Unescape the input string. 320 * 321 * @param string to un-HTML-escape 322 * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) 323 * @return unescaped string 324 */ 325 static String unescape(String string, boolean strict) { 326 return Parser.unescapeEntities(string, strict); 327 } 328 329 /* 330 * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean. 331 * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF, 332 * performance may be bad. We can add more encoders for common character sets that are impacted by performance 333 * issues on Android if required. 334 * 335 * Benchmarks: * 336 * OLD toHtml() impl v New (fastpath) in millis 337 * Wiki: 1895, 16 338 * CNN: 6378, 55 339 * Alterslash: 3013, 28 340 * Jsoup: 167, 2 341 */ 342 private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) { 343 // todo add more charset tests if impacted by Android's bad perf in canEncode 344 switch (charset) { 345 case ascii: 346 return c < 0x80; 347 case utf: 348 return !(c >= Character.MIN_SURROGATE && c < (Character.MAX_SURROGATE + 1)); // !Character.isSurrogate(c); but not in Android 10 desugar 349 default: 350 return fallback.canEncode(c); 351 } 352 } 353 354 private static boolean isValidXmlChar(int codePoint) { 355 // https://www.w3.org/TR/2006/REC-xml-20060816/Overview.html#charsets 356 // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. 357 return (codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || (codePoint >= 0x20 && codePoint <= 0xD7FF) 358 || (codePoint >= 0xE000 && codePoint <= 0xFFFD) || (codePoint >= 0x10000 && codePoint <= 0x10FFFF)); 359 } 360 361 enum CoreCharset { 362 ascii, utf, fallback; 363 364 static CoreCharset byName(final String name) { 365 if (name.equals("US-ASCII")) 366 return ascii; 367 if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al 368 return utf; 369 return fallback; 370 } 371 } 372 373 // cache the last used fallback encoder to save recreating on every use 374 private static final ThreadLocal<CharsetEncoder> LocalEncoder = new ThreadLocal<>(); 375 private static CharsetEncoder encoderFor(Charset charset) { 376 CharsetEncoder encoder = LocalEncoder.get(); 377 if (encoder == null || !encoder.charset().equals(charset)) { 378 encoder = charset.newEncoder(); 379 LocalEncoder.set(encoder); 380 } 381 return encoder; 382 } 383 384 private static void load(EscapeMode e, String pointsData, int size) { 385 e.nameKeys = new String[size]; 386 e.codeVals = new int[size]; 387 e.codeKeys = new int[size]; 388 e.nameVals = new String[size]; 389 390 int i = 0; 391 try (CharacterReader reader = new CharacterReader(pointsData)) { 392 while (!reader.isEmpty()) { 393 // NotNestedLessLess=10913,824;1887& 394 395 final String name = reader.consumeTo('='); 396 reader.advance(); 397 final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix); 398 final char codeDelim = reader.current(); 399 reader.advance(); 400 final int cp2; 401 if (codeDelim == ',') { 402 cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix); 403 reader.advance(); 404 } else { 405 cp2 = empty; 406 } 407 final String indexS = reader.consumeTo('&'); 408 final int index = Integer.parseInt(indexS, codepointRadix); 409 reader.advance(); 410 411 e.nameKeys[i] = name; 412 e.codeVals[i] = cp1; 413 e.codeKeys[index] = cp1; 414 e.nameVals[index] = name; 415 416 if (cp2 != empty) { 417 multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2)); 418 } 419 i++; 420 } 421 422 Validate.isTrue(i == size, "Unexpected count of entities loaded"); 423 } 424 } 425}