001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SharedConstants; 005import org.jspecify.annotations.Nullable; 006 007import java.util.ArrayList; 008import java.util.HashMap; 009import java.util.Map; 010import java.util.Objects; 011import java.util.function.Consumer; 012 013import static org.jsoup.parser.Parser.NamespaceHtml; 014import static org.jsoup.parser.Parser.NamespaceMathml; 015import static org.jsoup.parser.Parser.NamespaceSvg; 016 017/** 018 A TagSet controls the {@link Tag} configuration for a Document's parse, and its serialization. It contains the initial 019 defaults, and after the parse, any additionally discovered tags. 020 021 @see Parser#tagSet(TagSet) 022 @since 1.20.1 023 */ 024public class TagSet { 025 static final TagSet HtmlTagSet = initHtmlDefault(); 026 027 private final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag 028 private final @Nullable TagSet source; // internal fallback for lazy tag copies 029 private @Nullable ArrayList<Consumer<Tag>> customizers; // optional onNewTag tag customizer 030 031 /** 032 Returns a mutable copy of the default HTML tag set. 033 */ 034 public static TagSet Html() { 035 return new TagSet(HtmlTagSet, null); 036 } 037 038 private TagSet(@Nullable TagSet source, @Nullable ArrayList<Consumer<Tag>> customizers) { 039 this.source = source; 040 this.customizers = customizers; 041 } 042 043 public TagSet() { 044 this(null, null); 045 } 046 047 /** 048 Creates a new TagSet by copying the current tags and customizers from the provided source TagSet. Changes made to 049 one TagSet will not affect the other. 050 @param template the TagSet to copy 051 */ 052 public TagSet(TagSet template) { 053 this(template.source, copyCustomizers(template)); 054 // copy tags eagerly; any lazy pull-through should come only from the root source (which would be the HTML defaults), not the template itself. 055 // that way the template tagset is not mutated when we do read through 056 if (template.tags.isEmpty()) return; 057 058 for (Map.Entry<String, Map<String, Tag>> namespaceEntry : template.tags.entrySet()) { 059 Map<String, Tag> nsTags = new HashMap<>(namespaceEntry.getValue().size()); 060 for (Map.Entry<String, Tag> tagEntry : namespaceEntry.getValue().entrySet()) { 061 nsTags.put(tagEntry.getKey(), tagEntry.getValue().clone()); 062 } 063 tags.put(namespaceEntry.getKey(), nsTags); 064 } 065 } 066 067 private static @Nullable ArrayList<Consumer<Tag>> copyCustomizers(TagSet base) { 068 if (base.customizers == null) return null; 069 return new ArrayList<>(base.customizers); 070 } 071 072 /** 073 Insert a tag into this TagSet. If the tag already exists, it is replaced. 074 <p>Tags explicitly added like this are considered to be known tags (vs those that are dynamically created via 075 .valueOf() if not already in the set.</p> 076 077 @param tag the tag to add 078 @return this TagSet 079 */ 080 public TagSet add(Tag tag) { 081 tag.set(Tag.Known); 082 doAdd(tag); 083 return this; 084 } 085 086 /** Adds the tag, but does not set defined. Used in .valueOf */ 087 private void doAdd(Tag tag) { 088 if (customizers != null) { 089 for (Consumer<Tag> customizer : customizers) { 090 customizer.accept(tag); 091 } 092 } 093 094 tags.computeIfAbsent(tag.namespace, ns -> new HashMap<>()) 095 .put(tag.tagName, tag); 096 } 097 098 /** 099 Get an existing Tag from this TagSet by tagName and namespace. The tag name is not normalized, to support mixed 100 instances. 101 102 @param tagName the case-sensitive tag name 103 @param namespace the namespace 104 @return the tag, or null if not found 105 */ 106 public @Nullable Tag get(String tagName, String namespace) { 107 Validate.notNull(tagName); 108 Validate.notNull(namespace); 109 110 // get from our tags 111 Map<String, Tag> nsTags = tags.get(namespace); 112 if (nsTags != null) { 113 Tag tag = nsTags.get(tagName); 114 if (tag != null) { 115 return tag; 116 } 117 } 118 119 // not found; clone on demand from source if exists 120 if (source != null) { 121 Tag tag = source.get(tagName, namespace); 122 if (tag != null) { 123 Tag copy = tag.clone(); 124 doAdd(copy); 125 return copy; 126 } 127 } 128 129 return null; 130 } 131 132 /** 133 Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes. 134 Provide a null normalName unless we already have one; will be normalized if required from tagName. 135 */ 136 Tag valueOf(String tagName, @Nullable String normalName, String namespace, boolean preserveTagCase) { 137 Validate.notNull(tagName); 138 Validate.notNull(namespace); 139 tagName = tagName.trim(); 140 Validate.notEmpty(tagName); 141 Tag tag = get(tagName, namespace); 142 if (tag != null) return tag; 143 144 // not found by tagName, try by normal 145 if (normalName == null) normalName = ParseSettings.normalName(tagName); 146 tagName = preserveTagCase ? tagName : normalName; 147 tag = get(normalName, namespace); 148 if (tag != null) { 149 if (preserveTagCase && !tagName.equals(normalName)) { 150 tag = tag.clone(); // copy so that the name update doesn't reset all instances 151 tag.tagName = tagName; 152 doAdd(tag); 153 } 154 return tag; 155 } 156 157 // not defined: return a new one 158 tag = new Tag(tagName, normalName, namespace); 159 doAdd(tag); 160 161 return tag; 162 } 163 164 /** 165 Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag. 166 <p>New tags will be added to this TagSet.</p> 167 168 @param tagName Name of tag, e.g. "p". 169 @param namespace the namespace for the tag. 170 @param settings used to control tag name sensitivity 171 @return The tag, either defined or new generic. 172 */ 173 public Tag valueOf(String tagName, String namespace, ParseSettings settings) { 174 return valueOf(tagName, null, namespace, settings.preserveTagCase()); 175 } 176 177 /** 178 Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag. 179 <p>New tags will be added to this TagSet.</p> 180 181 @param tagName Name of tag, e.g. "p". <b>Case-sensitive</b>. 182 @param namespace the namespace for the tag. 183 @return The tag, either defined or new generic. 184 @see #valueOf(String tagName, String namespace, ParseSettings settings) 185 */ 186 public Tag valueOf(String tagName, String namespace) { 187 return valueOf(tagName, namespace, ParseSettings.preserveCase); 188 } 189 190 /** 191 Register a callback to customize each {@link Tag} as it's added to this TagSet. 192 <p>Customizers are invoked once per Tag, when they are added (explicitly or via the valueOf methods).</p> 193 194 <p>For example, to allow all unknown tags to be self-closing during when parsing as HTML:</p> 195 <pre><code> 196 Parser parser = Parser.htmlParser(); 197 parser.tagSet().onNewTag(tag -> { 198 if (!tag.isKnownTag()) 199 tag.set(Tag.SelfClose); 200 }); 201 202 Document doc = Jsoup.parse(html, parser); 203 </code></pre> 204 205 @param customizer a {@code Consumer<Tag>} that will be called for each newly added or cloned Tag; callers can 206 inspect and modify the Tag's state (e.g. set options) 207 @return this TagSet, to allow method chaining 208 @since 1.21.0 209 */ 210 public TagSet onNewTag(Consumer<Tag> customizer) { 211 Validate.notNull(customizer); 212 if (customizers == null) 213 customizers = new ArrayList<>(); 214 customizers.add(customizer); 215 return this; 216 } 217 218 @Override 219 public boolean equals(Object o) { 220 if (!(o instanceof TagSet)) return false; 221 TagSet tagSet = (TagSet) o; 222 return Objects.equals(tags, tagSet.tags); 223 } 224 225 @Override 226 public int hashCode() { 227 return Objects.hashCode(tags); 228 } 229 230 // Default HTML initialization 231 232 /** 233 Initialize the default HTML tag set. 234 */ 235 static TagSet initHtmlDefault() { 236 String[] blockTags = { 237 "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", 238 "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", 239 "h6", "button", 240 "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", 241 "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", 242 "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main", 243 "center", "template", 244 "dir", "applet", "marquee", "listing", // deprecated but still known / special handling 245 "#root" // the outer Document 246 }; 247 String[] inlineTags = { 248 "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", 249 "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "wbr", "map", 250 "q", 251 "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup", 252 "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", 253 "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", 254 "data", "bdi", "s", "strike", "nobr", 255 "rb", // deprecated but still known / special handling 256 }; 257 String[] inlineContainers = { // can only contain inline; aka phrasing content 258 "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style", 259 "ins", "del", "s", "button" 260 }; 261 String[] voidTags = { 262 "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", 263 "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track" 264 }; 265 String[] preserveWhitespaceTags = { 266 "pre", "plaintext", "title", "textarea", "script" 267 }; 268 String[] rcdataTags = { "title", "textarea" }; 269 String[] dataTags = { "iframe", "noembed", "noframes", "script", "style", "xmp" }; 270 String[] formSubmitTags = SharedConstants.FormSubmitTags; 271 String[] blockMathTags = {"math"}; 272 String[] inlineMathTags = {"mi", "mo", "msup", "mn", "mtext"}; 273 String[] blockSvgTags = {"svg", "femerge", "femergenode"}; // note these are LC versions, but actually preserve case 274 String[] inlineSvgTags = {"text"}; 275 String[] dataSvgTags = {"script"}; 276 277 return new TagSet() 278 .setupTags(NamespaceHtml, blockTags, tag -> tag.set(Tag.Block)) 279 .setupTags(NamespaceHtml, inlineTags, tag -> tag.set(0)) 280 .setupTags(NamespaceHtml, inlineContainers, tag -> tag.set(Tag.InlineContainer)) 281 .setupTags(NamespaceHtml, voidTags, tag -> tag.set(Tag.Void)) 282 .setupTags(NamespaceHtml, preserveWhitespaceTags, tag -> tag.set(Tag.PreserveWhitespace)) 283 .setupTags(NamespaceHtml, rcdataTags, tag -> tag.set(Tag.RcData)) 284 .setupTags(NamespaceHtml, dataTags, tag -> tag.set(Tag.Data)) 285 .setupTags(NamespaceHtml, formSubmitTags, tag -> tag.set(Tag.FormSubmittable)) 286 .setupTags(NamespaceMathml, blockMathTags, tag -> tag.set(Tag.Block)) 287 .setupTags(NamespaceMathml, inlineMathTags, tag -> tag.set(0)) 288 .setupTags(NamespaceSvg, blockSvgTags, tag -> tag.set(Tag.Block)) 289 .setupTags(NamespaceSvg, inlineSvgTags, tag -> tag.set(0)) 290 .setupTags(NamespaceSvg, dataSvgTags, tag -> tag.set(Tag.Data)) 291 ; 292 } 293 294 private TagSet setupTags(String namespace, String[] tagNames, Consumer<Tag> tagModifier) { 295 for (String tagName : tagNames) { 296 Tag tag = get(tagName, namespace); 297 if (tag == null) { 298 tag = new Tag(tagName, tagName, namespace); // normal name is already normal here 299 tag.options = 0; // clear defaults 300 add(tag); 301 } 302 tagModifier.accept(tag); 303 } 304 return this; 305 } 306}