001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SharedConstants;
005import org.jspecify.annotations.Nullable;
006
007import java.util.ArrayList;
008import java.util.HashMap;
009import java.util.Map;
010import java.util.Objects;
011import java.util.function.Consumer;
012
013import static org.jsoup.parser.Parser.NamespaceHtml;
014import static org.jsoup.parser.Parser.NamespaceMathml;
015import static org.jsoup.parser.Parser.NamespaceSvg;
016
017/**
018 A TagSet controls the {@link Tag} configuration for a Document's parse, and its serialization. It contains the initial
019 defaults, and after the parse, any additionally discovered tags.
020
021 @see Parser#tagSet(TagSet)
022 @since 1.20.1
023 */
024public class TagSet {
025    static final TagSet HtmlTagSet = initHtmlDefault();
026
027    private final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag
028    private final @Nullable TagSet source; // internal fallback for lazy tag copies
029    private @Nullable ArrayList<Consumer<Tag>> customizers; // optional onNewTag tag customizer
030
031    /**
032     Returns a mutable copy of the default HTML tag set.
033     */
034    public static TagSet Html() {
035        return new TagSet(HtmlTagSet, null);
036    }
037
038    private TagSet(@Nullable TagSet source, @Nullable ArrayList<Consumer<Tag>> customizers) {
039        this.source = source;
040        this.customizers = customizers;
041    }
042
043    public TagSet() {
044        this(null, null);
045    }
046
047    /**
048     Creates a new TagSet by copying the current tags and customizers from the provided source TagSet. Changes made to
049     one TagSet will not affect the other.
050     @param template the TagSet to copy
051     */
052    public TagSet(TagSet template) {
053        this(template.source, copyCustomizers(template));
054        // copy tags eagerly; any lazy pull-through should come only from the root source (which would be the HTML defaults), not the template itself.
055        // that way the template tagset is not mutated when we do read through
056        if (template.tags.isEmpty()) return;
057
058        for (Map.Entry<String, Map<String, Tag>> namespaceEntry : template.tags.entrySet()) {
059            Map<String, Tag> nsTags = new HashMap<>(namespaceEntry.getValue().size());
060            for (Map.Entry<String, Tag> tagEntry : namespaceEntry.getValue().entrySet()) {
061                nsTags.put(tagEntry.getKey(), tagEntry.getValue().clone());
062            }
063            tags.put(namespaceEntry.getKey(), nsTags);
064        }
065    }
066
067    private static @Nullable ArrayList<Consumer<Tag>> copyCustomizers(TagSet base) {
068        if (base.customizers == null) return null;
069        return new ArrayList<>(base.customizers);
070    }
071
072    /**
073     Insert a tag into this TagSet. If the tag already exists, it is replaced.
074     <p>Tags explicitly added like this are considered to be known tags (vs those that are dynamically created via
075     .valueOf() if not already in the set.</p>
076
077     @param tag the tag to add
078     @return this TagSet
079     */
080    public TagSet add(Tag tag) {
081        tag.set(Tag.Known);
082        doAdd(tag);
083        return this;
084    }
085
086    /** Adds the tag, but does not set defined. Used in .valueOf */
087    private void doAdd(Tag tag) {
088        if (customizers != null) {
089            for (Consumer<Tag> customizer : customizers) {
090                customizer.accept(tag);
091            }
092        }
093
094        tags.computeIfAbsent(tag.namespace, ns -> new HashMap<>())
095            .put(tag.tagName, tag);
096    }
097
098    /**
099     Get an existing Tag from this TagSet by tagName and namespace. The tag name is not normalized, to support mixed
100     instances.
101
102     @param tagName the case-sensitive tag name
103     @param namespace the namespace
104     @return the tag, or null if not found
105     */
106    public @Nullable Tag get(String tagName, String namespace) {
107        Validate.notNull(tagName);
108        Validate.notNull(namespace);
109
110        // get from our tags
111        Map<String, Tag> nsTags = tags.get(namespace);
112        if (nsTags != null) {
113            Tag tag = nsTags.get(tagName);
114            if (tag != null) {
115                return tag;
116            }
117        }
118
119        // not found; clone on demand from source if exists
120        if (source != null) {
121            Tag tag = source.get(tagName, namespace);
122            if (tag != null) {
123                Tag copy = tag.clone();
124                doAdd(copy);
125                return copy;
126            }
127        }
128
129        return null;
130    }
131
132    /**
133     Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes.
134     Provide a null normalName unless we already have one; will be normalized if required from tagName.
135     */
136    Tag valueOf(String tagName, @Nullable String normalName, String namespace, boolean preserveTagCase) {
137        Validate.notNull(tagName);
138        Validate.notNull(namespace);
139        tagName = tagName.trim();
140        Validate.notEmpty(tagName);
141        Tag tag = get(tagName, namespace);
142        if (tag != null) return tag;
143
144        // not found by tagName, try by normal
145        if (normalName == null) normalName = ParseSettings.normalName(tagName);
146        tagName = preserveTagCase ? tagName : normalName;
147        tag = get(normalName, namespace);
148        if (tag != null) {
149            if (preserveTagCase && !tagName.equals(normalName)) {
150                tag = tag.clone(); // copy so that the name update doesn't reset all instances
151                tag.tagName = tagName;
152                doAdd(tag);
153            }
154            return tag;
155        }
156
157        // not defined: return a new one
158        tag = new Tag(tagName, normalName, namespace);
159        doAdd(tag);
160
161        return tag;
162    }
163
164    /**
165     Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag.
166     <p>New tags will be added to this TagSet.</p>
167
168     @param tagName Name of tag, e.g. "p".
169     @param namespace the namespace for the tag.
170     @param settings used to control tag name sensitivity
171     @return The tag, either defined or new generic.
172     */
173    public Tag valueOf(String tagName, String namespace, ParseSettings settings) {
174        return valueOf(tagName, null, namespace, settings.preserveTagCase());
175    }
176
177    /**
178     Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag.
179     <p>New tags will be added to this TagSet.</p>
180
181     @param tagName Name of tag, e.g. "p". <b>Case-sensitive</b>.
182     @param namespace the namespace for the tag.
183     @return The tag, either defined or new generic.
184     @see #valueOf(String tagName, String namespace, ParseSettings settings)
185     */
186    public Tag valueOf(String tagName, String namespace) {
187        return valueOf(tagName, namespace, ParseSettings.preserveCase);
188    }
189
190    /**
191     Register a callback to customize each {@link Tag} as it's added to this TagSet.
192     <p>Customizers are invoked once per Tag, when they are added (explicitly or via the valueOf methods).</p>
193
194     <p>For example, to allow all unknown tags to be self-closing during when parsing as HTML:</p>
195     <pre><code>
196     Parser parser = Parser.htmlParser();
197     parser.tagSet().onNewTag(tag -> {
198     if (!tag.isKnownTag())
199        tag.set(Tag.SelfClose);
200     });
201
202     Document doc = Jsoup.parse(html, parser);
203     </code></pre>
204
205     @param customizer a {@code Consumer<Tag>} that will be called for each newly added or cloned Tag; callers can
206     inspect and modify the Tag's state (e.g. set options)
207     @return this TagSet, to allow method chaining
208     @since 1.21.0
209     */
210    public TagSet onNewTag(Consumer<Tag> customizer) {
211        Validate.notNull(customizer);
212        if (customizers == null)
213            customizers = new ArrayList<>();
214        customizers.add(customizer);
215        return this;
216    }
217
218    @Override
219    public boolean equals(Object o) {
220        if (!(o instanceof TagSet)) return false;
221        TagSet tagSet = (TagSet) o;
222        return Objects.equals(tags, tagSet.tags);
223    }
224
225    @Override
226    public int hashCode() {
227        return Objects.hashCode(tags);
228    }
229
230    // Default HTML initialization
231
232    /**
233     Initialize the default HTML tag set.
234     */
235    static TagSet initHtmlDefault() {
236        String[] blockTags = {
237            "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame",
238            "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5",
239            "h6", "button",
240            "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
241            "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
242            "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main",
243            "center", "template",
244            "dir", "applet", "marquee", "listing", // deprecated but still known / special handling
245            "#root" // the outer Document
246        };
247        String[] inlineTags = {
248            "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd",
249            "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "wbr", "map",
250            "q",
251            "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup",
252            "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
253            "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track",
254            "data", "bdi", "s", "strike", "nobr",
255            "rb", // deprecated but still known / special handling
256        };
257        String[] inlineContainers = { // can only contain inline; aka phrasing content
258            "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style",
259            "ins", "del", "s", "button"
260        };
261        String[] voidTags = {
262            "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",
263            "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track"
264        };
265        String[] preserveWhitespaceTags = {
266            "pre", "plaintext", "title", "textarea", "script"
267        };
268        String[] rcdataTags = { "title", "textarea" };
269        String[] dataTags = { "iframe", "noembed", "noframes", "script", "style", "xmp" };
270        String[] formSubmitTags = SharedConstants.FormSubmitTags;
271        String[] blockMathTags = {"math"};
272        String[] inlineMathTags = {"mi", "mo", "msup", "mn", "mtext"};
273        String[] blockSvgTags = {"svg", "femerge", "femergenode"}; // note these are LC versions, but actually preserve case
274        String[] inlineSvgTags = {"text"};
275        String[] dataSvgTags = {"script"};
276
277        return new TagSet()
278            .setupTags(NamespaceHtml, blockTags, tag -> tag.set(Tag.Block))
279            .setupTags(NamespaceHtml, inlineTags, tag -> tag.set(0))
280            .setupTags(NamespaceHtml, inlineContainers, tag -> tag.set(Tag.InlineContainer))
281            .setupTags(NamespaceHtml, voidTags, tag -> tag.set(Tag.Void))
282            .setupTags(NamespaceHtml, preserveWhitespaceTags, tag -> tag.set(Tag.PreserveWhitespace))
283            .setupTags(NamespaceHtml, rcdataTags, tag -> tag.set(Tag.RcData))
284            .setupTags(NamespaceHtml, dataTags, tag -> tag.set(Tag.Data))
285            .setupTags(NamespaceHtml, formSubmitTags, tag -> tag.set(Tag.FormSubmittable))
286            .setupTags(NamespaceMathml, blockMathTags, tag -> tag.set(Tag.Block))
287            .setupTags(NamespaceMathml, inlineMathTags, tag -> tag.set(0))
288            .setupTags(NamespaceSvg, blockSvgTags, tag -> tag.set(Tag.Block))
289            .setupTags(NamespaceSvg, inlineSvgTags, tag -> tag.set(0))
290            .setupTags(NamespaceSvg, dataSvgTags, tag -> tag.set(Tag.Data))
291            ;
292    }
293
294    private TagSet setupTags(String namespace, String[] tagNames, Consumer<Tag> tagModifier) {
295        for (String tagName : tagNames) {
296            Tag tag = get(tagName, namespace);
297            if (tag == null) {
298                tag = new Tag(tagName, tagName, namespace); // normal name is already normal here
299                tag.options = 0; // clear defaults
300                add(tag);
301            }
302            tagModifier.accept(tag);
303        }
304        return this;
305    }
306}