001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SharedConstants;
005import org.jsoup.nodes.Attribute;
006import org.jsoup.nodes.Attributes;
007import org.jsoup.nodes.CDataNode;
008import org.jsoup.nodes.Comment;
009import org.jsoup.nodes.DataNode;
010import org.jsoup.nodes.Document;
011import org.jsoup.nodes.DocumentType;
012import org.jsoup.nodes.Element;
013import org.jsoup.nodes.Entities;
014import org.jsoup.nodes.LeafNode;
015import org.jsoup.nodes.Node;
016import org.jsoup.nodes.TextNode;
017import org.jsoup.nodes.XmlDeclaration;
018import org.jsoup.select.Elements;
019import org.jspecify.annotations.Nullable;
020
021import java.io.Reader;
022import java.io.StringReader;
023import java.util.ArrayDeque;
024import java.util.HashMap;
025import java.util.List;
026import java.util.Map;
027
028import static org.jsoup.parser.Parser.NamespaceXml;
029
030/**
031 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
032 * document.
033 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
034 *
035 * @author Jonathan Hedley
036 */
037public class XmlTreeBuilder extends TreeBuilder {
038    static final String XmlnsKey = "xmlns";
039    static final String XmlnsPrefix = "xmlns:";
040    private final ArrayDeque<HashMap<String, String>> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn
041
042    @Override ParseSettings defaultSettings() {
043        return ParseSettings.preserveCase;
044    }
045
046    @Override
047    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
048        super.initialiseParse(input, baseUri, parser);
049        doc.outputSettings()
050            .syntax(Document.OutputSettings.Syntax.xml)
051            .escapeMode(Entities.EscapeMode.xhtml)
052            .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not
053
054        namespacesStack.clear();
055        HashMap<String, String> ns = new HashMap<>();
056        ns.put("xml", NamespaceXml);
057        ns.put("", NamespaceXml);
058        namespacesStack.push(ns);
059    }
060
061    @Override
062    void initialiseParseFragment(@Nullable Element context) {
063        super.initialiseParseFragment(context);
064        if (context == null) return;
065
066        // transition to the tag's text state if available
067        TokeniserState textState = context.tag().textState();
068        if (textState != null) tokeniser.transition(textState);
069
070        // reconstitute the namespace stack by traversing the element and its parents (top down)
071        Elements chain = context.parents();
072        chain.add(0, context);
073        for (int i = chain.size() - 1; i >= 0; i--) {
074            Element el = chain.get(i);
075            HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek());
076            namespacesStack.push(namespaces);
077            if (el.attributesSize() > 0) {
078                processNamespaces(el.attributes(), namespaces);
079            }
080        }
081    }
082
083    Document parse(Reader input, String baseUri) {
084        return parse(input, baseUri, new Parser(this));
085    }
086
087    Document parse(String input, String baseUri) {
088        return parse(new StringReader(input), baseUri, new Parser(this));
089    }
090
091    @Override List<Node> completeParseFragment() {
092        return doc.childNodes();
093    }
094
095    @Override
096    XmlTreeBuilder newInstance() {
097        return new XmlTreeBuilder();
098    }
099
100    @Override public String defaultNamespace() {
101        return NamespaceXml;
102    }
103
104    @Override
105    TagSet defaultTagSet() {
106        return new TagSet(); // an empty tagset
107    }
108
109    @Override
110    int defaultMaxDepth() {
111        return Integer.MAX_VALUE;
112    }
113
114    @Override
115    protected boolean process(Token token) {
116        currentToken = token;
117
118        // start tag, end tag, doctype, xmldecl, comment, character, eof
119        switch (token.type) {
120            case StartTag:
121                insertElementFor(token.asStartTag());
122                break;
123            case EndTag:
124                popStackToClose(token.asEndTag());
125                break;
126            case Comment:
127                insertCommentFor(token.asComment());
128                break;
129            case Character:
130                insertCharacterFor(token.asCharacter());
131                break;
132            case Doctype:
133                insertDoctypeFor(token.asDoctype());
134                break;
135            case XmlDecl:
136                insertXmlDeclarationFor(token.asXmlDecl());
137                break;
138            case EOF: // could put some normalisation here if desired
139                break;
140            default:
141                Validate.fail("Unexpected token type: " + token.type);
142        }
143        return true;
144    }
145
146    void insertElementFor(Token.StartTag startTag) {
147        // handle namespace for tag
148        HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek());
149        namespacesStack.push(namespaces);
150
151        Attributes attributes = startTag.attributes;
152        if (attributes != null) {
153            settings.normalizeAttributes(attributes);
154            attributes.deduplicate(settings);
155            processNamespaces(attributes, namespaces);
156            applyNamespacesToAttributes(attributes, namespaces);
157        }
158
159        enforceStackDepthLimit();
160
161        String tagName = startTag.tagName.value();
162        String ns = resolveNamespace(tagName, namespaces);
163        Tag tag = tagFor(tagName, startTag.normalName, ns, settings);
164        Element el = new Element(tag, null, attributes);
165        currentElement().appendChild(el);
166        push(el);
167
168        if (startTag.isSelfClosing()) {
169            tag.setSeenSelfClose();
170            pop(); // push & pop ensures onNodeInserted & onNodeClosed
171        } else if (tag.isEmpty()) {
172            pop(); // custom defined void tag
173        } else {
174            TokeniserState textState = tag.textState();
175            if (textState != null) tokeniser.transition(textState);
176        }
177    }
178
179    private static void processNamespaces(Attributes attributes, HashMap<String, String> namespaces) {
180        // process attributes for namespaces (xmlns, xmlns:)
181        for (Attribute attr : attributes) {
182            String key = attr.getKey();
183            String value = attr.getValue();
184            if (key.equals(XmlnsKey)) {
185                namespaces.put("", value); // new default for this level
186            } else if (key.startsWith(XmlnsPrefix)) {
187                String nsPrefix = key.substring(XmlnsPrefix.length());
188                namespaces.put(nsPrefix, value);
189            }
190        }
191    }
192
193    private static void applyNamespacesToAttributes(Attributes attributes, HashMap<String, String> namespaces) {
194        // second pass, apply namespace to attributes. Collects them first then adds (as userData is an attribute)
195        Map<String, String> attrPrefix = new HashMap<>();
196        for (Attribute attr: attributes) {
197            String prefix = attr.prefix();
198            if (!prefix.isEmpty()) {
199                if (prefix.equals(XmlnsKey)) continue;
200                String ns = namespaces.get(prefix);
201                if (ns != null) attrPrefix.put(SharedConstants.XmlnsAttr + prefix, ns);
202            }
203        }
204        for (Map.Entry<String, String> entry : attrPrefix.entrySet())
205            attributes.userData(entry.getKey(), entry.getValue());
206    }
207
208    private static String resolveNamespace(String tagName, HashMap<String, String> namespaces) {
209        String ns = namespaces.get("");
210        int pos = tagName.indexOf(':');
211        if (pos > 0) {
212            String prefix = tagName.substring(0, pos);
213            if (namespaces.containsKey(prefix))
214                ns = namespaces.get(prefix);
215        }
216        return ns;
217    }
218
219    void insertLeafNode(LeafNode node) {
220        currentElement().appendChild(node);
221        onNodeInserted(node);
222    }
223
224    void insertCommentFor(Token.Comment commentToken) {
225        Comment comment = new Comment(commentToken.getData());
226        insertLeafNode(comment);
227    }
228
229    void insertCharacterFor(Token.Character token) {
230        final String data = token.getData();
231        LeafNode node;
232        if      (token.isCData())                       node = new CDataNode(data);
233        else if (currentElement().tag().is(Tag.Data))   node = new DataNode(data);
234        else                                            node = new TextNode(data);
235        insertLeafNode(node);
236    }
237
238    void insertDoctypeFor(Token.Doctype token) {
239        DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier());
240        doctypeNode.setPubSysKey(token.getPubSysKey());
241        insertLeafNode(doctypeNode);
242    }
243
244    void insertXmlDeclarationFor(Token.XmlDecl token) {
245        XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration);
246        if (token.attributes != null) decl.attributes().addAll(token.attributes);
247        insertLeafNode(decl);
248    }
249
250    @Override
251    Element pop() {
252        namespacesStack.pop();
253        return super.pop();
254    }
255
256    /**
257     * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
258     * found, skips.
259     *
260     * @param endTag tag to close
261     */
262    protected void popStackToClose(Token.EndTag endTag) {
263        // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks
264        String elName = settings.normalizeTag(endTag.name());
265        Element firstFound = null;
266
267        final int bottom = stack.size() - 1;
268        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
269
270        for (int pos = stack.size() -1; pos >= upper; pos--) {
271            Element next = stack.get(pos);
272            if (next.nodeName().equals(elName)) {
273                firstFound = next;
274                break;
275            }
276        }
277        if (firstFound == null)
278            return; // not found, skip
279
280        for (int pos = stack.size() -1; pos >= 0; pos--) {
281            Element next = pop();
282            if (next == firstFound) {
283                break;
284            }
285        }
286    }
287    private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain
288}