001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SharedConstants; 005import org.jsoup.nodes.Attribute; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.nodes.CDataNode; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.DataNode; 010import org.jsoup.nodes.Document; 011import org.jsoup.nodes.DocumentType; 012import org.jsoup.nodes.Element; 013import org.jsoup.nodes.Entities; 014import org.jsoup.nodes.LeafNode; 015import org.jsoup.nodes.Node; 016import org.jsoup.nodes.TextNode; 017import org.jsoup.nodes.XmlDeclaration; 018import org.jsoup.select.Elements; 019import org.jspecify.annotations.Nullable; 020 021import java.io.Reader; 022import java.io.StringReader; 023import java.util.ArrayDeque; 024import java.util.HashMap; 025import java.util.List; 026import java.util.Map; 027 028import static org.jsoup.parser.Parser.NamespaceXml; 029 030/** 031 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the 032 * document. 033 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p> 034 * 035 * @author Jonathan Hedley 036 */ 037public class XmlTreeBuilder extends TreeBuilder { 038 static final String XmlnsKey = "xmlns"; 039 static final String XmlnsPrefix = "xmlns:"; 040 private final ArrayDeque<HashMap<String, String>> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn 041 042 @Override ParseSettings defaultSettings() { 043 return ParseSettings.preserveCase; 044 } 045 046 @Override 047 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 048 super.initialiseParse(input, baseUri, parser); 049 doc.outputSettings() 050 .syntax(Document.OutputSettings.Syntax.xml) 051 .escapeMode(Entities.EscapeMode.xhtml) 052 .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not 053 054 namespacesStack.clear(); 055 HashMap<String, String> ns = new HashMap<>(); 056 ns.put("xml", NamespaceXml); 057 ns.put("", NamespaceXml); 058 namespacesStack.push(ns); 059 } 060 061 @Override 062 void initialiseParseFragment(@Nullable Element context) { 063 super.initialiseParseFragment(context); 064 if (context == null) return; 065 066 // transition to the tag's text state if available 067 TokeniserState textState = context.tag().textState(); 068 if (textState != null) tokeniser.transition(textState); 069 070 // reconstitute the namespace stack by traversing the element and its parents (top down) 071 Elements chain = context.parents(); 072 chain.add(0, context); 073 for (int i = chain.size() - 1; i >= 0; i--) { 074 Element el = chain.get(i); 075 HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek()); 076 namespacesStack.push(namespaces); 077 if (el.attributesSize() > 0) { 078 processNamespaces(el.attributes(), namespaces); 079 } 080 } 081 } 082 083 Document parse(Reader input, String baseUri) { 084 return parse(input, baseUri, new Parser(this)); 085 } 086 087 Document parse(String input, String baseUri) { 088 return parse(new StringReader(input), baseUri, new Parser(this)); 089 } 090 091 @Override List<Node> completeParseFragment() { 092 return doc.childNodes(); 093 } 094 095 @Override 096 XmlTreeBuilder newInstance() { 097 return new XmlTreeBuilder(); 098 } 099 100 @Override public String defaultNamespace() { 101 return NamespaceXml; 102 } 103 104 @Override 105 TagSet defaultTagSet() { 106 return new TagSet(); // an empty tagset 107 } 108 109 @Override 110 int defaultMaxDepth() { 111 return Integer.MAX_VALUE; 112 } 113 114 @Override 115 protected boolean process(Token token) { 116 currentToken = token; 117 118 // start tag, end tag, doctype, xmldecl, comment, character, eof 119 switch (token.type) { 120 case StartTag: 121 insertElementFor(token.asStartTag()); 122 break; 123 case EndTag: 124 popStackToClose(token.asEndTag()); 125 break; 126 case Comment: 127 insertCommentFor(token.asComment()); 128 break; 129 case Character: 130 insertCharacterFor(token.asCharacter()); 131 break; 132 case Doctype: 133 insertDoctypeFor(token.asDoctype()); 134 break; 135 case XmlDecl: 136 insertXmlDeclarationFor(token.asXmlDecl()); 137 break; 138 case EOF: // could put some normalisation here if desired 139 break; 140 default: 141 Validate.fail("Unexpected token type: " + token.type); 142 } 143 return true; 144 } 145 146 void insertElementFor(Token.StartTag startTag) { 147 // handle namespace for tag 148 HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek()); 149 namespacesStack.push(namespaces); 150 151 Attributes attributes = startTag.attributes; 152 if (attributes != null) { 153 settings.normalizeAttributes(attributes); 154 attributes.deduplicate(settings); 155 processNamespaces(attributes, namespaces); 156 applyNamespacesToAttributes(attributes, namespaces); 157 } 158 159 enforceStackDepthLimit(); 160 161 String tagName = startTag.tagName.value(); 162 String ns = resolveNamespace(tagName, namespaces); 163 Tag tag = tagFor(tagName, startTag.normalName, ns, settings); 164 Element el = new Element(tag, null, attributes); 165 currentElement().appendChild(el); 166 push(el); 167 168 if (startTag.isSelfClosing()) { 169 tag.setSeenSelfClose(); 170 pop(); // push & pop ensures onNodeInserted & onNodeClosed 171 } else if (tag.isEmpty()) { 172 pop(); // custom defined void tag 173 } else { 174 TokeniserState textState = tag.textState(); 175 if (textState != null) tokeniser.transition(textState); 176 } 177 } 178 179 private static void processNamespaces(Attributes attributes, HashMap<String, String> namespaces) { 180 // process attributes for namespaces (xmlns, xmlns:) 181 for (Attribute attr : attributes) { 182 String key = attr.getKey(); 183 String value = attr.getValue(); 184 if (key.equals(XmlnsKey)) { 185 namespaces.put("", value); // new default for this level 186 } else if (key.startsWith(XmlnsPrefix)) { 187 String nsPrefix = key.substring(XmlnsPrefix.length()); 188 namespaces.put(nsPrefix, value); 189 } 190 } 191 } 192 193 private static void applyNamespacesToAttributes(Attributes attributes, HashMap<String, String> namespaces) { 194 // second pass, apply namespace to attributes. Collects them first then adds (as userData is an attribute) 195 Map<String, String> attrPrefix = new HashMap<>(); 196 for (Attribute attr: attributes) { 197 String prefix = attr.prefix(); 198 if (!prefix.isEmpty()) { 199 if (prefix.equals(XmlnsKey)) continue; 200 String ns = namespaces.get(prefix); 201 if (ns != null) attrPrefix.put(SharedConstants.XmlnsAttr + prefix, ns); 202 } 203 } 204 for (Map.Entry<String, String> entry : attrPrefix.entrySet()) 205 attributes.userData(entry.getKey(), entry.getValue()); 206 } 207 208 private static String resolveNamespace(String tagName, HashMap<String, String> namespaces) { 209 String ns = namespaces.get(""); 210 int pos = tagName.indexOf(':'); 211 if (pos > 0) { 212 String prefix = tagName.substring(0, pos); 213 if (namespaces.containsKey(prefix)) 214 ns = namespaces.get(prefix); 215 } 216 return ns; 217 } 218 219 void insertLeafNode(LeafNode node) { 220 currentElement().appendChild(node); 221 onNodeInserted(node); 222 } 223 224 void insertCommentFor(Token.Comment commentToken) { 225 Comment comment = new Comment(commentToken.getData()); 226 insertLeafNode(comment); 227 } 228 229 void insertCharacterFor(Token.Character token) { 230 final String data = token.getData(); 231 LeafNode node; 232 if (token.isCData()) node = new CDataNode(data); 233 else if (currentElement().tag().is(Tag.Data)) node = new DataNode(data); 234 else node = new TextNode(data); 235 insertLeafNode(node); 236 } 237 238 void insertDoctypeFor(Token.Doctype token) { 239 DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier()); 240 doctypeNode.setPubSysKey(token.getPubSysKey()); 241 insertLeafNode(doctypeNode); 242 } 243 244 void insertXmlDeclarationFor(Token.XmlDecl token) { 245 XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration); 246 if (token.attributes != null) decl.attributes().addAll(token.attributes); 247 insertLeafNode(decl); 248 } 249 250 @Override 251 Element pop() { 252 namespacesStack.pop(); 253 return super.pop(); 254 } 255 256 /** 257 * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not 258 * found, skips. 259 * 260 * @param endTag tag to close 261 */ 262 protected void popStackToClose(Token.EndTag endTag) { 263 // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks 264 String elName = settings.normalizeTag(endTag.name()); 265 Element firstFound = null; 266 267 final int bottom = stack.size() - 1; 268 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 269 270 for (int pos = stack.size() -1; pos >= upper; pos--) { 271 Element next = stack.get(pos); 272 if (next.nodeName().equals(elName)) { 273 firstFound = next; 274 break; 275 } 276 } 277 if (firstFound == null) 278 return; // not found, skip 279 280 for (int pos = stack.size() -1; pos >= 0; pos--) { 281 Element next = pop(); 282 if (next == firstFound) { 283 break; 284 } 285 } 286 } 287 private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain 288}