001package org.jsoup.helper; 002 003import org.jsoup.internal.Normalizer; 004import org.jsoup.internal.StringUtil; 005import org.jsoup.nodes.Attribute; 006import org.jsoup.parser.HtmlTreeBuilder; 007import org.jsoup.select.NodeVisitor; 008import org.jsoup.select.Selector; 009import org.w3c.dom.Comment; 010import org.w3c.dom.DOMException; 011import org.w3c.dom.DOMImplementation; 012import org.w3c.dom.Document; 013import org.w3c.dom.DocumentType; 014import org.w3c.dom.Element; 015import org.w3c.dom.Node; 016import org.w3c.dom.NodeList; 017import org.w3c.dom.Text; 018import org.jspecify.annotations.Nullable; 019 020import javax.xml.parsers.DocumentBuilder; 021import javax.xml.parsers.DocumentBuilderFactory; 022import javax.xml.parsers.ParserConfigurationException; 023import javax.xml.transform.OutputKeys; 024import javax.xml.transform.Transformer; 025import javax.xml.transform.TransformerException; 026import javax.xml.transform.TransformerFactory; 027import javax.xml.transform.dom.DOMSource; 028import javax.xml.transform.stream.StreamResult; 029import javax.xml.xpath.XPathConstants; 030import javax.xml.xpath.XPathExpression; 031import javax.xml.xpath.XPathExpressionException; 032import javax.xml.xpath.XPathFactory; 033import javax.xml.xpath.XPathFactoryConfigurationException; 034import java.io.StringWriter; 035import java.util.ArrayList; 036import java.util.HashMap; 037import java.util.List; 038import java.util.Map; 039import java.util.Properties; 040 041import static javax.xml.transform.OutputKeys.METHOD; 042import static org.jsoup.nodes.Document.OutputSettings.Syntax; 043 044/** 045 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document}, 046 * for integration with toolsets that use the W3C DOM. 047 */ 048public class W3CDom { 049 /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */ 050 public static final String SourceProperty = "jsoupSource"; 051 private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc 052 private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context 053 054 /** 055 To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory 056 implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}). 057 */ 058 public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup"; 059 060 protected DocumentBuilderFactory factory; 061 private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience 062 063 public W3CDom() { 064 factory = DocumentBuilderFactory.newInstance(); 065 factory.setNamespaceAware(true); 066 } 067 068 /** 069 Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity 070 when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}. 071 @return the current namespace aware setting. 072 */ 073 public boolean namespaceAware() { 074 return namespaceAware; 075 } 076 077 /** 078 Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes. 079 <p>For HTML documents, controls if the document will be in the default {@code http://www.w3.org/1999/xhtml} 080 namespace if otherwise unset.</p>. 081 @param namespaceAware the updated setting 082 @return this W3CDom, for chaining. 083 */ 084 public W3CDom namespaceAware(boolean namespaceAware) { 085 this.namespaceAware = namespaceAware; 086 factory.setNamespaceAware(namespaceAware); 087 return this; 088 } 089 090 /** 091 * Converts a jsoup DOM to a W3C DOM. 092 * 093 * @param in jsoup Document 094 * @return W3C Document 095 */ 096 public static Document convert(org.jsoup.nodes.Document in) { 097 return (new W3CDom().fromJsoup(in)); 098 } 099 100 /** 101 * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If 102 * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the 103 * document. 104 * 105 * @param doc Document 106 * @param properties (optional/nullable) the output properties to use. See {@link 107 * Transformer#setOutputProperties(Properties)} and {@link OutputKeys} 108 * @return Document as string 109 * @see #OutputHtml 110 * @see #OutputXml 111 * @see OutputKeys#ENCODING 112 * @see OutputKeys#OMIT_XML_DECLARATION 113 * @see OutputKeys#STANDALONE 114 * @see OutputKeys#DOCTYPE_PUBLIC 115 * @see OutputKeys#CDATA_SECTION_ELEMENTS 116 * @see OutputKeys#INDENT 117 * @see OutputKeys#MEDIA_TYPE 118 */ 119 public static String asString(Document doc, @Nullable Map<String, String> properties) { 120 try { 121 DOMSource domSource = new DOMSource(doc); 122 StringWriter writer = new StringWriter(); 123 StreamResult result = new StreamResult(writer); 124 TransformerFactory tf = TransformerFactory.newInstance(); 125 Transformer transformer = tf.newTransformer(); 126 if (properties != null) 127 transformer.setOutputProperties(propertiesFromMap(properties)); 128 129 if (doc.getDoctype() != null) { 130 DocumentType doctype = doc.getDoctype(); 131 if (!StringUtil.isBlank(doctype.getPublicId())) 132 transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId()); 133 if (!StringUtil.isBlank(doctype.getSystemId())) 134 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId()); 135 // handle <!doctype html> for legacy dom. 136 else if (doctype.getName().equalsIgnoreCase("html") 137 && StringUtil.isBlank(doctype.getPublicId()) 138 && StringUtil.isBlank(doctype.getSystemId())) 139 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "about:legacy-compat"); 140 } 141 142 transformer.transform(domSource, result); 143 return writer.toString(); 144 145 } catch (TransformerException e) { 146 throw new IllegalStateException(e); 147 } 148 } 149 150 static Properties propertiesFromMap(Map<String, String> map) { 151 Properties props = new Properties(); 152 props.putAll(map); 153 return props; 154 } 155 156 /** Canned default for HTML output. */ 157 public static HashMap<String, String> OutputHtml() { 158 return methodMap("html"); 159 } 160 161 /** Canned default for XML output. */ 162 public static HashMap<String, String> OutputXml() { 163 return methodMap("xml"); 164 } 165 166 private static HashMap<String, String> methodMap(String method) { 167 HashMap<String, String> map = new HashMap<>(); 168 map.put(METHOD, method); 169 return map; 170 } 171 172 /** 173 * Convert a jsoup Document to a W3C Document. The created nodes will link back to the original 174 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 175 * flow to the other). 176 * 177 * @param in jsoup doc 178 * @return a W3C DOM Document representing the jsoup Document or Element contents. 179 */ 180 public Document fromJsoup(org.jsoup.nodes.Document in) { 181 // just method API backcompat 182 return fromJsoup((org.jsoup.nodes.Element) in); 183 } 184 185 /** 186 * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original 187 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 188 * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is 189 * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.) 190 * 191 * @param in jsoup element or doc 192 * @return a W3C DOM Document representing the jsoup Document or Element contents. 193 * @see #sourceNodes(NodeList, Class) 194 * @see #contextNode(Document) 195 */ 196 public Document fromJsoup(org.jsoup.nodes.Element in) { 197 Validate.notNull(in); 198 DocumentBuilder builder; 199 try { 200 builder = factory.newDocumentBuilder(); 201 DOMImplementation impl = builder.getDOMImplementation(); 202 Document out = builder.newDocument(); 203 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 204 org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null; 205 if (doctype != null) { 206 try { 207 org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId()); 208 out.appendChild(documentType); 209 } catch (DOMException ignored) { 210 // invalid / empty doctype dropped 211 } 212 } 213 out.setXmlStandalone(true); 214 // if in is Document, use the root element, not the wrapping document, as the context: 215 org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.firstElementChild() : in; 216 out.setUserData(ContextProperty, context, null); 217 convert(inDoc != null ? inDoc : in, out); 218 return out; 219 } catch (ParserConfigurationException e) { 220 throw new IllegalStateException(e); 221 } 222 } 223 224 /** 225 * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output 226 * document before converting. 227 * 228 * @param in jsoup doc 229 * @param out w3c doc 230 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 231 */ 232 public void convert(org.jsoup.nodes.Document in, Document out) { 233 // just provides method API backcompat 234 convert((org.jsoup.nodes.Element) in, out); 235 } 236 237 /** 238 * Converts a jsoup element into the provided W3C Document. If required, you can set options on the output 239 * document before converting. 240 * 241 * @param in jsoup element 242 * @param out w3c doc 243 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 244 */ 245 public void convert(org.jsoup.nodes.Element in, Document out) { 246 W3CBuilder builder = new W3CBuilder(out); 247 builder.namespaceAware = namespaceAware; 248 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 249 if (inDoc != null) { 250 if (!StringUtil.isBlank(inDoc.location())) { 251 out.setDocumentURI(inDoc.location()); 252 } 253 builder.syntax = inDoc.outputSettings().syntax(); 254 } 255 org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.firstElementChild() : in; // skip the #root node if a Document 256 assert rootEl != null; 257 builder.traverse(rootEl); 258 } 259 260 /** 261 Evaluate an XPath query against the supplied document, and return the results. 262 @param xpath an XPath query 263 @param doc the document to evaluate against 264 @return the matches nodes 265 */ 266 public NodeList selectXpath(String xpath, Document doc) { 267 return selectXpath(xpath, (Node) doc); 268 } 269 270 /** 271 Evaluate an XPath query against the supplied context node, and return the results. 272 @param xpath an XPath query 273 @param contextNode the context node to evaluate against 274 @return the matches nodes 275 */ 276 public NodeList selectXpath(String xpath, Node contextNode) { 277 Validate.notEmptyParam(xpath, "xpath"); 278 Validate.notNullParam(contextNode, "contextNode"); 279 280 NodeList nodeList; 281 try { 282 // if there is a configured XPath factory, use that instead of the Java base impl: 283 String property = System.getProperty(XPathFactoryProperty); 284 final XPathFactory xPathFactory = property != null ? 285 XPathFactory.newInstance("jsoup") : 286 XPathFactory.newInstance(); 287 288 XPathExpression expression = xPathFactory.newXPath().compile(xpath); 289 nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s 290 Validate.notNull(nodeList); 291 } catch (XPathExpressionException | XPathFactoryConfigurationException e) { 292 throw new Selector.SelectorParseException( 293 e, "Could not evaluate XPath query [%s]: %s", xpath, e.getMessage()); 294 } 295 return nodeList; 296 } 297 298 /** 299 Retrieves the original jsoup DOM nodes from a nodelist created by this convertor. 300 @param nodeList the W3C nodes to get the original jsoup nodes from 301 @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc) 302 @param <T> node type 303 @return a list of the original nodes 304 */ 305 public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) { 306 Validate.notNull(nodeList); 307 Validate.notNull(nodeType); 308 List<T> nodes = new ArrayList<>(nodeList.getLength()); 309 310 for (int i = 0; i < nodeList.getLength(); i++) { 311 org.w3c.dom.Node node = nodeList.item(i); 312 Object source = node.getUserData(W3CDom.SourceProperty); 313 if (nodeType.isInstance(source)) 314 nodes.add(nodeType.cast(source)); 315 } 316 317 return nodes; 318 } 319 320 /** 321 For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node. 322 @param wDoc Document created by this class 323 @return the corresponding W3C Node to the jsoup Element that was used as the creating context. 324 */ 325 public Node contextNode(Document wDoc) { 326 return (Node) wDoc.getUserData(ContextNodeProperty); 327 } 328 329 /** 330 * Serialize a W3C document that was created by {@link #fromJsoup(org.jsoup.nodes.Element)} to a String. 331 * The output format will be XML or HTML depending on the content of the doc. 332 * 333 * @param doc Document 334 * @return Document as string 335 * @see W3CDom#asString(Document, Map) 336 */ 337 public String asString(Document doc) { 338 return asString(doc, null); 339 } 340 341 /** 342 * Implements the conversion by walking the input. 343 */ 344 protected static class W3CBuilder implements NodeVisitor { 345 private final Document doc; 346 private boolean namespaceAware = true; 347 private Node dest; 348 private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available. 349 /*@Nullable*/ private final org.jsoup.nodes.Element contextElement; // todo - unsure why this can't be marked nullable? 350 351 public W3CBuilder(Document doc) { 352 this.doc = doc; 353 dest = doc; 354 contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element 355 } 356 357 @Override 358 public void head(org.jsoup.nodes.Node source, int depth) { 359 if (source instanceof org.jsoup.nodes.Element) { 360 org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; 361 String namespace = namespaceAware ? sourceEl.tag().namespace() : null; 362 String tagName = Normalizer.xmlSafeTagName(sourceEl.tagName()); 363 try { 364 // use an empty namespace if none is present but the tag name has a prefix 365 String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace; 366 Element el = doc.createElementNS(imputedNamespace, tagName); 367 copyAttributes(sourceEl, el); 368 append(el, sourceEl); 369 if (sourceEl == contextElement) 370 doc.setUserData(ContextNodeProperty, el, null); 371 dest = el; // descend 372 } catch (DOMException e) { 373 // If the Normalize didn't get it XML / W3C safe, inserts as plain text 374 append(doc.createTextNode("<" + tagName + ">"), sourceEl); 375 } 376 } else if (source instanceof org.jsoup.nodes.TextNode) { 377 org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; 378 Text text = doc.createTextNode(sourceText.getWholeText()); 379 append(text, sourceText); 380 } else if (source instanceof org.jsoup.nodes.Comment) { 381 org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; 382 Comment comment = doc.createComment(sourceComment.getData()); 383 append(comment, sourceComment); 384 } else if (source instanceof org.jsoup.nodes.DataNode) { 385 org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; 386 Text node = doc.createTextNode(sourceData.getWholeData()); 387 append(node, sourceData); 388 } else { 389 // unhandled. note that doctype is not handled here - rather it is used in the initial doc creation 390 } 391 } 392 393 private void append(Node append, org.jsoup.nodes.Node source) { 394 append.setUserData(SourceProperty, source, null); 395 dest.appendChild(append); 396 } 397 398 @Override 399 public void tail(org.jsoup.nodes.Node source, int depth) { 400 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) { 401 dest = dest.getParentNode(); // undescend 402 } 403 } 404 405 private void copyAttributes(org.jsoup.nodes.Element jEl, Element wEl) { 406 for (Attribute attribute : jEl.attributes()) { 407 try { 408 setAttribute(jEl, wEl, attribute, syntax); 409 } catch (DOMException e) { 410 if (syntax != Syntax.xml) 411 setAttribute(jEl, wEl, attribute, Syntax.xml); 412 } 413 } 414 } 415 416 private void setAttribute(org.jsoup.nodes.Element jEl, Element wEl, Attribute attribute, Syntax syntax) throws DOMException { 417 String key = Attribute.getValidKey(attribute.getKey(), syntax); 418 if (key != null) { 419 String namespace = attribute.namespace(); 420 if (namespaceAware && !namespace.isEmpty()) 421 wEl.setAttributeNS(namespace, key, attribute.getValue()); 422 else 423 wEl.setAttribute(key, attribute.getValue()); 424 maybeAddUndeclaredNs(namespace, key, jEl, wEl); 425 } 426 } 427 428 /** 429 Add a namespace declaration for an attribute with a prefix if it is not already present. Ensures that attributes 430 with prefixes have the corresponding namespace declared, E.g. attribute "v-bind:foo" gets another attribute 431 "xmlns:v-bind='undefined'. So that the asString() transformation pass is valid. 432 If the parser was HTML we don't have a discovered namespace but we are trying to coerce it, so walk up the 433 element stack and find it. 434 */ 435 private void maybeAddUndeclaredNs(String namespace, String attrKey, org.jsoup.nodes.Element jEl, Element wEl) { 436 if (!namespaceAware || !namespace.isEmpty()) return; 437 int pos = attrKey.indexOf(':'); 438 if (pos != -1) { // prefixed but no namespace defined during parse, add a fake so that w3c serialization doesn't blow up 439 String prefix = attrKey.substring(0, pos); 440 if (prefix.equals("xmlns")) return; 441 org.jsoup.nodes.Document doc = jEl.ownerDocument(); 442 if (doc != null && doc.parser().getTreeBuilder() instanceof HtmlTreeBuilder) { 443 // try walking up the stack and seeing if there is a namespace declared for this prefix (and that we didn't parse because HTML) 444 for (org.jsoup.nodes.Element el = jEl; el != null; el = el.parent()) { 445 String ns = el.attr("xmlns:" + prefix); 446 if (!ns.isEmpty()) { 447 namespace = ns; 448 // found it, set it 449 wEl.setAttributeNS(namespace, attrKey, jEl.attr(attrKey)); 450 return; 451 } 452 } 453 } 454 455 // otherwise, put in a fake one 456 wEl.setAttribute("xmlns:" + prefix, undefinedNs); 457 } 458 } 459 private static final String undefinedNs = "undefined"; 460 } 461 462}