001package org.jsoup.helper;
002
003import org.jsoup.internal.Normalizer;
004import org.jsoup.internal.StringUtil;
005import org.jsoup.nodes.Attribute;
006import org.jsoup.parser.HtmlTreeBuilder;
007import org.jsoup.select.NodeVisitor;
008import org.jsoup.select.Selector;
009import org.w3c.dom.Comment;
010import org.w3c.dom.DOMException;
011import org.w3c.dom.DOMImplementation;
012import org.w3c.dom.Document;
013import org.w3c.dom.DocumentType;
014import org.w3c.dom.Element;
015import org.w3c.dom.Node;
016import org.w3c.dom.NodeList;
017import org.w3c.dom.Text;
018import org.jspecify.annotations.Nullable;
019
020import javax.xml.parsers.DocumentBuilder;
021import javax.xml.parsers.DocumentBuilderFactory;
022import javax.xml.parsers.ParserConfigurationException;
023import javax.xml.transform.OutputKeys;
024import javax.xml.transform.Transformer;
025import javax.xml.transform.TransformerException;
026import javax.xml.transform.TransformerFactory;
027import javax.xml.transform.dom.DOMSource;
028import javax.xml.transform.stream.StreamResult;
029import javax.xml.xpath.XPathConstants;
030import javax.xml.xpath.XPathExpression;
031import javax.xml.xpath.XPathExpressionException;
032import javax.xml.xpath.XPathFactory;
033import javax.xml.xpath.XPathFactoryConfigurationException;
034import java.io.StringWriter;
035import java.util.ArrayList;
036import java.util.HashMap;
037import java.util.List;
038import java.util.Map;
039import java.util.Properties;
040
041import static javax.xml.transform.OutputKeys.METHOD;
042import static org.jsoup.nodes.Document.OutputSettings.Syntax;
043
044/**
045 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
046 * for integration with toolsets that use the W3C DOM.
047 */
048public class W3CDom {
049    /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */
050    public static final String SourceProperty = "jsoupSource";
051    private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc
052    private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context
053
054    /**
055     To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory
056     implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}).
057     */
058    public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup";
059
060    protected DocumentBuilderFactory factory;
061    private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience
062
063    public W3CDom() {
064        factory = DocumentBuilderFactory.newInstance();
065        factory.setNamespaceAware(true);
066    }
067
068    /**
069     Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity
070     when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}.
071     @return the current namespace aware setting.
072     */
073    public boolean namespaceAware() {
074        return namespaceAware;
075    }
076
077    /**
078     Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes.
079     <p>For HTML documents, controls if the document will be in the default {@code http://www.w3.org/1999/xhtml}
080     namespace if otherwise unset.</p>.
081     @param namespaceAware the updated setting
082     @return this W3CDom, for chaining.
083     */
084    public W3CDom namespaceAware(boolean namespaceAware) {
085        this.namespaceAware = namespaceAware;
086        factory.setNamespaceAware(namespaceAware);
087        return this;
088    }
089
090    /**
091     * Converts a jsoup DOM to a W3C DOM.
092     *
093     * @param in jsoup Document
094     * @return W3C Document
095     */
096    public static Document convert(org.jsoup.nodes.Document in) {
097        return (new W3CDom().fromJsoup(in));
098    }
099
100    /**
101     * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If
102     * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the
103     * document.
104     *
105     * @param doc Document
106     * @param properties (optional/nullable) the output properties to use. See {@link
107     *     Transformer#setOutputProperties(Properties)} and {@link OutputKeys}
108     * @return Document as string
109     * @see #OutputHtml
110     * @see #OutputXml
111     * @see OutputKeys#ENCODING
112     * @see OutputKeys#OMIT_XML_DECLARATION
113     * @see OutputKeys#STANDALONE
114     * @see OutputKeys#DOCTYPE_PUBLIC
115     * @see OutputKeys#CDATA_SECTION_ELEMENTS
116     * @see OutputKeys#INDENT
117     * @see OutputKeys#MEDIA_TYPE
118     */
119    public static String asString(Document doc, @Nullable Map<String, String> properties) {
120        try {
121            DOMSource domSource = new DOMSource(doc);
122            StringWriter writer = new StringWriter();
123            StreamResult result = new StreamResult(writer);
124            TransformerFactory tf = TransformerFactory.newInstance();
125            Transformer transformer = tf.newTransformer();
126            if (properties != null)
127                transformer.setOutputProperties(propertiesFromMap(properties));
128
129            if (doc.getDoctype() != null) {
130                DocumentType doctype = doc.getDoctype();
131                if (!StringUtil.isBlank(doctype.getPublicId()))
132                    transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId());
133                if (!StringUtil.isBlank(doctype.getSystemId()))
134                    transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId());
135                    // handle <!doctype html> for legacy dom.
136                else if (doctype.getName().equalsIgnoreCase("html")
137                    && StringUtil.isBlank(doctype.getPublicId())
138                    && StringUtil.isBlank(doctype.getSystemId()))
139                    transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "about:legacy-compat");
140            }
141
142            transformer.transform(domSource, result);
143            return writer.toString();
144
145        } catch (TransformerException e) {
146            throw new IllegalStateException(e);
147        }
148    }
149
150    static Properties propertiesFromMap(Map<String, String> map) {
151        Properties props = new Properties();
152        props.putAll(map);
153        return props;
154    }
155
156    /** Canned default for HTML output. */
157    public static HashMap<String, String> OutputHtml() {
158        return methodMap("html");
159    }
160
161    /** Canned default for XML output. */
162    public static HashMap<String, String> OutputXml() {
163        return methodMap("xml");
164    }
165
166    private static HashMap<String, String> methodMap(String method) {
167        HashMap<String, String> map = new HashMap<>();
168        map.put(METHOD, method);
169        return map;
170    }
171
172    /**
173     * Convert a jsoup Document to a W3C Document. The created nodes will link back to the original
174     * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
175     * flow to the other).
176     *
177     * @param in jsoup doc
178     * @return a W3C DOM Document representing the jsoup Document or Element contents.
179     */
180    public Document fromJsoup(org.jsoup.nodes.Document in) {
181        // just method API backcompat
182        return fromJsoup((org.jsoup.nodes.Element) in);
183    }
184
185    /**
186     * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original
187     * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
188     * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is
189     * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.)
190     *
191     * @param in jsoup element or doc
192     * @return a W3C DOM Document representing the jsoup Document or Element contents.
193     * @see #sourceNodes(NodeList, Class)
194     * @see #contextNode(Document)
195     */
196    public Document fromJsoup(org.jsoup.nodes.Element in) {
197        Validate.notNull(in);
198        DocumentBuilder builder;
199        try {
200            builder = factory.newDocumentBuilder();
201            DOMImplementation impl = builder.getDOMImplementation();
202            Document out = builder.newDocument();
203            org.jsoup.nodes.Document inDoc = in.ownerDocument();
204            org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null;
205            if (doctype != null) {
206                try {
207                    org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId());
208                    out.appendChild(documentType);
209                } catch (DOMException ignored) {
210                    // invalid / empty doctype dropped
211                }
212            }
213            out.setXmlStandalone(true);
214            // if in is Document, use the root element, not the wrapping document, as the context:
215            org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.firstElementChild() : in;
216            out.setUserData(ContextProperty, context, null);
217            convert(inDoc != null ? inDoc : in, out);
218            return out;
219        } catch (ParserConfigurationException e) {
220            throw new IllegalStateException(e);
221        }
222    }
223
224    /**
225     * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output
226     * document before converting.
227     *
228     * @param in jsoup doc
229     * @param out w3c doc
230     * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
231     */
232    public void convert(org.jsoup.nodes.Document in, Document out) {
233        // just provides method API backcompat
234        convert((org.jsoup.nodes.Element) in, out);
235    }
236
237    /**
238     * Converts a jsoup element into the provided W3C Document. If required, you can set options on the output
239     * document before converting.
240     *
241     * @param in jsoup element
242     * @param out w3c doc
243     * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
244     */
245    public void convert(org.jsoup.nodes.Element in, Document out) {
246        W3CBuilder builder = new W3CBuilder(out);
247        builder.namespaceAware = namespaceAware;
248        org.jsoup.nodes.Document inDoc = in.ownerDocument();
249        if (inDoc != null) {
250            if (!StringUtil.isBlank(inDoc.location())) {
251                out.setDocumentURI(inDoc.location());
252            }
253            builder.syntax = inDoc.outputSettings().syntax();
254        }
255        org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.firstElementChild() : in; // skip the #root node if a Document
256        assert rootEl != null;
257        builder.traverse(rootEl);
258    }
259
260    /**
261     Evaluate an XPath query against the supplied document, and return the results.
262     @param xpath an XPath query
263     @param doc the document to evaluate against
264     @return the matches nodes
265     */
266    public NodeList selectXpath(String xpath, Document doc) {
267        return selectXpath(xpath, (Node) doc);
268    }
269
270    /**
271     Evaluate an XPath query against the supplied context node, and return the results.
272     @param xpath an XPath query
273     @param contextNode the context node to evaluate against
274     @return the matches nodes
275     */
276    public NodeList selectXpath(String xpath, Node contextNode) {
277        Validate.notEmptyParam(xpath, "xpath");
278        Validate.notNullParam(contextNode, "contextNode");
279
280        NodeList nodeList;
281        try {
282            // if there is a configured XPath factory, use that instead of the Java base impl:
283            String property = System.getProperty(XPathFactoryProperty);
284            final XPathFactory xPathFactory = property != null ?
285                XPathFactory.newInstance("jsoup") :
286                XPathFactory.newInstance();
287
288            XPathExpression expression = xPathFactory.newXPath().compile(xpath);
289            nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s
290            Validate.notNull(nodeList);
291        } catch (XPathExpressionException | XPathFactoryConfigurationException e) {
292            throw new Selector.SelectorParseException(
293                e, "Could not evaluate XPath query [%s]: %s", xpath, e.getMessage());
294        }
295        return nodeList;
296    }
297
298    /**
299     Retrieves the original jsoup DOM nodes from a nodelist created by this convertor.
300     @param nodeList the W3C nodes to get the original jsoup nodes from
301     @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc)
302     @param <T> node type
303     @return a list of the original nodes
304     */
305    public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) {
306        Validate.notNull(nodeList);
307        Validate.notNull(nodeType);
308        List<T> nodes = new ArrayList<>(nodeList.getLength());
309
310        for (int i = 0; i < nodeList.getLength(); i++) {
311            org.w3c.dom.Node node = nodeList.item(i);
312            Object source = node.getUserData(W3CDom.SourceProperty);
313            if (nodeType.isInstance(source))
314                nodes.add(nodeType.cast(source));
315        }
316
317        return nodes;
318    }
319
320    /**
321     For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node.
322     @param wDoc Document created by this class
323     @return the corresponding W3C Node to the jsoup Element that was used as the creating context.
324     */
325    public Node contextNode(Document wDoc) {
326        return (Node) wDoc.getUserData(ContextNodeProperty);
327    }
328
329    /**
330     * Serialize a W3C document that was created by {@link #fromJsoup(org.jsoup.nodes.Element)} to a String.
331     * The output format will be XML or HTML depending on the content of the doc.
332     *
333     * @param doc Document
334     * @return Document as string
335     * @see W3CDom#asString(Document, Map)
336     */
337    public String asString(Document doc) {
338        return asString(doc, null);
339    }
340
341    /**
342     * Implements the conversion by walking the input.
343     */
344    protected static class W3CBuilder implements NodeVisitor {
345        private final Document doc;
346        private boolean namespaceAware = true;
347        private Node dest;
348        private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
349        /*@Nullable*/ private final org.jsoup.nodes.Element contextElement; // todo - unsure why this can't be marked nullable?
350
351        public W3CBuilder(Document doc) {
352            this.doc = doc;
353            dest = doc;
354            contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element
355        }
356
357        @Override
358        public void head(org.jsoup.nodes.Node source, int depth) {
359            if (source instanceof org.jsoup.nodes.Element) {
360                org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
361                String namespace = namespaceAware ? sourceEl.tag().namespace() : null;
362                String tagName = Normalizer.xmlSafeTagName(sourceEl.tagName());
363                try {
364                    // use an empty namespace if none is present but the tag name has a prefix
365                    String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace;
366                    Element el = doc.createElementNS(imputedNamespace, tagName);
367                    copyAttributes(sourceEl, el);
368                    append(el, sourceEl);
369                    if (sourceEl == contextElement)
370                        doc.setUserData(ContextNodeProperty, el, null);
371                    dest = el; // descend
372                } catch (DOMException e) {
373                    // If the Normalize didn't get it XML / W3C safe, inserts as plain text
374                    append(doc.createTextNode("<" + tagName + ">"), sourceEl);
375                }
376            } else if (source instanceof org.jsoup.nodes.TextNode) {
377                org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
378                Text text = doc.createTextNode(sourceText.getWholeText());
379                append(text, sourceText);
380            } else if (source instanceof org.jsoup.nodes.Comment) {
381                org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
382                Comment comment = doc.createComment(sourceComment.getData());
383                append(comment, sourceComment);
384            } else if (source instanceof org.jsoup.nodes.DataNode) {
385                org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
386                Text node = doc.createTextNode(sourceData.getWholeData());
387                append(node, sourceData);
388            } else {
389                // unhandled. note that doctype is not handled here - rather it is used in the initial doc creation
390            }
391        }
392
393        private void append(Node append, org.jsoup.nodes.Node source) {
394            append.setUserData(SourceProperty, source, null);
395            dest.appendChild(append);
396        }
397
398        @Override
399        public void tail(org.jsoup.nodes.Node source, int depth) {
400            if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
401                dest = dest.getParentNode(); // undescend
402            }
403        }
404
405        private void copyAttributes(org.jsoup.nodes.Element jEl, Element wEl) {
406            for (Attribute attribute : jEl.attributes()) {
407                try {
408                    setAttribute(jEl, wEl, attribute, syntax);
409                } catch (DOMException e) {
410                    if (syntax != Syntax.xml)
411                        setAttribute(jEl, wEl, attribute, Syntax.xml);
412                }
413            }
414        }
415
416        private void setAttribute(org.jsoup.nodes.Element jEl, Element wEl, Attribute attribute, Syntax syntax) throws DOMException {
417            String key = Attribute.getValidKey(attribute.getKey(), syntax);
418            if (key != null) {
419                String namespace = attribute.namespace();
420                if (namespaceAware && !namespace.isEmpty())
421                    wEl.setAttributeNS(namespace, key, attribute.getValue());
422                else
423                    wEl.setAttribute(key, attribute.getValue());
424                maybeAddUndeclaredNs(namespace, key, jEl, wEl);
425            }
426        }
427
428        /**
429         Add a namespace declaration for an attribute with a prefix if it is not already present. Ensures that attributes
430         with prefixes have the corresponding namespace declared, E.g. attribute "v-bind:foo" gets another attribute
431         "xmlns:v-bind='undefined'. So that the asString() transformation pass is valid.
432         If the parser was HTML we don't have a discovered namespace but we are trying to coerce it, so walk up the
433         element stack and find it.
434         */
435        private void maybeAddUndeclaredNs(String namespace, String attrKey, org.jsoup.nodes.Element jEl, Element wEl) {
436            if (!namespaceAware || !namespace.isEmpty()) return;
437            int pos = attrKey.indexOf(':');
438            if (pos != -1) { // prefixed but no namespace defined during parse, add a fake so that w3c serialization doesn't blow up
439                String prefix = attrKey.substring(0, pos);
440                if (prefix.equals("xmlns")) return;
441                org.jsoup.nodes.Document doc = jEl.ownerDocument();
442                if (doc != null && doc.parser().getTreeBuilder() instanceof HtmlTreeBuilder) {
443                    // try walking up the stack and seeing if there is a namespace declared for this prefix (and that we didn't parse because HTML)
444                    for (org.jsoup.nodes.Element el = jEl; el != null; el = el.parent()) {
445                        String ns = el.attr("xmlns:" + prefix);
446                        if (!ns.isEmpty()) {
447                            namespace = ns;
448                            // found it, set it
449                            wEl.setAttributeNS(namespace, attrKey, jEl.attr(attrKey));
450                            return;
451                        }
452                    }
453                }
454
455                // otherwise, put in a fake one
456                wEl.setAttribute("xmlns:" + prefix, undefinedNs);
457            }
458        }
459        private static final String undefinedNs = "undefined";
460    }
461
462}