001package org.jsoup.safety;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.Attribute;
005import org.jsoup.nodes.Attributes;
006import org.jsoup.nodes.DataNode;
007import org.jsoup.nodes.Document;
008import org.jsoup.nodes.Element;
009import org.jsoup.nodes.Node;
010import org.jsoup.nodes.TextNode;
011import org.jsoup.parser.ParseErrorList;
012import org.jsoup.parser.Parser;
013import org.jsoup.select.NodeVisitor;
014
015import java.util.List;
016
017import static org.jsoup.internal.SharedConstants.DummyUri;
018
019/**
020 The {@link Safelist}-based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
021 that you are expecting; no junk, and no cross-site scripting attacks!
022 <p>
023 The HTML cleaner parses the input as HTML and then runs it through a safelist, so the output HTML can only contain
024 HTML that is allowed by the safelist.
025 </p>
026 <p>
027 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
028 canned safelists only allow body-contained tags.
029 </p>
030 <p>
031 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
032 </p>
033 <p>
034 A Cleaner may be reused across multiple documents and shared across concurrent threads once its {@link Safelist} has
035 been configured. The cleaner uses the supplied safelist directly, so later safelist changes affect later cleaning
036 calls. If you need a variant of an existing configuration, use {@link Safelist#Safelist(Safelist)} to make a copy.
037 </p>
038 */
039public class Cleaner {
040    private final Safelist safelist;
041
042    /**
043     Create a new cleaner, that sanitizes documents using the supplied safelist.
044     @param safelist safe-list to clean with
045     */
046    public Cleaner(Safelist safelist) {
047        Validate.notNull(safelist);
048        this.safelist = safelist;
049    }
050
051    /**
052     Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist.
053     The original document is not modified. Only elements from the dirty document's <code>body</code> are used. The
054     OutputSettings of the original document are cloned into the clean document.
055     @param dirtyDocument Untrusted base document to clean.
056     @return cleaned document.
057     */
058    public Document clean(Document dirtyDocument) {
059        Validate.notNull(dirtyDocument);
060
061        Document clean = Document.createShell(dirtyDocument.baseUri());
062        copySafeNodes(dirtyDocument.body(), clean.body());
063        clean.outputSettings(dirtyDocument.outputSettings().clone());
064
065        return clean;
066    }
067
068    /**
069     Determines if the input document's <b>body</b> is valid, against the safelist. It is considered valid if all the
070     tags and attributes in the input HTML are allowed by the safelist, and that there is no content in the
071     <code>head</code>.
072     <p>
073     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
074     output of this method, the input document <b>must always</b> be normalized using a method such as
075     {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse
076     such as presentation to end users. This ensures that enforced attributes are set correctly, and that any
077     differences between how a given browser and how jsoup parses the input HTML are normalized.
078     </p>
079     <p>Example:
080     <pre>{@code
081     Document inputDoc = Jsoup.parse(inputHtml);
082     Cleaner cleaner = new Cleaner(Safelist.relaxed());
083     boolean isValid = cleaner.isValid(inputDoc);
084     Document normalizedDoc = cleaner.clean(inputDoc);
085     }</pre>
086     </p>
087     @param dirtyDocument document to test
088     @return true if no tags or attributes need to be removed; false if they do
089     */
090    public boolean isValid(Document dirtyDocument) {
091        Validate.notNull(dirtyDocument);
092
093        Document clean = Document.createShell(dirtyDocument.baseUri());
094        int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
095        return numDiscarded == 0
096            && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head
097    }
098
099    /**
100     Determines if the input document's <b>body HTML</b> is valid, against the safelist. It is considered valid if all
101     the tags and attributes in the input HTML are allowed by the safelist.
102     <p>
103     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
104     output of this method, the input document <b>must always</b> be normalized using a method such as
105     {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse
106     such as presentation to end users. This ensures that enforced attributes are set correctly, and that any
107     differences between how a given browser and how jsoup parses the input HTML are normalized.
108     </p>
109     <p>Example:
110     <pre>{@code
111     Document inputDoc = Jsoup.parse(inputHtml);
112     Cleaner cleaner = new Cleaner(Safelist.relaxed());
113     boolean isValid = cleaner.isValidBodyHtml(inputHtml);
114     Document normalizedDoc = cleaner.clean(inputDoc);
115     }</pre>
116     </p>
117     @param bodyHtml HTML fragment to test
118     @return true if no tags or attributes need to be removed; false if they do
119     */
120    public boolean isValidBodyHtml(String bodyHtml) {
121        String baseUri = (safelist.preserveRelativeLinks()) ? DummyUri : ""; // fake base URI to allow relative URLs to remain valid
122        Document clean = Document.createShell(baseUri);
123        Document dirty = Document.createShell(baseUri);
124        ParseErrorList errorList = ParseErrorList.tracking(1);
125        List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), baseUri, errorList);
126        dirty.body().insertChildren(0, nodes);
127        int numDiscarded = copySafeNodes(dirty.body(), clean.body());
128        return numDiscarded == 0 && errorList.isEmpty();
129    }
130
131    /**
132     Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
133     */
134    private final class CleaningVisitor implements NodeVisitor {
135        private int numDiscarded = 0;
136        private final Element root;
137        private Element destination; // current element to append nodes to
138
139        private CleaningVisitor(Element root, Element destination) {
140            this.root = root;
141            this.destination = destination;
142        }
143
144        @Override public void head(Node source, int depth) {
145            if (source instanceof Element) {
146                Element sourceEl = (Element) source;
147
148                if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs
149                    ElementMeta meta = createSafeElement(sourceEl);
150                    Element destChild = meta.el;
151                    destination.appendChild(destChild);
152
153                    numDiscarded += meta.numAttribsDiscarded;
154                    destination = destChild;
155                } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
156                    numDiscarded++;
157                }
158            } else if (source instanceof TextNode) {
159                TextNode sourceText = (TextNode) source;
160                TextNode destText = new TextNode(sourceText.getWholeText());
161                destination.appendChild(destText);
162            } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) {
163                DataNode sourceData = (DataNode) source;
164                DataNode destData = new DataNode(sourceData.getWholeData());
165                destination.appendChild(destData);
166            } else { // else, we don't care about comments, xml proc instructions, etc
167                numDiscarded++;
168            }
169        }
170
171        @Override public void tail(Node source, int depth) {
172            if (source instanceof Element && safelist.isSafeTag(source.normalName())) {
173                destination = destination.parent(); // would have descended, so pop destination stack
174            }
175        }
176    }
177
178    private int copySafeNodes(Element source, Element dest) {
179        CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
180        cleaningVisitor.traverse(source);
181        return cleaningVisitor.numDiscarded;
182    }
183
184    private ElementMeta createSafeElement(Element sourceEl) {
185        Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data
186        String sourceTag = sourceEl.tagName();
187        Attributes destAttrs = dest.attributes();
188        dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy
189
190        int numDiscarded = 0;
191        Attributes sourceAttrs = sourceEl.attributes();
192        for (Attribute sourceAttr : sourceAttrs) {
193            if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { // will keep this attr
194                String key = sourceAttr.getKey();
195                String value = sourceAttr.getValue();
196
197                if (safelist.shouldAbsUrl(sourceTag, key)) { // configured to make absolute urls for this key (href)
198                    value = sourceEl.absUrl(key);
199                    if (value.isEmpty()) // could not be made abs; leave as-is to allow custom unknown protocols
200                        value = sourceAttr.getValue();
201                }
202                destAttrs.put(key, value);
203            } else
204                numDiscarded++;
205        }
206
207
208        Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag);
209        // special case for <a href rel=nofollow>, only apply to external links:
210        if (sourceEl.nameIs("a") && enforcedAttrs.get("rel").equals("nofollow")) {
211            String href = sourceEl.absUrl("href");
212            String sourceBase = sourceEl.baseUri();
213            if (!href.isEmpty() && !sourceBase.isEmpty() && href.startsWith(sourceBase)) { // same site, so don't set the nofollow
214                enforcedAttrs.remove("rel");
215            }
216        }
217
218        // apply enforced attributes case-insensitively, so a preserved-case source attr is canonicalized to the enforced key
219        for (Attribute enforcedAttr : enforcedAttrs) {
220            destAttrs.removeIgnoreCase(enforcedAttr.getKey());
221            destAttrs.put(enforcedAttr.getKey(), enforcedAttr.getValue());
222        }
223        dest.attributes().addAll(destAttrs); // re-attach, if removed in clear
224        return new ElementMeta(dest, numDiscarded);
225    }
226
227    private static class ElementMeta {
228        Element el;
229        int numAttribsDiscarded;
230
231        ElementMeta(Element el, int numAttribsDiscarded) {
232            this.el = el;
233            this.numAttribsDiscarded = numAttribsDiscarded;
234        }
235    }
236
237}