001package org.jsoup.safety;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.Attribute;
005import org.jsoup.nodes.Attributes;
006import org.jsoup.nodes.DataNode;
007import org.jsoup.nodes.Document;
008import org.jsoup.nodes.Element;
009import org.jsoup.nodes.Node;
010import org.jsoup.nodes.TextNode;
011import org.jsoup.parser.ParseErrorList;
012import org.jsoup.parser.Parser;
013import org.jsoup.select.NodeVisitor;
014
015import java.util.List;
016
017import static org.jsoup.internal.SharedConstants.DummyUri;
018
019/**
020 The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
021 that you are expecting; no junk, and no cross-site scripting attacks!
022 <p>
023 The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain
024 HTML that is allowed by the safelist.
025 </p>
026 <p>
027 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
028 canned safe-lists only allow body contained tags.
029 </p>
030 <p>
031 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
032 </p>
033 */
034public class Cleaner {
035    private final Safelist safelist;
036
037    /**
038     Create a new cleaner, that sanitizes documents using the supplied safelist.
039     @param safelist safe-list to clean with
040     */
041    public Cleaner(Safelist safelist) {
042        Validate.notNull(safelist);
043        this.safelist = safelist;
044    }
045
046    /**
047     Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist.
048     The original document is not modified. Only elements from the dirty document's <code>body</code> are used. The
049     OutputSettings of the original document are cloned into the clean document.
050     @param dirtyDocument Untrusted base document to clean.
051     @return cleaned document.
052     */
053    public Document clean(Document dirtyDocument) {
054        Validate.notNull(dirtyDocument);
055
056        Document clean = Document.createShell(dirtyDocument.baseUri());
057        copySafeNodes(dirtyDocument.body(), clean.body());
058        clean.outputSettings(dirtyDocument.outputSettings().clone());
059
060        return clean;
061    }
062
063    /**
064     Determines if the input document's <b>body</b> is valid, against the safelist. It is considered valid if all the
065     tags and attributes in the input HTML are allowed by the safelist, and that there is no content in the
066     <code>head</code>.
067     <p>
068     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
069     output of this method, the input document <b>must always</b> be normalized using a method such as
070     {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse
071     such as presentation to end users. This ensures that enforced attributes are set correctly, and that any
072     differences between how a given browser and how jsoup parses the input HTML are normalized.
073     </p>
074     <p>Example:
075     <pre>{@code
076     Document inputDoc = Jsoup.parse(inputHtml);
077     Cleaner cleaner = new Cleaner(Safelist.relaxed());
078     boolean isValid = cleaner.isValid(inputDoc);
079     Document normalizedDoc = cleaner.clean(inputDoc);
080     }</pre>
081     </p>
082     @param dirtyDocument document to test
083     @return true if no tags or attributes need to be removed; false if they do
084     */
085    public boolean isValid(Document dirtyDocument) {
086        Validate.notNull(dirtyDocument);
087
088        Document clean = Document.createShell(dirtyDocument.baseUri());
089        int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
090        return numDiscarded == 0
091            && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head
092    }
093
094    /**
095     Determines if the input document's <b>body HTML</b> is valid, against the safelist. It is considered valid if all
096     the tags and attributes in the input HTML are allowed by the safelist.
097     <p>
098     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
099     output of this method, the input document <b>must always</b> be normalized using a method such as
100     {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse
101     such as presentation to end users. This ensures that enforced attributes are set correctly, and that any
102     differences between how a given browser and how jsoup parses the input HTML are normalized.
103     </p>
104     <p>Example:
105     <pre>{@code
106     Document inputDoc = Jsoup.parse(inputHtml);
107     Cleaner cleaner = new Cleaner(Safelist.relaxed());
108     boolean isValid = cleaner.isValidBodyHtml(inputHtml);
109     Document normalizedDoc = cleaner.clean(inputDoc);
110     }</pre>
111     </p>
112     @param bodyHtml HTML fragment to test
113     @return true if no tags or attributes need to be removed; false if they do
114     */
115    public boolean isValidBodyHtml(String bodyHtml) {
116        String baseUri = (safelist.preserveRelativeLinks()) ? DummyUri : ""; // fake base URI to allow relative URLs to remain valid
117        Document clean = Document.createShell(baseUri);
118        Document dirty = Document.createShell(baseUri);
119        ParseErrorList errorList = ParseErrorList.tracking(1);
120        List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), baseUri, errorList);
121        dirty.body().insertChildren(0, nodes);
122        int numDiscarded = copySafeNodes(dirty.body(), clean.body());
123        return numDiscarded == 0 && errorList.isEmpty();
124    }
125
126    /**
127     Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
128     */
129    private final class CleaningVisitor implements NodeVisitor {
130        private int numDiscarded = 0;
131        private final Element root;
132        private Element destination; // current element to append nodes to
133
134        private CleaningVisitor(Element root, Element destination) {
135            this.root = root;
136            this.destination = destination;
137        }
138
139        @Override public void head(Node source, int depth) {
140            if (source instanceof Element) {
141                Element sourceEl = (Element) source;
142
143                if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs
144                    ElementMeta meta = createSafeElement(sourceEl);
145                    Element destChild = meta.el;
146                    destination.appendChild(destChild);
147
148                    numDiscarded += meta.numAttribsDiscarded;
149                    destination = destChild;
150                } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
151                    numDiscarded++;
152                }
153            } else if (source instanceof TextNode) {
154                TextNode sourceText = (TextNode) source;
155                TextNode destText = new TextNode(sourceText.getWholeText());
156                destination.appendChild(destText);
157            } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) {
158                DataNode sourceData = (DataNode) source;
159                DataNode destData = new DataNode(sourceData.getWholeData());
160                destination.appendChild(destData);
161            } else { // else, we don't care about comments, xml proc instructions, etc
162                numDiscarded++;
163            }
164        }
165
166        @Override public void tail(Node source, int depth) {
167            if (source instanceof Element && safelist.isSafeTag(source.normalName())) {
168                destination = destination.parent(); // would have descended, so pop destination stack
169            }
170        }
171    }
172
173    private int copySafeNodes(Element source, Element dest) {
174        CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
175        cleaningVisitor.traverse(source);
176        return cleaningVisitor.numDiscarded;
177    }
178
179    private ElementMeta createSafeElement(Element sourceEl) {
180        Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data
181        String sourceTag = sourceEl.tagName();
182        Attributes destAttrs = dest.attributes();
183        dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy
184
185        int numDiscarded = 0;
186        Attributes sourceAttrs = sourceEl.attributes();
187        for (Attribute sourceAttr : sourceAttrs) {
188            if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
189                destAttrs.put(sourceAttr);
190            else
191                numDiscarded++;
192        }
193
194
195        Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag);
196        // special case for <a href rel=nofollow>, only apply to external links:
197        if (sourceEl.nameIs("a") && enforcedAttrs.get("rel").equals("nofollow")) {
198            String href = sourceEl.absUrl("href");
199            String sourceBase = sourceEl.baseUri();
200            if (!href.isEmpty() && !sourceBase.isEmpty() && href.startsWith(sourceBase)) { // same site, so don't set the nofollow
201                enforcedAttrs.remove("rel");
202            }
203        }
204
205        destAttrs.addAll(enforcedAttrs);
206        dest.attributes().addAll(destAttrs); // re-attach, if removed in clear
207        return new ElementMeta(dest, numDiscarded);
208    }
209
210    private static class ElementMeta {
211        Element el;
212        int numAttribsDiscarded;
213
214        ElementMeta(Element el, int numAttribsDiscarded) {
215            this.el = el;
216            this.numAttribsDiscarded = numAttribsDiscarded;
217        }
218    }
219
220}