001package org.jsoup.safety; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Attribute; 005import org.jsoup.nodes.Attributes; 006import org.jsoup.nodes.DataNode; 007import org.jsoup.nodes.Document; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.Node; 010import org.jsoup.nodes.TextNode; 011import org.jsoup.parser.ParseErrorList; 012import org.jsoup.parser.Parser; 013import org.jsoup.select.NodeVisitor; 014 015import java.util.List; 016 017import static org.jsoup.internal.SharedConstants.DummyUri; 018 019/** 020 The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes 021 that you are expecting; no junk, and no cross-site scripting attacks! 022 <p> 023 The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain 024 HTML that is allowed by the safelist. 025 </p> 026 <p> 027 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the 028 canned safe-lists only allow body contained tags. 029 </p> 030 <p> 031 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. 032 </p> 033 */ 034public class Cleaner { 035 private final Safelist safelist; 036 037 /** 038 Create a new cleaner, that sanitizes documents using the supplied safelist. 039 @param safelist safe-list to clean with 040 */ 041 public Cleaner(Safelist safelist) { 042 Validate.notNull(safelist); 043 this.safelist = safelist; 044 } 045 046 /** 047 Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist. 048 The original document is not modified. Only elements from the dirty document's <code>body</code> are used. The 049 OutputSettings of the original document are cloned into the clean document. 050 @param dirtyDocument Untrusted base document to clean. 051 @return cleaned document. 052 */ 053 public Document clean(Document dirtyDocument) { 054 Validate.notNull(dirtyDocument); 055 056 Document clean = Document.createShell(dirtyDocument.baseUri()); 057 copySafeNodes(dirtyDocument.body(), clean.body()); 058 clean.outputSettings(dirtyDocument.outputSettings().clone()); 059 060 return clean; 061 } 062 063 /** 064 Determines if the input document's <b>body</b> is valid, against the safelist. It is considered valid if all the 065 tags and attributes in the input HTML are allowed by the safelist, and that there is no content in the 066 <code>head</code>. 067 <p> 068 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 069 output of this method, the input document <b>must always</b> be normalized using a method such as 070 {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse 071 such as presentation to end users. This ensures that enforced attributes are set correctly, and that any 072 differences between how a given browser and how jsoup parses the input HTML are normalized. 073 </p> 074 <p>Example: 075 <pre>{@code 076 Document inputDoc = Jsoup.parse(inputHtml); 077 Cleaner cleaner = new Cleaner(Safelist.relaxed()); 078 boolean isValid = cleaner.isValid(inputDoc); 079 Document normalizedDoc = cleaner.clean(inputDoc); 080 }</pre> 081 </p> 082 @param dirtyDocument document to test 083 @return true if no tags or attributes need to be removed; false if they do 084 */ 085 public boolean isValid(Document dirtyDocument) { 086 Validate.notNull(dirtyDocument); 087 088 Document clean = Document.createShell(dirtyDocument.baseUri()); 089 int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); 090 return numDiscarded == 0 091 && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head 092 } 093 094 /** 095 Determines if the input document's <b>body HTML</b> is valid, against the safelist. It is considered valid if all 096 the tags and attributes in the input HTML are allowed by the safelist. 097 <p> 098 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 099 output of this method, the input document <b>must always</b> be normalized using a method such as 100 {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse 101 such as presentation to end users. This ensures that enforced attributes are set correctly, and that any 102 differences between how a given browser and how jsoup parses the input HTML are normalized. 103 </p> 104 <p>Example: 105 <pre>{@code 106 Document inputDoc = Jsoup.parse(inputHtml); 107 Cleaner cleaner = new Cleaner(Safelist.relaxed()); 108 boolean isValid = cleaner.isValidBodyHtml(inputHtml); 109 Document normalizedDoc = cleaner.clean(inputDoc); 110 }</pre> 111 </p> 112 @param bodyHtml HTML fragment to test 113 @return true if no tags or attributes need to be removed; false if they do 114 */ 115 public boolean isValidBodyHtml(String bodyHtml) { 116 String baseUri = (safelist.preserveRelativeLinks()) ? DummyUri : ""; // fake base URI to allow relative URLs to remain valid 117 Document clean = Document.createShell(baseUri); 118 Document dirty = Document.createShell(baseUri); 119 ParseErrorList errorList = ParseErrorList.tracking(1); 120 List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), baseUri, errorList); 121 dirty.body().insertChildren(0, nodes); 122 int numDiscarded = copySafeNodes(dirty.body(), clean.body()); 123 return numDiscarded == 0 && errorList.isEmpty(); 124 } 125 126 /** 127 Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. 128 */ 129 private final class CleaningVisitor implements NodeVisitor { 130 private int numDiscarded = 0; 131 private final Element root; 132 private Element destination; // current element to append nodes to 133 134 private CleaningVisitor(Element root, Element destination) { 135 this.root = root; 136 this.destination = destination; 137 } 138 139 @Override public void head(Node source, int depth) { 140 if (source instanceof Element) { 141 Element sourceEl = (Element) source; 142 143 if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs 144 ElementMeta meta = createSafeElement(sourceEl); 145 Element destChild = meta.el; 146 destination.appendChild(destChild); 147 148 numDiscarded += meta.numAttribsDiscarded; 149 destination = destChild; 150 } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. 151 numDiscarded++; 152 } 153 } else if (source instanceof TextNode) { 154 TextNode sourceText = (TextNode) source; 155 TextNode destText = new TextNode(sourceText.getWholeText()); 156 destination.appendChild(destText); 157 } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) { 158 DataNode sourceData = (DataNode) source; 159 DataNode destData = new DataNode(sourceData.getWholeData()); 160 destination.appendChild(destData); 161 } else { // else, we don't care about comments, xml proc instructions, etc 162 numDiscarded++; 163 } 164 } 165 166 @Override public void tail(Node source, int depth) { 167 if (source instanceof Element && safelist.isSafeTag(source.normalName())) { 168 destination = destination.parent(); // would have descended, so pop destination stack 169 } 170 } 171 } 172 173 private int copySafeNodes(Element source, Element dest) { 174 CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest); 175 cleaningVisitor.traverse(source); 176 return cleaningVisitor.numDiscarded; 177 } 178 179 private ElementMeta createSafeElement(Element sourceEl) { 180 Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data 181 String sourceTag = sourceEl.tagName(); 182 Attributes destAttrs = dest.attributes(); 183 dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy 184 185 int numDiscarded = 0; 186 Attributes sourceAttrs = sourceEl.attributes(); 187 for (Attribute sourceAttr : sourceAttrs) { 188 if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) 189 destAttrs.put(sourceAttr); 190 else 191 numDiscarded++; 192 } 193 194 195 Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag); 196 // special case for <a href rel=nofollow>, only apply to external links: 197 if (sourceEl.nameIs("a") && enforcedAttrs.get("rel").equals("nofollow")) { 198 String href = sourceEl.absUrl("href"); 199 String sourceBase = sourceEl.baseUri(); 200 if (!href.isEmpty() && !sourceBase.isEmpty() && href.startsWith(sourceBase)) { // same site, so don't set the nofollow 201 enforcedAttrs.remove("rel"); 202 } 203 } 204 205 destAttrs.addAll(enforcedAttrs); 206 dest.attributes().addAll(destAttrs); // re-attach, if removed in clear 207 return new ElementMeta(dest, numDiscarded); 208 } 209 210 private static class ElementMeta { 211 Element el; 212 int numAttribsDiscarded; 213 214 ElementMeta(Element el, int numAttribsDiscarded) { 215 this.el = el; 216 this.numAttribsDiscarded = numAttribsDiscarded; 217 } 218 } 219 220}