001package org.jsoup.safety; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Attribute; 005import org.jsoup.nodes.Attributes; 006import org.jsoup.nodes.DataNode; 007import org.jsoup.nodes.Document; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.Node; 010import org.jsoup.nodes.TextNode; 011import org.jsoup.parser.ParseErrorList; 012import org.jsoup.parser.Parser; 013import org.jsoup.select.NodeVisitor; 014 015import java.util.List; 016 017import static org.jsoup.internal.SharedConstants.DummyUri; 018 019/** 020 The {@link Safelist}-based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes 021 that you are expecting; no junk, and no cross-site scripting attacks! 022 <p> 023 The HTML cleaner parses the input as HTML and then runs it through a safelist, so the output HTML can only contain 024 HTML that is allowed by the safelist. 025 </p> 026 <p> 027 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the 028 canned safelists only allow body-contained tags. 029 </p> 030 <p> 031 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. 032 </p> 033 <p> 034 A Cleaner may be reused across multiple documents and shared across concurrent threads once its {@link Safelist} has 035 been configured. The cleaner uses the supplied safelist directly, so later safelist changes affect later cleaning 036 calls. If you need a variant of an existing configuration, use {@link Safelist#Safelist(Safelist)} to make a copy. 037 </p> 038 */ 039public class Cleaner { 040 private final Safelist safelist; 041 042 /** 043 Create a new cleaner, that sanitizes documents using the supplied safelist. 044 @param safelist safe-list to clean with 045 */ 046 public Cleaner(Safelist safelist) { 047 Validate.notNull(safelist); 048 this.safelist = safelist; 049 } 050 051 /** 052 Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist. 053 The original document is not modified. Only elements from the dirty document's <code>body</code> are used. The 054 OutputSettings of the original document are cloned into the clean document. 055 @param dirtyDocument Untrusted base document to clean. 056 @return cleaned document. 057 */ 058 public Document clean(Document dirtyDocument) { 059 Validate.notNull(dirtyDocument); 060 061 Document clean = Document.createShell(dirtyDocument.baseUri()); 062 copySafeNodes(dirtyDocument.body(), clean.body()); 063 clean.outputSettings(dirtyDocument.outputSettings().clone()); 064 065 return clean; 066 } 067 068 /** 069 Determines if the input document's <b>body</b> is valid, against the safelist. It is considered valid if all the 070 tags and attributes in the input HTML are allowed by the safelist, and that there is no content in the 071 <code>head</code>. 072 <p> 073 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 074 output of this method, the input document <b>must always</b> be normalized using a method such as 075 {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse 076 such as presentation to end users. This ensures that enforced attributes are set correctly, and that any 077 differences between how a given browser and how jsoup parses the input HTML are normalized. 078 </p> 079 <p>Example: 080 <pre>{@code 081 Document inputDoc = Jsoup.parse(inputHtml); 082 Cleaner cleaner = new Cleaner(Safelist.relaxed()); 083 boolean isValid = cleaner.isValid(inputDoc); 084 Document normalizedDoc = cleaner.clean(inputDoc); 085 }</pre> 086 </p> 087 @param dirtyDocument document to test 088 @return true if no tags or attributes need to be removed; false if they do 089 */ 090 public boolean isValid(Document dirtyDocument) { 091 Validate.notNull(dirtyDocument); 092 093 Document clean = Document.createShell(dirtyDocument.baseUri()); 094 int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); 095 return numDiscarded == 0 096 && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head 097 } 098 099 /** 100 Determines if the input document's <b>body HTML</b> is valid, against the safelist. It is considered valid if all 101 the tags and attributes in the input HTML are allowed by the safelist. 102 <p> 103 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 104 output of this method, the input document <b>must always</b> be normalized using a method such as 105 {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse 106 such as presentation to end users. This ensures that enforced attributes are set correctly, and that any 107 differences between how a given browser and how jsoup parses the input HTML are normalized. 108 </p> 109 <p>Example: 110 <pre>{@code 111 Document inputDoc = Jsoup.parse(inputHtml); 112 Cleaner cleaner = new Cleaner(Safelist.relaxed()); 113 boolean isValid = cleaner.isValidBodyHtml(inputHtml); 114 Document normalizedDoc = cleaner.clean(inputDoc); 115 }</pre> 116 </p> 117 @param bodyHtml HTML fragment to test 118 @return true if no tags or attributes need to be removed; false if they do 119 */ 120 public boolean isValidBodyHtml(String bodyHtml) { 121 String baseUri = (safelist.preserveRelativeLinks()) ? DummyUri : ""; // fake base URI to allow relative URLs to remain valid 122 Document clean = Document.createShell(baseUri); 123 Document dirty = Document.createShell(baseUri); 124 ParseErrorList errorList = ParseErrorList.tracking(1); 125 List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), baseUri, errorList); 126 dirty.body().insertChildren(0, nodes); 127 int numDiscarded = copySafeNodes(dirty.body(), clean.body()); 128 return numDiscarded == 0 && errorList.isEmpty(); 129 } 130 131 /** 132 Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. 133 */ 134 private final class CleaningVisitor implements NodeVisitor { 135 private int numDiscarded = 0; 136 private final Element root; 137 private Element destination; // current element to append nodes to 138 139 private CleaningVisitor(Element root, Element destination) { 140 this.root = root; 141 this.destination = destination; 142 } 143 144 @Override public void head(Node source, int depth) { 145 if (source instanceof Element) { 146 Element sourceEl = (Element) source; 147 148 if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs 149 ElementMeta meta = createSafeElement(sourceEl); 150 Element destChild = meta.el; 151 destination.appendChild(destChild); 152 153 numDiscarded += meta.numAttribsDiscarded; 154 destination = destChild; 155 } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. 156 numDiscarded++; 157 } 158 } else if (source instanceof TextNode) { 159 TextNode sourceText = (TextNode) source; 160 TextNode destText = new TextNode(sourceText.getWholeText()); 161 destination.appendChild(destText); 162 } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) { 163 DataNode sourceData = (DataNode) source; 164 DataNode destData = new DataNode(sourceData.getWholeData()); 165 destination.appendChild(destData); 166 } else { // else, we don't care about comments, xml proc instructions, etc 167 numDiscarded++; 168 } 169 } 170 171 @Override public void tail(Node source, int depth) { 172 if (source instanceof Element && safelist.isSafeTag(source.normalName())) { 173 destination = destination.parent(); // would have descended, so pop destination stack 174 } 175 } 176 } 177 178 private int copySafeNodes(Element source, Element dest) { 179 CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest); 180 cleaningVisitor.traverse(source); 181 return cleaningVisitor.numDiscarded; 182 } 183 184 private ElementMeta createSafeElement(Element sourceEl) { 185 Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data 186 String sourceTag = sourceEl.tagName(); 187 Attributes destAttrs = dest.attributes(); 188 dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy 189 190 int numDiscarded = 0; 191 Attributes sourceAttrs = sourceEl.attributes(); 192 for (Attribute sourceAttr : sourceAttrs) { 193 if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { // will keep this attr 194 String key = sourceAttr.getKey(); 195 String value = sourceAttr.getValue(); 196 197 if (safelist.shouldAbsUrl(sourceTag, key)) { // configured to make absolute urls for this key (href) 198 value = sourceEl.absUrl(key); 199 if (value.isEmpty()) // could not be made abs; leave as-is to allow custom unknown protocols 200 value = sourceAttr.getValue(); 201 } 202 destAttrs.put(key, value); 203 } else 204 numDiscarded++; 205 } 206 207 208 Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag); 209 // special case for <a href rel=nofollow>, only apply to external links: 210 if (sourceEl.nameIs("a") && enforcedAttrs.get("rel").equals("nofollow")) { 211 String href = sourceEl.absUrl("href"); 212 String sourceBase = sourceEl.baseUri(); 213 if (!href.isEmpty() && !sourceBase.isEmpty() && href.startsWith(sourceBase)) { // same site, so don't set the nofollow 214 enforcedAttrs.remove("rel"); 215 } 216 } 217 218 // apply enforced attributes case-insensitively, so a preserved-case source attr is canonicalized to the enforced key 219 for (Attribute enforcedAttr : enforcedAttrs) { 220 destAttrs.removeIgnoreCase(enforcedAttr.getKey()); 221 destAttrs.put(enforcedAttr.getKey(), enforcedAttr.getValue()); 222 } 223 dest.attributes().addAll(destAttrs); // re-attach, if removed in clear 224 return new ElementMeta(dest, numDiscarded); 225 } 226 227 private static class ElementMeta { 228 Element el; 229 int numAttribsDiscarded; 230 231 ElementMeta(Element el, int numAttribsDiscarded) { 232 this.el = el; 233 this.numAttribsDiscarded = numAttribsDiscarded; 234 } 235 } 236 237}