001package org.jsoup.helper; 002 003import org.jsoup.internal.SharedConstants; 004 005import java.lang.reflect.InvocationTargetException; 006import java.lang.reflect.Method; 007import java.util.regex.Pattern; 008import java.util.regex.PatternSyntaxException; 009 010/** 011 A regular expression abstraction. Allows jsoup to optionally use the re2j regular expression engine (linear time) 012 instead of the JDK's backtracking regex implementation. 013 014 <p>If the {@code com.google.re2j} library is found on the classpath, by default it will be used. You can override this 015 by setting {@code -Djsoup.useRe2j=false} to explicitly disable, and use the JDK regex engine.</p> 016 017 <p>(Currently this a simplified implementation for jsoup's specific use; can extend as required.)</p> 018 */ 019public class Regex { 020 private static final boolean hasRe2j = hasRe2j(); 021 022 private final Pattern jdkPattern; 023 024 Regex(Pattern jdkPattern) { 025 this.jdkPattern = jdkPattern; 026 } 027 028 /** 029 Compile a regex, using re2j if enabled and available; otherwise JDK regex. 030 031 @param regex the regex to compile 032 @return the compiled regex 033 @throws ValidationException if the regex is invalid 034 */ 035 public static Regex compile(String regex) { 036 if (usingRe2j()) { 037 return Re2jRegex.compile(regex); 038 } 039 040 try { 041 return new Regex(Pattern.compile(regex)); 042 } catch (PatternSyntaxException e) { 043 throw new ValidationException("Pattern syntax error: " + e.getMessage()); 044 } 045 } 046 047 /** Wraps an existing JDK Pattern (for API compat); doesn't switch */ 048 public static Regex fromPattern(Pattern pattern) { 049 return new Regex(pattern); 050 } 051 052 /** 053 Checks if re2j is available (on classpath) and enabled (via system property). 054 @return true if re2j is available and enabled 055 */ 056 public static boolean usingRe2j() { 057 return hasRe2j && wantsRe2j(); 058 } 059 060 static boolean wantsRe2j() { 061 return Boolean.parseBoolean(System.getProperty(SharedConstants.UseRe2j, "true")); 062 } 063 064 static void wantsRe2j(boolean use) { 065 System.setProperty(SharedConstants.UseRe2j, Boolean.toString(use)); 066 } 067 068 static boolean hasRe2j() { 069 try { 070 Class<?> re2 = Class.forName("com.google.re2j.Pattern", false, Regex.class.getClassLoader()); // check if re2j is in classpath 071 try { 072 // if it is, and we are on JVM9+, we need to dork around with modules, because re2j doesn't publish a module name. 073 // done via reflection so we can still run on JVM 8. 074 // todo remove if re2j publishes as a module 075 Class<?> moduleCls = Class.forName("java.lang.Module"); 076 Method getModule = Class.class.getMethod("getModule"); 077 Object jsoupMod = getModule.invoke(Regex.class); 078 Object re2Mod = getModule.invoke(re2); 079 boolean reads = (boolean) moduleCls.getMethod("canRead", moduleCls).invoke(jsoupMod, re2Mod); 080 if (!reads) moduleCls.getMethod("addReads", moduleCls).invoke(jsoupMod, re2Mod); 081 } catch (ClassNotFoundException ignore) { 082 // jvm8 - no Module class; so we can use as-is 083 } 084 return true; 085 } catch (ClassNotFoundException e) { 086 return false; // no re2j 087 } catch (ReflectiveOperationException e) { 088 // unexpectedly couldn’t wire modules on 9+; return false to avoid IllegalAccessError later 089 System.err.println("Warning: (bug? please report) couldn't access re2j from jsoup due to modules: " + e); 090 return false; 091 } 092 } 093 094 public Matcher matcher(CharSequence input) { 095 return new JdkMatcher(jdkPattern.matcher(input)); 096 } 097 098 @Override 099 public String toString() { 100 return jdkPattern.toString(); 101 } 102 103 public interface Matcher { 104 boolean find(); 105 } 106 107 private static final class JdkMatcher implements Matcher { 108 private final java.util.regex.Matcher delegate; 109 110 JdkMatcher(java.util.regex.Matcher delegate) { 111 this.delegate = delegate; 112 } 113 114 @Override 115 public boolean find() { 116 return delegate.find(); 117 } 118 } 119}