001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2016 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.regex.Matcher; 023import java.util.regex.Pattern; 024 025import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 026import com.puppycrawl.tools.checkstyle.api.DetailAST; 027import com.puppycrawl.tools.checkstyle.api.TokenTypes; 028 029/** 030 * <p> 031 * Restrict using <a href = 032 * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3"> 033 * Unicode escapes</a> (e.g. \u221e). 034 * It is possible to allow using escapes for 035 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 036 * non-printable(control) characters</a>. 037 * Also, this check can be configured to allow using escapes 038 * if trail comment is present. By the option it is possible to 039 * allow using escapes if literal contains only them. By the option it 040 * is possible to allow using escapes for space literals. 041 * </p> 042 * <p> 043 * Examples of using Unicode:</p> 044 * <pre> 045 * String unitAbbrev = "μs"; //Best: perfectly clear even without a comment. 046 * String unitAbbrev = "\u03bcs"; //Poor: the reader has no idea what this is. 047 * </pre> 048 * <p> 049 * An example of how to configure the check is: 050 * </p> 051 * <pre> 052 * <module name="AvoidEscapedUnicodeCharacters"/> 053 * </pre> 054 * <p> 055 * An example of non-printable(control) characters. 056 * </p> 057 * <pre> 058 * return '\ufeff' + content; // byte order mark 059 * </pre> 060 * <p> 061 * An example of how to configure the check to allow using escapes 062 * for non-printable(control) characters: 063 * </p> 064 * <pre> 065 * <module name="AvoidEscapedUnicodeCharacters"> 066 * <property name="allowEscapesForControlCharacters" value="true"/> 067 * </module> 068 * </pre> 069 * <p> 070 * Example of using escapes with trail comment: 071 * </p> 072 * <pre> 073 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 074 * </pre> 075 * <p>An example of how to configure the check to allow using escapes 076 * if trail comment is present: 077 * </p> 078 * <pre> 079 * <module name="AvoidEscapedUnicodeCharacters"> 080 * <property name="allowByTailComment" value="true"/> 081 * </module> 082 * </pre> 083 * <p>Example of using escapes if literal contains only them: 084 * </p> 085 * <pre> 086 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 087 * </pre> 088 * <p>An example of how to configure the check to allow escapes 089 * if literal contains only them: 090 * </p> 091 * <pre> 092 * <module name="AvoidEscapedUnicodeCharacters"> 093 * <property name="allowIfAllCharactersEscaped" value="true"/> 094 * </module> 095 * </pre> 096 * <p>An example of how to configure the check to allow non-printable escapes: 097 * </p> 098 * <pre> 099 * <module name="AvoidEscapedUnicodeCharacters"> 100 * <property name="allowNonPrintableEscapes" value="true"/> 101 * </module> 102 * </pre> 103 * 104 * @author maxvetrenko 105 * 106 */ 107public class AvoidEscapedUnicodeCharactersCheck 108 extends AbstractCheck { 109 /** 110 * A key is pointing to the warning message text in "messages.properties" 111 * file. 112 */ 113 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 114 115 /** Regular expression for Unicode chars. */ 116 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); 117 118 /** Regular expression Unicode control characters. */ 119 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)" 120 + "(00[0-1][0-1A-Fa-f]|00[8-9][0-9A-Fa-f]|034(f|F)|070(f|F)" 121 + "|180(e|E)|200[b-fB-F]|202[b-eB-E]|206[0-4a-fA-F]" 122 + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})"); 123 124 /** Regular expression for trail comment. */ 125 private static final Pattern COMMENT_REGEXP = Pattern.compile(";[ ]*//+" 126 + "[a-zA-Z0-9 ]*|;[ ]*/[*]+[a-zA-Z0-9 ]*"); 127 128 /** Regular expression for all escaped chars. */ 129 private static final Pattern ALL_ESCAPED_CHARS = 130 Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" 131 + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$"); 132 133 /** Regular expression for non-printable unicode chars. */ 134 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028" 135 + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)" 136 + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)" 137 + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)" 138 + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069" 139 + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9" 140 + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604" 141 + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)" 142 + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)" 143 + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)" 144 + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00" 145 + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9" 146 + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}" 147 + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000" 148 + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)" 149 + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)" 150 + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006" 151 + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028" 152 + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025" 153 + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61"); 154 155 /** Allow use escapes for non-printable(control) characters. */ 156 private boolean allowEscapesForControlCharacters; 157 158 /** Allow use escapes if trail comment is present. */ 159 private boolean allowByTailComment; 160 161 /** Allow if all characters in literal are escaped. */ 162 private boolean allowIfAllCharactersEscaped; 163 164 /** Allow escapes for space literals. */ 165 private boolean allowNonPrintableEscapes; 166 167 /** 168 * Set allowIfAllCharactersEscaped. 169 * @param allow user's value. 170 */ 171 public final void setAllowEscapesForControlCharacters(boolean allow) { 172 allowEscapesForControlCharacters = allow; 173 } 174 175 /** 176 * Set allowByTailComment. 177 * @param allow user's value. 178 */ 179 public final void setAllowByTailComment(boolean allow) { 180 allowByTailComment = allow; 181 } 182 183 /** 184 * Set allowIfAllCharactersEscaped. 185 * @param allow user's value. 186 */ 187 public final void setAllowIfAllCharactersEscaped(boolean allow) { 188 allowIfAllCharactersEscaped = allow; 189 } 190 191 /** 192 * Set allowSpaceEscapes. 193 * @param allow user's value. 194 */ 195 public final void setAllowNonPrintableEscapes(boolean allow) { 196 allowNonPrintableEscapes = allow; 197 } 198 199 @Override 200 public int[] getDefaultTokens() { 201 return getAcceptableTokens(); 202 } 203 204 @Override 205 public int[] getAcceptableTokens() { 206 return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; 207 } 208 209 @Override 210 public int[] getRequiredTokens() { 211 return getAcceptableTokens(); 212 } 213 214 @Override 215 public void visitToken(DetailAST ast) { 216 217 final String literal = ast.getText(); 218 219 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 220 || isAllCharactersEscaped(literal) 221 || allowEscapesForControlCharacters 222 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 223 || allowNonPrintableEscapes 224 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 225 log(ast.getLineNo(), MSG_KEY); 226 } 227 } 228 229 /** 230 * Checks if literal has Unicode chars. 231 * @param literal String literal. 232 * @return true if literal has Unicode chars. 233 */ 234 private static boolean hasUnicodeChar(String literal) { 235 return UNICODE_REGEXP.matcher(literal).find(); 236 } 237 238 /** 239 * Check if String literal contains Unicode control chars. 240 * @param literal String literal. 241 * @param pattern RegExp for valid characters. 242 * @return true, if String literal contains Unicode control chars. 243 */ 244 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 245 final int unicodeMatchesCounter = 246 countMatches(UNICODE_REGEXP, literal); 247 final int unicodeValidMatchesCounter = 248 countMatches(pattern, literal); 249 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 250 } 251 252 /** 253 * Check if trail comment is present after ast token. 254 * @param ast current token. 255 * @return true if trail comment is present after ast token. 256 */ 257 private boolean hasTrailComment(DetailAST ast) { 258 final DetailAST variableDef = getVariableDef(ast); 259 DetailAST semi; 260 261 if (variableDef == null) { 262 semi = getSemi(ast); 263 } 264 else { 265 semi = variableDef.getNextSibling(); 266 267 if (semi.getType() != TokenTypes.SEMI) { 268 semi = variableDef.getLastChild(); 269 } 270 } 271 272 boolean result = false; 273 if (semi != null) { 274 final int lineNo = semi.getLineNo(); 275 final String currentLine = getLine(lineNo - 1); 276 277 if (COMMENT_REGEXP.matcher(currentLine).find()) { 278 result = true; 279 } 280 } 281 282 return result; 283 } 284 285 /** 286 * Count regexp matches into String literal. 287 * @param pattern pattern. 288 * @param target String literal. 289 * @return count of regexp matches. 290 */ 291 private static int countMatches(Pattern pattern, String target) { 292 int matcherCounter = 0; 293 final Matcher matcher = pattern.matcher(target); 294 while (matcher.find()) { 295 matcherCounter++; 296 } 297 return matcherCounter; 298 } 299 300 /** 301 * Get variable definition. 302 * @param ast current token. 303 * @return variable definition. 304 */ 305 private static DetailAST getVariableDef(DetailAST ast) { 306 DetailAST result = ast.getParent(); 307 while (result != null 308 && result.getType() != TokenTypes.VARIABLE_DEF) { 309 result = result.getParent(); 310 } 311 return result; 312 } 313 314 /** 315 * Get semi token. 316 * @param ast current token. 317 * @return semi token or null. 318 */ 319 private static DetailAST getSemi(DetailAST ast) { 320 DetailAST result = ast.getParent(); 321 while (result != null 322 && result.getLastChild().getType() != TokenTypes.SEMI) { 323 result = result.getParent(); 324 } 325 if (result != null) { 326 result = result.getLastChild(); 327 } 328 return result; 329 } 330 331 /** 332 * Checks if all characters in String literal is escaped. 333 * @param literal current literal. 334 * @return true if all characters in String literal is escaped. 335 */ 336 private boolean isAllCharactersEscaped(String literal) { 337 return allowIfAllCharactersEscaped 338 && ALL_ESCAPED_CHARS.matcher(literal.substring(1, 339 literal.length() - 1)).find(); 340 } 341}