001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2020 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.List; 023import java.util.Map; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 028import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 029import com.puppycrawl.tools.checkstyle.api.DetailAST; 030import com.puppycrawl.tools.checkstyle.api.TextBlock; 031import com.puppycrawl.tools.checkstyle.api.TokenTypes; 032import com.puppycrawl.tools.checkstyle.utils.CommonUtil; 033 034/** 035 * <p> 036 * Restricts using 037 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3"> 038 * Unicode escapes</a> 039 * (such as \u221e). It is possible to allow using escapes for 040 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 041 * non-printable, control characters</a>. 042 * Also, this check can be configured to allow using escapes 043 * if trail comment is present. By the option it is possible to 044 * allow using escapes if literal contains only them. 045 * </p> 046 * <ul> 047 * <li> 048 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for 049 * non-printable, control characters. 050 * Type is {@code boolean}. 051 * Default value is {@code false}. 052 * </li> 053 * <li> 054 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present. 055 * Type is {@code boolean}. 056 * Default value is {@code false}. 057 * </li> 058 * <li> 059 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped. 060 * Type is {@code boolean}. 061 * Default value is {@code false}. 062 * </li> 063 * <li> 064 * Property {@code allowNonPrintableEscapes} - Allow use escapes for 065 * non-printable, whitespace characters. 066 * Type is {@code boolean}. 067 * Default value is {@code false}. 068 * </li> 069 * </ul> 070 * <p> 071 * To configure the check: 072 * </p> 073 * <pre> 074 * <module name="AvoidEscapedUnicodeCharacters"/> 075 * </pre> 076 * <p> 077 * Examples of using Unicode:</p> 078 * <pre> 079 * String unitAbbrev = "μs"; // Best: perfectly clear even without a comment. 080 * String unitAbbrev = "\u03bcs"; // Poor: the reader has no idea what this is. 081 * </pre> 082 * <p> 083 * An example of non-printable, control characters. 084 * </p> 085 * <pre> 086 * return '\ufeff' + content; // byte order mark 087 * </pre> 088 * <p> 089 * An example of how to configure the check to allow using escapes 090 * for non-printable, control characters: 091 * </p> 092 * <pre> 093 * <module name="AvoidEscapedUnicodeCharacters"> 094 * <property name="allowEscapesForControlCharacters" value="true"/> 095 * </module> 096 * </pre> 097 * <p> 098 * Example of using escapes with trail comment: 099 * </p> 100 * <pre> 101 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 102 * String textBlockUnitAbbrev = """ 103 * \u03bcs"""; // Greek letter mu, "s" 104 * </pre> 105 * <p>An example of how to configure the check to allow using escapes 106 * if trail comment is present: 107 * </p> 108 * <pre> 109 * <module name="AvoidEscapedUnicodeCharacters"> 110 * <property name="allowByTailComment" value="true"/> 111 * </module> 112 * </pre> 113 * <p>Example of using escapes if literal contains only them: 114 * </p> 115 * <pre> 116 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 117 * </pre> 118 * <p>An example of how to configure the check to allow escapes 119 * if literal contains only them: 120 * </p> 121 * <pre> 122 * <module name="AvoidEscapedUnicodeCharacters"> 123 * <property name="allowIfAllCharactersEscaped" value="true"/> 124 * </module> 125 * </pre> 126 * <p>An example of how to configure the check to allow using escapes 127 * for non-printable, whitespace characters: 128 * </p> 129 * <pre> 130 * <module name="AvoidEscapedUnicodeCharacters"> 131 * <property name="allowNonPrintableEscapes" value="true"/> 132 * </module> 133 * </pre> 134 * <p> 135 * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker} 136 * </p> 137 * <p> 138 * Violation Message Keys: 139 * </p> 140 * <ul> 141 * <li> 142 * {@code forbid.escaped.unicode.char} 143 * </li> 144 * </ul> 145 * 146 * @since 5.8 147 */ 148@FileStatefulCheck 149public class AvoidEscapedUnicodeCharactersCheck 150 extends AbstractCheck { 151 152 /** 153 * A key is pointing to the warning message text in "messages.properties" 154 * file. 155 */ 156 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 157 158 /** Regular expression for Unicode chars. */ 159 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); 160 161 /** 162 * Regular expression Unicode control characters. 163 * 164 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 165 * Appendix:Control characters</a> 166 */ 167 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\[uU]" 168 + "(00[0-1][0-9A-Fa-f]" 169 + "|00[8-9][0-9A-Fa-f]" 170 + "|00[aA][dD]" 171 + "|034[fF]" 172 + "|070[fF]" 173 + "|180[eE]" 174 + "|200[b-fB-F]" 175 + "|202[a-eA-E]" 176 + "|206[0-4a-fA-F]" 177 + "|[fF]{3}[9a-bA-B]" 178 + "|[fF][eE][fF]{2})"); 179 180 /** 181 * Regular expression for all escaped chars. 182 * See "EscapeSequence" at 183 * https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.10.6 184 */ 185 private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" 186 + "|\"" 187 + "|'" 188 + "|\\\\" 189 + "|\\\\b" 190 + "|\\\\f" 191 + "|\\\\n" 192 + "|\\\\r" 193 + "|\\\\t" 194 + ")+$"); 195 196 /** Regular expression for escaped backslash. */ 197 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 198 199 /** Regular expression for non-printable unicode chars. */ 200 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000" 201 + "|\\\\u0009" 202 + "|\\\\u000[bB]" 203 + "|\\\\u000[cC]" 204 + "|\\\\u0020" 205 + "|\\\\u007[fF]" 206 + "|\\\\u0085" 207 + "|\\\\u009[fF]" 208 + "|\\\\u00[aA]0" 209 + "|\\\\u00[aA][dD]" 210 + "|\\\\u04[fF]9" 211 + "|\\\\u05[bB][eE]" 212 + "|\\\\u05[dD]0" 213 + "|\\\\u05[eE][aA]" 214 + "|\\\\u05[fF]3" 215 + "|\\\\u05[fF]4" 216 + "|\\\\u0600" 217 + "|\\\\u0604" 218 + "|\\\\u061[cC]" 219 + "|\\\\u06[dD]{2}" 220 + "|\\\\u06[fF]{2}" 221 + "|\\\\u070[fF]" 222 + "|\\\\u0750" 223 + "|\\\\u077[fF]" 224 + "|\\\\u0[eE]00" 225 + "|\\\\u0[eE]7[fF]" 226 + "|\\\\u1680" 227 + "|\\\\u180[eE]" 228 + "|\\\\u1[eE]00" 229 + "|\\\\u2000" 230 + "|\\\\u2001" 231 + "|\\\\u2002" 232 + "|\\\\u2003" 233 + "|\\\\u2004" 234 + "|\\\\u2005" 235 + "|\\\\u2006" 236 + "|\\\\u2007" 237 + "|\\\\u2008" 238 + "|\\\\u2009" 239 + "|\\\\u200[aA]" 240 + "|\\\\u200[fF]" 241 + "|\\\\u2025" 242 + "|\\\\u2028" 243 + "|\\\\u2029" 244 + "|\\\\u202[fF]" 245 + "|\\\\u205[fF]" 246 + "|\\\\u2064" 247 + "|\\\\u2066" 248 + "|\\\\u2067" 249 + "|\\\\u2068" 250 + "|\\\\u2069" 251 + "|\\\\u206[aA]" 252 + "|\\\\u206[fF]" 253 + "|\\\\u20[aA][fF]" 254 + "|\\\\u2100" 255 + "|\\\\u213[aA]" 256 + "|\\\\u3000" 257 + "|\\\\u[dD]800" 258 + "|\\\\u[fF]8[fF]{2}" 259 + "|\\\\u[fF][bB]50" 260 + "|\\\\u[fF][dD][fF]{2}" 261 + "|\\\\u[fF][eE]70" 262 + "|\\\\u[fF][eE][fF]{2}" 263 + "|\\\\u[fF]{2}0[eE]" 264 + "|\\\\u[fF]{2}61" 265 + "|\\\\u[fF]{2}[dD][cC]" 266 + "|\\\\u[fF]{3}9" 267 + "|\\\\u[fF]{3}[aA]" 268 + "|\\\\u[fF]{3}[bB]" 269 + "|\\\\u[fF]{4}"); 270 271 /** Cpp style comments. */ 272 private Map<Integer, TextBlock> singlelineComments; 273 /** C style comments. */ 274 private Map<Integer, List<TextBlock>> blockComments; 275 276 /** Allow use escapes for non-printable, control characters. */ 277 private boolean allowEscapesForControlCharacters; 278 279 /** Allow use escapes if trail comment is present. */ 280 private boolean allowByTailComment; 281 282 /** Allow if all characters in literal are escaped. */ 283 private boolean allowIfAllCharactersEscaped; 284 285 /** Allow use escapes for non-printable, whitespace characters. */ 286 private boolean allowNonPrintableEscapes; 287 288 /** 289 * Setter to allow use escapes for non-printable, control characters. 290 * 291 * @param allow user's value. 292 */ 293 public final void setAllowEscapesForControlCharacters(boolean allow) { 294 allowEscapesForControlCharacters = allow; 295 } 296 297 /** 298 * Setter to allow use escapes if trail comment is present. 299 * 300 * @param allow user's value. 301 */ 302 public final void setAllowByTailComment(boolean allow) { 303 allowByTailComment = allow; 304 } 305 306 /** 307 * Setter to allow if all characters in literal are escaped. 308 * 309 * @param allow user's value. 310 */ 311 public final void setAllowIfAllCharactersEscaped(boolean allow) { 312 allowIfAllCharactersEscaped = allow; 313 } 314 315 /** 316 * Setter to allow use escapes for non-printable, whitespace characters. 317 * 318 * @param allow user's value. 319 */ 320 public final void setAllowNonPrintableEscapes(boolean allow) { 321 allowNonPrintableEscapes = allow; 322 } 323 324 @Override 325 public int[] getDefaultTokens() { 326 return getRequiredTokens(); 327 } 328 329 @Override 330 public int[] getAcceptableTokens() { 331 return getRequiredTokens(); 332 } 333 334 @Override 335 public int[] getRequiredTokens() { 336 return new int[] { 337 TokenTypes.STRING_LITERAL, 338 TokenTypes.CHAR_LITERAL, 339 TokenTypes.TEXT_BLOCK_CONTENT, 340 }; 341 } 342 343 @Override 344 public void beginTree(DetailAST rootAST) { 345 singlelineComments = getFileContents().getSingleLineComments(); 346 blockComments = getFileContents().getBlockComments(); 347 } 348 349 @Override 350 public void visitToken(DetailAST ast) { 351 final String literal = ast.getText(); 352 353 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 354 || isAllCharactersEscaped(literal) 355 || allowEscapesForControlCharacters 356 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 357 || allowNonPrintableEscapes 358 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 359 log(ast, MSG_KEY); 360 } 361 } 362 363 /** 364 * Checks if literal has Unicode chars. 365 * 366 * @param literal String literal. 367 * @return true if literal has Unicode chars. 368 */ 369 private static boolean hasUnicodeChar(String literal) { 370 final String literalWithoutEscapedBackslashes = 371 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 372 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 373 } 374 375 /** 376 * Check if String literal contains Unicode control chars. 377 * 378 * @param literal String literal. 379 * @param pattern RegExp for valid characters. 380 * @return true, if String literal contains Unicode control chars. 381 */ 382 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 383 final int unicodeMatchesCounter = 384 countMatches(UNICODE_REGEXP, literal); 385 final int unicodeValidMatchesCounter = 386 countMatches(pattern, literal); 387 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 388 } 389 390 /** 391 * Check if trail comment is present after ast token. 392 * 393 * @param ast current token. 394 * @return true if trail comment is present after ast token. 395 */ 396 private boolean hasTrailComment(DetailAST ast) { 397 int lineNo = ast.getLineNo(); 398 399 // Since the trailing comment in the case of text blocks must follow the """ delimiter, 400 // we need to look for it after TEXT_BLOCK_LITERAL_END. 401 if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) { 402 lineNo = ast.getNextSibling().getLineNo(); 403 } 404 boolean result = false; 405 if (singlelineComments.containsKey(lineNo)) { 406 result = true; 407 } 408 else { 409 final List<TextBlock> commentList = blockComments.get(lineNo); 410 if (commentList != null) { 411 final TextBlock comment = commentList.get(commentList.size() - 1); 412 final String line = getLines()[lineNo - 1]; 413 result = isTrailingBlockComment(comment, line); 414 } 415 } 416 return result; 417 } 418 419 /** 420 * Whether the C style comment is trailing. 421 * 422 * @param comment the comment to check. 423 * @param line the line where the comment starts. 424 * @return true if the comment is trailing. 425 */ 426 private static boolean isTrailingBlockComment(TextBlock comment, String line) { 427 return comment.getText().length != 1 428 || CommonUtil.isBlank(line.substring(comment.getEndColNo() + 1)); 429 } 430 431 /** 432 * Count regexp matches into String literal. 433 * 434 * @param pattern pattern. 435 * @param target String literal. 436 * @return count of regexp matches. 437 */ 438 private static int countMatches(Pattern pattern, String target) { 439 int matcherCounter = 0; 440 final Matcher matcher = pattern.matcher(target); 441 while (matcher.find()) { 442 matcherCounter++; 443 } 444 return matcherCounter; 445 } 446 447 /** 448 * Checks if all characters in String literal is escaped. 449 * 450 * @param literal current literal. 451 * @return true if all characters in String literal is escaped. 452 */ 453 private boolean isAllCharactersEscaped(String literal) { 454 return allowIfAllCharactersEscaped 455 && ALL_ESCAPED_CHARS.matcher(literal.substring(1, 456 literal.length() - 1)).find(); 457 } 458 459}