001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2021 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.List; 023import java.util.Map; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 028import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 029import com.puppycrawl.tools.checkstyle.api.DetailAST; 030import com.puppycrawl.tools.checkstyle.api.TextBlock; 031import com.puppycrawl.tools.checkstyle.api.TokenTypes; 032import com.puppycrawl.tools.checkstyle.utils.CheckUtil; 033import com.puppycrawl.tools.checkstyle.utils.CommonUtil; 034 035/** 036 * <p> 037 * Restricts using 038 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3"> 039 * Unicode escapes</a> 040 * (such as \u221e). It is possible to allow using escapes for 041 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 042 * non-printable, control characters</a>. 043 * Also, this check can be configured to allow using escapes 044 * if trail comment is present. By the option it is possible to 045 * allow using escapes if literal contains only them. 046 * </p> 047 * <ul> 048 * <li> 049 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for 050 * non-printable, control characters. 051 * Type is {@code boolean}. 052 * Default value is {@code false}. 053 * </li> 054 * <li> 055 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present. 056 * Type is {@code boolean}. 057 * Default value is {@code false}. 058 * </li> 059 * <li> 060 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped. 061 * Type is {@code boolean}. 062 * Default value is {@code false}. 063 * </li> 064 * <li> 065 * Property {@code allowNonPrintableEscapes} - Allow use escapes for 066 * non-printable, whitespace characters. 067 * Type is {@code boolean}. 068 * Default value is {@code false}. 069 * </li> 070 * </ul> 071 * <p> 072 * To configure the check: 073 * </p> 074 * <pre> 075 * <module name="AvoidEscapedUnicodeCharacters"/> 076 * </pre> 077 * <p> 078 * Examples of using Unicode:</p> 079 * <pre> 080 * String unitAbbrev = "μs"; // OK, perfectly clear even without a comment. 081 * String unitAbbrev = "\u03bcs";// violation, the reader has no idea what this is. 082 * return '\ufeff' + content; // OK, an example of non-printable, 083 * // control characters (byte order mark). 084 * </pre> 085 * <p> 086 * An example of how to configure the check to allow using escapes 087 * for non-printable, control characters: 088 * </p> 089 * <pre> 090 * <module name="AvoidEscapedUnicodeCharacters"> 091 * <property name="allowEscapesForControlCharacters" value="true"/> 092 * </module> 093 * </pre> 094 * <p> 095 * Example of using escapes for non-printable, control characters: 096 * </p> 097 * <pre> 098 * String unitAbbrev = "μs"; // OK, a normal String 099 * String unitAbbrev = "\u03bcs"; // violation, "\u03bcs" is a printable character. 100 * return '\ufeff' + content; // OK, non-printable control character. 101 * </pre> 102 * <p> 103 * An example of how to configure the check to allow using escapes 104 * if trail comment is present: 105 * </p> 106 * <pre> 107 * <module name="AvoidEscapedUnicodeCharacters"> 108 * <property name="allowByTailComment" value="true"/> 109 * </module> 110 * </pre> 111 * <p>Example of using escapes if trail comment is present: 112 * </p> 113 * <pre> 114 * String unitAbbrev = "μs"; // OK, a normal String 115 * String unitAbbrev = "\u03bcs"; // OK, Greek letter mu, "s" 116 * return '\ufeff' + content; 117 * // -----^--------------------- violation, comment is not used within same line. 118 * </pre> 119 * <p> 120 * An example of how to configure the check to allow if 121 * all characters in literal are escaped. 122 * </p> 123 * <pre> 124 * <module name="AvoidEscapedUnicodeCharacters"> 125 * <property name="allowIfAllCharactersEscaped" value="true"/> 126 * </module> 127 * </pre> 128 * <p>Example of using escapes if all characters in literal are escaped:</p> 129 * <pre> 130 * String unitAbbrev = "μs"; // OK, a normal String 131 * String unitAbbrev = "\u03bcs"; // violation, not all characters are escaped ('s'). 132 * String unitAbbrev = "\u03bc\u03bc\u03bc"; // OK 133 * String unitAbbrev = "\u03bc\u03bcs";// violation, not all characters are escaped ('s'). 134 * return '\ufeff' + content; // OK, all control characters are escaped 135 * </pre> 136 * <p>An example of how to configure the check to allow using escapes 137 * for non-printable whitespace characters: 138 * </p> 139 * <pre> 140 * <module name="AvoidEscapedUnicodeCharacters"> 141 * <property name="allowNonPrintableEscapes" value="true"/> 142 * </module> 143 * </pre> 144 * <p>Example of using escapes for non-printable whitespace characters:</p> 145 * <pre> 146 * String unitAbbrev = "μs"; // OK, a normal String 147 * String unitAbbrev1 = "\u03bcs"; // violation, printable escape character. 148 * String unitAbbrev2 = "\u03bc\u03bc\u03bc"; // violation, printable escape character. 149 * String unitAbbrev3 = "\u03bc\u03bcs";// violation, printable escape character. 150 * return '\ufeff' + content; // OK, non-printable escape character. 151 * </pre> 152 * <p> 153 * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker} 154 * </p> 155 * <p> 156 * Violation Message Keys: 157 * </p> 158 * <ul> 159 * <li> 160 * {@code forbid.escaped.unicode.char} 161 * </li> 162 * </ul> 163 * 164 * @since 5.8 165 */ 166@FileStatefulCheck 167public class AvoidEscapedUnicodeCharactersCheck 168 extends AbstractCheck { 169 170 /** 171 * A key is pointing to the warning message text in "messages.properties" 172 * file. 173 */ 174 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 175 176 /** Regular expression for Unicode chars. */ 177 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F0-9]{4}"); 178 179 /** 180 * Regular expression Unicode control characters. 181 * 182 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 183 * Appendix:Control characters</a> 184 */ 185 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+" 186 + "(00[0-1][0-9A-Fa-f]" 187 + "|00[8-9][0-9A-Fa-f]" 188 + "|00[aA][dD]" 189 + "|034[fF]" 190 + "|070[fF]" 191 + "|180[eE]" 192 + "|200[b-fB-F]" 193 + "|202[a-eA-E]" 194 + "|206[0-4a-fA-F]" 195 + "|[fF]{3}[9a-bA-B]" 196 + "|[fF][eE][fF]{2})"); 197 198 /** 199 * Regular expression for all escaped chars. 200 * See "EscapeSequence" at 201 * https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7 202 */ 203 private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^(" 204 + UNICODE_REGEXP.pattern() 205 + "|\"" 206 + "|'" 207 + "|\\\\" 208 + "|\\\\b" 209 + "|\\\\f" 210 + "|\\\\n" 211 + "|\\R" 212 + "|\\\\r" 213 + "|\\\\s" 214 + "|\\\\t" 215 + ")+$"); 216 217 /** Regular expression for escaped backslash. */ 218 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 219 220 /** Regular expression for non-printable unicode chars. */ 221 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000" 222 + "|\\\\u0009" 223 + "|\\\\u000[bB]" 224 + "|\\\\u000[cC]" 225 + "|\\\\u0020" 226 + "|\\\\u007[fF]" 227 + "|\\\\u0085" 228 + "|\\\\u009[fF]" 229 + "|\\\\u00[aA]0" 230 + "|\\\\u00[aA][dD]" 231 + "|\\\\u04[fF]9" 232 + "|\\\\u05[bB][eE]" 233 + "|\\\\u05[dD]0" 234 + "|\\\\u05[eE][aA]" 235 + "|\\\\u05[fF]3" 236 + "|\\\\u05[fF]4" 237 + "|\\\\u0600" 238 + "|\\\\u0604" 239 + "|\\\\u061[cC]" 240 + "|\\\\u06[dD]{2}" 241 + "|\\\\u06[fF]{2}" 242 + "|\\\\u070[fF]" 243 + "|\\\\u0750" 244 + "|\\\\u077[fF]" 245 + "|\\\\u0[eE]00" 246 + "|\\\\u0[eE]7[fF]" 247 + "|\\\\u1680" 248 + "|\\\\u180[eE]" 249 + "|\\\\u1[eE]00" 250 + "|\\\\u2000" 251 + "|\\\\u2001" 252 + "|\\\\u2002" 253 + "|\\\\u2003" 254 + "|\\\\u2004" 255 + "|\\\\u2005" 256 + "|\\\\u2006" 257 + "|\\\\u2007" 258 + "|\\\\u2008" 259 + "|\\\\u2009" 260 + "|\\\\u200[aA]" 261 + "|\\\\u200[fF]" 262 + "|\\\\u2025" 263 + "|\\\\u2028" 264 + "|\\\\u2029" 265 + "|\\\\u202[fF]" 266 + "|\\\\u205[fF]" 267 + "|\\\\u2064" 268 + "|\\\\u2066" 269 + "|\\\\u2067" 270 + "|\\\\u2068" 271 + "|\\\\u2069" 272 + "|\\\\u206[aA]" 273 + "|\\\\u206[fF]" 274 + "|\\\\u20[aA][fF]" 275 + "|\\\\u2100" 276 + "|\\\\u213[aA]" 277 + "|\\\\u3000" 278 + "|\\\\u[dD]800" 279 + "|\\\\u[fF]8[fF]{2}" 280 + "|\\\\u[fF][bB]50" 281 + "|\\\\u[fF][dD][fF]{2}" 282 + "|\\\\u[fF][eE]70" 283 + "|\\\\u[fF][eE][fF]{2}" 284 + "|\\\\u[fF]{2}0[eE]" 285 + "|\\\\u[fF]{2}61" 286 + "|\\\\u[fF]{2}[dD][cC]" 287 + "|\\\\u[fF]{3}9" 288 + "|\\\\u[fF]{3}[aA]" 289 + "|\\\\u[fF]{3}[bB]" 290 + "|\\\\u[fF]{4}"); 291 292 /** Cpp style comments. */ 293 private Map<Integer, TextBlock> singlelineComments; 294 /** C style comments. */ 295 private Map<Integer, List<TextBlock>> blockComments; 296 297 /** Allow use escapes for non-printable, control characters. */ 298 private boolean allowEscapesForControlCharacters; 299 300 /** Allow use escapes if trail comment is present. */ 301 private boolean allowByTailComment; 302 303 /** Allow if all characters in literal are escaped. */ 304 private boolean allowIfAllCharactersEscaped; 305 306 /** Allow use escapes for non-printable, whitespace characters. */ 307 private boolean allowNonPrintableEscapes; 308 309 /** 310 * Setter to allow use escapes for non-printable, control characters. 311 * 312 * @param allow user's value. 313 */ 314 public final void setAllowEscapesForControlCharacters(boolean allow) { 315 allowEscapesForControlCharacters = allow; 316 } 317 318 /** 319 * Setter to allow use escapes if trail comment is present. 320 * 321 * @param allow user's value. 322 */ 323 public final void setAllowByTailComment(boolean allow) { 324 allowByTailComment = allow; 325 } 326 327 /** 328 * Setter to allow if all characters in literal are escaped. 329 * 330 * @param allow user's value. 331 */ 332 public final void setAllowIfAllCharactersEscaped(boolean allow) { 333 allowIfAllCharactersEscaped = allow; 334 } 335 336 /** 337 * Setter to allow use escapes for non-printable, whitespace characters. 338 * 339 * @param allow user's value. 340 */ 341 public final void setAllowNonPrintableEscapes(boolean allow) { 342 allowNonPrintableEscapes = allow; 343 } 344 345 @Override 346 public int[] getDefaultTokens() { 347 return getRequiredTokens(); 348 } 349 350 @Override 351 public int[] getAcceptableTokens() { 352 return getRequiredTokens(); 353 } 354 355 @Override 356 public int[] getRequiredTokens() { 357 return new int[] { 358 TokenTypes.STRING_LITERAL, 359 TokenTypes.CHAR_LITERAL, 360 TokenTypes.TEXT_BLOCK_CONTENT, 361 }; 362 } 363 364 @Override 365 public void beginTree(DetailAST rootAST) { 366 singlelineComments = getFileContents().getSingleLineComments(); 367 blockComments = getFileContents().getBlockComments(); 368 } 369 370 @Override 371 public void visitToken(DetailAST ast) { 372 final String literal = 373 CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText()); 374 375 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 376 || isAllCharactersEscaped(literal) 377 || allowEscapesForControlCharacters 378 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 379 || allowNonPrintableEscapes 380 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 381 log(ast, MSG_KEY); 382 } 383 } 384 385 /** 386 * Checks if literal has Unicode chars. 387 * 388 * @param literal String literal. 389 * @return true if literal has Unicode chars. 390 */ 391 private static boolean hasUnicodeChar(String literal) { 392 final String literalWithoutEscapedBackslashes = 393 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 394 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 395 } 396 397 /** 398 * Check if String literal contains Unicode control chars. 399 * 400 * @param literal String literal. 401 * @param pattern RegExp for valid characters. 402 * @return true, if String literal contains Unicode control chars. 403 */ 404 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 405 final int unicodeMatchesCounter = 406 countMatches(UNICODE_REGEXP, literal); 407 final int unicodeValidMatchesCounter = 408 countMatches(pattern, literal); 409 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 410 } 411 412 /** 413 * Check if trail comment is present after ast token. 414 * 415 * @param ast current token. 416 * @return true if trail comment is present after ast token. 417 */ 418 private boolean hasTrailComment(DetailAST ast) { 419 int lineNo = ast.getLineNo(); 420 421 // Since the trailing comment in the case of text blocks must follow the """ delimiter, 422 // we need to look for it after TEXT_BLOCK_LITERAL_END. 423 if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) { 424 lineNo = ast.getNextSibling().getLineNo(); 425 } 426 boolean result = false; 427 if (singlelineComments.containsKey(lineNo)) { 428 result = true; 429 } 430 else { 431 final List<TextBlock> commentList = blockComments.get(lineNo); 432 if (commentList != null) { 433 final TextBlock comment = commentList.get(commentList.size() - 1); 434 final String line = getLines()[lineNo - 1]; 435 result = isTrailingBlockComment(comment, line); 436 } 437 } 438 return result; 439 } 440 441 /** 442 * Whether the C style comment is trailing. 443 * 444 * @param comment the comment to check. 445 * @param line the line where the comment starts. 446 * @return true if the comment is trailing. 447 */ 448 private static boolean isTrailingBlockComment(TextBlock comment, String line) { 449 return comment.getText().length != 1 450 || CommonUtil.isBlank(line.substring(comment.getEndColNo() + 1)); 451 } 452 453 /** 454 * Count regexp matches into String literal. 455 * 456 * @param pattern pattern. 457 * @param target String literal. 458 * @return count of regexp matches. 459 */ 460 private static int countMatches(Pattern pattern, String target) { 461 int matcherCounter = 0; 462 final Matcher matcher = pattern.matcher(target); 463 while (matcher.find()) { 464 matcherCounter++; 465 } 466 return matcherCounter; 467 } 468 469 /** 470 * Checks if all characters in String literal is escaped. 471 * 472 * @param literal current literal. 473 * @return true if all characters in String literal is escaped. 474 */ 475 private boolean isAllCharactersEscaped(String literal) { 476 return allowIfAllCharactersEscaped 477 && ALL_ESCAPED_CHARS.matcher(literal).find(); 478 } 479 480}