001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2020 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.List;
023import java.util.Map;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
028import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
029import com.puppycrawl.tools.checkstyle.api.DetailAST;
030import com.puppycrawl.tools.checkstyle.api.TextBlock;
031import com.puppycrawl.tools.checkstyle.api.TokenTypes;
032import com.puppycrawl.tools.checkstyle.utils.CommonUtil;
033
034/**
035 * <p>
036 * Restricts using
037 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
038 * Unicode escapes</a>
039 * (such as &#92;u221e). It is possible to allow using escapes for
040 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
041 * non-printable, control characters</a>.
042 * Also, this check can be configured to allow using escapes
043 * if trail comment is present. By the option it is possible to
044 * allow using escapes if literal contains only them.
045 * </p>
046 * <ul>
047 * <li>
048 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for
049 * non-printable, control characters.
050 * Type is {@code boolean}.
051 * Default value is {@code false}.
052 * </li>
053 * <li>
054 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present.
055 * Type is {@code boolean}.
056 * Default value is {@code false}.
057 * </li>
058 * <li>
059 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped.
060 * Type is {@code boolean}.
061 * Default value is {@code false}.
062 * </li>
063 * <li>
064 * Property {@code allowNonPrintableEscapes} - Allow use escapes for
065 * non-printable, whitespace characters.
066 * Type is {@code boolean}.
067 * Default value is {@code false}.
068 * </li>
069 * </ul>
070 * <p>
071 * To configure the check:
072 * </p>
073 * <pre>
074 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
075 * </pre>
076 * <p>
077 * Examples of using Unicode:</p>
078 * <pre>
079 * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
080 * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
081 * </pre>
082 * <p>
083 * An example of non-printable, control characters.
084 * </p>
085 * <pre>
086 * return '&#92;ufeff' + content; // byte order mark
087 * </pre>
088 * <p>
089 * An example of how to configure the check to allow using escapes
090 * for non-printable, control characters:
091 * </p>
092 * <pre>
093 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
094 *   &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
095 * &lt;/module&gt;
096 * </pre>
097 * <p>
098 * Example of using escapes with trail comment:
099 * </p>
100 * <pre>
101 * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
102 * String textBlockUnitAbbrev = """
103 *          &#92;u03bcs"""; // Greek letter mu, "s"
104 * </pre>
105 * <p>An example of how to configure the check to allow using escapes
106 * if trail comment is present:
107 * </p>
108 * <pre>
109 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
110 *   &lt;property name="allowByTailComment" value="true"/&gt;
111 * &lt;/module&gt;
112 * </pre>
113 * <p>Example of using escapes if literal contains only them:
114 * </p>
115 * <pre>
116 * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
117 * </pre>
118 * <p>An example of how to configure the check to allow escapes
119 * if literal contains only them:
120 * </p>
121 * <pre>
122 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
123 *   &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
124 * &lt;/module&gt;
125 * </pre>
126 * <p>An example of how to configure the check to allow using escapes
127 * for non-printable, whitespace characters:
128 * </p>
129 * <pre>
130 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
131 *   &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
132 * &lt;/module&gt;
133 * </pre>
134 * <p>
135 * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker}
136 * </p>
137 * <p>
138 * Violation Message Keys:
139 * </p>
140 * <ul>
141 * <li>
142 * {@code forbid.escaped.unicode.char}
143 * </li>
144 * </ul>
145 *
146 * @since 5.8
147 */
148@FileStatefulCheck
149public class AvoidEscapedUnicodeCharactersCheck
150    extends AbstractCheck {
151
152    /**
153     * A key is pointing to the warning message text in "messages.properties"
154     * file.
155     */
156    public static final String MSG_KEY = "forbid.escaped.unicode.char";
157
158    /** Regular expression for Unicode chars. */
159    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
160
161    /**
162     * Regular expression Unicode control characters.
163     *
164     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
165     *     Appendix:Control characters</a>
166     */
167    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\[uU]"
168            + "(00[0-1][0-9A-Fa-f]"
169            + "|00[8-9][0-9A-Fa-f]"
170            + "|00[aA][dD]"
171            + "|034[fF]"
172            + "|070[fF]"
173            + "|180[eE]"
174            + "|200[b-fB-F]"
175            + "|202[a-eA-E]"
176            + "|206[0-4a-fA-F]"
177            + "|[fF]{3}[9a-bA-B]"
178            + "|[fF][eE][fF]{2})");
179
180    /**
181     * Regular expression for all escaped chars.
182     * See "EscapeSequence" at
183     * https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.10.6
184     */
185    private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
186            + "|\""
187            + "|'"
188            + "|\\\\"
189            + "|\\\\b"
190            + "|\\\\f"
191            + "|\\\\n"
192            + "|\\\\r"
193            + "|\\\\t"
194            + ")+$");
195
196    /** Regular expression for escaped backslash. */
197    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
198
199    /** Regular expression for non-printable unicode chars. */
200    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
201            + "|\\\\u0009"
202            + "|\\\\u000[bB]"
203            + "|\\\\u000[cC]"
204            + "|\\\\u0020"
205            + "|\\\\u007[fF]"
206            + "|\\\\u0085"
207            + "|\\\\u009[fF]"
208            + "|\\\\u00[aA]0"
209            + "|\\\\u00[aA][dD]"
210            + "|\\\\u04[fF]9"
211            + "|\\\\u05[bB][eE]"
212            + "|\\\\u05[dD]0"
213            + "|\\\\u05[eE][aA]"
214            + "|\\\\u05[fF]3"
215            + "|\\\\u05[fF]4"
216            + "|\\\\u0600"
217            + "|\\\\u0604"
218            + "|\\\\u061[cC]"
219            + "|\\\\u06[dD]{2}"
220            + "|\\\\u06[fF]{2}"
221            + "|\\\\u070[fF]"
222            + "|\\\\u0750"
223            + "|\\\\u077[fF]"
224            + "|\\\\u0[eE]00"
225            + "|\\\\u0[eE]7[fF]"
226            + "|\\\\u1680"
227            + "|\\\\u180[eE]"
228            + "|\\\\u1[eE]00"
229            + "|\\\\u2000"
230            + "|\\\\u2001"
231            + "|\\\\u2002"
232            + "|\\\\u2003"
233            + "|\\\\u2004"
234            + "|\\\\u2005"
235            + "|\\\\u2006"
236            + "|\\\\u2007"
237            + "|\\\\u2008"
238            + "|\\\\u2009"
239            + "|\\\\u200[aA]"
240            + "|\\\\u200[fF]"
241            + "|\\\\u2025"
242            + "|\\\\u2028"
243            + "|\\\\u2029"
244            + "|\\\\u202[fF]"
245            + "|\\\\u205[fF]"
246            + "|\\\\u2064"
247            + "|\\\\u2066"
248            + "|\\\\u2067"
249            + "|\\\\u2068"
250            + "|\\\\u2069"
251            + "|\\\\u206[aA]"
252            + "|\\\\u206[fF]"
253            + "|\\\\u20[aA][fF]"
254            + "|\\\\u2100"
255            + "|\\\\u213[aA]"
256            + "|\\\\u3000"
257            + "|\\\\u[dD]800"
258            + "|\\\\u[fF]8[fF]{2}"
259            + "|\\\\u[fF][bB]50"
260            + "|\\\\u[fF][dD][fF]{2}"
261            + "|\\\\u[fF][eE]70"
262            + "|\\\\u[fF][eE][fF]{2}"
263            + "|\\\\u[fF]{2}0[eE]"
264            + "|\\\\u[fF]{2}61"
265            + "|\\\\u[fF]{2}[dD][cC]"
266            + "|\\\\u[fF]{3}9"
267            + "|\\\\u[fF]{3}[aA]"
268            + "|\\\\u[fF]{3}[bB]"
269            + "|\\\\u[fF]{4}");
270
271    /** Cpp style comments. */
272    private Map<Integer, TextBlock> singlelineComments;
273    /** C style comments. */
274    private Map<Integer, List<TextBlock>> blockComments;
275
276    /** Allow use escapes for non-printable, control characters. */
277    private boolean allowEscapesForControlCharacters;
278
279    /** Allow use escapes if trail comment is present. */
280    private boolean allowByTailComment;
281
282    /** Allow if all characters in literal are escaped. */
283    private boolean allowIfAllCharactersEscaped;
284
285    /** Allow use escapes for non-printable, whitespace characters. */
286    private boolean allowNonPrintableEscapes;
287
288    /**
289     * Setter to allow use escapes for non-printable, control characters.
290     *
291     * @param allow user's value.
292     */
293    public final void setAllowEscapesForControlCharacters(boolean allow) {
294        allowEscapesForControlCharacters = allow;
295    }
296
297    /**
298     * Setter to allow use escapes if trail comment is present.
299     *
300     * @param allow user's value.
301     */
302    public final void setAllowByTailComment(boolean allow) {
303        allowByTailComment = allow;
304    }
305
306    /**
307     * Setter to allow if all characters in literal are escaped.
308     *
309     * @param allow user's value.
310     */
311    public final void setAllowIfAllCharactersEscaped(boolean allow) {
312        allowIfAllCharactersEscaped = allow;
313    }
314
315    /**
316     * Setter to allow use escapes for non-printable, whitespace characters.
317     *
318     * @param allow user's value.
319     */
320    public final void setAllowNonPrintableEscapes(boolean allow) {
321        allowNonPrintableEscapes = allow;
322    }
323
324    @Override
325    public int[] getDefaultTokens() {
326        return getRequiredTokens();
327    }
328
329    @Override
330    public int[] getAcceptableTokens() {
331        return getRequiredTokens();
332    }
333
334    @Override
335    public int[] getRequiredTokens() {
336        return new int[] {
337            TokenTypes.STRING_LITERAL,
338            TokenTypes.CHAR_LITERAL,
339            TokenTypes.TEXT_BLOCK_CONTENT,
340        };
341    }
342
343    @Override
344    public void beginTree(DetailAST rootAST) {
345        singlelineComments = getFileContents().getSingleLineComments();
346        blockComments = getFileContents().getBlockComments();
347    }
348
349    @Override
350    public void visitToken(DetailAST ast) {
351        final String literal = ast.getText();
352
353        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
354                || isAllCharactersEscaped(literal)
355                || allowEscapesForControlCharacters
356                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
357                || allowNonPrintableEscapes
358                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
359            log(ast, MSG_KEY);
360        }
361    }
362
363    /**
364     * Checks if literal has Unicode chars.
365     *
366     * @param literal String literal.
367     * @return true if literal has Unicode chars.
368     */
369    private static boolean hasUnicodeChar(String literal) {
370        final String literalWithoutEscapedBackslashes =
371                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
372        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
373    }
374
375    /**
376     * Check if String literal contains Unicode control chars.
377     *
378     * @param literal String literal.
379     * @param pattern RegExp for valid characters.
380     * @return true, if String literal contains Unicode control chars.
381     */
382    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
383        final int unicodeMatchesCounter =
384                countMatches(UNICODE_REGEXP, literal);
385        final int unicodeValidMatchesCounter =
386                countMatches(pattern, literal);
387        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
388    }
389
390    /**
391     * Check if trail comment is present after ast token.
392     *
393     * @param ast current token.
394     * @return true if trail comment is present after ast token.
395     */
396    private boolean hasTrailComment(DetailAST ast) {
397        int lineNo = ast.getLineNo();
398
399        // Since the trailing comment in the case of text blocks must follow the """ delimiter,
400        // we need to look for it after TEXT_BLOCK_LITERAL_END.
401        if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
402            lineNo = ast.getNextSibling().getLineNo();
403        }
404        boolean result = false;
405        if (singlelineComments.containsKey(lineNo)) {
406            result = true;
407        }
408        else {
409            final List<TextBlock> commentList = blockComments.get(lineNo);
410            if (commentList != null) {
411                final TextBlock comment = commentList.get(commentList.size() - 1);
412                final String line = getLines()[lineNo - 1];
413                result = isTrailingBlockComment(comment, line);
414            }
415        }
416        return result;
417    }
418
419    /**
420     * Whether the C style comment is trailing.
421     *
422     * @param comment the comment to check.
423     * @param line the line where the comment starts.
424     * @return true if the comment is trailing.
425     */
426    private static boolean isTrailingBlockComment(TextBlock comment, String line) {
427        return comment.getText().length != 1
428            || CommonUtil.isBlank(line.substring(comment.getEndColNo() + 1));
429    }
430
431    /**
432     * Count regexp matches into String literal.
433     *
434     * @param pattern pattern.
435     * @param target String literal.
436     * @return count of regexp matches.
437     */
438    private static int countMatches(Pattern pattern, String target) {
439        int matcherCounter = 0;
440        final Matcher matcher = pattern.matcher(target);
441        while (matcher.find()) {
442            matcherCounter++;
443        }
444        return matcherCounter;
445    }
446
447    /**
448     * Checks if all characters in String literal is escaped.
449     *
450     * @param literal current literal.
451     * @return true if all characters in String literal is escaped.
452     */
453    private boolean isAllCharactersEscaped(String literal) {
454        return allowIfAllCharactersEscaped
455                && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
456                        literal.length() - 1)).find();
457    }
458
459}