001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2016 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.regex.Matcher;
023import java.util.regex.Pattern;
024
025import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
026import com.puppycrawl.tools.checkstyle.api.DetailAST;
027import com.puppycrawl.tools.checkstyle.api.TokenTypes;
028
029/**
030 * <p>
031 * Restrict using <a href =
032 * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
033 * Unicode escapes</a> (e.g. \u221e).
034 * It is possible to allow using escapes for
035 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
036 * non-printable(control) characters</a>.
037 * Also, this check can be configured to allow using escapes
038 * if trail comment is present. By the option it is possible to
039 * allow using escapes if literal contains only them. By the option it
040 * is possible to allow using escapes for space literals.
041 * </p>
042 * <p>
043 * Examples of using Unicode:</p>
044 * <pre>
045 * String unitAbbrev = "μs"; //Best: perfectly clear even without a comment.
046 * String unitAbbrev = "\u03bcs"; //Poor: the reader has no idea what this is.
047 * </pre>
048 * <p>
049 * An example of how to configure the check is:
050 * </p>
051 * <pre>
052 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
053 * </pre>
054 * <p>
055 * An example of non-printable(control) characters.
056 * </p>
057 * <pre>
058 * return '\ufeff' + content; // byte order mark
059 * </pre>
060 * <p>
061 * An example of how to configure the check to allow using escapes
062 * for non-printable(control) characters:
063 * </p>
064 * <pre>
065 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
066 *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
067 * &lt;/module&gt;
068 * </pre>
069 * <p>
070 * Example of using escapes with trail comment:
071 * </p>
072 * <pre>
073 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s"
074 * </pre>
075 * <p>An example of how to configure the check to allow using escapes
076 * if trail comment is present:
077 * </p>
078 * <pre>
079 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
080 *     &lt;property name="allowByTailComment" value="true"/&gt;
081 * &lt;/module&gt;
082 * </pre>
083 * <p>Example of using escapes if literal contains only them:
084 * </p>
085 * <pre>
086 * String unitAbbrev = "\u03bc\u03bc\u03bc";
087 * </pre>
088 * <p>An example of how to configure the check to allow escapes
089 * if literal contains only them:
090 * </p>
091 * <pre>
092 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
093 *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
094 * &lt;/module&gt;
095 * </pre>
096 * <p>An example of how to configure the check to allow non-printable escapes:
097 * </p>
098 * <pre>
099 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
100 *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
101 * &lt;/module&gt;
102 * </pre>
103 *
104 * @author maxvetrenko
105 *
106 */
107public class AvoidEscapedUnicodeCharactersCheck
108    extends AbstractCheck {
109    /**
110     * A key is pointing to the warning message text in "messages.properties"
111     * file.
112     */
113    public static final String MSG_KEY = "forbid.escaped.unicode.char";
114
115    /** Regular expression for Unicode chars. */
116    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
117
118    /** Regular expression Unicode control characters. */
119    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)"
120            + "(00[0-1][0-1A-Fa-f]|00[8-9][0-9A-Fa-f]|034(f|F)|070(f|F)"
121            + "|180(e|E)|200[b-fB-F]|202[b-eB-E]|206[0-4a-fA-F]"
122            + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})");
123
124    /** Regular expression for trail comment. */
125    private static final Pattern COMMENT_REGEXP = Pattern.compile(";[ ]*//+"
126            + "[a-zA-Z0-9 ]*|;[ ]*/[*]+[a-zA-Z0-9 ]*");
127
128    /** Regular expression for all escaped chars. */
129    private static final Pattern ALL_ESCAPED_CHARS =
130            Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
131                    + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$");
132
133    /** Regular expression for non-printable unicode chars. */
134    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028"
135            + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)"
136            + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)"
137            + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)"
138            + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069"
139            + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9"
140            + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604"
141            + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)"
142            + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)"
143            + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)"
144            + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00"
145            + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9"
146            + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}"
147            + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000"
148            + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)"
149            + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)"
150            + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006"
151            + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028"
152            + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025"
153            + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61");
154
155    /** Allow use escapes for non-printable(control) characters.  */
156    private boolean allowEscapesForControlCharacters;
157
158    /** Allow use escapes if trail comment is present. */
159    private boolean allowByTailComment;
160
161    /** Allow if all characters in literal are escaped. */
162    private boolean allowIfAllCharactersEscaped;
163
164    /** Allow escapes for space literals. */
165    private boolean allowNonPrintableEscapes;
166
167    /**
168     * Set allowIfAllCharactersEscaped.
169     * @param allow user's value.
170     */
171    public final void setAllowEscapesForControlCharacters(boolean allow) {
172        allowEscapesForControlCharacters = allow;
173    }
174
175    /**
176     * Set allowByTailComment.
177     * @param allow user's value.
178     */
179    public final void setAllowByTailComment(boolean allow) {
180        allowByTailComment = allow;
181    }
182
183    /**
184     * Set allowIfAllCharactersEscaped.
185     * @param allow user's value.
186     */
187    public final void setAllowIfAllCharactersEscaped(boolean allow) {
188        allowIfAllCharactersEscaped = allow;
189    }
190
191    /**
192     * Set allowSpaceEscapes.
193     * @param allow user's value.
194     */
195    public final void setAllowNonPrintableEscapes(boolean allow) {
196        allowNonPrintableEscapes = allow;
197    }
198
199    @Override
200    public int[] getDefaultTokens() {
201        return getAcceptableTokens();
202    }
203
204    @Override
205    public int[] getAcceptableTokens() {
206        return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
207    }
208
209    @Override
210    public int[] getRequiredTokens() {
211        return getAcceptableTokens();
212    }
213
214    @Override
215    public void visitToken(DetailAST ast) {
216
217        final String literal = ast.getText();
218
219        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
220                || isAllCharactersEscaped(literal)
221                || allowEscapesForControlCharacters
222                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
223                || allowNonPrintableEscapes
224                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
225            log(ast.getLineNo(), MSG_KEY);
226        }
227    }
228
229    /**
230     * Checks if literal has Unicode chars.
231     * @param literal String literal.
232     * @return true if literal has Unicode chars.
233     */
234    private static boolean hasUnicodeChar(String literal) {
235        return UNICODE_REGEXP.matcher(literal).find();
236    }
237
238    /**
239     * Check if String literal contains Unicode control chars.
240     * @param literal String literal.
241     * @param pattern RegExp for valid characters.
242     * @return true, if String literal contains Unicode control chars.
243     */
244    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
245        final int unicodeMatchesCounter =
246                countMatches(UNICODE_REGEXP, literal);
247        final int unicodeValidMatchesCounter =
248                countMatches(pattern, literal);
249        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
250    }
251
252    /**
253     * Check if trail comment is present after ast token.
254     * @param ast current token.
255     * @return true if trail comment is present after ast token.
256     */
257    private boolean hasTrailComment(DetailAST ast) {
258        final DetailAST variableDef = getVariableDef(ast);
259        DetailAST semi;
260
261        if (variableDef == null) {
262            semi = getSemi(ast);
263        }
264        else {
265            semi = variableDef.getNextSibling();
266
267            if (semi.getType() != TokenTypes.SEMI) {
268                semi = variableDef.getLastChild();
269            }
270        }
271
272        boolean result = false;
273        if (semi != null) {
274            final int lineNo = semi.getLineNo();
275            final String currentLine = getLine(lineNo - 1);
276
277            if (COMMENT_REGEXP.matcher(currentLine).find()) {
278                result = true;
279            }
280        }
281
282        return result;
283    }
284
285    /**
286     * Count regexp matches into String literal.
287     * @param pattern pattern.
288     * @param target String literal.
289     * @return count of regexp matches.
290     */
291    private static int countMatches(Pattern pattern, String target) {
292        int matcherCounter = 0;
293        final Matcher matcher = pattern.matcher(target);
294        while (matcher.find()) {
295            matcherCounter++;
296        }
297        return matcherCounter;
298    }
299
300    /**
301     * Get variable definition.
302     * @param ast current token.
303     * @return variable definition.
304     */
305    private static DetailAST getVariableDef(DetailAST ast) {
306        DetailAST result = ast.getParent();
307        while (result != null
308                && result.getType() != TokenTypes.VARIABLE_DEF) {
309            result = result.getParent();
310        }
311        return result;
312    }
313
314    /**
315     * Get semi token.
316     * @param ast current token.
317     * @return semi token or null.
318     */
319    private static DetailAST getSemi(DetailAST ast) {
320        DetailAST result = ast.getParent();
321        while (result != null
322                && result.getLastChild().getType() != TokenTypes.SEMI) {
323            result = result.getParent();
324        }
325        if (result != null) {
326            result = result.getLastChild();
327        }
328        return result;
329    }
330
331    /**
332     * Checks if all characters in String literal is escaped.
333     * @param literal current literal.
334     * @return true if all characters in String literal is escaped.
335     */
336    private boolean isAllCharactersEscaped(String literal) {
337        return allowIfAllCharactersEscaped
338                && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
339                        literal.length() - 1)).find();
340    }
341}