diff options
-rw-r--r-- | include/clang/Basic/DiagnosticLexKinds.td | 3 | ||||
-rw-r--r-- | lib/Lex/Lexer.cpp | 21 | ||||
-rw-r--r-- | test/Lexer/unicode.c | 7 |
3 files changed, 28 insertions, 3 deletions
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index 1c960711bc..8cf6d7e7c0 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -122,6 +122,9 @@ def ext_unicode_whitespace : ExtWarn< def warn_utf8_symbol_homoglyph : Warning< "treating Unicode character <U+%0> as identifier character rather than " "as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>; +def warn_utf8_symbol_zero_width : Warning< + "identifier contains Unicode character <U+%0> that is invisible in " + "some environments">, InGroup<DiagGroup<"unicode-zero-width">>; def err_hex_escape_no_digits : Error< "\\%0 used with no following hex digits">; diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index e8588a771a..6a69bb4974 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -1510,8 +1510,17 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, bool operator<(HomoglyphPair R) const { return Character < R.Character; } }; static constexpr HomoglyphPair SortedHomoglyphs[] = { + {U'\u00ad', 0}, // SOFT HYPHEN {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK {U'\u037e', ';'}, // GREEK QUESTION MARK + {U'\u200b', 0}, // ZERO WIDTH SPACE + {U'\u200c', 0}, // ZERO WIDTH NON-JOINER + {U'\u200d', 0}, // ZERO WIDTH JOINER + {U'\u2060', 0}, // WORD JOINER + {U'\u2061', 0}, // FUNCTION APPLICATION + {U'\u2062', 0}, // INVISIBLE TIMES + {U'\u2063', 0}, // INVISIBLE SEPARATOR + {U'\u2064', 0}, // INVISIBLE PLUS {U'\u2212', '-'}, // MINUS SIGN {U'\u2215', '/'}, // DIVISION SLASH {U'\u2216', '\\'}, // SET MINUS @@ -1521,6 +1530,7 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, {U'\u2236', ':'}, // RATIO {U'\u223c', '~'}, // TILDE OPERATOR {U'\ua789', ':'}, // MODIFIER LETTER COLON + {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN @@ -1560,9 +1570,14 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, llvm::raw_svector_ostream CharOS(CharBuf); llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); } - const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; - Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) - << Range << CharBuf << LooksLikeStr; + if (Homoglyph->LooksLike) { + const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) + << Range << CharBuf << LooksLikeStr; + } else { + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) + << Range << CharBuf; + } } } diff --git a/test/Lexer/unicode.c b/test/Lexer/unicode.c index 30e353fa79..bebab82988 100644 --- a/test/Lexer/unicode.c +++ b/test/Lexer/unicode.c @@ -38,3 +38,10 @@ int n; = 3; // expected-warning {{treating Unicode character <U+037E> as identi int *n꞉꞉v = &n;; // expected-warning 2{{treating Unicode character <U+A789> as identifier character rather than as ':' symbol}} // expected-warning@-1 {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}} int v=[=](auto){return~x;}(); // expected-warning 12{{treating Unicode character}} + +int xx; +// expected-warning@-1 {{identifier contains Unicode character <U+2060> that is invisible in some environments}} +// expected-warning@-2 {{identifier contains Unicode character <U+FEFF> that is invisible in some environments}} +// expected-warning@-3 {{identifier contains Unicode character <U+200D> that is invisible in some environments}} +int foobar = 0; // expected-warning {{identifier contains Unicode character <U+200B> that is invisible in some environments}} +int x = foobar; // expected-error {{undeclared identifier}} |