summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/clang/Basic/DiagnosticLexKinds.td3
-rw-r--r--lib/Lex/Lexer.cpp21
-rw-r--r--test/Lexer/unicode.c7
3 files changed, 28 insertions, 3 deletions
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td
index 1c960711bc..8cf6d7e7c0 100644
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -122,6 +122,9 @@ def ext_unicode_whitespace : ExtWarn<
def warn_utf8_symbol_homoglyph : Warning<
"treating Unicode character <U+%0> as identifier character rather than "
"as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>;
+def warn_utf8_symbol_zero_width : Warning<
+ "identifier contains Unicode character <U+%0> that is invisible in "
+ "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
def err_hex_escape_no_digits : Error<
"\\%0 used with no following hex digits">;
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index e8588a771a..6a69bb4974 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -1510,8 +1510,17 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
bool operator<(HomoglyphPair R) const { return Character < R.Character; }
};
static constexpr HomoglyphPair SortedHomoglyphs[] = {
+ {U'\u00ad', 0}, // SOFT HYPHEN
{U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
{U'\u037e', ';'}, // GREEK QUESTION MARK
+ {U'\u200b', 0}, // ZERO WIDTH SPACE
+ {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
+ {U'\u200d', 0}, // ZERO WIDTH JOINER
+ {U'\u2060', 0}, // WORD JOINER
+ {U'\u2061', 0}, // FUNCTION APPLICATION
+ {U'\u2062', 0}, // INVISIBLE TIMES
+ {U'\u2063', 0}, // INVISIBLE SEPARATOR
+ {U'\u2064', 0}, // INVISIBLE PLUS
{U'\u2212', '-'}, // MINUS SIGN
{U'\u2215', '/'}, // DIVISION SLASH
{U'\u2216', '\\'}, // SET MINUS
@@ -1521,6 +1530,7 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
{U'\u2236', ':'}, // RATIO
{U'\u223c', '~'}, // TILDE OPERATOR
{U'\ua789', ':'}, // MODIFIER LETTER COLON
+ {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
{U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
{U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
{U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
@@ -1560,9 +1570,14 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
llvm::raw_svector_ostream CharOS(CharBuf);
llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
}
- const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
- Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
- << Range << CharBuf << LooksLikeStr;
+ if (Homoglyph->LooksLike) {
+ const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
+ Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
+ << Range << CharBuf << LooksLikeStr;
+ } else {
+ Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
+ << Range << CharBuf;
+ }
}
}
diff --git a/test/Lexer/unicode.c b/test/Lexer/unicode.c
index 30e353fa79..bebab82988 100644
--- a/test/Lexer/unicode.c
+++ b/test/Lexer/unicode.c
@@ -38,3 +38,10 @@ int n; = 3; // expected-warning {{treating Unicode character <U+037E> as identi
int *n꞉꞉v = &n;; // expected-warning 2{{treating Unicode character <U+A789> as identifier character rather than as ':' symbol}}
// expected-warning@-1 {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
int v=[=](auto){return~x;}(); // expected-warning 12{{treating Unicode character}}
+
+int ⁠xx‍;
+// expected-warning@-1 {{identifier contains Unicode character <U+2060> that is invisible in some environments}}
+// expected-warning@-2 {{identifier contains Unicode character <U+FEFF> that is invisible in some environments}}
+// expected-warning@-3 {{identifier contains Unicode character <U+200D> that is invisible in some environments}}
+int foo​bar = 0; // expected-warning {{identifier contains Unicode character <U+200B> that is invisible in some environments}}
+int x = foobar; // expected-error {{undeclared identifier}}