summaryrefslogtreecommitdiffstats
path: root/lib/Lex/Lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Lex/Lexer.cpp')
-rw-r--r--lib/Lex/Lexer.cpp281
1 files changed, 176 insertions, 105 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 6025a66751..5132d0e62c 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -19,6 +19,7 @@
#include "clang/Lex/LexDiagnostic.h"
#include "clang/Lex/LiteralSupport.h"
#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/Compiler.h"
@@ -43,6 +44,8 @@ using namespace clang;
/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
+ if (isAnnotation())
+ return false;
if (IdentifierInfo *II = getIdentifierInfo())
return II->getObjCKeywordID() == objcKey;
return false;
@@ -50,6 +53,8 @@ bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
/// getObjCKeywordID - Return the ObjC keyword kind.
tok::ObjCKeywordKind Token::getObjCKeywordID() const {
+ if (isAnnotation())
+ return tok::objc_not_keyword;
IdentifierInfo *specId = getIdentifierInfo();
return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
}
@@ -452,6 +457,25 @@ bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
return false;
}
+/// Returns the pointer that points to the beginning of line that contains
+/// the given offset, or null if the offset if invalid.
+static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
+ const char *BufStart = Buffer.data();
+ if (Offset >= Buffer.size())
+ return nullptr;
+
+ const char *LexStart = BufStart + Offset;
+ for (; LexStart != BufStart; --LexStart) {
+ if (isVerticalWhitespace(LexStart[0]) &&
+ !Lexer::isNewLineEscaped(BufStart, LexStart)) {
+ // LexStart should point at first character of logical line.
+ ++LexStart;
+ break;
+ }
+ }
+ return LexStart;
+}
+
static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
const SourceManager &SM,
const LangOptions &LangOpts) {
@@ -459,7 +483,7 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
if (LocInfo.first.isInvalid())
return Loc;
-
+
bool Invalid = false;
StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
if (Invalid)
@@ -467,47 +491,35 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
// Back up from the current location until we hit the beginning of a line
// (or the buffer). We'll relex from that point.
- const char *BufStart = Buffer.data();
- if (LocInfo.second >= Buffer.size())
+ const char *StrData = Buffer.data() + LocInfo.second;
+ const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
+ if (!LexStart || LexStart == StrData)
return Loc;
-
- const char *StrData = BufStart+LocInfo.second;
- if (StrData[0] == '\n' || StrData[0] == '\r')
- return Loc;
-
- const char *LexStart = StrData;
- while (LexStart != BufStart) {
- if (LexStart[0] == '\n' || LexStart[0] == '\r') {
- ++LexStart;
- break;
- }
- --LexStart;
- }
-
// Create a lexer starting at the beginning of this token.
SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
- Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
+ Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
+ Buffer.end());
TheLexer.SetCommentRetentionState(true);
-
+
// Lex tokens until we find the token that contains the source location.
Token TheTok;
do {
TheLexer.LexFromRawLexer(TheTok);
-
+
if (TheLexer.getBufferLocation() > StrData) {
// Lexing this token has taken the lexer past the source location we're
// looking for. If the current token encompasses our source location,
// return the beginning of that token.
if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
return TheTok.getLocation();
-
+
// We ended up skipping over the source location entirely, which means
// that it points into whitespace. We're done here.
break;
}
} while (TheTok.getKind() != tok::eof);
-
+
// We've passed our source location; just return the original source location.
return Loc;
}
@@ -515,36 +527,34 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
const SourceManager &SM,
const LangOptions &LangOpts) {
- if (Loc.isFileID())
- return getBeginningOfFileToken(Loc, SM, LangOpts);
-
- if (!SM.isMacroArgExpansion(Loc))
- return Loc;
+ if (Loc.isFileID())
+ return getBeginningOfFileToken(Loc, SM, LangOpts);
- SourceLocation FileLoc = SM.getSpellingLoc(Loc);
- SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
- std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
- std::pair<FileID, unsigned> BeginFileLocInfo
- = SM.getDecomposedLoc(BeginFileLoc);
- assert(FileLocInfo.first == BeginFileLocInfo.first &&
- FileLocInfo.second >= BeginFileLocInfo.second);
- return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
+ if (!SM.isMacroArgExpansion(Loc))
+ return Loc;
+
+ SourceLocation FileLoc = SM.getSpellingLoc(Loc);
+ SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
+ std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
+ std::pair<FileID, unsigned> BeginFileLocInfo =
+ SM.getDecomposedLoc(BeginFileLoc);
+ assert(FileLocInfo.first == BeginFileLocInfo.first &&
+ FileLocInfo.second >= BeginFileLocInfo.second);
+ return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
}
namespace {
enum PreambleDirectiveKind {
PDK_Skipped,
- PDK_StartIf,
- PDK_EndIf,
PDK_Unknown
};
} // end anonymous namespace
-std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
- const LangOptions &LangOpts,
- unsigned MaxLines) {
+PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
+ const LangOptions &LangOpts,
+ unsigned MaxLines) {
// Create a lexer starting at the beginning of the file. Note that we use a
// "fake" file source location at offset 1 so that the lexer will track our
// position within the file.
@@ -554,13 +564,8 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
Buffer.end());
TheLexer.SetCommentRetentionState(true);
- // StartLoc will differ from FileLoc if there is a BOM that was skipped.
- SourceLocation StartLoc = TheLexer.getSourceLocation();
-
bool InPreprocessorDirective = false;
Token TheTok;
- Token IfStartTok;
- unsigned IfCount = 0;
SourceLocation ActiveCommentLoc;
unsigned MaxLineOffset = 0;
@@ -643,33 +648,18 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
.Case("sccs", PDK_Skipped)
.Case("assert", PDK_Skipped)
.Case("unassert", PDK_Skipped)
- .Case("if", PDK_StartIf)
- .Case("ifdef", PDK_StartIf)
- .Case("ifndef", PDK_StartIf)
+ .Case("if", PDK_Skipped)
+ .Case("ifdef", PDK_Skipped)
+ .Case("ifndef", PDK_Skipped)
.Case("elif", PDK_Skipped)
.Case("else", PDK_Skipped)
- .Case("endif", PDK_EndIf)
+ .Case("endif", PDK_Skipped)
.Default(PDK_Unknown);
switch (PDK) {
case PDK_Skipped:
continue;
- case PDK_StartIf:
- if (IfCount == 0)
- IfStartTok = HashTok;
-
- ++IfCount;
- continue;
-
- case PDK_EndIf:
- // Mismatched #endif. The preamble ends here.
- if (IfCount == 0)
- break;
-
- --IfCount;
- continue;
-
case PDK_Unknown:
// We don't know what this directive is; stop at the '#'.
break;
@@ -690,16 +680,13 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
} while (true);
SourceLocation End;
- if (IfCount)
- End = IfStartTok.getLocation();
- else if (ActiveCommentLoc.isValid())
+ if (ActiveCommentLoc.isValid())
End = ActiveCommentLoc; // don't truncate a decl comment.
else
End = TheTok.getLocation();
- return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
- IfCount? IfStartTok.isAtStartOfLine()
- : TheTok.isAtStartOfLine());
+ return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
+ TheTok.isAtStartOfLine());
}
/// AdvanceToTokenCharacter - Given a location that specifies the start of a
@@ -1038,6 +1025,47 @@ bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
return isIdentifierBody(c, LangOpts.DollarIdents);
}
+bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
+ assert(isVerticalWhitespace(Str[0]));
+ if (Str - 1 < BufferStart)
+ return false;
+
+ if ((Str[0] == '\n' && Str[-1] == '\r') ||
+ (Str[0] == '\r' && Str[-1] == '\n')) {
+ if (Str - 2 < BufferStart)
+ return false;
+ --Str;
+ }
+ --Str;
+
+ // Rewind to first non-space character:
+ while (Str > BufferStart && isHorizontalWhitespace(*Str))
+ --Str;
+
+ return *Str == '\\';
+}
+
+StringRef Lexer::getIndentationForLine(SourceLocation Loc,
+ const SourceManager &SM) {
+ if (Loc.isInvalid() || Loc.isMacroID())
+ return "";
+ std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
+ if (LocInfo.first.isInvalid())
+ return "";
+ bool Invalid = false;
+ StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
+ if (Invalid)
+ return "";
+ const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
+ if (!Line)
+ return "";
+ StringRef Rest = Buffer.substr(Line - Buffer.data());
+ size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
+ return NumWhitespaceChars == StringRef::npos
+ ? ""
+ : Rest.take_front(NumWhitespaceChars);
+}
+
//===----------------------------------------------------------------------===//
// Diagnostics forwarding code.
//===----------------------------------------------------------------------===//
@@ -1171,6 +1199,8 @@ const char *Lexer::SkipEscapedNewLines(const char *P) {
// If not a trigraph for escape, bail out.
if (P[1] != '?' || P[2] != '/')
return P;
+ // FIXME: Take LangOpts into account; the language might not
+ // support trigraphs.
AfterEscape = P+3;
} else {
return P;
@@ -1282,12 +1312,6 @@ Slash:
Size += EscapedNewLineSize;
Ptr += EscapedNewLineSize;
- // If the char that we finally got was a \n, then we must have had
- // something like \<newline><newline>. We don't want to consume the
- // second newline.
- if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
- return ' ';
-
// Use slow version to accumulate a correct size field.
return getCharAndSizeSlow(Ptr, Size, Tok);
}
@@ -1338,12 +1362,6 @@ Slash:
Size += EscapedNewLineSize;
Ptr += EscapedNewLineSize;
- // If the char that we finally got was a \n, then we must have had
- // something like \<newline><newline>. We don't want to consume the
- // second newline.
- if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
- return ' ';
-
// Use slow version to accumulate a correct size field.
return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
}
@@ -1373,9 +1391,9 @@ Slash:
// Helper methods for lexing.
//===----------------------------------------------------------------------===//
-/// \brief Routine that indiscriminately skips bytes in the source file.
-void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
- BufferPtr += Bytes;
+/// \brief Routine that indiscriminately sets the offset into the source file.
+void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
+ BufferPtr = BufferStart + Offset;
if (BufferPtr > BufferEnd)
BufferPtr = BufferEnd;
// FIXME: What exactly does the StartOfLine bit mean? There are two
@@ -2070,8 +2088,11 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
// Scan over the body of the comment. The common case, when scanning, is that
// the comment contains normal ascii characters with nothing interesting in
// them. As such, optimize for this case with the inner loop.
+ //
+ // This loop terminates with CurPtr pointing at the newline (or end of buffer)
+ // character that ends the line comment.
char C;
- do {
+ while (true) {
C = *CurPtr;
// Skip over characters in the fast loop.
while (C != 0 && // Potentially EOF.
@@ -2088,10 +2109,12 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
HasSpace = true;
}
- if (*EscapePtr == '\\') // Escaped newline.
+ if (*EscapePtr == '\\')
+ // Escaped newline.
CurPtr = EscapePtr;
else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
- EscapePtr[-2] == '?') // Trigraph-escaped newline.
+ EscapePtr[-2] == '?' && LangOpts.Trigraphs)
+ // Trigraph-escaped newline.
CurPtr = EscapePtr-2;
else
break; // This is a newline, we're done.
@@ -2121,7 +2144,8 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
// If we read multiple characters, and one of those characters was a \r or
// \n, then we had an escaped newline within the comment. Emit diagnostic
// unless the next line is also a // comment.
- if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
+ if (CurPtr != OldPtr + 1 && C != '/' &&
+ (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
for (; OldPtr != CurPtr; ++OldPtr)
if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
// Okay, we found a // comment that ends in a newline, if the next
@@ -2140,9 +2164,9 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
}
}
- if (CurPtr == BufferEnd+1) {
- --CurPtr;
- break;
+ if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
+ --CurPtr;
+ break;
}
if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
@@ -2150,8 +2174,7 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
cutOffLexing();
return false;
}
-
- } while (C != '\n' && C != '\r');
+ }
// Found but did not consume the newline. Notify comment handlers about the
// comment unless we're in a #if 0 block.
@@ -2490,6 +2513,7 @@ void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
break;
}
// FALL THROUGH.
+ LLVM_FALLTHROUGH;
case '\r':
case '\n':
// Okay, we found the end of the line. First, back up past the \0, \r, \n.
@@ -2540,6 +2564,11 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
return true;
}
+ if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
+ PP->setRecordedPreambleConditionalStack(ConditionalStack);
+ ConditionalStack.clear();
+ }
+
// Issue diagnostics for unterminated #if and missing newline.
// If we are in a #if directive, emit an error.
@@ -2722,6 +2751,37 @@ bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
return false;
}
+static const char *findPlaceholderEnd(const char *CurPtr,
+ const char *BufferEnd) {
+ if (CurPtr == BufferEnd)
+ return nullptr;
+ BufferEnd -= 1; // Scan until the second last character.
+ for (; CurPtr != BufferEnd; ++CurPtr) {
+ if (CurPtr[0] == '#' && CurPtr[1] == '>')
+ return CurPtr + 2;
+ }
+ return nullptr;
+}
+
+bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
+ assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
+ if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
+ return false;
+ const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
+ if (!End)
+ return false;
+ const char *Start = CurPtr - 1;
+ if (!LangOpts.AllowEditorPlaceholders)
+ Diag(Start, diag::err_placeholder_in_source);
+ Result.startToken();
+ FormTokenWithChars(Result, End, tok::raw_identifier);
+ Result.setRawIdentifierData(Start);
+ PP->LookUpIdentifierInfo(Result);
+ Result.setFlag(Token::IsEditorPlaceholder);
+ BufferPtr = End;
+ return true;
+}
+
bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
if (PP && PP->isCodeCompletionEnabled()) {
SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
@@ -3009,8 +3069,11 @@ LexNextToken:
Kind = tok::unknown;
break;
- case '\n':
case '\r':
+ if (CurPtr[0] == '\n')
+ Char = getAndAdvanceChar(CurPtr, Result);
+ LLVM_FALLTHROUGH;
+ case '\n':
// If we are inside a preprocessor directive and we see the end of line,
// we know we are done with the directive, so return an EOD token.
if (ParsingPreprocessorDirective) {
@@ -3203,6 +3266,7 @@ LexNextToken:
return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
tok::wide_char_constant);
// FALL THROUGH, treating L like the start of an identifier.
+ LLVM_FALLTHROUGH;
// C99 6.4.2: Identifiers.
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
@@ -3479,6 +3543,9 @@ LexNextToken:
} else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::l_brace;
+ } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
+ lexEditorPlaceholder(Result, CurPtr)) {
+ return true;
} else {
Kind = tok::less;
}
@@ -3545,7 +3612,9 @@ LexNextToken:
if (LangOpts.Digraphs && Char == '>') {
Kind = tok::r_square; // ':>' -> ']'
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
- } else if (LangOpts.CPlusPlus && Char == ':') {
+ } else if ((LangOpts.CPlusPlus ||
+ LangOpts.DoubleSquareBracketAttributes) &&
+ Char == ':') {
Kind = tok::coloncolon;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
@@ -3603,17 +3672,19 @@ LexNextToken:
// UCNs (C99 6.4.3, C++11 [lex.charset]p2)
case '\\':
- if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
- if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
- if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
- return true; // KeepWhitespaceMode
+ if (!LangOpts.AsmPreprocessor) {
+ if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
+ if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
+ if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
+ return true; // KeepWhitespaceMode
+
+ // We only saw whitespace, so just try again with this lexer.
+ // (We manually eliminate the tail call to avoid recursion.)
+ goto LexNextToken;
+ }
- // We only saw whitespace, so just try again with this lexer.
- // (We manually eliminate the tail call to avoid recursion.)
- goto LexNextToken;
+ return LexUnicode(Result, CodePoint, CurPtr);
}
-
- return LexUnicode(Result, CodePoint, CurPtr);
}
Kind = tok::unknown;