summaryrefslogtreecommitdiffstats
path: root/lib/Analysis/CloneDetection.cpp
diff options
context:
space:
mode:
authorArtem Dergachev <artem.dergachev@gmail.com>2016-08-20 17:35:53 +0000
committerArtem Dergachev <artem.dergachev@gmail.com>2016-08-20 17:35:53 +0000
commit656204ffb45bbf056101265d3ae4811638184c17 (patch)
tree6c74d3a37cad5bdf24df1d8d6d36e8cce3ae7ceb /lib/Analysis/CloneDetection.cpp
parent9c27c6bd260a3e4d531a2ebbcff339c4600da04f (diff)
[analyzer] Use faster hashing (MD5) in CloneDetector.
This replaces the old approach of fingerprinting every AST node into a string, which avoided collisions and was simple to implement, but turned out to be extremely ineffective with respect to both performance and memory. The collisions are now dealt with in a separate pass, which no longer causes performance problems because collisions are rare. Patch by Raphael Isemann! Differential Revision: https://reviews.llvm.org/D22515 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@279378 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Analysis/CloneDetection.cpp')
-rw-r--r--lib/Analysis/CloneDetection.cpp253
1 files changed, 189 insertions, 64 deletions
diff --git a/lib/Analysis/CloneDetection.cpp b/lib/Analysis/CloneDetection.cpp
index 9bba611d0a..c331ddc25b 100644
--- a/lib/Analysis/CloneDetection.cpp
+++ b/lib/Analysis/CloneDetection.cpp
@@ -19,6 +19,7 @@
#include "clang/AST/StmtVisitor.h"
#include "clang/Lex/Lexer.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MD5.h"
#include "llvm/Support/raw_ostream.h"
using namespace clang;
@@ -279,47 +280,35 @@ namespace {
/// This class defines what a code clone is: If it collects for two statements
/// the same data, then those two statements are considered to be clones of each
/// other.
-class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector> {
+///
+/// All collected data is forwarded to the given data consumer of the type T.
+/// The data consumer class needs to provide a member method with the signature:
+/// update(StringRef Str)
+template <typename T>
+class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector<T>> {
ASTContext &Context;
- std::vector<CloneDetector::DataPiece> &CollectedData;
+ /// \brief The data sink to which all data is forwarded.
+ T &DataConsumer;
public:
/// \brief Collects data of the given Stmt.
/// \param S The given statement.
/// \param Context The ASTContext of S.
- /// \param D The given data vector to which all collected data is appended.
- StmtDataCollector(const Stmt *S, ASTContext &Context,
- std::vector<CloneDetector::DataPiece> &D)
- : Context(Context), CollectedData(D) {
- Visit(S);
+ /// \param D The data sink to which all data is forwarded.
+ StmtDataCollector(const Stmt *S, ASTContext &Context, T &DataConsumer)
+ : Context(Context), DataConsumer(DataConsumer) {
+ this->Visit(S);
}
// Below are utility methods for appending different data to the vector.
void addData(CloneDetector::DataPiece Integer) {
- CollectedData.push_back(Integer);
+ DataConsumer.update(
+ StringRef(reinterpret_cast<char *>(&Integer), sizeof(Integer)));
}
- // FIXME: The functions below add long strings to the data vector which are
- // probably not good for performance. Replace the strings with pointer values
- // or a some other unique integer.
-
- void addData(llvm::StringRef Str) {
- if (Str.empty())
- return;
-
- const size_t OldSize = CollectedData.size();
-
- const size_t PieceSize = sizeof(CloneDetector::DataPiece);
- // Calculate how many vector units we need to accomodate all string bytes.
- size_t RoundedUpPieceNumber = (Str.size() + PieceSize - 1) / PieceSize;
- // Allocate space for the string in the data vector.
- CollectedData.resize(CollectedData.size() + RoundedUpPieceNumber);
-
- // Copy the string to the allocated space at the end of the vector.
- std::memcpy(CollectedData.data() + OldSize, Str.data(), Str.size());
- }
+ void addData(llvm::StringRef Str) { DataConsumer.update(Str); }
void addData(const QualType &QT) { addData(QT.getAsString()); }
@@ -484,8 +473,10 @@ class CloneSignatureGenerator {
// Create an empty signature that will be filled in this method.
CloneDetector::CloneSignature Signature;
- // Collect all relevant data from S and put it into the empty signature.
- StmtDataCollector(S, Context, Signature.Data);
+ llvm::MD5 Hash;
+
+ // Collect all relevant data from S and hash it.
+ StmtDataCollector<llvm::MD5>(S, Context, Hash);
// Look up what macros expanded into the current statement.
std::string StartMacroStack = getMacroStack(S->getLocStart(), Context);
@@ -523,7 +514,9 @@ class CloneSignatureGenerator {
auto ChildSignature = generateSignatures(Child, StartMacroStack);
// Add the collected data to the signature of the current statement.
- Signature.add(ChildSignature);
+ Signature.Complexity += ChildSignature.Complexity;
+ Hash.update(StringRef(reinterpret_cast<char *>(&ChildSignature.Hash),
+ sizeof(ChildSignature.Hash)));
// If the current statement is a CompoundStatement, we need to store the
// signature for the generation of the sub-sequences.
@@ -536,6 +529,14 @@ class CloneSignatureGenerator {
if (CS)
handleSubSequences(CS, ChildSignatures);
+ // Create the final hash code for the current signature.
+ llvm::MD5::MD5Result HashResult;
+ Hash.final(HashResult);
+
+ // Copy as much of the generated hash code to the signature's hash code.
+ std::memcpy(&Signature.Hash, &HashResult,
+ std::min(sizeof(Signature.Hash), sizeof(HashResult)));
+
// Save the signature for the current statement in the CloneDetector object.
CD.add(StmtSequence(S, Context), Signature);
@@ -564,11 +565,24 @@ class CloneSignatureGenerator {
// Create an empty signature and add the signatures of all selected
// child statements to it.
CloneDetector::CloneSignature SubSignature;
+ llvm::MD5 SubHash;
for (unsigned i = Pos; i < Pos + Length; ++i) {
- SubSignature.add(ChildSignatures[i]);
+ SubSignature.Complexity += ChildSignatures[i].Complexity;
+ size_t ChildHash = ChildSignatures[i].Hash;
+
+ SubHash.update(StringRef(reinterpret_cast<char *>(&ChildHash),
+ sizeof(ChildHash)));
}
+ // Create the final hash code for the current signature.
+ llvm::MD5::MD5Result HashResult;
+ SubHash.final(HashResult);
+
+ // Copy as much of the generated hash code to the signature's hash code.
+ std::memcpy(&SubSignature.Hash, &HashResult,
+ std::min(sizeof(SubSignature.Hash), sizeof(HashResult)));
+
// Save the signature together with the information about what children
// sequence we selected.
CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
@@ -594,25 +608,7 @@ void CloneDetector::analyzeCodeBody(const Decl *D) {
void CloneDetector::add(const StmtSequence &S,
const CloneSignature &Signature) {
- // StringMap only works with StringRefs, so we create one for our data vector.
- auto &Data = Signature.Data;
- StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
- Data.size() * sizeof(unsigned));
-
- // Search with the help of the signature if we already have encountered a
- // clone of the given StmtSequence.
- auto I = CloneGroupIndexes.find(DataRef);
- if (I == CloneGroupIndexes.end()) {
- // We haven't found an existing clone group, so we create a new clone group
- // for this StmtSequence and store the index of it in our search map.
- CloneGroupIndexes[DataRef] = CloneGroups.size();
- CloneGroups.emplace_back(S, Signature.Complexity);
- return;
- }
-
- // We have found an existing clone group and can expand it with the given
- // StmtSequence.
- CloneGroups[I->getValue()].Sequences.push_back(S);
+ Sequences.push_back(std::make_pair(Signature, S));
}
namespace {
@@ -645,14 +641,66 @@ bool containsGroup(CloneDetector::CloneGroup &Group,
}
} // end anonymous namespace
+namespace {
+/// \brief Wrapper around FoldingSetNodeID that it can be used as the template
+/// argument of the StmtDataCollector.
+class FoldingSetNodeIDWrapper {
+
+ llvm::FoldingSetNodeID &FS;
+
+public:
+ FoldingSetNodeIDWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {}
+
+ void update(StringRef Str) { FS.AddString(Str); }
+};
+} // end anonymous namespace
+
+/// \brief Writes the relevant data from all statements and child statements
+/// in the given StmtSequence into the given FoldingSetNodeID.
+static void CollectStmtSequenceData(const StmtSequence &Sequence,
+ FoldingSetNodeIDWrapper &OutputData) {
+ for (const Stmt *S : Sequence) {
+ StmtDataCollector<FoldingSetNodeIDWrapper>(S, Sequence.getASTContext(),
+ OutputData);
+
+ for (const Stmt *Child : S->children()) {
+ if (!Child)
+ continue;
+
+ CollectStmtSequenceData(StmtSequence(Child, Sequence.getASTContext()),
+ OutputData);
+ }
+ }
+}
+
+/// \brief Returns true if both sequences are clones of each other.
+static bool areSequencesClones(const StmtSequence &LHS,
+ const StmtSequence &RHS) {
+ // We collect the data from all statements in the sequence as we did before
+ // when generating a hash value for each sequence. But this time we don't
+ // hash the collected data and compare the whole data set instead. This
+ // prevents any false-positives due to hash code collisions.
+ llvm::FoldingSetNodeID DataLHS, DataRHS;
+ FoldingSetNodeIDWrapper LHSWrapper(DataLHS);
+ FoldingSetNodeIDWrapper RHSWrapper(DataRHS);
+
+ CollectStmtSequenceData(LHS, LHSWrapper);
+ CollectStmtSequenceData(RHS, RHSWrapper);
+
+ return DataLHS == DataRHS;
+}
+
/// \brief Finds all actual clone groups in a single group of presumed clones.
-/// \param Result Output parameter to which all found groups are added. Every
-/// clone in a group that was added this way follows the same
-/// variable pattern as the other clones in its group.
-/// \param Group A group of clones. The clones are allowed to have a different
-/// variable pattern.
+/// \param Result Output parameter to which all found groups are added.
+/// \param Group A group of presumed clones. The clones are allowed to have a
+/// different variable pattern and may not be actual clones of each
+/// other.
+/// \param CheckVariablePatterns If true, every clone in a group that was added
+/// to the output follows the same variable pattern as the other
+/// clones in its group.
static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
- const CloneDetector::CloneGroup &Group) {
+ const CloneDetector::CloneGroup &Group,
+ bool CheckVariablePattern) {
// We remove the Sequences one by one, so a list is more appropriate.
std::list<StmtSequence> UnassignedSequences(Group.Sequences.begin(),
Group.Sequences.end());
@@ -664,7 +712,7 @@ static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
StmtSequence Prototype = UnassignedSequences.front();
UnassignedSequences.pop_front();
- CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Complexity);
+ CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Signature);
// Analyze the variable pattern of the prototype. Every other StmtSequence
// needs to have the same pattern to get into the new clone group.
@@ -674,12 +722,27 @@ static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
// and assign them to our new clone group.
auto I = UnassignedSequences.begin(), E = UnassignedSequences.end();
while (I != E) {
+ // If the sequence doesn't fit to the prototype, we have encountered
+ // an unintended hash code collision and we skip it.
+ if (!areSequencesClones(Prototype, *I)) {
+ ++I;
+ continue;
+ }
- if (VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) {
+ // If we weren't asked to check for a matching variable pattern in clone
+ // groups we can add the sequence now to the new clone group.
+ // If we were asked to check for matching variable pattern, we first have
+ // to check that there are no differences between the two patterns and
+ // only proceed if they match.
+ if (!CheckVariablePattern ||
+ VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) {
FilteredGroup.Sequences.push_back(*I);
I = UnassignedSequences.erase(I);
continue;
}
+
+ // We didn't found a matching variable pattern, so we continue with the
+ // next sequence.
++I;
}
@@ -694,14 +757,76 @@ static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
void CloneDetector::findClones(std::vector<CloneGroup> &Result,
unsigned MinGroupComplexity,
bool CheckPatterns) {
+ // A shortcut (and necessary for the for-loop later in this function).
+ if (Sequences.empty())
+ return;
+
+ // We need to search for groups of StmtSequences with the same hash code to
+ // create our initial clone groups. By sorting all known StmtSequences by
+ // their hash value we make sure that StmtSequences with the same hash code
+ // are grouped together in the Sequences vector.
+ // Note: We stable sort here because the StmtSequences are added in the order
+ // in which they appear in the source file. We want to preserve that order
+ // because we also want to report them in that order in the CloneChecker.
+ std::stable_sort(Sequences.begin(), Sequences.end(),
+ [](std::pair<CloneSignature, StmtSequence> LHS,
+ std::pair<CloneSignature, StmtSequence> RHS) {
+ return LHS.first.Hash < RHS.first.Hash;
+ });
+
+ std::vector<CloneGroup> CloneGroups;
+
+ // Check for each CloneSignature if its successor has the same hash value.
+ // We don't check the last CloneSignature as it has no successor.
+ // Note: The 'size - 1' in the condition is safe because we check for an empty
+ // Sequences vector at the beginning of this function.
+ for (unsigned i = 0; i < Sequences.size() - 1; ++i) {
+ const auto Current = Sequences[i];
+ const auto Next = Sequences[i + 1];
+
+ if (Current.first.Hash != Next.first.Hash)
+ continue;
+
+ // It's likely that we just found an sequence of CloneSignatures that
+ // represent a CloneGroup, so we create a new group and start checking and
+ // adding the CloneSignatures in this sequence.
+ CloneGroup Group;
+ Group.Signature = Current.first;
+
+ for (; i < Sequences.size(); ++i) {
+ const auto &Signature = Sequences[i];
+
+ // A different hash value means we have reached the end of the sequence.
+ if (Current.first.Hash != Signature.first.Hash) {
+ // The current Signature could be the start of a new CloneGroup. So we
+ // decrement i so that we visit it again in the outer loop.
+ // Note: i can never be 0 at this point because we are just comparing
+ // the hash of the Current CloneSignature with itself in the 'if' above.
+ assert(i != 0);
+ --i;
+ break;
+ }
+
+ // Skip CloneSignatures that won't pass the complexity requirement.
+ if (Signature.first.Complexity < MinGroupComplexity)
+ continue;
+
+ Group.Sequences.push_back(Signature.second);
+ }
+
+ // There is a chance that we haven't found more than two fitting
+ // CloneSignature because not enough CloneSignatures passed the complexity
+ // requirement. As a CloneGroup with less than two members makes no sense,
+ // we ignore this CloneGroup and won't add it to the result.
+ if (!Group.isValid())
+ continue;
+
+ CloneGroups.push_back(Group);
+ }
+
// Add every valid clone group that fulfills the complexity requirement.
for (const CloneGroup &Group : CloneGroups) {
- if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
- if (CheckPatterns)
- createCloneGroups(Result, Group);
- else
- Result.push_back(Group);
- }
+ createCloneGroups(Result, Group, CheckPatterns);
}
std::vector<unsigned> IndexesToRemove;