diff options
author | Artem Dergachev <artem.dergachev@gmail.com> | 2016-07-26 18:13:12 +0000 |
---|---|---|
committer | Artem Dergachev <artem.dergachev@gmail.com> | 2016-07-26 18:13:12 +0000 |
commit | 07b4e2f4072d6277db401b3990e8977632f5d55a (patch) | |
tree | 1c0ccd1ce94dc144627f9f8461eafc314b740529 /lib/Analysis/CloneDetection.cpp | |
parent | cefd54750c4216d515b019df09eccd7c13c7612c (diff) |
[analyzer] Add basic capabilities to detect source code clones.
This patch adds the CloneDetector class which allows searching source code
for clones.
For every statement or group of statements within a compound statement,
CloneDetector computes a hash value, and finds clones by detecting
identical hash values.
This initial patch only provides a simple hashing mechanism
that hashes the kind of each sub-statement.
This patch also adds CloneChecker - a simple static analyzer checker
that uses CloneDetector to report copy-pasted code.
Patch by Raphael Isemann!
Differential Revision: https://reviews.llvm.org/D20795
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@276782 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Analysis/CloneDetection.cpp')
-rw-r--r-- | lib/Analysis/CloneDetection.cpp | 277 |
1 files changed, 277 insertions, 0 deletions
diff --git a/lib/Analysis/CloneDetection.cpp b/lib/Analysis/CloneDetection.cpp new file mode 100644 index 0000000000..0d3926ebd3 --- /dev/null +++ b/lib/Analysis/CloneDetection.cpp @@ -0,0 +1,277 @@ +//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// This file implements classes for searching and anlyzing source code clones. +/// +//===----------------------------------------------------------------------===// + +#include "clang/Analysis/CloneDetection.h" + +#include "clang/AST/ASTContext.h" +#include "clang/AST/RecursiveASTVisitor.h" +#include "clang/AST/Stmt.h" +#include "llvm/ADT/StringRef.h" + +using namespace clang; + +StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context, + unsigned StartIndex, unsigned EndIndex) + : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) { + assert(Stmt && "Stmt must not be a nullptr"); + assert(StartIndex < EndIndex && "Given array should not be empty"); + assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt"); +} + +StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context) + : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {} + +StmtSequence::StmtSequence() + : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {} + +bool StmtSequence::contains(const StmtSequence &Other) const { + // If both sequences reside in different translation units, they can never + // contain each other. + if (Context != Other.Context) + return false; + + const SourceManager &SM = Context->getSourceManager(); + + // Otherwise check if the start and end locations of the current sequence + // surround the other sequence. + bool StartIsInBounds = + SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) || + getStartLoc() == Other.getStartLoc(); + if (!StartIsInBounds) + return false; + + bool EndIsInBounds = + SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) || + Other.getEndLoc() == getEndLoc(); + return EndIsInBounds; +} + +StmtSequence::iterator StmtSequence::begin() const { + if (!holdsSequence()) { + return &S; + } + auto CS = cast<CompoundStmt>(S); + return CS->body_begin() + StartIndex; +} + +StmtSequence::iterator StmtSequence::end() const { + if (!holdsSequence()) { + return &S + 1; + } + auto CS = cast<CompoundStmt>(S); + return CS->body_begin() + EndIndex; +} + +SourceLocation StmtSequence::getStartLoc() const { + return front()->getLocStart(); +} + +SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); } + +namespace { +/// Generates CloneSignatures for a set of statements and stores the results in +/// a CloneDetector object. +class CloneSignatureGenerator { + + CloneDetector &CD; + ASTContext &Context; + + /// \brief Generates CloneSignatures for all statements in the given statement + /// tree and stores them in the CloneDetector. + /// + /// \param S The root of the given statement tree. + /// \return The CloneSignature of the root statement. + CloneDetector::CloneSignature generateSignatures(const Stmt *S) { + // Create an empty signature that will be filled in this method. + CloneDetector::CloneSignature Signature; + + // The only relevant data for now is the class of the statement. + // TODO: Collect statement class specific data. + Signature.Data.push_back(S->getStmtClass()); + + // Storage for the signatures of the direct child statements. This is only + // needed if the current statement is a CompoundStmt. + std::vector<CloneDetector::CloneSignature> ChildSignatures; + const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S); + + // The signature of a statement includes the signatures of its children. + // Therefore we create the signatures for every child and add them to the + // current signature. + for (const Stmt *Child : S->children()) { + // Some statements like 'if' can have nullptr children that we will skip. + if (!Child) + continue; + + // Recursive call to create the signature of the child statement. This + // will also create and store all clone groups in this child statement. + auto ChildSignature = generateSignatures(Child); + + // Add the collected data to the signature of the current statement. + Signature.add(ChildSignature); + + // If the current statement is a CompoundStatement, we need to store the + // signature for the generation of the sub-sequences. + if (CS) + ChildSignatures.push_back(ChildSignature); + } + + // If the current statement is a CompoundStmt, we also need to create the + // clone groups from the sub-sequences inside the children. + if (CS) + handleSubSequences(CS, ChildSignatures); + + // Save the signature for the current statement in the CloneDetector object. + CD.add(StmtSequence(S, Context), Signature); + + return Signature; + } + + /// \brief Adds all possible sub-sequences in the child array of the given + /// CompoundStmt to the CloneDetector. + /// \param CS The given CompoundStmt. + /// \param ChildSignatures A list of calculated signatures for each child in + /// the given CompoundStmt. + void handleSubSequences( + const CompoundStmt *CS, + const std::vector<CloneDetector::CloneSignature> &ChildSignatures) { + + // FIXME: This function has quadratic runtime right now. Check if skipping + // this function for too long CompoundStmts is an option. + + // The length of the sub-sequence. We don't need to handle sequences with + // the length 1 as they are already handled in CollectData(). + for (unsigned Length = 2; Length <= CS->size(); ++Length) { + // The start index in the body of the CompoundStmt. We increase the + // position until the end of the sub-sequence reaches the end of the + // CompoundStmt body. + for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) { + // Create an empty signature and add the signatures of all selected + // child statements to it. + CloneDetector::CloneSignature SubSignature; + + for (unsigned i = Pos; i < Pos + Length; ++i) { + SubSignature.add(ChildSignatures[i]); + } + + // Save the signature together with the information about what children + // sequence we selected. + CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature); + } + } + } + +public: + explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context) + : CD(CD), Context(Context) {} + + /// \brief Generates signatures for all statements in the given function body. + void consumeCodeBody(const Stmt *S) { generateSignatures(S); } +}; +} // end anonymous namespace + +void CloneDetector::analyzeCodeBody(const Decl *D) { + assert(D); + assert(D->hasBody()); + CloneSignatureGenerator Generator(*this, D->getASTContext()); + Generator.consumeCodeBody(D->getBody()); +} + +void CloneDetector::add(const StmtSequence &S, + const CloneSignature &Signature) { + // StringMap only works with StringRefs, so we create one for our data vector. + auto &Data = Signature.Data; + StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()), + Data.size() * sizeof(unsigned)); + + // Search with the help of the signature if we already have encountered a + // clone of the given StmtSequence. + auto I = CloneGroupIndexes.find(DataRef); + if (I == CloneGroupIndexes.end()) { + // We haven't found an existing clone group, so we create a new clone group + // for this StmtSequence and store the index of it in our search map. + CloneGroupIndexes[DataRef] = CloneGroups.size(); + CloneGroups.emplace_back(S, Signature.Complexity); + return; + } + + // We have found an existing clone group and can expand it with the given + // StmtSequence. + CloneGroups[I->getValue()].Sequences.push_back(S); +} + +namespace { +/// \brief Returns true if and only if \p Stmt contains at least one other +/// sequence in the \p Group. +bool containsAnyInGroup(StmtSequence &Stmt, + CloneDetector::CloneGroup &Group) { + for (StmtSequence &GroupStmt : Group.Sequences) { + if (Stmt.contains(GroupStmt)) + return true; + } + return false; +} + +/// \brief Returns true if and only if all sequences in \p OtherGroup are +/// contained by a sequence in \p Group. +bool containsGroup(CloneDetector::CloneGroup &Group, + CloneDetector::CloneGroup &OtherGroup) { + // We have less sequences in the current group than we have in the other, + // so we will never fulfill the requirement for returning true. This is only + // possible because we know that a sequence in Group can contain at most + // one sequence in OtherGroup. + if (Group.Sequences.size() < OtherGroup.Sequences.size()) + return false; + + for (StmtSequence &Stmt : Group.Sequences) { + if (!containsAnyInGroup(Stmt, OtherGroup)) + return false; + } + return true; +} +} // end anonymous namespace + +void CloneDetector::findClones(std::vector<CloneGroup> &Result, + unsigned MinGroupComplexity) { + // Add every valid clone group that fulfills the complexity requirement. + for (const CloneGroup &Group : CloneGroups) { + if (Group.isValid() && Group.Complexity >= MinGroupComplexity) { + Result.push_back(Group); + } + } + + std::vector<unsigned> IndexesToRemove; + + // Compare every group in the result with the rest. If one groups contains + // another group, we only need to return the bigger group. + // Note: This doesn't scale well, so if possible avoid calling any heavy + // function from this loop to minimize the performance impact. + for (unsigned i = 0; i < Result.size(); ++i) { + for (unsigned j = 0; j < Result.size(); ++j) { + // Don't compare a group with itself. + if (i == j) + continue; + + if (containsGroup(Result[j], Result[i])) { + IndexesToRemove.push_back(i); + break; + } + } + } + + // Erasing a list of indexes from the vector should be done with decreasing + // indexes. As IndexesToRemove is constructed with increasing values, we just + // reverse iterate over it to get the desired order. + for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) { + Result.erase(Result.begin() + *I); + } +} |