summaryrefslogtreecommitdiffstats
path: root/lib/Analysis/CloneDetection.cpp
diff options
context:
space:
mode:
authorArtem Dergachev <artem.dergachev@gmail.com>2016-07-26 18:13:12 +0000
committerArtem Dergachev <artem.dergachev@gmail.com>2016-07-26 18:13:12 +0000
commit07b4e2f4072d6277db401b3990e8977632f5d55a (patch)
tree1c0ccd1ce94dc144627f9f8461eafc314b740529 /lib/Analysis/CloneDetection.cpp
parentcefd54750c4216d515b019df09eccd7c13c7612c (diff)
[analyzer] Add basic capabilities to detect source code clones.
This patch adds the CloneDetector class which allows searching source code for clones. For every statement or group of statements within a compound statement, CloneDetector computes a hash value, and finds clones by detecting identical hash values. This initial patch only provides a simple hashing mechanism that hashes the kind of each sub-statement. This patch also adds CloneChecker - a simple static analyzer checker that uses CloneDetector to report copy-pasted code. Patch by Raphael Isemann! Differential Revision: https://reviews.llvm.org/D20795 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@276782 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Analysis/CloneDetection.cpp')
-rw-r--r--lib/Analysis/CloneDetection.cpp277
1 files changed, 277 insertions, 0 deletions
diff --git a/lib/Analysis/CloneDetection.cpp b/lib/Analysis/CloneDetection.cpp
new file mode 100644
index 0000000000..0d3926ebd3
--- /dev/null
+++ b/lib/Analysis/CloneDetection.cpp
@@ -0,0 +1,277 @@
+//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file implements classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/CloneDetection.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Stmt.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace clang;
+
+StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
+ unsigned StartIndex, unsigned EndIndex)
+ : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
+ assert(Stmt && "Stmt must not be a nullptr");
+ assert(StartIndex < EndIndex && "Given array should not be empty");
+ assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
+}
+
+StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context)
+ : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
+
+StmtSequence::StmtSequence()
+ : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
+
+bool StmtSequence::contains(const StmtSequence &Other) const {
+ // If both sequences reside in different translation units, they can never
+ // contain each other.
+ if (Context != Other.Context)
+ return false;
+
+ const SourceManager &SM = Context->getSourceManager();
+
+ // Otherwise check if the start and end locations of the current sequence
+ // surround the other sequence.
+ bool StartIsInBounds =
+ SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
+ getStartLoc() == Other.getStartLoc();
+ if (!StartIsInBounds)
+ return false;
+
+ bool EndIsInBounds =
+ SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
+ Other.getEndLoc() == getEndLoc();
+ return EndIsInBounds;
+}
+
+StmtSequence::iterator StmtSequence::begin() const {
+ if (!holdsSequence()) {
+ return &S;
+ }
+ auto CS = cast<CompoundStmt>(S);
+ return CS->body_begin() + StartIndex;
+}
+
+StmtSequence::iterator StmtSequence::end() const {
+ if (!holdsSequence()) {
+ return &S + 1;
+ }
+ auto CS = cast<CompoundStmt>(S);
+ return CS->body_begin() + EndIndex;
+}
+
+SourceLocation StmtSequence::getStartLoc() const {
+ return front()->getLocStart();
+}
+
+SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
+
+namespace {
+/// Generates CloneSignatures for a set of statements and stores the results in
+/// a CloneDetector object.
+class CloneSignatureGenerator {
+
+ CloneDetector &CD;
+ ASTContext &Context;
+
+ /// \brief Generates CloneSignatures for all statements in the given statement
+ /// tree and stores them in the CloneDetector.
+ ///
+ /// \param S The root of the given statement tree.
+ /// \return The CloneSignature of the root statement.
+ CloneDetector::CloneSignature generateSignatures(const Stmt *S) {
+ // Create an empty signature that will be filled in this method.
+ CloneDetector::CloneSignature Signature;
+
+ // The only relevant data for now is the class of the statement.
+ // TODO: Collect statement class specific data.
+ Signature.Data.push_back(S->getStmtClass());
+
+ // Storage for the signatures of the direct child statements. This is only
+ // needed if the current statement is a CompoundStmt.
+ std::vector<CloneDetector::CloneSignature> ChildSignatures;
+ const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S);
+
+ // The signature of a statement includes the signatures of its children.
+ // Therefore we create the signatures for every child and add them to the
+ // current signature.
+ for (const Stmt *Child : S->children()) {
+ // Some statements like 'if' can have nullptr children that we will skip.
+ if (!Child)
+ continue;
+
+ // Recursive call to create the signature of the child statement. This
+ // will also create and store all clone groups in this child statement.
+ auto ChildSignature = generateSignatures(Child);
+
+ // Add the collected data to the signature of the current statement.
+ Signature.add(ChildSignature);
+
+ // If the current statement is a CompoundStatement, we need to store the
+ // signature for the generation of the sub-sequences.
+ if (CS)
+ ChildSignatures.push_back(ChildSignature);
+ }
+
+ // If the current statement is a CompoundStmt, we also need to create the
+ // clone groups from the sub-sequences inside the children.
+ if (CS)
+ handleSubSequences(CS, ChildSignatures);
+
+ // Save the signature for the current statement in the CloneDetector object.
+ CD.add(StmtSequence(S, Context), Signature);
+
+ return Signature;
+ }
+
+ /// \brief Adds all possible sub-sequences in the child array of the given
+ /// CompoundStmt to the CloneDetector.
+ /// \param CS The given CompoundStmt.
+ /// \param ChildSignatures A list of calculated signatures for each child in
+ /// the given CompoundStmt.
+ void handleSubSequences(
+ const CompoundStmt *CS,
+ const std::vector<CloneDetector::CloneSignature> &ChildSignatures) {
+
+ // FIXME: This function has quadratic runtime right now. Check if skipping
+ // this function for too long CompoundStmts is an option.
+
+ // The length of the sub-sequence. We don't need to handle sequences with
+ // the length 1 as they are already handled in CollectData().
+ for (unsigned Length = 2; Length <= CS->size(); ++Length) {
+ // The start index in the body of the CompoundStmt. We increase the
+ // position until the end of the sub-sequence reaches the end of the
+ // CompoundStmt body.
+ for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
+ // Create an empty signature and add the signatures of all selected
+ // child statements to it.
+ CloneDetector::CloneSignature SubSignature;
+
+ for (unsigned i = Pos; i < Pos + Length; ++i) {
+ SubSignature.add(ChildSignatures[i]);
+ }
+
+ // Save the signature together with the information about what children
+ // sequence we selected.
+ CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
+ }
+ }
+ }
+
+public:
+ explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context)
+ : CD(CD), Context(Context) {}
+
+ /// \brief Generates signatures for all statements in the given function body.
+ void consumeCodeBody(const Stmt *S) { generateSignatures(S); }
+};
+} // end anonymous namespace
+
+void CloneDetector::analyzeCodeBody(const Decl *D) {
+ assert(D);
+ assert(D->hasBody());
+ CloneSignatureGenerator Generator(*this, D->getASTContext());
+ Generator.consumeCodeBody(D->getBody());
+}
+
+void CloneDetector::add(const StmtSequence &S,
+ const CloneSignature &Signature) {
+ // StringMap only works with StringRefs, so we create one for our data vector.
+ auto &Data = Signature.Data;
+ StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
+ Data.size() * sizeof(unsigned));
+
+ // Search with the help of the signature if we already have encountered a
+ // clone of the given StmtSequence.
+ auto I = CloneGroupIndexes.find(DataRef);
+ if (I == CloneGroupIndexes.end()) {
+ // We haven't found an existing clone group, so we create a new clone group
+ // for this StmtSequence and store the index of it in our search map.
+ CloneGroupIndexes[DataRef] = CloneGroups.size();
+ CloneGroups.emplace_back(S, Signature.Complexity);
+ return;
+ }
+
+ // We have found an existing clone group and can expand it with the given
+ // StmtSequence.
+ CloneGroups[I->getValue()].Sequences.push_back(S);
+}
+
+namespace {
+/// \brief Returns true if and only if \p Stmt contains at least one other
+/// sequence in the \p Group.
+bool containsAnyInGroup(StmtSequence &Stmt,
+ CloneDetector::CloneGroup &Group) {
+ for (StmtSequence &GroupStmt : Group.Sequences) {
+ if (Stmt.contains(GroupStmt))
+ return true;
+ }
+ return false;
+}
+
+/// \brief Returns true if and only if all sequences in \p OtherGroup are
+/// contained by a sequence in \p Group.
+bool containsGroup(CloneDetector::CloneGroup &Group,
+ CloneDetector::CloneGroup &OtherGroup) {
+ // We have less sequences in the current group than we have in the other,
+ // so we will never fulfill the requirement for returning true. This is only
+ // possible because we know that a sequence in Group can contain at most
+ // one sequence in OtherGroup.
+ if (Group.Sequences.size() < OtherGroup.Sequences.size())
+ return false;
+
+ for (StmtSequence &Stmt : Group.Sequences) {
+ if (!containsAnyInGroup(Stmt, OtherGroup))
+ return false;
+ }
+ return true;
+}
+} // end anonymous namespace
+
+void CloneDetector::findClones(std::vector<CloneGroup> &Result,
+ unsigned MinGroupComplexity) {
+ // Add every valid clone group that fulfills the complexity requirement.
+ for (const CloneGroup &Group : CloneGroups) {
+ if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
+ Result.push_back(Group);
+ }
+ }
+
+ std::vector<unsigned> IndexesToRemove;
+
+ // Compare every group in the result with the rest. If one groups contains
+ // another group, we only need to return the bigger group.
+ // Note: This doesn't scale well, so if possible avoid calling any heavy
+ // function from this loop to minimize the performance impact.
+ for (unsigned i = 0; i < Result.size(); ++i) {
+ for (unsigned j = 0; j < Result.size(); ++j) {
+ // Don't compare a group with itself.
+ if (i == j)
+ continue;
+
+ if (containsGroup(Result[j], Result[i])) {
+ IndexesToRemove.push_back(i);
+ break;
+ }
+ }
+ }
+
+ // Erasing a list of indexes from the vector should be done with decreasing
+ // indexes. As IndexesToRemove is constructed with increasing values, we just
+ // reverse iterate over it to get the desired order.
+ for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
+ Result.erase(Result.begin() + *I);
+ }
+}