[analyzer] Add basic capabilities to detect source code clones.

This patch adds the CloneDetector class which allows searching source code for clones. For every statement or group of statements within a compound statement, CloneDetector computes a hash value, and finds clones by detecting identical hash values. This initial patch only provides a simple hashing mechanism that hashes the kind of each sub-statement. This patch also adds CloneChecker - a simple static analyzer checker that uses CloneDetector to report copy-pasted code. Patch by Raphael Isemann! Differential Revision: https://reviews.llvm.org/D20795 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@276782 91177308-0d34-0410-b5e6-96231b3b80d8
author: Artem Dergachev <artem.dergachev@gmail.com> 2016-07-26 18:13:12 +0000
committer: Artem Dergachev <artem.dergachev@gmail.com> 2016-07-26 18:13:12 +0000
commit: 07b4e2f4072d6277db401b3990e8977632f5d55a (patch)
tree: 1c0ccd1ce94dc144627f9f8461eafc314b740529 /lib/Analysis/CloneDetection.cpp
parent: cefd54750c4216d515b019df09eccd7c13c7612c (diff)
1 files changed, 277 insertions, 0 deletions
diff --git a/lib/Analysis/CloneDetection.cpp b/lib/Analysis/CloneDetection.cpp
new file mode 100644
index 0000000000..0d3926ebd3
--- /dev/null
+++ b/lib/Analysis/CloneDetection.cpp
@@ -0,0 +1,277 @@
+//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+///  This file implements classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/CloneDetection.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Stmt.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace clang;
+
+StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
+                           unsigned StartIndex, unsigned EndIndex)
+    : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
+  assert(Stmt && "Stmt must not be a nullptr");
+  assert(StartIndex < EndIndex && "Given array should not be empty");
+  assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
+}
+
+StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context)
+    : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
+
+StmtSequence::StmtSequence()
+    : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
+
+bool StmtSequence::contains(const StmtSequence &Other) const {
+  // If both sequences reside in different translation units, they can never
+  // contain each other.
+  if (Context != Other.Context)
+    return false;
+
+  const SourceManager &SM = Context->getSourceManager();
+
+  // Otherwise check if the start and end locations of the current sequence
+  // surround the other sequence.
+  bool StartIsInBounds =
+      SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
+      getStartLoc() == Other.getStartLoc();
+  if (!StartIsInBounds)
+    return false;
+
+  bool EndIsInBounds =
+      SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
+      Other.getEndLoc() == getEndLoc();
+  return EndIsInBounds;
+}
+
+StmtSequence::iterator StmtSequence::begin() const {
+  if (!holdsSequence()) {
+    return &S;
+  }
+  auto CS = cast<CompoundStmt>(S);
+  return CS->body_begin() + StartIndex;
+}
+
+StmtSequence::iterator StmtSequence::end() const {
+  if (!holdsSequence()) {
+    return &S + 1;
+  }
+  auto CS = cast<CompoundStmt>(S);
+  return CS->body_begin() + EndIndex;
+}
+
+SourceLocation StmtSequence::getStartLoc() const {
+  return front()->getLocStart();
+}
+
+SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
+
+namespace {
+/// Generates CloneSignatures for a set of statements and stores the results in
+/// a CloneDetector object.
+class CloneSignatureGenerator {
+
+  CloneDetector &CD;
+  ASTContext &Context;
+
+  /// \brief Generates CloneSignatures for all statements in the given statement
+  /// tree and stores them in the CloneDetector.
+  ///
+  /// \param S The root of the given statement tree.
+  /// \return The CloneSignature of the root statement.
+  CloneDetector::CloneSignature generateSignatures(const Stmt *S) {
+    // Create an empty signature that will be filled in this method.
+    CloneDetector::CloneSignature Signature;
+
+    // The only relevant data for now is the class of the statement.
+    // TODO: Collect statement class specific data.
+    Signature.Data.push_back(S->getStmtClass());
+
+    // Storage for the signatures of the direct child statements. This is only
+    // needed if the current statement is a CompoundStmt.
+    std::vector<CloneDetector::CloneSignature> ChildSignatures;
+    const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S);
+
+    // The signature of a statement includes the signatures of its children.
+    // Therefore we create the signatures for every child and add them to the
+    // current signature.
+    for (const Stmt *Child : S->children()) {
+      // Some statements like 'if' can have nullptr children that we will skip.
+      if (!Child)
+        continue;
+
+      // Recursive call to create the signature of the child statement. This
+      // will also create and store all clone groups in this child statement.
+      auto ChildSignature = generateSignatures(Child);
+
+      // Add the collected data to the signature of the current statement.
+      Signature.add(ChildSignature);
+
+      // If the current statement is a CompoundStatement, we need to store the
+      // signature for the generation of the sub-sequences.
+      if (CS)
+        ChildSignatures.push_back(ChildSignature);
+    }
+
+    // If the current statement is a CompoundStmt, we also need to create the
+    // clone groups from the sub-sequences inside the children.
+    if (CS)
+      handleSubSequences(CS, ChildSignatures);
+
+    // Save the signature for the current statement in the CloneDetector object.
+    CD.add(StmtSequence(S, Context), Signature);
+
+    return Signature;
+  }
+
+  /// \brief Adds all possible sub-sequences in the child array of the given
+  ///        CompoundStmt to the CloneDetector.
+  /// \param CS The given CompoundStmt.
+  /// \param ChildSignatures A list of calculated signatures for each child in
+  ///                        the given CompoundStmt.
+  void handleSubSequences(
+      const CompoundStmt *CS,
+      const std::vector<CloneDetector::CloneSignature> &ChildSignatures) {
+
+    // FIXME: This function has quadratic runtime right now. Check if skipping
+    // this function for too long CompoundStmts is an option.
+
+    // The length of the sub-sequence. We don't need to handle sequences with
+    // the length 1 as they are already handled in CollectData().
+    for (unsigned Length = 2; Length <= CS->size(); ++Length) {
+      // The start index in the body of the CompoundStmt. We increase the
+      // position until the end of the sub-sequence reaches the end of the
+      // CompoundStmt body.
+      for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
+        // Create an empty signature and add the signatures of all selected
+        // child statements to it.
+        CloneDetector::CloneSignature SubSignature;
+
+        for (unsigned i = Pos; i < Pos + Length; ++i) {
+          SubSignature.add(ChildSignatures[i]);
+        }
+
+        // Save the signature together with the information about what children
+        // sequence we selected.
+        CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
+      }
+    }
+  }
+
+public:
+  explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context)
+      : CD(CD), Context(Context) {}
+
+  /// \brief Generates signatures for all statements in the given function body.
+  void consumeCodeBody(const Stmt *S) { generateSignatures(S); }
+};
+} // end anonymous namespace
+
+void CloneDetector::analyzeCodeBody(const Decl *D) {
+  assert(D);
+  assert(D->hasBody());
+  CloneSignatureGenerator Generator(*this, D->getASTContext());
+  Generator.consumeCodeBody(D->getBody());
+}
+
+void CloneDetector::add(const StmtSequence &S,
+                        const CloneSignature &Signature) {
+  // StringMap only works with StringRefs, so we create one for our data vector.
+  auto &Data = Signature.Data;
+  StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
+                                Data.size() * sizeof(unsigned));
+
+  // Search with the help of the signature if we already have encountered a
+  // clone of the given StmtSequence.
+  auto I = CloneGroupIndexes.find(DataRef);
+  if (I == CloneGroupIndexes.end()) {
+    // We haven't found an existing clone group, so we create a new clone group
+    // for this StmtSequence and store the index of it in our search map.
+    CloneGroupIndexes[DataRef] = CloneGroups.size();
+    CloneGroups.emplace_back(S, Signature.Complexity);
+    return;
+  }
+
+  // We have found an existing clone group and can expand it with the given
+  // StmtSequence.
+  CloneGroups[I->getValue()].Sequences.push_back(S);
+}
+
+namespace {
+/// \brief Returns true if and only if \p Stmt contains at least one other
+/// sequence in the \p Group.
+bool containsAnyInGroup(StmtSequence &Stmt,
+                            CloneDetector::CloneGroup &Group) {
+  for (StmtSequence &GroupStmt : Group.Sequences) {
+    if (Stmt.contains(GroupStmt))
+      return true;
+  }
+  return false;
+}
+
+/// \brief Returns true if and only if all sequences in \p OtherGroup are
+/// contained by a sequence in \p Group.
+bool containsGroup(CloneDetector::CloneGroup &Group,
+                   CloneDetector::CloneGroup &OtherGroup) {
+  // We have less sequences in the current group than we have in the other,
+  // so we will never fulfill the requirement for returning true. This is only
+  // possible because we know that a sequence in Group can contain at most
+  // one sequence in OtherGroup.
+  if (Group.Sequences.size() < OtherGroup.Sequences.size())
+    return false;
+
+  for (StmtSequence &Stmt : Group.Sequences) {
+    if (!containsAnyInGroup(Stmt, OtherGroup))
+      return false;
+  }
+  return true;
+}
+} // end anonymous namespace
+
+void CloneDetector::findClones(std::vector<CloneGroup> &Result,
+                               unsigned MinGroupComplexity) {
+  // Add every valid clone group that fulfills the complexity requirement.
+  for (const CloneGroup &Group : CloneGroups) {
+    if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
+      Result.push_back(Group);
+    }
+  }
+
+  std::vector<unsigned> IndexesToRemove;
+
+  // Compare every group in the result with the rest. If one groups contains
+  // another group, we only need to return the bigger group.
+  // Note: This doesn't scale well, so if possible avoid calling any heavy
+  // function from this loop to minimize the performance impact.
+  for (unsigned i = 0; i < Result.size(); ++i) {
+    for (unsigned j = 0; j < Result.size(); ++j) {
+      // Don't compare a group with itself.
+      if (i == j)
+        continue;
+
+      if (containsGroup(Result[j], Result[i])) {
+        IndexesToRemove.push_back(i);
+        break;
+      }
+    }
+  }
+
+  // Erasing a list of indexes from the vector should be done with decreasing
+  // indexes. As IndexesToRemove is constructed with increasing values, we just
+  // reverse iterate over it to get the desired order.
+  for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
+    Result.erase(Result.begin() + *I);
+  }
+}
author	Artem Dergachev <artem.dergachev@gmail.com>	2016-07-26 18:13:12 +0000
committer	Artem Dergachev <artem.dergachev@gmail.com>	2016-07-26 18:13:12 +0000
commit	07b4e2f4072d6277db401b3990e8977632f5d55a (patch)
tree	1c0ccd1ce94dc144627f9f8461eafc314b740529 /lib/Analysis/CloneDetection.cpp
parent	cefd54750c4216d515b019df09eccd7c13c7612c (diff)