diff options
author | Artem Dergachev <artem.dergachev@gmail.com> | 2016-08-18 12:29:41 +0000 |
---|---|---|
committer | Artem Dergachev <artem.dergachev@gmail.com> | 2016-08-18 12:29:41 +0000 |
commit | ac4644401fe7e7305aed433541d6274203a69b97 (patch) | |
tree | d8e5d36b8019fcada8d21bfe83b6caf2fa6e08a7 /lib/Analysis/CloneDetection.cpp | |
parent | 9d09f35654f5b4bbbf94a97bf9156f885301d6f5 (diff) |
[analyzer] Teach CloneDetector to find clones that look like copy-paste errors.
The original clone checker tries to find copy-pasted code that is exactly
identical to the original code, up to minor details.
As an example, if the copy-pasted code has all references to variable 'a'
replaced with references to variable 'b', it is still considered to be
an exact clone.
The new check finds copy-pasted code in which exactly one variable seems
out of place compared to the original code, which likely indicates
a copy-paste error (a variable was forgotten to be renamed in one place).
Patch by Raphael Isemann!
Differential Revision: https://reviews.llvm.org/D23314
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@279056 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Analysis/CloneDetection.cpp')
-rw-r--r-- | lib/Analysis/CloneDetection.cpp | 143 |
1 files changed, 123 insertions, 20 deletions
diff --git a/lib/Analysis/CloneDetection.cpp b/lib/Analysis/CloneDetection.cpp index 27815f30ae..a26409f950 100644 --- a/lib/Analysis/CloneDetection.cpp +++ b/lib/Analysis/CloneDetection.cpp @@ -88,8 +88,11 @@ class VariablePattern { struct VariableOccurence { /// The index of the associated VarDecl in the Variables vector. size_t KindID; + /// The source range in the code where the variable was referenced. + SourceRange Range; - VariableOccurence(size_t KindID) : KindID(KindID) {} + VariableOccurence(size_t KindID, SourceRange Range) + : KindID(KindID), Range(Range) {} }; /// All occurences of referenced variables in the order of appearance. @@ -100,19 +103,20 @@ class VariablePattern { /// \brief Adds a new variable referenced to this pattern. /// \param VarDecl The declaration of the variable that is referenced. - void addVariableOccurence(const VarDecl *VarDecl) { + /// \param Range The SourceRange where this variable is referenced. + void addVariableOccurence(const VarDecl *VarDecl, SourceRange Range) { // First check if we already reference this variable for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) { if (Variables[KindIndex] == VarDecl) { // If yes, add a new occurence that points to the existing entry in // the Variables vector. - Occurences.emplace_back(KindIndex); + Occurences.emplace_back(KindIndex, Range); return; } } // If this variable wasn't already referenced, add it to the list of // referenced variables and add a occurence that points to this new entry. - Occurences.emplace_back(Variables.size()); + Occurences.emplace_back(Variables.size(), Range); Variables.push_back(VarDecl); } @@ -127,7 +131,7 @@ class VariablePattern { // Check if S is a reference to a variable. If yes, add it to the pattern. if (auto D = dyn_cast<DeclRefExpr>(S)) { if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl())) - addVariableOccurence(VD); + addVariableOccurence(VD, D->getSourceRange()); } // Recursively check all children of the given statement. @@ -144,33 +148,93 @@ public: addVariables(S); } - /// \brief Compares this pattern with the given one. + /// \brief Counts the differences between this pattern and the given one. /// \param Other The given VariablePattern to compare with. - /// \return Returns true if and only if the references variables in this - /// object follow the same pattern than the ones in the given - /// VariablePattern. + /// \param FirstMismatch Output parameter that will be filled with information + /// about the first difference between the two patterns. This parameter + /// can be a nullptr, in which case it will be ignored. + /// \return Returns the number of differences between the pattern this object + /// is following and the given VariablePattern. /// - /// For example, the following statements all have the same pattern: + /// For example, the following statements all have the same pattern and this + /// function would return zero: /// /// if (a < b) return a; return b; /// if (x < y) return x; return y; /// if (u2 < u1) return u2; return u1; /// - /// but the following statement has a different pattern (note the changed - /// variables in the return statements). + /// But the following statement has a different pattern (note the changed + /// variables in the return statements) and would have two differences when + /// compared with one of the statements above. /// /// if (a < b) return b; return a; /// /// This function should only be called if the related statements of the given /// pattern and the statements of this objects are clones of each other. - bool comparePattern(const VariablePattern &Other) { + unsigned countPatternDifferences( + const VariablePattern &Other, + CloneDetector::SuspiciousClonePair *FirstMismatch = nullptr) { + unsigned NumberOfDifferences = 0; + assert(Other.Occurences.size() == Occurences.size()); for (unsigned i = 0; i < Occurences.size(); ++i) { - if (Occurences[i].KindID != Other.Occurences[i].KindID) { - return false; - } + auto ThisOccurence = Occurences[i]; + auto OtherOccurence = Other.Occurences[i]; + if (ThisOccurence.KindID == OtherOccurence.KindID) + continue; + + ++NumberOfDifferences; + + // If FirstMismatch is not a nullptr, we need to store information about + // the first difference between the two patterns. + if (FirstMismatch == nullptr) + continue; + + // Only proceed if we just found the first difference as we only store + // information about the first difference. + if (NumberOfDifferences != 1) + continue; + + const VarDecl *FirstSuggestion = nullptr; + // If there is a variable available in the list of referenced variables + // which wouldn't break the pattern if it is used in place of the + // current variable, we provide this variable as the suggested fix. + if (OtherOccurence.KindID < Variables.size()) + FirstSuggestion = Variables[OtherOccurence.KindID]; + + // Store information about the first clone. + FirstMismatch->FirstCloneInfo = + CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo( + Variables[ThisOccurence.KindID], ThisOccurence.Range, + FirstSuggestion); + + // Same as above but with the other clone. We do this for both clones as + // we don't know which clone is the one containing the unintended + // pattern error. + const VarDecl *SecondSuggestion = nullptr; + if (ThisOccurence.KindID < Other.Variables.size()) + SecondSuggestion = Other.Variables[ThisOccurence.KindID]; + + // Store information about the second clone. + FirstMismatch->SecondCloneInfo = + CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo( + Variables[ThisOccurence.KindID], OtherOccurence.Range, + SecondSuggestion); + + // SuspiciousClonePair guarantees that the first clone always has a + // suggested variable associated with it. As we know that one of the two + // clones in the pair always has suggestion, we swap the two clones + // in case the first clone has no suggested variable which means that + // the second clone has a suggested variable and should be first. + if (!FirstMismatch->FirstCloneInfo.Suggestion) + std::swap(FirstMismatch->FirstCloneInfo, + FirstMismatch->SecondCloneInfo); + + // This ensures that we always have at least one suggestion in a pair. + assert(FirstMismatch->FirstCloneInfo.Suggestion); } - return true; + + return NumberOfDifferences; } }; } @@ -529,7 +593,8 @@ static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result, // and assign them to our new clone group. auto I = UnassignedSequences.begin(), E = UnassignedSequences.end(); while (I != E) { - if (VariablePattern(*I).comparePattern(PrototypeFeatures)) { + + if (VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) { FilteredGroup.Sequences.push_back(*I); I = UnassignedSequences.erase(I); continue; @@ -546,11 +611,15 @@ static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result, } void CloneDetector::findClones(std::vector<CloneGroup> &Result, - unsigned MinGroupComplexity) { + unsigned MinGroupComplexity, + bool CheckPatterns) { // Add every valid clone group that fulfills the complexity requirement. for (const CloneGroup &Group : CloneGroups) { if (Group.isValid() && Group.Complexity >= MinGroupComplexity) { - createCloneGroups(Result, Group); + if (CheckPatterns) + createCloneGroups(Result, Group); + else + Result.push_back(Group); } } @@ -580,3 +649,37 @@ void CloneDetector::findClones(std::vector<CloneGroup> &Result, Result.erase(Result.begin() + *I); } } + +void CloneDetector::findSuspiciousClones( + std::vector<CloneDetector::SuspiciousClonePair> &Result, + unsigned MinGroupComplexity) { + std::vector<CloneGroup> Clones; + // Reuse the normal search for clones but specify that the clone groups don't + // need to have a common referenced variable pattern so that we can manually + // search for the kind of pattern errors this function is supposed to find. + findClones(Clones, MinGroupComplexity, false); + + for (const CloneGroup &Group : Clones) { + for (unsigned i = 0; i < Group.Sequences.size(); ++i) { + VariablePattern PatternA(Group.Sequences[i]); + + for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) { + VariablePattern PatternB(Group.Sequences[j]); + + CloneDetector::SuspiciousClonePair ClonePair; + // For now, we only report clones which break the variable pattern just + // once because multiple differences in a pattern are an indicator that + // those differences are maybe intended (e.g. because it's actually + // a different algorithm). + // TODO: In very big clones even multiple variables can be unintended, + // so replacing this number with a percentage could better handle such + // cases. On the other hand it could increase the false-positive rate + // for all clones if the percentage is too high. + if (PatternA.countPatternDifferences(PatternB, &ClonePair) == 1) { + Result.push_back(ClonePair); + break; + } + } + } + } +} |