[clangd] FuzzyMatch exposes an API for its word segmentation. NFC

Summary: This is intended to be used for indexing, e.g. in D49417 Reviewers: ioeric, omtcyfz Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, cfe-commits Differential Revision: https://reviews.llvm.org/D49540 git-svn-id: https://llvm.org/svn/llvm-project/clang-tools-extra/trunk@337527 91177308-0d34-0410-b5e6-96231b3b80d8
author: Sam McCall <sam.mccall@gmail.com> 2018-07-20 08:01:37 +0000
committer: Sam McCall <sam.mccall@gmail.com> 2018-07-20 08:01:37 +0000
commit: b45d073374e56ff7a4f16e1ed08e902fd64888d6 (patch)
tree: 078006a2ad2c8e442c8fa493de3509c8f33d3afc
parent: b720ecd98c931e792e7e5f93befb4c1e47c53e68 (diff)
3 files changed, 81 insertions, 45 deletions
diff --git a/clangd/FuzzyMatch.cpp b/clangd/FuzzyMatch.cpp
index b3d58d55..b81020af 100644
--- a/clangd/FuzzyMatch.cpp
+++ b/clangd/FuzzyMatch.cpp
@@ -87,8 +87,8 @@ FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
     for (int W = 0; W < P; ++W)
       for (Action A : {Miss, Match})
         Scores[P][W][A] = {AwfulScore, Miss};
-  if (PatN > 0)
-    calculateRoles(Pat, PatRole, PatTypeSet, PatN);
+  PatTypeSet =
+      calculateRoles(StringRef(Pat, PatN), makeMutableArrayRef(PatRole, PatN));
 }
 
 Optional<float> FuzzyMatcher::match(StringRef Word) {
@@ -110,25 +110,6 @@ Optional<float> FuzzyMatcher::match(StringRef Word) {
   return Score;
 }
 
-// Segmentation of words and patterns.
-// A name like "fooBar_baz" consists of several parts foo, bar, baz.
-// Aligning segmentation of word and pattern improves the fuzzy-match.
-// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
-//
-// First we classify each character into types (uppercase, lowercase, etc).
-// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
-
-// We only distinguish the types of characters that affect segmentation.
-// It's not obvious how to segment digits, we treat them as lowercase letters.
-// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
-// This means we require exact (case-sensitive) match.
-enum FuzzyMatcher::CharType : unsigned char {
-  Empty = 0,       // Before-the-start and after-the-end (and control chars).
-  Lower = 1,       // Lowercase letters, digits, and non-ASCII bytes.
-  Upper = 2,       // Uppercase letters.
-  Punctuation = 3, // ASCII punctuation (including Space)
-};
-
 // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
 // The top 6 bits of the char select the byte, the bottom 2 select the offset.
 // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
@@ -147,17 +128,6 @@ constexpr static uint8_t CharTypes[] = {
     0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
 };
 
-// Each character's Role is the Head or Tail of a segment, or a Separator.
-// e.g. XMLHttpRequest_Async
-//      +--+---+------ +----
-//      ^Head   ^Tail ^Separator
-enum FuzzyMatcher::CharRole : unsigned char {
-  Unknown = 0,   // Stray control characters or impossible states.
-  Tail = 1,      // Part of a word segment, but not the first character.
-  Head = 2,      // The first character of a word segment.
-  Separator = 3, // Punctuation characters that separate word segments.
-};
-
 // The Role can be determined from the Type of a character and its neighbors:
 //
 //   Example  | Chars | Type | Role
@@ -183,26 +153,28 @@ constexpr static uint8_t CharRoles[] = {
 template <typename T> static T packedLookup(const uint8_t *Data, int I) {
   return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
 }
-void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet,
-                                  int N) {
-  assert(N > 0);
+CharTypeSet calculateRoles(StringRef Text, MutableArrayRef<CharRole> Roles) {
+  assert(Text.size() == Roles.size());
+  if (Text.size() == 0)
+    return 0;
   CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
-  TypeSet = 1 << Type;
+  CharTypeSet TypeSet = 1 << Type;
   // Types holds a sliding window of (Prev, Curr, Next) types.
   // Initial value is (Empty, Empty, type of Text[0]).
   int Types = Type;
   // Rotate slides in the type of the next character.
   auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
-  for (int I = 0; I < N - 1; ++I) {
+  for (unsigned I = 0; I < Text.size() - 1; ++I) {
     // For each character, rotate in the next, and look up the role.
     Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
     TypeSet |= 1 << Type;
     Rotate(Type);
-    *Out++ = packedLookup<CharRole>(CharRoles, Types);
+    Roles[I] = packedLookup<CharRole>(CharRoles, Types);
   }
   // For the last character, the "next character" is Empty.
   Rotate(Empty);
-  *Out++ = packedLookup<CharRole>(CharRoles, Types);
+  Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types);
+  return TypeSet;
 }
 
 // Sets up the data structures matching Word.
@@ -228,7 +200,8 @@ bool FuzzyMatcher::init(StringRef NewWord) {
   // FIXME: some words are hard to tokenize algorithmically.
   // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
   // We could add a tokenization dictionary for common stdlib names.
-  calculateRoles(Word, WordRole, WordTypeSet, WordN);
+  WordTypeSet = calculateRoles(StringRef(Word, WordN),
+                               makeMutableArrayRef(WordRole, WordN));
   return true;
 }
 
diff --git a/clangd/FuzzyMatch.h b/clangd/FuzzyMatch.h
index 552ab78e..f0c7e722 100644
--- a/clangd/FuzzyMatch.h
+++ b/clangd/FuzzyMatch.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -24,6 +25,48 @@
 namespace clang {
 namespace clangd {
 
+// Utilities for word segmentation.
+// FuzzyMatcher already incorporates this logic, so most users don't need this.
+//
+// A name like "fooBar_baz" consists of several parts foo, bar, baz.
+// Aligning segmentation of word and pattern improves the fuzzy-match.
+// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
+//
+// First we classify each character into types (uppercase, lowercase, etc).
+// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
+
+// We distinguish the types of characters that affect segmentation.
+// It's not obvious how to segment digits, we treat them as lowercase letters.
+// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
+// This means we require exact (case-sensitive) match for those characters.
+enum CharType : unsigned char {
+  Empty = 0,       // Before-the-start and after-the-end (and control chars).
+  Lower = 1,       // Lowercase letters, digits, and non-ASCII bytes.
+  Upper = 2,       // Uppercase letters.
+  Punctuation = 3, // ASCII punctuation (including Space)
+};
+// A CharTypeSet is a bitfield representing all the character types in a word.
+// Its bits are 1<<Empty, 1<<Lower, etc.
+using CharTypeSet = unsigned char;
+
+// Each character's Role is the Head or Tail of a segment, or a Separator.
+// e.g. XMLHttpRequest_Async
+//      +--+---+------ +----
+//      ^Head   ^Tail ^Separator
+enum CharRole : unsigned char {
+  Unknown = 0,   // Stray control characters or impossible states.
+  Tail = 1,      // Part of a word segment, but not the first character.
+  Head = 2,      // The first character of a word segment.
+  Separator = 3, // Punctuation characters that separate word segments.
+};
+
+// Compute segmentation of Text.
+// Character roles are stored in Roles (Roles.size() must equal Text.size()).
+// The set of character types encountered is returned, this may inform
+// heuristics for dealing with poorly-segmented identifiers like "strndup".
+CharTypeSet calculateRoles(llvm::StringRef Text,
+                           llvm::MutableArrayRef<CharRole> Roles);
+
 // A matcher capable of matching and scoring strings against a single pattern.
 // It's optimized for matching against many strings - match() does not allocate.
 class FuzzyMatcher {
@@ -48,8 +91,6 @@ public:
 private:
   // We truncate the pattern and the word to bound the cost of matching.
   constexpr static int MaxPat = 63, MaxWord = 127;
-  enum CharRole : unsigned char; // For segmentation.
-  enum CharType : unsigned char; // For segmentation.
   // Action describes how a word character was matched to the pattern.
   // It should be an enum, but this causes bitfield problems:
   //   - for MSVC the enum type must be explicitly unsigned for correctness
@@ -60,7 +101,6 @@ private:
 
   bool init(llvm::StringRef Word);
   void buildGraph();
-  void calculateRoles(const char *Text, CharRole *Out, int &Types, int N);
   bool allowMatch(int P, int W, Action Last) const;
   int skipPenalty(int W, Action Last) const;
   int matchBonus(int P, int W, Action Last) const;
@@ -70,7 +110,7 @@ private:
   int PatN;                 // Length
   char LowPat[MaxPat];      // Pattern in lowercase
   CharRole PatRole[MaxPat]; // Pattern segmentation info
-  int PatTypeSet;           // Bitmask of 1<<CharType for all Pattern characters
+  CharTypeSet PatTypeSet;   // Bitmask of 1<<CharType for all Pattern characters
   float ScoreScale;         // Normalizes scores for the pattern length.
 
   // Word data is initialized on each call to match(), mostly by init().
@@ -78,7 +118,7 @@ private:
   int WordN;                  // Length
   char LowWord[MaxWord];      // Word in lowercase
   CharRole WordRole[MaxWord]; // Word segmentation info
-  int WordTypeSet;            // Bitmask of 1<<CharType for all Word characters
+  CharTypeSet WordTypeSet;    // Bitmask of 1<<CharType for all Word characters
   bool WordContainsPattern;   // Simple substring check
 
   // Cumulative best-match score table.
diff --git a/unittests/clangd/FuzzyMatchTests.cpp b/unittests/clangd/FuzzyMatchTests.cpp
index 4e3de6b9..c03f8f3d 100644
--- a/unittests/clangd/FuzzyMatchTests.cpp
+++ b/unittests/clangd/FuzzyMatchTests.cpp
@@ -273,6 +273,29 @@ TEST(FuzzyMatch, Scoring) {
   EXPECT_THAT("Abs", matches("[abs]", 2.f));
 }
 
+// Returns pretty-printed segmentation of Text.
+// e.g. std::basic_string --> +--  +---- +-----
+std::string segment(StringRef Text) {
+  std::vector<CharRole> Roles(Text.size());
+  calculateRoles(Text, Roles);
+  std::string Printed;
+  for (unsigned I = 0; I < Text.size(); ++I)
+    Printed.push_back("?-+ "[static_cast<unsigned>(Roles[I])]);
+  return Printed;
+}
+
+// this is a no-op hack so clang-format will vertically align our testcases.
+StringRef returns(StringRef Text) { return Text; }
+
+TEST(FuzzyMatch, Segmentation) {
+  EXPECT_THAT(segment("std::basic_string"), //
+              returns("+--  +---- +-----"));
+  EXPECT_THAT(segment("XMLHttpRequest"), //
+              returns("+--+---+------"));
+  EXPECT_THAT(segment("t3h PeNgU1N oF d00m!!!!!!!!"), //
+              returns("+-- +-+-+-+ ++ +---        "));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
author	Sam McCall <sam.mccall@gmail.com>	2018-07-20 08:01:37 +0000
committer	Sam McCall <sam.mccall@gmail.com>	2018-07-20 08:01:37 +0000
commit	b45d073374e56ff7a4f16e1ed08e902fd64888d6 (patch)
tree	078006a2ad2c8e442c8fa493de3509c8f33d3afc
parent	b720ecd98c931e792e7e5f93befb4c1e47c53e68 (diff)