summaryrefslogtreecommitdiffstats
path: root/chromium/base/i18n/streaming_utf8_validator_perftest.cc
blob: ac2eb0820bac01a3a172fb89236d92a9763562e9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// All data that is passed through a WebSocket with type "Text" needs to be
// validated as UTF8. Since this is done on the IO thread, it needs to be
// reasonably fast.

// We are only interested in the performance on valid UTF8. Invalid UTF8 will
// result in a connection failure, so is unlikely to become a source of
// performance issues.

#include "base/i18n/streaming_utf8_validator.h"

#include <string>

#include "base/basictypes.h"
#include "base/bind.h"
#include "base/callback.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "base/test/perf_time_logger.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace base {
namespace {

// We want to test ranges of valid UTF-8 sequences. These ranges are inclusive.
// They are intended to be large enough that the validator needs to do
// meaningful work while being in some sense "realistic" (eg. control characters
// are not included).
const char kOneByteSeqRangeStart[] = " ";  // U+0020
const char kOneByteSeqRangeEnd[] = "~";    // U+007E

const char kTwoByteSeqRangeStart[] = "\xc2\xa0";  // U+00A0 non-breaking space
const char kTwoByteSeqRangeEnd[] = "\xc9\x8f";    // U+024F small y with stroke

const char kThreeByteSeqRangeStart[] = "\xe3\x81\x82";  // U+3042 Hiragana "a"
const char kThreeByteSeqRangeEnd[] = "\xe9\xbf\x83";    // U+9FC3 "to blink"

const char kFourByteSeqRangeStart[] = "\xf0\xa0\x80\x8b";  // U+2000B
const char kFourByteSeqRangeEnd[] = "\xf0\xaa\x9a\xb2";    // U+2A6B2

// The different lengths of strings to test.
const size_t kTestLengths[] = {1, 32, 256, 32768, 1 << 20};

// Simplest possible byte-at-a-time validator, to provide a baseline
// for comparison. This is only tried on 1-byte UTF-8 sequences, as
// the results will not be meaningful with sequences containing
// top-bit-set bytes.
bool IsString7Bit(const std::string& s) {
  for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) {
    if (*it & 0x80)
      return false;
  }
  return true;
}

// Assumes that |previous| is a valid UTF-8 sequence, and attempts to return
// the next one. Is just barely smart enough to iterate through the ranges
// defined about.
std::string NextUtf8Sequence(const std::string& previous) {
  DCHECK(StreamingUtf8Validator::Validate(previous));
  std::string next = previous;
  for (int i = static_cast<int>(previous.length() - 1); i >= 0; --i) {
    // All bytes in a UTF-8 sequence except the first one are
    // constrained to the range 0x80 to 0xbf, inclusive. When we
    // increment past 0xbf, we carry into the previous byte.
    if (i > 0 && next[i] == '\xbf') {
      next[i] = '\x80';
      continue;  // carry
    }
    ++next[i];
    break;  // no carry
  }
  DCHECK(StreamingUtf8Validator::Validate(next))
      << "Result \"" << next << "\" failed validation";
  return next;
}

typedef bool (*TestTargetType)(const std::string&);

// Run fuction |target| over |test_string| |times| times, and report the results
// using |description|.
bool RunTest(const std::string& description,
             TestTargetType target,
             const std::string& test_string,
             int times) {
  base::PerfTimeLogger timer(description.c_str());
  bool result = true;
  for (int i = 0; i < times; ++i) {
    result = target(test_string) && result;
  }
  timer.Done();
  return result;
}

// Construct a string by repeating |input| enough times to equal or exceed
// |length|.
std::string ConstructRepeatedTestString(const std::string& input,
                                        size_t length) {
  std::string output = input;
  while (output.length() * 2 < length) {
    output += output;
  }
  if (output.length() < length) {
    output += ConstructRepeatedTestString(input, length - output.length());
  }
  return output;
}

// Construct a string by expanding the range of UTF-8 sequences
// between |input_start| and |input_end|, inclusive, and then
// repeating the resulting string until it equals or exceeds |length|
// bytes. |input_start| and |input_end| must be valid UTF-8
// sequences.
std::string ConstructRangedTestString(const std::string& input_start,
                                      const std::string& input_end,
                                      size_t length) {
  std::string output = input_start;
  std::string input = input_start;
  while (output.length() < length && input != input_end) {
    input = NextUtf8Sequence(input);
    output += input;
  }
  if (output.length() < length) {
    output = ConstructRepeatedTestString(output, length);
  }
  return output;
}

struct TestFunctionDescription {
  TestTargetType function;
  const char* function_name;
};

// IsString7Bit is intentionally placed last so it can be excluded easily.
const TestFunctionDescription kTestFunctions[] = {
    {&StreamingUtf8Validator::Validate, "StreamingUtf8Validator"},
    {&IsStringUTF8, "IsStringUTF8"}, {&IsString7Bit, "IsString7Bit"}};

// Construct a test string from |construct_test_string| for each of the lengths
// in |kTestLengths| in turn. For each string, run each test in |test_functions|
// for a number of iterations such that the total number of bytes validated
// is around 16MB.
void RunSomeTests(
    const char format[],
    base::Callback<std::string(size_t length)> construct_test_string,
    const TestFunctionDescription* test_functions,
    size_t test_count) {
  for (size_t i = 0; i < arraysize(kTestLengths); ++i) {
    const size_t length = kTestLengths[i];
    const std::string test_string = construct_test_string.Run(length);
    const int real_length = static_cast<int>(test_string.length());
    const int times = (1 << 24) / real_length;
    for (size_t test_index = 0; test_index < test_count; ++test_index) {
      EXPECT_TRUE(RunTest(StringPrintf(format,
                                       test_functions[test_index].function_name,
                                       real_length,
                                       times),
                          test_functions[test_index].function,
                          test_string,
                          times));
    }
  }
}

TEST(StreamingUtf8ValidatorPerfTest, OneByteRepeated) {
  RunSomeTests("%s: bytes=1 repeated length=%d repeat=%d",
               base::Bind(ConstructRepeatedTestString, kOneByteSeqRangeStart),
               kTestFunctions,
               3);
}

TEST(StreamingUtf8ValidatorPerfTest, OneByteRange) {
  RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d",
               base::Bind(ConstructRangedTestString,
                          kOneByteSeqRangeStart,
                          kOneByteSeqRangeEnd),
               kTestFunctions,
               3);
}

TEST(StreamingUtf8ValidatorPerfTest, TwoByteRepeated) {
  RunSomeTests("%s: bytes=2 repeated length=%d repeat=%d",
               base::Bind(ConstructRepeatedTestString, kTwoByteSeqRangeStart),
               kTestFunctions,
               2);
}

TEST(StreamingUtf8ValidatorPerfTest, TwoByteRange) {
  RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d",
               base::Bind(ConstructRangedTestString,
                          kTwoByteSeqRangeStart,
                          kTwoByteSeqRangeEnd),
               kTestFunctions,
               2);
}

TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRepeated) {
  RunSomeTests(
      "%s: bytes=3 repeated length=%d repeat=%d",
      base::Bind(ConstructRepeatedTestString, kThreeByteSeqRangeStart),
      kTestFunctions,
      2);
}

TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRange) {
  RunSomeTests("%s: bytes=3 ranged length=%d repeat=%d",
               base::Bind(ConstructRangedTestString,
                          kThreeByteSeqRangeStart,
                          kThreeByteSeqRangeEnd),
               kTestFunctions,
               2);
}

TEST(StreamingUtf8ValidatorPerfTest, FourByteRepeated) {
  RunSomeTests("%s: bytes=4 repeated length=%d repeat=%d",
               base::Bind(ConstructRepeatedTestString, kFourByteSeqRangeStart),
               kTestFunctions,
               2);
}

TEST(StreamingUtf8ValidatorPerfTest, FourByteRange) {
  RunSomeTests("%s: bytes=4 ranged length=%d repeat=%d",
               base::Bind(ConstructRangedTestString,
                          kFourByteSeqRangeStart,
                          kFourByteSeqRangeEnd),
               kTestFunctions,
               2);
}

}  // namespace
}  // namespace base