From 0e7100d8c08b74cbe879ac1ce604b98915e39689 Mon Sep 17 00:00:00 2001 From: Marc Mutz Date: Tue, 29 Jul 2014 01:07:48 +0200 Subject: QTimeZone: optimize QTimeZonePrivate::isValidId() This function is used in the named timezone ctor and was using QByteArray::split(), followed by size checks and a linear scan for invalid chars per section. The use of split() resulted in a lot of memory allocations and, unsurprisingly, bad performance. The new code just performs one linear scan through the byte array, calculating section sizes on the fly. Benchmark results (with the test data in tst_QTimeZone::isValidId_data()) show typical speedups of ~10x for valid IDs: RESULT : tst_QTimeZone::isValidId_bench():"minimal middle": - 0.00036 msecs per iteration (total: 95, iterations: 262144) + 0.000035 msecs per iteration (total: 74, iterations: 2097152) Even in the sweet-spot case of the old code---a space character anywhere in the string, checked for before the split---the new code is anywhere between slightly faster and not much slower: RESULT : tst_QTimeZone::isValidId_bench():"invalid char ' ' front": - 0.000011 msecs per iteration (total: 94, iterations: 8388608) + 0.000010 msecs per iteration (total: 86, iterations: 8388608) RESULT : tst_QTimeZone::isValidId_bench():"invalid char ' ' middle": - 0.000014 msecs per iteration (total: 62, iterations: 4194304) + 0.000016 msecs per iteration (total: 69, iterations: 4194304) RESULT : tst_QTimeZone::isValidId_bench():"invalid char ' ' back": - 0.000018 msecs per iteration (total: 79, iterations: 4194304) + 0.000023 msecs per iteration (total: 98, iterations: 4194304) This is not surprising, as the space character was singled out for a fast-exit check before. For any other invalid character, the new version is anywhere from 15x to 35x faster: RESULT : tst_QTimeZone::isValidId_bench():"invalid char ? front": - 0.00034 msecs per iteration (total: 91, iterations: 262144) + 0.000010 msecs per iteration (total: 87, iterations: 8388608) RESULT : tst_QTimeZone::isValidId_bench():"invalid char ? middle": - 0.00036 msecs per iteration (total: 96, iterations: 262144) + 0.000016 msecs per iteration (total: 68, iterations: 4194304) RESULT : tst_QTimeZone::isValidId_bench():"invalid char ? back": - 0.00035 msecs per iteration (total: 94, iterations: 262144) + 0.000021 msecs per iteration (total: 92, iterations: 4194304) If there was a deeper reason to single out the space character, that fast-exit path can easily be restored. This function is often used in conjunction with availableTimeZoneIds(), which currently vastly dominates the runtime of the function calling both, but I'll add another optimization for the common use-case of just checking for a time-zone's existence in a subsequent commit. Change-Id: Ife1d096fcd39464083ea464c23e49ad98fabf345 Reviewed-by: Thiago Macieira --- src/corelib/tools/qtimezoneprivate.cpp | 47 +++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 18 deletions(-) (limited to 'src') diff --git a/src/corelib/tools/qtimezoneprivate.cpp b/src/corelib/tools/qtimezoneprivate.cpp index 4658b91dd3..0575b5c999 100644 --- a/src/corelib/tools/qtimezoneprivate.cpp +++ b/src/corelib/tools/qtimezoneprivate.cpp @@ -446,32 +446,43 @@ QTimeZone::OffsetData QTimeZonePrivate::toOffsetData(const QTimeZonePrivate::Dat bool QTimeZonePrivate::isValidId(const QByteArray &ianaId) { // Rules for defining TZ/IANA names as per ftp://ftp.iana.org/tz/code/Theory - // * Use only valid POSIX file name components - // * Within a file name component, use only ASCII letters, `.', `-' and `_'. - // * Do not use digits - // * A file name component must not exceed 14 characters or start with `-' - // Aliases such as "Etc/GMT+7" and "SystemV/EST5EDT" are valid so we need to accept digits - if (ianaId.contains(' ')) - return false; - QList parts = ianaId.split('/'); - foreach (const QByteArray &part, parts) { - if (part.size() > 14 || part.size() < 1) - return false; - if (part.at(0) == '-') - return false; - for (int i = 0; i < part.size(); ++i) { - QChar ch = part.at(i); - if (!(ch >= 'a' && ch <= 'z') + // 1. Use only valid POSIX file name components + // 2. Within a file name component, use only ASCII letters, `.', `-' and `_'. + // 3. Do not use digits + // 4. A file name component must not exceed 14 characters or start with `-' + // Aliases such as "Etc/GMT+7" and "SystemV/EST5EDT" are valid so we need to accept digits, ':', and '+'. + + // The following would be preferable if QRegExp would work on QByteArrays directly: + // const QRegExp rx(QStringLiteral("[a-z0-9:+._][a-z0-9:+._-]{,13}(?:/[a-z0-9:+._][a-z0-9:+._-]{,13})*"), + // Qt::CaseInsensitive); + // return rx.exactMatch(ianaId); + + // hand-rolled version: + const int MinSectionLength = 1; + const int MaxSectionLength = 14; + int sectionLength = 0; + for (const char *it = ianaId.begin(), * const end = ianaId.end(); it != end; ++it, ++sectionLength) { + const char ch = *it; + if (ch == '/') { + if (sectionLength < MinSectionLength || sectionLength > MaxSectionLength) + return false; // violates (4) + sectionLength = -1; + } else if (ch == '-') { + if (sectionLength == 0) + return false; // violates (4) + } else if (!(ch >= 'a' && ch <= 'z') && !(ch >= 'A' && ch <= 'Z') && !(ch == '_') && !(ch >= '0' && ch <= '9') && !(ch == '-') && !(ch == '+') && !(ch == ':') - && !(ch == '.')) - return false; + && !(ch == '.')) { + return false; // violates (2) } } + if (sectionLength < MinSectionLength || sectionLength > MaxSectionLength) + return false; // violates (4) return true; } -- cgit v1.2.3