diff options
Diffstat (limited to 'src/corelib/tools/qhash.cpp')
-rw-r--r-- | src/corelib/tools/qhash.cpp | 317 |
1 files changed, 249 insertions, 68 deletions
diff --git a/src/corelib/tools/qhash.cpp b/src/corelib/tools/qhash.cpp index 62fc6c5d86..20202a4896 100644 --- a/src/corelib/tools/qhash.cpp +++ b/src/corelib/tools/qhash.cpp @@ -1,6 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies). +** Copyright (C) 2012 Giuseppe D'Angelo <dangelog@gmail.com>. ** Contact: http://www.qt-project.org/ ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -39,6 +40,13 @@ ** ****************************************************************************/ +// for rand_s, _CRT_RAND_S must be #defined before #including stdlib.h. +// put it at the beginning so some indirect inclusion doesn't break it +#ifndef _CRT_RAND_S +#define _CRT_RAND_S +#endif +#include <stdlib.h> + #include "qhash.h" #ifdef truncate @@ -47,67 +55,82 @@ #include <qbitarray.h> #include <qstring.h> -#include <stdlib.h> -#ifdef QT_QHASH_DEBUG -#include <qstring.h> -#endif +#include <qglobal.h> +#include <qbytearray.h> +#include <qdatetime.h> +#include <qbasicatomic.h> -QT_BEGIN_NAMESPACE +#ifndef QT_BOOTSTRAPPED +#include <qcoreapplication.h> +#endif // QT_BOOTSTRAPPED + +#ifdef Q_OS_UNIX +#include <stdio.h> +#include "private/qcore_unix_p.h" +#endif // Q_OS_UNIX +#include <limits.h> -// ### Qt 5: see tests/benchmarks/corelib/tools/qhash/qhash_string.cpp -// Hashing of the whole string is a waste of cycles. +QT_BEGIN_NAMESPACE /* - These functions are based on Peter J. Weinberger's hash function - (from the Dragon Book). The constant 24 in the original function - was replaced with 23 to produce fewer collisions on input such as - "a", "aa", "aaa", "aaaa", ... + The Java's hashing algorithm for strings is a variation of D. J. Bernstein + hashing algorithm appeared here http://cr.yp.to/cdb/cdb.txt + and informally known as DJB33XX - DJB's 33 Times Xor. + Java uses DJB31XA, that is, 31 Times Add. + + The original algorithm was a loop around + (h << 5) + h ^ c + (which is indeed h*33 ^ c); it was then changed to + (h << 5) - h ^ c + (so h*31^c: DJB31XX), and the XOR changed to a sum: + (h << 5) - h + c + (DJB31XA), which can save some assembly instructions. + + Still, we can avoid writing the multiplication as "(h << 5) - h" + -- the compiler will turn it into a shift and an addition anyway + (for instance, gcc 4.4 does that even at -O0). */ -static uint hash(const uchar *p, int n) +static inline uint hash(const uchar *p, int len, uint seed) { - uint h = 0; + uint h = seed; + + for (int i = 0; i < len; ++i) + h = 31 * h + p[i]; - while (n--) { - h = (h << 4) + *p++; - h ^= (h & 0xf0000000) >> 23; - h &= 0x0fffffff; - } return h; } -static uint hash(const QChar *p, int n) +static inline uint hash(const QChar *p, int len, uint seed) { - uint h = 0; + uint h = seed; + + for (int i = 0; i < len; ++i) + h = 31 * h + p[i].unicode(); - while (n--) { - h = (h << 4) + (*p++).unicode(); - h ^= (h & 0xf0000000) >> 23; - h &= 0x0fffffff; - } return h; } -uint qHash(const QByteArray &key) +uint qHash(const QByteArray &key, uint seed) { - return hash(reinterpret_cast<const uchar *>(key.constData()), key.size()); + return hash(reinterpret_cast<const uchar *>(key.constData()), key.size(), seed); } -uint qHash(const QString &key) +uint qHash(const QString &key, uint seed) { - return hash(key.unicode(), key.size()); + return hash(key.unicode(), key.size(), seed); } -uint qHash(const QStringRef &key) +uint qHash(const QStringRef &key, uint seed) { - return hash(key.unicode(), key.size()); + return hash(key.unicode(), key.size(), seed); } -uint qHash(const QBitArray &bitArray) +uint qHash(const QBitArray &bitArray, uint seed) { int m = bitArray.d.size() - 1; - uint result = hash(reinterpret_cast<const uchar *>(bitArray.d.constData()), qMax(0, m)); + uint result = hash(reinterpret_cast<const uchar *>(bitArray.d.constData()), qMax(0, m), seed); // deal with the last 0 to 7 bits manually, because we can't trust that // the padding is initialized to 0 in bitArray.d @@ -117,6 +140,117 @@ uint qHash(const QBitArray &bitArray) return result; } +uint qHash(const QLatin1String &key, uint seed) +{ + return hash(reinterpret_cast<const uchar *>(key.data()), key.size(), seed); +} + +/*! + \internal + + Creates the QHash random seed from various sources. + In order of decreasing precedence: + - under Unix, it attemps to read from /dev/urandom; + - under Unix, it attemps to read from /dev/random; + - under Windows, it attempts to use rand_s; + - as a general fallback, the application's PID, a timestamp and the + address of a stack-local variable are used. +*/ +static uint qt_create_qhash_seed() +{ + uint seed = 0; + +#ifdef Q_OS_UNIX + int randomfd = qt_safe_open("/dev/urandom", O_RDONLY); + if (randomfd == -1) + randomfd = qt_safe_open("/dev/random", O_RDONLY | O_NONBLOCK); + if (randomfd != -1) { + if (qt_safe_read(randomfd, reinterpret_cast<char *>(&seed), sizeof(seed)) == sizeof(seed)) { + qt_safe_close(randomfd); + return seed; + } + qt_safe_close(randomfd); + } +#endif // Q_OS_UNIX + +#if defined(Q_OS_WIN32) && !defined(Q_CC_GNU) + errno_t err; + err = rand_s(&seed); + if (err == 0) + return seed; +#endif // Q_OS_WIN32 + + // general fallback: initialize from the current timestamp, pid, + // and address of a stack-local variable + quint64 timestamp = QDateTime::currentMSecsSinceEpoch(); + seed ^= timestamp; + seed ^= (timestamp >> 32); + +#ifndef QT_BOOTSTRAPPED + quint64 pid = QCoreApplication::applicationPid(); + seed ^= pid; + seed ^= (pid >> 32); +#endif // QT_BOOTSTRAPPED + + quintptr seedPtr = reinterpret_cast<quintptr>(&seed); + seed ^= seedPtr; +#if QT_POINTER_SIZE == 8 + seed ^= (seedPtr >> 32); +#endif + + return seed; +} + +/* + The QHash seed itself. +*/ +Q_CORE_EXPORT QBasicAtomicInt qt_qhash_seed = Q_BASIC_ATOMIC_INITIALIZER(-1); + +/*! + \internal + + Seed == -1 means it that it was not initialized yet. + + We let qt_create_qhash_seed return any unsigned integer, + but convert it to signed in order to initialize the seed. + + We don't actually care about the fact that different calls to + qt_create_qhash_seed() might return different values, + as long as in the end everyone uses the very same value. +*/ +static void qt_initialize_qhash_seed() +{ + if (qt_qhash_seed.load() == -1) { + int x(qt_create_qhash_seed() & INT_MAX); + qt_qhash_seed.testAndSetRelaxed(-1, x); + } +} + +/*! + \internal + + Private copy of the implementation of the Qt 4 qHash algorithm for strings, + to be used wherever the result is somehow stored or reused across multiple + Qt versions. The public qHash implementation can change at any time, + therefore one must not rely on the fact that it will always give the same + results. + + This function must *never* change its results. +*/ +uint qt_hash(const QString &key) +{ + const QChar *p = key.unicode(); + int n = key.size(); + uint h = 0; + + while (n--) { + h = (h << 4) + (*p++).unicode(); + h ^= (h & 0xf0000000) >> 23; + h &= 0x0fffffff; + } + return h; +} + /* The prime_deltas array is a table of selected prime values, even though it doesn't look like one. The primes we are using are 1, @@ -166,7 +300,7 @@ static int countBits(int hint) const int MinNumBits = 4; const QHashData QHashData::shared_null = { - 0, 0, Q_REFCOUNT_INITIALIZER(-1), 0, 0, MinNumBits, 0, 0, true, false, 0 + 0, 0, Q_REFCOUNT_INITIALIZE_STATIC, 0, 0, MinNumBits, 0, 0, 0, true, false, 0 }; void *QHashData::allocateNode(int nodeAlign) @@ -193,15 +327,18 @@ QHashData *QHashData::detach_helper(void (*node_duplicate)(Node *, void *), QHashData *d; Node *e; }; + if (this == &shared_null) + qt_initialize_qhash_seed(); d = new QHashData; d->fakeNext = 0; d->buckets = 0; - d->ref = 1; + d->ref.initializeOwned(); d->size = size; d->nodeSize = nodeSize; d->userNumBits = userNumBits; d->numBits = numBits; d->numBuckets = numBuckets; + d->seed = uint(qt_qhash_seed.load()); d->sharable = true; d->strictAlignment = nodeAlign > 8; d->reserved = 0; @@ -511,17 +648,17 @@ void QHashData::checkSanity() Returns the hash value for the \a key. */ -/*! \fn uint qHash(const QByteArray &key) - \fn uint qHash(const QBitArray &key) - \relates QHash +/*! \fn uint qHash(const QByteArray &key, uint seed = 0) + \fn uint qHash(const QBitArray &key, uint seed = 0) + \fn uint qHash(const QString &key, uint seed = 0) + \fn uint qHash(const QStringRef &key, uint seed = 0) + \fn uint qHash(const QLatin1String &key, uint seed = 0) - Returns the hash value for the \a key. -*/ - -/*! \fn uint qHash(const QString &key) \relates QHash + \since 5.0 - Returns the hash value for the \a key. + Returns the hash value for the \a key, using \a seed to + seed the calculation. */ /*! \fn uint qHash(const T *key) @@ -553,8 +690,7 @@ void QHashData::checkSanity() key. With QHash, the items are arbitrarily ordered. \li The key type of a QMap must provide operator<(). The key type of a QHash must provide operator==() and a global - hash function called qHash() (see the related non-member - functions). + hash function called qHash() (see \l{qHash}). \endlist Here's an example QHash with QString keys and \c int values: @@ -599,6 +735,15 @@ void QHashData::checkSanity() To avoid this problem, replace \c hash[i] with \c hash.value(i) in the code above. + Internally, QHash uses a hash table to perform lookups. Unlike Qt + 3's \c QDict class, which needed to be initialized with a prime + number, QHash's hash table automatically grows and shrinks to + provide fast lookups without wasting too much memory. You can + still control the size of the hash table by calling reserve() if + you already know approximately how many items the QHash will + contain, but this isn't necessary to obtain good performance. You + can also call capacity() to retrieve the hash table's size. + If you want to navigate through all the (key, value) pairs stored in a QHash, you can use an iterator. QHash provides both \l{Java-style iterators} (QHashIterator and QMutableHashIterator) @@ -648,21 +793,15 @@ void QHashData::checkSanity() QHash's key and value data types must be \l{assignable data types}. You cannot, for example, store a QWidget as a value; - instead, store a QWidget *. In addition, QHash's key type must - provide operator==(), and there must also be a global qHash() - function that returns a hash value for an argument of the key's - type. - - Here's a list of the C++ and Qt types that can serve as keys in a - QHash: any integer type (char, unsigned long, etc.), any pointer - type, QChar, QString, and QByteArray. For all of these, the \c - <QHash> header defines a qHash() function that computes an - adequate hash value. If you want to use other types as the key, - make sure that you provide operator==() and a qHash() - implementation. + instead, store a QWidget *. - Example: - \snippet doc/src/snippets/code/src_corelib_tools_qhash.cpp 13 + \target qHash + \section2 The qHash() hashing function + + A QHash's key type has additional requirements other than being an + assignable data type: it must provide operator==(), and there must also be + a global qHash() function that returns a hash value for an argument of the + key's type. The qHash() function computes a numeric value based on a key. It can use any algorithm imaginable, as long as it always returns @@ -672,19 +811,61 @@ void QHashData::checkSanity() attempt to return different hash values for different keys to the largest extent possible. + For a key type \c{K}, the qHash function must have one of these signatures: + + \code + uint qHash(K key); + uint qHash(const K &key); + + uint qHash(K key, uint seed); + uint qHash(const K &key, uint seed); + \endcode + + The two-arguments overloads take an unsigned integer that should be used to + seed the calculation of the hash function. This seed is provided by QHash + in order to prevent a family of \l{algorithmic complexity attacks}. If both + a one-argument and a two-arguments overload are defined for a key type, + the latter is used by QHash (note that you can simply define a + two-arguments version, and use a default value for the seed parameter). + + Here's a partial list of the C++ and Qt types that can serve as keys in a + QHash: any integer type (char, unsigned long, etc.), any pointer type, + QChar, QString, and QByteArray. For all of these, the \c <QHash> header + defines a qHash() function that computes an adequate hash value. Many other + Qt classes also declare a qHash overload for their type; please refer to + the documentation of each class. + + If you want to use other types as the key, make sure that you provide + operator==() and a qHash() implementation. + + Example: + \snippet doc/src/snippets/code/src_corelib_tools_qhash.cpp 13 + In the example above, we've relied on Qt's global qHash(const - QString &) to give us a hash value for the employee's name, and + QString &, uint) to give us a hash value for the employee's name, and XOR'ed this with the day they were born to help produce unique hashes for people with the same name. - Internally, QHash uses a hash table to perform lookups. Unlike Qt - 3's \c QDict class, which needed to be initialized with a prime - number, QHash's hash table automatically grows and shrinks to - provide fast lookups without wasting too much memory. You can - still control the size of the hash table by calling reserve() if - you already know approximately how many items the QHash will - contain, but this isn't necessary to obtain good performance. You - can also call capacity() to retrieve the hash table's size. + Note that the implementation of the qHash() overloads offered by Qt + may change at any time. You \b{must not} rely on the fact that qHash() + will give the same results (for the same inputs) across different Qt + versions. + + \section2 Algorithmic complexity attacks + + All hash tables are vulnerable to a particular class of denial of service + attacks, in which the attacker carefully pre-computes a set of different + keys that are going to be hashed in the same bucket of a hash table (or + even have the very same hash value). The attack aims at getting the + worst-case algorithmic behavior (O(n) instead of amortized O(1), see + \l{Algorithmic Complexity} for the details) when the data is fed into the + table. + + In order to avoid this worst-case behavior, the calculation of the hash + value done by qHash() can be salted by a random seed, that nullifies the + attack's extent. This seed is automatically generated by QHash once per + process, and then passed by QHash as the second argument of the + two-arguments overload of the qHash() function. \sa QHashIterator, QMutableHashIterator, QMap, QSet */ |