1 files changed, 249 insertions, 68 deletions
diff --git a/src/corelib/tools/qhash.cpp b/src/corelib/tools/qhash.cpp
index 62fc6c5d86..20202a4896 100644
--- a/src/corelib/tools/qhash.cpp
+++ b/src/corelib/tools/qhash.cpp
@@ -1,6 +1,7 @@
 /****************************************************************************
 **
 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
+** Copyright (C) 2012 Giuseppe D'Angelo <dangelog@gmail.com>.
 ** Contact: http://www.qt-project.org/
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@@ -39,6 +40,13 @@
 **
 ****************************************************************************/
 
+// for rand_s, _CRT_RAND_S must be #defined before #including stdlib.h.
+// put it at the beginning so some indirect inclusion doesn't break it
+#ifndef _CRT_RAND_S
+#define _CRT_RAND_S
+#endif
+#include <stdlib.h>
+
 #include "qhash.h"
 
 #ifdef truncate
@@ -47,67 +55,82 @@
 
 #include <qbitarray.h>
 #include <qstring.h>
-#include <stdlib.h>
-#ifdef QT_QHASH_DEBUG
-#include <qstring.h>
-#endif
+#include <qglobal.h>
+#include <qbytearray.h>
+#include <qdatetime.h>
+#include <qbasicatomic.h>
 
-QT_BEGIN_NAMESPACE
+#ifndef QT_BOOTSTRAPPED
+#include <qcoreapplication.h>
+#endif // QT_BOOTSTRAPPED
+
+#ifdef Q_OS_UNIX
+#include <stdio.h>
+#include "private/qcore_unix_p.h"
+#endif // Q_OS_UNIX
 
+#include <limits.h>
 
-// ### Qt 5: see tests/benchmarks/corelib/tools/qhash/qhash_string.cpp
-// Hashing of the whole string is a waste of cycles.
+QT_BEGIN_NAMESPACE
 
 /*
-    These functions are based on Peter J. Weinberger's hash function
-    (from the Dragon Book). The constant 24 in the original function
-    was replaced with 23 to produce fewer collisions on input such as
-    "a", "aa", "aaa", "aaaa", ...
+    The Java's hashing algorithm for strings is a variation of D. J. Bernstein
+    hashing algorithm appeared here http://cr.yp.to/cdb/cdb.txt
+    and informally known as DJB33XX - DJB's 33 Times Xor.
+    Java uses DJB31XA, that is, 31 Times Add.
+
+    The original algorithm was a loop around
+        (h << 5) + h ^ c
+    (which is indeed h*33 ^ c); it was then changed to
+        (h << 5) - h ^ c
+    (so h*31^c: DJB31XX), and the XOR changed to a sum:
+        (h << 5) - h + c
+    (DJB31XA), which can save some assembly instructions.
+
+    Still, we can avoid writing the multiplication as "(h << 5) - h"
+    -- the compiler will turn it into a shift and an addition anyway
+    (for instance, gcc 4.4 does that even at -O0).
 */
 
-static uint hash(const uchar *p, int n)
+static inline uint hash(const uchar *p, int len, uint seed)
 {
-    uint h = 0;
+    uint h = seed;
+
+    for (int i = 0; i < len; ++i)
+        h = 31 * h + p[i];
 
-    while (n--) {
-        h = (h << 4) + *p++;
-        h ^= (h & 0xf0000000) >> 23;
-        h &= 0x0fffffff;
-    }
     return h;
 }
 
-static uint hash(const QChar *p, int n)
+static inline uint hash(const QChar *p, int len, uint seed)
 {
-    uint h = 0;
+    uint h = seed;
+
+    for (int i = 0; i < len; ++i)
+        h = 31 * h + p[i].unicode();
 
-    while (n--) {
-        h = (h << 4) + (*p++).unicode();
-        h ^= (h & 0xf0000000) >> 23;
-        h &= 0x0fffffff;
-    }
     return h;
 }
 
-uint qHash(const QByteArray &key)
+uint qHash(const QByteArray &key, uint seed)
 {
-    return hash(reinterpret_cast<const uchar *>(key.constData()), key.size());
+    return hash(reinterpret_cast<const uchar *>(key.constData()), key.size(), seed);
 }
 
-uint qHash(const QString &key)
+uint qHash(const QString &key, uint seed)
 {
-    return hash(key.unicode(), key.size());
+    return hash(key.unicode(), key.size(), seed);
 }
 
-uint qHash(const QStringRef &key)
+uint qHash(const QStringRef &key, uint seed)
 {
-    return hash(key.unicode(), key.size());
+    return hash(key.unicode(), key.size(), seed);
 }
 
-uint qHash(const QBitArray &bitArray)
+uint qHash(const QBitArray &bitArray, uint seed)
 {
     int m = bitArray.d.size() - 1;
-    uint result = hash(reinterpret_cast<const uchar *>(bitArray.d.constData()), qMax(0, m));
+    uint result = hash(reinterpret_cast<const uchar *>(bitArray.d.constData()), qMax(0, m), seed);
 
     // deal with the last 0 to 7 bits manually, because we can't trust that
     // the padding is initialized to 0 in bitArray.d
@@ -117,6 +140,117 @@ uint qHash(const QBitArray &bitArray)
     return result;
 }
 
+uint qHash(const QLatin1String &key, uint seed)
+{
+    return hash(reinterpret_cast<const uchar *>(key.data()), key.size(), seed);
+}
+
+/*!
+    \internal
+
+    Creates the QHash random seed from various sources.
+    In order of decreasing precedence:
+    - under Unix, it attemps to read from /dev/urandom;
+    - under Unix, it attemps to read from /dev/random;
+    - under Windows, it attempts to use rand_s;
+    - as a general fallback, the application's PID, a timestamp and the
+      address of a stack-local variable are used.
+*/
+static uint qt_create_qhash_seed()
+{
+    uint seed = 0;
+
+#ifdef Q_OS_UNIX
+    int randomfd = qt_safe_open("/dev/urandom", O_RDONLY);
+    if (randomfd == -1)
+        randomfd = qt_safe_open("/dev/random", O_RDONLY | O_NONBLOCK);
+    if (randomfd != -1) {
+        if (qt_safe_read(randomfd, reinterpret_cast<char *>(&seed), sizeof(seed)) == sizeof(seed)) {
+            qt_safe_close(randomfd);
+            return seed;
+        }
+        qt_safe_close(randomfd);
+    }
+#endif // Q_OS_UNIX
+
+#if defined(Q_OS_WIN32) && !defined(Q_CC_GNU)
+    errno_t err;
+    err = rand_s(&seed);
+    if (err == 0)
+        return seed;
+#endif // Q_OS_WIN32
+
+    // general fallback: initialize from the current timestamp, pid,
+    // and address of a stack-local variable
+    quint64 timestamp = QDateTime::currentMSecsSinceEpoch();
+    seed ^= timestamp;
+    seed ^= (timestamp >> 32);
+
+#ifndef QT_BOOTSTRAPPED
+    quint64 pid = QCoreApplication::applicationPid();
+    seed ^= pid;
+    seed ^= (pid >> 32);
+#endif // QT_BOOTSTRAPPED
+
+    quintptr seedPtr = reinterpret_cast<quintptr>(&seed);
+    seed ^= seedPtr;
+#if QT_POINTER_SIZE == 8
+    seed ^= (seedPtr >> 32);
+#endif
+
+    return seed;
+}
+
+/*
+    The QHash seed itself.
+*/
+Q_CORE_EXPORT QBasicAtomicInt qt_qhash_seed = Q_BASIC_ATOMIC_INITIALIZER(-1);
+
+/*!
+    \internal
+
+    Seed == -1 means it that it was not initialized yet.
+
+    We let qt_create_qhash_seed return any unsigned integer,
+    but convert it to signed in order to initialize the seed.
+
+    We don't actually care about the fact that different calls to
+    qt_create_qhash_seed() might return different values,
+    as long as in the end everyone uses the very same value.
+*/
+static void qt_initialize_qhash_seed()
+{
+    if (qt_qhash_seed.load() == -1) {
+        int x(qt_create_qhash_seed() & INT_MAX);
+        qt_qhash_seed.testAndSetRelaxed(-1, x);
+    }
+}
+
+/*!
+    \internal
+
+    Private copy of the implementation of the Qt 4 qHash algorithm for strings,
+    to be used wherever the result is somehow stored or reused across multiple
+    Qt versions. The public qHash implementation can change at any time,
+    therefore one must not rely on the fact that it will always give the same
+    results.
+
+    This function must *never* change its results.
+*/
+uint qt_hash(const QString &key)
+{
+    const QChar *p = key.unicode();
+    int n = key.size();
+    uint h = 0;
+
+    while (n--) {
+        h = (h << 4) + (*p++).unicode();
+        h ^= (h & 0xf0000000) >> 23;
+        h &= 0x0fffffff;
+    }
+    return h;
+}
+
 /*
     The prime_deltas array is a table of selected prime values, even
     though it doesn't look like one. The primes we are using are 1,
@@ -166,7 +300,7 @@ static int countBits(int hint)
 const int MinNumBits = 4;
 
 const QHashData QHashData::shared_null = {
-    0, 0, Q_REFCOUNT_INITIALIZER(-1), 0, 0, MinNumBits, 0, 0, true, false, 0
+    0, 0, Q_REFCOUNT_INITIALIZE_STATIC, 0, 0, MinNumBits, 0, 0, 0, true, false, 0
 };
 
 void *QHashData::allocateNode(int nodeAlign)
@@ -193,15 +327,18 @@ QHashData *QHashData::detach_helper(void (*node_duplicate)(Node *, void *),
         QHashData *d;
         Node *e;
     };
+    if (this == &shared_null)
+        qt_initialize_qhash_seed();
     d = new QHashData;
     d->fakeNext = 0;
     d->buckets = 0;
-    d->ref = 1;
+    d->ref.initializeOwned();
     d->size = size;
     d->nodeSize = nodeSize;
     d->userNumBits = userNumBits;
     d->numBits = numBits;
     d->numBuckets = numBuckets;
+    d->seed = uint(qt_qhash_seed.load());
     d->sharable = true;
     d->strictAlignment = nodeAlign > 8;
     d->reserved = 0;
@@ -511,17 +648,17 @@ void QHashData::checkSanity()
     Returns the hash value for the \a key.
 */
 
-/*! \fn uint qHash(const QByteArray &key)
-    \fn uint qHash(const QBitArray &key)
-    \relates QHash
+/*! \fn uint qHash(const QByteArray &key, uint seed = 0)
+    \fn uint qHash(const QBitArray &key, uint seed = 0)
+    \fn uint qHash(const QString &key, uint seed = 0)
+    \fn uint qHash(const QStringRef &key, uint seed = 0)
+    \fn uint qHash(const QLatin1String &key, uint seed = 0)
 
-    Returns the hash value for the \a key.
-*/
-
-/*! \fn uint qHash(const QString &key)
     \relates QHash
+    \since 5.0
 
-    Returns the hash value for the \a key.
+    Returns the hash value for the \a key, using \a seed to
+    seed the calculation.
 */
 
 /*! \fn uint qHash(const T *key)
@@ -553,8 +690,7 @@ void QHashData::checkSanity()
        key. With QHash, the items are arbitrarily ordered.
     \li The key type of a QMap must provide operator<(). The key
        type of a QHash must provide operator==() and a global
-       hash function called qHash() (see the related non-member
-       functions).
+       hash function called qHash() (see \l{qHash}).
     \endlist
 
     Here's an example QHash with QString keys and \c int values:
@@ -599,6 +735,15 @@ void QHashData::checkSanity()
     To avoid this problem, replace \c hash[i] with \c hash.value(i)
     in the code above.
 
+    Internally, QHash uses a hash table to perform lookups. Unlike Qt
+    3's \c QDict class, which needed to be initialized with a prime
+    number, QHash's hash table automatically grows and shrinks to
+    provide fast lookups without wasting too much memory. You can
+    still control the size of the hash table by calling reserve() if
+    you already know approximately how many items the QHash will
+    contain, but this isn't necessary to obtain good performance. You
+    can also call capacity() to retrieve the hash table's size.
+
     If you want to navigate through all the (key, value) pairs stored
     in a QHash, you can use an iterator. QHash provides both
     \l{Java-style iterators} (QHashIterator and QMutableHashIterator)
@@ -648,21 +793,15 @@ void QHashData::checkSanity()
 
     QHash's key and value data types must be \l{assignable data
     types}. You cannot, for example, store a QWidget as a value;
-    instead, store a QWidget *. In addition, QHash's key type must
-    provide operator==(), and there must also be a global qHash()
-    function that returns a hash value for an argument of the key's
-    type.
-
-    Here's a list of the C++ and Qt types that can serve as keys in a
-    QHash: any integer type (char, unsigned long, etc.), any pointer
-    type, QChar, QString, and QByteArray. For all of these, the \c
-    <QHash> header defines a qHash() function that computes an
-    adequate hash value. If you want to use other types as the key,
-    make sure that you provide operator==() and a qHash()
-    implementation.
+    instead, store a QWidget *.
 
-    Example:
-    \snippet doc/src/snippets/code/src_corelib_tools_qhash.cpp 13
+    \target qHash
+    \section2 The qHash() hashing function
+
+    A QHash's key type has additional requirements other than being an
+    assignable data type: it must provide operator==(), and there must also be
+    a global qHash() function that returns a hash value for an argument of the
+    key's type.
 
     The qHash() function computes a numeric value based on a key. It
     can use any algorithm imaginable, as long as it always returns
@@ -672,19 +811,61 @@ void QHashData::checkSanity()
     attempt to return different hash values for different keys to the
     largest extent possible.
 
+    For a key type \c{K}, the qHash function must have one of these signatures:
+
+    \code
+    uint qHash(K key);
+    uint qHash(const K &key);
+
+    uint qHash(K key, uint seed);
+    uint qHash(const K &key, uint seed);
+    \endcode
+
+    The two-arguments overloads take an unsigned integer that should be used to
+    seed the calculation of the hash function. This seed is provided by QHash
+    in order to prevent a family of \l{algorithmic complexity attacks}. If both
+    a one-argument and a two-arguments overload are defined for a key type,
+    the latter is used by QHash (note that you can simply define a
+    two-arguments version, and use a default value for the seed parameter).
+
+    Here's a partial list of the C++ and Qt types that can serve as keys in a
+    QHash: any integer type (char, unsigned long, etc.), any pointer type,
+    QChar, QString, and QByteArray. For all of these, the \c <QHash> header
+    defines a qHash() function that computes an adequate hash value. Many other
+    Qt classes also declare a qHash overload for their type; please refer to
+    the documentation of each class.
+
+    If you want to use other types as the key, make sure that you provide
+    operator==() and a qHash() implementation.
+
+    Example:
+    \snippet doc/src/snippets/code/src_corelib_tools_qhash.cpp 13
+
     In the example above, we've relied on Qt's global qHash(const
-    QString &) to give us a hash value for the employee's name, and
+    QString &, uint) to give us a hash value for the employee's name, and
     XOR'ed this with the day they were born to help produce unique
     hashes for people with the same name.
 
-    Internally, QHash uses a hash table to perform lookups. Unlike Qt
-    3's \c QDict class, which needed to be initialized with a prime
-    number, QHash's hash table automatically grows and shrinks to
-    provide fast lookups without wasting too much memory. You can
-    still control the size of the hash table by calling reserve() if
-    you already know approximately how many items the QHash will
-    contain, but this isn't necessary to obtain good performance. You
-    can also call capacity() to retrieve the hash table's size.
+    Note that the implementation of the qHash() overloads offered by Qt
+    may change at any time. You \b{must not} rely on the fact that qHash()
+    will give the same results (for the same inputs) across different Qt
+    versions.
+
+    \section2 Algorithmic complexity attacks
+
+    All hash tables are vulnerable to a particular class of denial of service
+    attacks, in which the attacker carefully pre-computes a set of different
+    keys that are going to be hashed in the same bucket of a hash table (or
+    even have the very same hash value). The attack aims at getting the
+    worst-case algorithmic behavior (O(n) instead of amortized O(1), see
+    \l{Algorithmic Complexity} for the details) when the data is fed into the
+    table.
+
+    In order to avoid this worst-case behavior, the calculation of the hash
+    value done by qHash() can be salted by a random seed, that nullifies the
+    attack's extent. This seed is automatically generated by QHash once per
+    process, and then passed by QHash as the second argument of the
+    two-arguments overload of the qHash() function.
 
     \sa QHashIterator, QMutableHashIterator, QMap, QSet
 */