From 8cbe52d5811a985e610aa76c3a17a75cd785fb19 Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Thu, 23 Jun 2011 15:26:08 +0200 Subject: Long live QStringIterator! UCS-4 iterator over a QString. Kept private for now so we can still work on the API. Done-with: Thiago Change-Id: I377f8bb1921e591ee3292c08c3e097fb6bc7f0c4 Reviewed-by: Thiago Macieira --- .../code/src_corelib_tools_qstringiterator.cpp | 72 +++++ src/corelib/tools/qstringiterator.qdoc | 328 +++++++++++++++++++++ src/corelib/tools/qstringiterator_p.h | 233 +++++++++++++++ src/corelib/tools/tools.pri | 1 + 4 files changed, 634 insertions(+) create mode 100644 src/corelib/doc/snippets/code/src_corelib_tools_qstringiterator.cpp create mode 100644 src/corelib/tools/qstringiterator.qdoc create mode 100644 src/corelib/tools/qstringiterator_p.h (limited to 'src/corelib') diff --git a/src/corelib/doc/snippets/code/src_corelib_tools_qstringiterator.cpp b/src/corelib/doc/snippets/code/src_corelib_tools_qstringiterator.cpp new file mode 100644 index 0000000000..178c6feb0a --- /dev/null +++ b/src/corelib/doc/snippets/code/src_corelib_tools_qstringiterator.cpp @@ -0,0 +1,72 @@ +/**************************************************************************** +** +** Copyright (C) 2014 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo +** Contact: http://www.qt-project.org/legal +** +** This file is part of the documentation of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:BSD$ +** You may use this file under the terms of the BSD license as follows: +** +** "Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions are +** met: +** * Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** * Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in +** the documentation and/or other materials provided with the +** distribution. +** * Neither the name of Digia Plc and its Subsidiary(-ies) nor the names +** of its contributors may be used to endorse or promote products derived +** from this software without specific prior written permission. +** +** +** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +** "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +** LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +** A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +** OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +** LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +** OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include +#include +#include + +int main() +{ + +{ +//! [0] +QString string(QStringLiteral("a string")); +QStringIterator i(string); +//! [0] + +//! [1] +// will print 97, 32, 115, 116, etc.; +// that is, the decimal value of the code points in the Unicode string "a string" +while (i.hasNext()) + qDebug() << i.next(); +//! [1] +} + +{ +//! [2] +QString string(QStringLiteral("𝄞 is the G clef")); +QStringIterator i(string); +qDebug() << hex << i.next(); // will print 1d11e (U+1D11E, MUSICAL SYMBOL G CLEF) +qDebug() << hex << i.next(); // will print 20 (U+0020, SPACE) +qDebug() << hex << i.next(); // will print 69 (U+0069, LATIN SMALL LETTER I) +//! [2] +} + +} diff --git a/src/corelib/tools/qstringiterator.qdoc b/src/corelib/tools/qstringiterator.qdoc new file mode 100644 index 0000000000..510d5fbccf --- /dev/null +++ b/src/corelib/tools/qstringiterator.qdoc @@ -0,0 +1,328 @@ +/**************************************************************************** +** +** Copyright (C) 2014 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo +** Contact: http://www.qt-project.org/legal +** +** This file is part of the QtCore module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and Digia. For licensing terms and +** conditions see http://qt.digia.com/licensing. For further information +** use the contact form at http://qt.digia.com/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Digia gives you certain additional +** rights. These rights are described in the Digia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +/*! + \class QStringIterator + \since 5.3 + \inmodule QtCore + \ingroup tools + + \internal + + \brief The QStringIterator class provides a Unicode-aware iterator over QString. + + \reentrant + + QStringIterator is a Java-like, bidirectional, const iterator over the contents of a + QString. Unlike QString's own iterators, which manage the individual UTF-16 code units, + QStringIterator is Unicode-aware: it will transparently handle the \e{surrogate pairs} + that may be present in a QString, and return the individual Unicode code points. + + You can create a QStringIterator that iterates over a given + QString by passing the string to the QStringIterator's constructor: + + \snippet code/src_corelib_tools_qstringiterator.cpp 0 + + A newly created QStringIterator will point before the first position in the + string. It is possible to check whether the iterator can be advanced by + calling hasNext(), and actually advance it (and obtain the next code point) + by calling next(): + + \snippet code/src_corelib_tools_qstringiterator.cpp 1 + + Similarly, the hasPrevious() and previous() functions can be used to iterate backwards. + + The peekNext() and peekPrevious() functions will return the code point + respectively after and behind the iterator's current position, but unlike + next() and previous() they will not move the iterator. + Similarly, the advance() and recede() functions will move the iterator + respectively after and behind the iterator's current position, but they + will not return the code point the iterator has moved through. + + \section1 Unicode handling + + QString and all of its functions work in terms of UTF-16 code units. Unicode code points + that fall outside the Basic Multilingual Plane (U+10000 to U+10FFFF) will therefore + be represented by \e{surrogate pairs} in a QString, that is, a sequence of two + UTF-16 code units that encode a single code point. + + QStringIterator will automatically handle surrogate pairs inside a QString, + and return the correctly decoded code point, while also moving the iterator by + the right amount of code units to match the decoded code points. + + For instance: + + \snippet code/src_corelib_tools_qstringiterator.cpp 2 + + If the iterator is not able to decode the next code point (or the previous + one, when iterating backwards), then it will return \c{0xFFFD}, that is, + Unicode's replacement character (see QChar::ReplacementCharacter). + It is possible to make QStringIterator return another value when it encounters + a decoding problem; please refer to the each function documentation for + more details. + + \section1 Unchecked iteration + + It is possible to optimize iterating over a QString contents by skipping + some checks. This is in general not safe to do, because a QString is allowed + to contain malformed UTF-16 data; however, if we can trust a given QString, + then we can use the optimized \e{unchecked} functions. + + QStringIterator provides the \e{unchecked} counterparts for next(), + peekNext(), advance(), previous(), peekPrevious(), and recede(): + they're called, respectively, + nextUnchecked(), peekNextUnchecked(), advanceUnchecked(), + previousUnchecked(), peekPreviousUnchecked(), recedeUnchecked(). + The counterparts work exactly like the original ones, + but they're faster as they're allowed to make certain assumptions about + the string contents. + + \note please be extremely careful when using QStringIterator's unchecked functions, + as using them on a string containing malformed data leads to undefined behavior. + + \sa QString, QChar +*/ + +/*! + \fn QStringIterator::QStringIterator(const QString &string) + + Constructs an iterator over the contents of \a string. The iterator will point + before the first position in the string. + + The string \a string must remain valid while the iterator is being used. +*/ + +/*! + \fn QStringIterator::QStringIterator(const QChar *begin, const QChar *end) + + Constructs an iterator which iterates over the range from \a begin to \a end. + The iterator will point before \a begin. + + The range from \a begin to \a end must remain valid while the iterator is being used. +*/ + +/*! + \fn QString::const_iterator QStringIterator::position() const + + Returns the current position of the iterator. +*/ + +/*! + \fn void QStringIterator::setPosition(QString::const_iterator position) + + Sets the iterator's current position to \a position, which must be inside + of the iterable range. +*/ + +/*! + \fn bool QStringIterator::hasNext() const + + Returns true if the iterator has not reached the end of the valid iterable range + and therefore can move forward; false otherwise. + + \sa next() +*/ + +/*! + \fn void QStringIterator::advance() + + Advances the iterator by one Unicode code point. + + \note calling this function when the iterator is past the end of the iterable range + leads to undefined behavior. + + \sa next(), hasNext() +*/ + +/*! + \fn void QStringIterator::advanceUnchecked() + + Advances the iterator by one Unicode code point. + + \note calling this function when the iterator is past the end of the iterable range + or on a QString containing malformed UTF-16 data leads to undefined behavior. + + \sa advance(), next(), hasNext() +*/ + +/*! + \fn uint QStringIterator::peekNextUnchecked() const + + Returns the Unicode code point that is immediately after the iterator's current + position. The current position is not changed. + + \note calling this function when the iterator is past the end of the iterable range + or on a QString containing malformed UTF-16 data leads to undefined behavior. + + \sa peekNext(), next(), hasNext() +*/ + +/*! + \fn uint QStringIterator::peekNext(uint invalidAs = QChar::ReplacementCharacter) const + + Returns the Unicode code point that is immediately after the iterator's current + position. The current position is not changed. + + If the iterator is not able to decode the UTF-16 data after the iterator's current + position, this function returns \a invalidAs (by default, QChar::ReplacementCharacter, + which corresponds to \c{U+FFFD}). + + \note calling this function when the iterator is past the end of the iterable range + leads to undefined behavior. + + \sa next(), hasNext() +*/ + +/*! + \fn uint QStringIterator::nextUnchecked() + + Advances the iterator's current position by one Unicode code point, + and returns the Unicode code point that gets pointed by the iterator. + + \note calling this function when the iterator is past the end of the iterable range + or on a QString containing malformed UTF-16 data leads to undefined behavior. + + \sa next(), hasNext() +*/ + +/*! + \fn uint QStringIterator::next(uint invalidAs = QChar::ReplacementCharacter) + + Advances the iterator's current position by one Unicode code point, + and returns the Unicode code point that gets pointed by the iterator. + + If the iterator is not able to decode the UTF-16 data at the iterator's current + position, this function returns \a invalidAs (by default, QChar::ReplacementCharacter, + which corresponds to \c{U+FFFD}). + + \note calling this function when the iterator is past the end of the iterable range + leads to undefined behavior. + + \sa peekNext(), hasNext() +*/ + + +/*! + \fn bool QStringIterator::hasPrevious() const + + Returns true if the iterator is after the beginning of the valid iterable range + and therefore can move backwards; false otherwise. + + \sa previous() +*/ + +/*! + \fn void QStringIterator::recede() + + Moves the iterator back by one Unicode code point. + + \note calling this function when the iterator is before the beginning of the iterable range + leads to undefined behavior. + + \sa previous(), hasPrevious() +*/ + +/*! + \fn void QStringIterator::recedeUnchecked() + + Moves the iterator back by one Unicode code point. + + \note calling this function when the iterator is before the beginning of the iterable range + or on a QString containing malformed UTF-16 data leads to undefined behavior. + + \sa recede(), previous(), hasPrevious() +*/ + +/*! + \fn uint QStringIterator::peekPreviousUnchecked() const + + Returns the Unicode code point that is immediately before the iterator's current + position. The current position is not changed. + + \note calling this function when the iterator is before the beginning of the iterable range + or on a QString containing malformed UTF-16 data leads to undefined behavior. + + \sa previous(), hasPrevious() +*/ + +/*! + \fn uint QStringIterator::peekPrevious(uint invalidAs = QChar::ReplacementCharacter) const + + Returns the Unicode code point that is immediately before the iterator's current + position. The current position is not changed. + + If the iterator is not able to decode the UTF-16 data before the iterator's current + position, this function returns \a invalidAs (by default, QChar::ReplacementCharacter, + which corresponds to \c{U+FFFD}). + + \note calling this function when the iterator is before the beginning of the iterable range + leads to undefined behavior. + + \sa previous(), hasPrevious() +*/ + +/*! + \fn uint QStringIterator::previousUnchecked() + + Moves the iterator's current position back by one Unicode code point, + and returns the Unicode code point that gets pointed by the iterator. + + \note calling this function when the iterator is before the beginning of the iterable range + or on a QString containing malformed UTF-16 data leads to undefined behavior. + + \sa previous(), hasPrevious() +*/ + +/*! + \fn uint QStringIterator::previous(uint invalidAs = QChar::ReplacementCharacter) + + Moves the iterator's current position back by one Unicode code point, + and returns the Unicode code point that gets pointed by the iterator. + + If the iterator is not able to decode the UTF-16 data at the iterator's current + position, this function returns \a invalidAs (by default, QChar::ReplacementCharacter, + which corresponds to \c{U+FFFD}). + + \note calling this function when the iterator is before the beginning of the iterable range + leads to undefined behavior. + + \sa peekPrevious(), hasPrevious() +*/ diff --git a/src/corelib/tools/qstringiterator_p.h b/src/corelib/tools/qstringiterator_p.h new file mode 100644 index 0000000000..c3986f0477 --- /dev/null +++ b/src/corelib/tools/qstringiterator_p.h @@ -0,0 +1,233 @@ +/**************************************************************************** +** +** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies). +** Copyright (C) 2014 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo +** Contact: http://www.qt-project.org/legal +** +** This file is part of the QtCore module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and Digia. For licensing terms and +** conditions see http://qt.digia.com/licensing. For further information +** use the contact form at http://qt.digia.com/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Digia gives you certain additional +** rights. These rights are described in the Digia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef QSTRINGITERATOR_H +#define QSTRINGITERATOR_H + +#include + +QT_BEGIN_NAMESPACE + +class QStringIterator +{ + QString::const_iterator i, pos, e; + +public: + inline explicit QStringIterator(const QString &string) + : i(string.constBegin()), + pos(string.constBegin()), + e(string.constEnd()) + { + } + + inline explicit QStringIterator(const QChar *begin, const QChar *end) + : i(begin), + pos(begin), + e(end) + { + } + + inline QString::const_iterator position() const + { + return pos; + } + + inline void setPosition(QString::const_iterator position) + { + Q_ASSERT_X(i <= position && position <= e, Q_FUNC_INFO, "position out of bounds"); + pos = position; + } + + // forward iteration + + inline bool hasNext() const + { + return pos < e; + } + + inline void advance() + { + Q_ASSERT_X(hasNext(), Q_FUNC_INFO, "iterator hasn't a next item"); + + if (Q_UNLIKELY((pos++)->isHighSurrogate())) { + if (Q_LIKELY(pos != e && pos->isLowSurrogate())) + ++pos; + } + } + + inline void advanceUnchecked() + { + Q_ASSERT_X(hasNext(), Q_FUNC_INFO, "iterator hasn't a next item"); + + if (Q_UNLIKELY((pos++)->isHighSurrogate())) + ++pos; + } + + inline uint peekNextUnchecked() const + { + Q_ASSERT_X(hasNext(), Q_FUNC_INFO, "iterator hasn't a next item"); + + if (Q_UNLIKELY(pos->isHighSurrogate())) + return QChar::surrogateToUcs4(pos[0], pos[1]); + + return pos->unicode(); + } + + inline uint peekNext(uint invalidAs = QChar::ReplacementCharacter) const + { + Q_ASSERT_X(hasNext(), Q_FUNC_INFO, "iterator hasn't a next item"); + + if (Q_UNLIKELY(pos->isSurrogate())) { + if (Q_LIKELY(pos->isHighSurrogate())) { + const QChar *low = pos + 1; + if (Q_LIKELY(low != e && low->isLowSurrogate())) + return QChar::surrogateToUcs4(*pos, *low); + } + return invalidAs; + } + + return pos->unicode(); + } + + inline uint nextUnchecked() + { + Q_ASSERT_X(hasNext(), Q_FUNC_INFO, "iterator hasn't a next item"); + + const QChar cur = *pos++; + if (Q_UNLIKELY(cur.isHighSurrogate())) + return QChar::surrogateToUcs4(cur, *pos++); + return cur.unicode(); + } + + inline uint next(uint invalidAs = QChar::ReplacementCharacter) + { + Q_ASSERT_X(hasNext(), Q_FUNC_INFO, "iterator hasn't a next item"); + + const QChar uc = *pos++; + if (Q_UNLIKELY(uc.isSurrogate())) { + if (Q_LIKELY(uc.isHighSurrogate() && pos < e && pos->isLowSurrogate())) + return QChar::surrogateToUcs4(uc, *pos++); + return invalidAs; + } + + return uc.unicode(); + } + + // backwards iteration + + inline bool hasPrevious() const + { + return pos > i; + } + + inline void recede() + { + Q_ASSERT_X(hasPrevious(), Q_FUNC_INFO, "iterator hasn't a previous item"); + + if (Q_UNLIKELY((--pos)->isLowSurrogate())) { + const QChar *high = pos - 1; + if (Q_LIKELY(high != i - 1 && high->isHighSurrogate())) + --pos; + } + } + + inline void recedeUnchecked() + { + Q_ASSERT_X(hasPrevious(), Q_FUNC_INFO, "iterator hasn't a previous item"); + + if (Q_UNLIKELY((--pos)->isLowSurrogate())) + --pos; + } + + inline uint peekPreviousUnchecked() const + { + Q_ASSERT_X(hasPrevious(), Q_FUNC_INFO, "iterator hasn't a previous item"); + + if (Q_UNLIKELY(pos[-1].isLowSurrogate())) + return QChar::surrogateToUcs4(pos[-2], pos[-1]); + return pos[-1].unicode(); + } + + inline uint peekPrevious(uint invalidAs = QChar::ReplacementCharacter) const + { + Q_ASSERT_X(hasPrevious(), Q_FUNC_INFO, "iterator hasn't a previous item"); + + if (Q_UNLIKELY(pos[-1].isSurrogate())) { + if (Q_LIKELY(pos[-1].isLowSurrogate())) { + const QChar *high = pos - 2; + if (Q_LIKELY(high != i - 1 && high->isHighSurrogate())) + return QChar::surrogateToUcs4(*high, pos[-1]); + } + return invalidAs; + } + + return pos[-1].unicode(); + } + + inline uint previousUnchecked() + { + Q_ASSERT_X(hasPrevious(), Q_FUNC_INFO, "iterator hasn't a previous item"); + + const QChar cur = *--pos; + if (Q_UNLIKELY(cur.isLowSurrogate())) + return QChar::surrogateToUcs4(*--pos, cur); + return cur.unicode(); + } + + inline uint previous(uint invalidAs = QChar::ReplacementCharacter) + { + Q_ASSERT_X(hasPrevious(), Q_FUNC_INFO, "iterator hasn't a previous item"); + + const QChar uc = *--pos; + if (Q_UNLIKELY(uc.isSurrogate())) { + if (Q_LIKELY(uc.isLowSurrogate() && pos > i && pos[-1].isHighSurrogate())) + return QChar::surrogateToUcs4(*--pos, uc); + return invalidAs; + } + + return uc.unicode(); + } +}; + +QT_END_NAMESPACE + +#endif // QSTRINGITERATOR_H diff --git a/src/corelib/tools/tools.pri b/src/corelib/tools/tools.pri index ba995b047d..4ebd6ccd66 100644 --- a/src/corelib/tools/tools.pri +++ b/src/corelib/tools/tools.pri @@ -56,6 +56,7 @@ HEADERS += \ tools/qstack.h \ tools/qstring.h \ tools/qstringbuilder.h \ + tools/qstringiterator_p.h \ tools/qstringlist.h \ tools/qstringmatcher.h \ tools/qtextboundaryfinder.h \ -- cgit v1.2.3