summaryrefslogtreecommitdiffstats
path: root/src/corelib/text/qstringtokenizer.cpp
blob: 4e8ffc2c8e309b905ad1091155ecb95bebd8efe8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
/****************************************************************************
**
** Copyright (C) 2020 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Marc Mutz <marc.mutz@kdab.com>
** Contact: http://www.qt.io/licensing/
**
** This file is part of the QtCore module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and The Qt Company. For licensing terms
** and conditions see https://www.qt.io/terms-conditions. For further
** information use the contact form at https://www.qt.io/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 3 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL3 included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 3 requirements
** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU
** General Public License version 2.0 or (at your option) the GNU General
** Public license version 3 or any later version approved by the KDE Free
** Qt Foundation. The licenses are as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
** included in the packaging of this file. Please review the following
** information to ensure the GNU General Public License requirements will
** be met: https://www.gnu.org/licenses/gpl-2.0.html and
** https://www.gnu.org/licenses/gpl-3.0.html.
**
** $QT_END_LICENSE$
**
****************************************************************************/

#include "qstringtokenizer.h"
#include "qstringalgorithms.h"

QT_BEGIN_NAMESPACE

/*!
    \class QStringTokenizer
    \inmodule QtCore
    \since 6.0
    \brief The QStringTokenizer class splits strings into tokens along given separators.
    \reentrant
    \ingroup tools
    \ingroup string-processing

    Splits a string into substrings wherever a given separator occurs,
    returning a (lazily constructed) list of those strings. If the separator does
    not match anywhere in the string, produces a single-element list
    containing this string.  If the separator is empty,
    QStringTokenizer produces an empty string, followed by each of the
    string's characters, followed by another empty string. The two
    enumerations Qt::SplitBehavior and Qt::CaseSensitivity further
    control the output.

    QStringTokenizer drives QStringView::tokenize(), but, at least with a
    recent compiler, you can use it directly, too:

    \code
    for (auto it : QStringTokenizer{string, separator})
        use(*it);
    \endcode

    \note You should never, ever, name the template arguments of a
    QStringTokenizer explicitly.  If you can use C++17 Class Template
    Argument Deduction (CTAD), you may write
    \c{QStringTokenizer{string, separator}} (without template
    arguments).  If you can't use C++17 CTAD, you must use the
    QStringView::split() or QLatin1String::split() member functions
    and store the return value only in \c{auto} variables:

    \code
    auto result = string.split(sep);
    \endcode

    This is because the template arguments of QStringTokenizer have a
    very subtle dependency on the specific string and separator types
    from with which they are constructed, and they don't usually
    correspond to the actual types passed.

    \section1 Lazy Sequences

    QStringTokenizer acts as a so-called lazy sequence, that is, each
    next element is only computed once you ask for it. Lazy sequences
    have the advantage that they only require O(1) memory. They have
    the disadvantage that, at least for QStringTokenizer, they only
    allow forward, not random-access, iteration.

    The intended use-case is that you just plug it into a ranged for loop:

    \code
    for (auto it : QStringTokenizer{string, separator})
        use(*it);
    \endcode

    or a C++20 ranged algorithm:

    \code
    std::ranges::for_each(QStringTokenizer{string, separator},
                          [] (auto token) { use(token); });
    \endcode

    \section1 End Sentinel

    The QStringTokenizer iterators cannot be used with classical STL
    algorithms, because those require iterator/iterator pairs, while
    QStringTokenizer uses sentinels. That is, it uses a different
    type, QStringTokenizer::sentinel, to mark the end of the
    range. This improves performance, because the sentinel is an empty
    type. Sentinels are supported from C++17 (for ranged for)
    and C++20 (for algorithms using the new ranges library).

    \section1 Temporaries

    QStringTokenizer is very carefully designed to avoid dangling
    references. If you construct a tokenizer from a temporary string
    (an rvalue), that argument is stored internally, so the referenced
    data isn't deleted before it is tokenized:

    \code
    auto tok = QStringTokenizer{widget.text(), u','};
    // return value of `widget.text()` is destroyed, but content was moved into `tok`
    for (auto e : tok)
       use(e);
    \endcode

    If you pass named objects (lvalues), then QStringTokenizer does
    not store a copy. You are responsible to keep the named object's
    data around for longer than the tokenizer operates on it:

    \code
    auto text = widget.text();
    auto tok = QStringTokenizer{text, u','};
    text.clear();      // destroy content of `text`
    for (auto e : tok) // ERROR: `tok` references deleted data!
        use(e);
    \endcode

    \sa QStringView::split(), QLatin1String::split(), QRegularExpression
*/

/*!
    \typedef QStringTokenizer::value_type

    Alias for \c{const QStringView} or \c{const QLatin1String},
    depending on the tokenizer's \c Haystack template argument.
*/

/*!
    \typedef QStringTokenizer::difference_type

    Alias for qsizetype.
*/

/*!
    \typedef QStringTokenizer::size_type

    Alias for qsizetype.
*/

/*!
    \typedef QStringTokenizer::reference

    Alias for \c{value_type &}.

    QStringTokenizer does not support mutable references, so this is
    the same as const_reference.
*/

/*!
    \typedef QStringTokenizer::const_reference

    Alias for \c{value_type &}.
*/

/*!
    \typedef QStringTokenizer::pointer

    Alias for \c{value_type *}.

    QStringTokenizer does not support mutable iterators, so this is
    the same as const_pointer.
*/

/*!
    \typedef QStringTokenizer::const_pointer

    Alias for \c{value_type *}.
*/

/*!
    \typedef QStringTokenizer::iterator

    This typedef provides an STL-style const iterator for
    QStringTokenizer.

    QStringTokenizer does not support mutable iterators, so this is
    the same as const_iterator.

    \sa const_iterator
*/

/*!
    \typedef QStringTokenizer::const_iterator

    This typedef provides an STL-style const iterator for
    QStringTokenizer.

    \sa iterator
*/

/*!
    \typedef QStringTokenizer::sentinel

    This typedef provides an STL-style sentinel for
    QStringTokenizer::iterator and QStringTokenizer::const_iterator.

    \sa const_iterator
*/

/*!
    \fn QStringTokenizer(Haystack haystack, String needle, Qt::CaseSensitivity cs, Qt::SplitBehavior sb)
    \fn QStringTokenizer(Haystack haystack, String needle, Qt::SplitBehavior sb, Qt::CaseSensitivity cs)

    Constructs a string tokenizer that splits the string \a haystack
    into substrings wherever \a needle occurs, and allows iteration
    over those strings as they are found. If \a needle does not match
    anywhere in \a haystack, a single element containing \a haystack
    is produced.

    \a cs specifies whether \a needle should be matched case
    sensitively or case insensitively.

    If \a sb is QString::SkipEmptyParts, empty entries don't
    appear in the result. By default, empty entries are included.

    \sa QStringView::split(), QLatin1String::split(), Qt::CaseSensitivity, Qt::SplitBehavior
*/

/*!
    \fn QStringTokenizer::const_iterator QStringTokenizer::begin() const

    Returns a const \l{STL-style iterators}{STL-style iterator}
    pointing to the first token in the list.

    \sa end(), cbegin()
*/

/*!
    \fn QStringTokenizer::const_iterator QStringTokenizer::cbegin() const

    Same as begin().

    \sa cend(), begin()
*/

/*!
    \fn QStringTokenizer::sentinel QStringTokenizer::end() const

    Returns a const \l{STL-style iterators}{STL-style sentinel}
    pointing to the imaginary token after the last token in the list.

    \sa begin(), cend()
*/

/*!
    \fn QStringTokenizer::sentinel QStringTokenizer::cend() const

    Same as end().

    \sa cbegin(), end()
*/

/*!
    \fn QStringTokenizer::toContainer(Container &&c) const &

    Convenience method to convert the lazy sequence into a
    (typically) random-access container.

    This function is only available if \c Container has a \c value_type
    matching this tokenizer's value_type.

    If you pass in a named container (an lvalue), then that container
    is filled, and a reference to it is returned.

    If you pass in a temporary container (an rvalue, incl. the default
    argument), then that container is filled, and returned by value.

    \code
    // assuming tok's value_type is QStringView, then...
    auto tok = QStringTokenizer{~~~};
    // ... rac1 is a QList:
    auto rac1 = tok.toContainer();
    // ... rac2 is std::pmr::vector<QStringView>:
    auto rac2 = tok.toContainer<std::pmr::vector<QStringView>>();
    auto rac3 = QVarLengthArray<QStringView, 12>{};
    // appends the token sequence produced by tok to rac3
    //  and returns a reference to rac3 (which we ignore here):
    tok.toContainer(rac3);
    \endcode

    This gives you maximum flexibility in how you want the sequence to
    be stored.
*/

/*!
    \fn QStringTokenizer::toContainer(Container &&c) const &&
    \overload

    In addition to the constraints on the lvalue-this overload, this
    rvalue-this overload is only available when this QStringTokenizer
    does not store the haystack internally, as this could create a
    container full of dangling references:

    \code
    auto tokens = QStringTokenizer{widget.text(), u','}.toContainer();
    // ERROR: cannot call toContainer() on rvalue
    // 'tokens' references the data of the copy of widget.text()
    // stored inside the QStringTokenizer, which has since been deleted
    \endcode

    To fix, store the QStringTokenizer in a temporary:

    \code
    auto tokenizer = QStringTokenizer{widget.text90, u','};
    auto tokens = tokenizer.toContainer();
    // OK: the copy of widget.text() stored in 'tokenizer' keeps the data
    // referenced by 'tokens' alive.
    \endcode

    You can force this function into existence by passing a view instead:

    \code
    func(QStringTokenizer{QStringView{widget.text()}, u','}.toContainer());
    // OK: compiler keeps widget.text() around until after func() has executed
    \endcode
*/

/*!
    \fn qTokenize(Haystack &&haystack, Needle &&needle, Flags...flags)
    \relates QStringTokenizer
    \since 6.0

    Factory function for QStringTokenizer. You can use this function
    if your compiler doesn't, yet, support C++17 Class Template
    Argument Deduction (CTAD), but we recommend direct use of
    QStringTokenizer with CTAD instead.
*/

QT_END_NAMESPACE