summaryrefslogtreecommitdiffstats
path: root/src/corelib/text
diff options
context:
space:
mode:
authorLars Knoll <lars.knoll@qt.io>2020-04-22 14:39:27 +0200
committerLars Knoll <lars.knoll@qt.io>2020-05-14 07:47:33 +0200
commitcab0d57d1eb02d93774f4e444cb4514400bac0e9 (patch)
tree49f59bd135fcd071367d529b64304ca4e864fdc9 /src/corelib/text
parent542ded462ef637dd48b29ba10845c3afb76ec126 (diff)
Clean up the Flag handling in QStringConverter
IgnoreHeader was a rather badly defined enum, in addition the utf8 and utf16 codecs where handling BOMs somewhat different for stateless decoding. Fix this by introducing explicit flags for writing a bom when encoding and not skipping the initial bom when decoding. Source compatibility for QTextCodec is done with a couple of static constexpr variables. Change-Id: I0b2d94f84c937cec1e0494c16ef448c00382691d Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/corelib/text')
-rw-r--r--src/corelib/text/qstringconverter.cpp93
-rw-r--r--src/corelib/text/qstringconverter.h28
2 files changed, 65 insertions, 56 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index 5858dc3e0f..fd716da15a 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -373,6 +373,8 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
}
#endif
+enum { HeaderDone = 1 };
+
QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len)
{
// create a QByteArray with the worst case scenario size
@@ -405,23 +407,25 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len, QStringConv
uchar replacement = '?';
qsizetype rlen = 3*len;
int surrogate_high = -1;
+ bool writeBom = false;
if (state) {
- if (state->flags & QStringConverter::ConvertInvalidToNull)
+ if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
replacement = 0;
- if (!(state->flags & QStringConverter::IgnoreHeader))
+ if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
rlen += 3;
+ writeBom = true;
+ }
if (state->remainingChars)
surrogate_high = state->state_data[0];
}
-
QByteArray rstr(rlen, Qt::Uninitialized);
uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
const ushort *src = reinterpret_cast<const ushort *>(uc);
const ushort *const end = src + len;
int invalid = 0;
- if (state && !(state->flags & QStringConverter::IgnoreHeader)) {
+ if (writeBom) {
// append UTF-8 BOM
*cursor++ = utf8bom[0];
*cursor++ = utf8bom[1];
@@ -459,7 +463,7 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len, QStringConv
rstr.resize(cursor - (const uchar*)rstr.constData());
if (state) {
state->invalidChars += invalid;
- state->flags |= QStringConverter::IgnoreHeader;
+ state->internalState |= HeaderDone;
state->remainingChars = 0;
if (surrogate_high >= 0) {
state->remainingChars = 1;
@@ -545,7 +549,7 @@ QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, qsizetype len)
QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state)
{
- bool headerdone = false;
+ bool headerdone = state && state->internalState & HeaderDone;
ushort replacement = QChar::ReplacementCharacter;
int invalid = 0;
int res;
@@ -568,9 +572,9 @@ QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConvert
const uchar *end = src + len;
if (state) {
- if (state->flags & QStringConverter::IgnoreHeader)
+ if (state->flags & QStringConverter::Flag::DontSkipInitialBom)
headerdone = true;
- if (state->flags & QStringConverter::ConvertInvalidToNull)
+ if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
replacement = QChar::Null;
if (state->remainingChars) {
// handle incoming state first
@@ -636,7 +640,7 @@ QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConvert
}
}
- if (!state && res == QUtf8BaseTraits::EndOfString) {
+ if ((!state || state->flags & QStringConverter::Flag::Stateless) && res == QUtf8BaseTraits::EndOfString) {
// unterminated UTF sequence
*dst++ = QChar::ReplacementCharacter;
while (src++ < end)
@@ -647,7 +651,7 @@ QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConvert
if (state) {
state->invalidChars += invalid;
if (headerdone)
- state->flags |= QStringConverter::IgnoreHeader;
+ state->internalState |= HeaderDone;
if (res == QUtf8BaseTraits::EndOfString) {
--src; // unread the byte in ch
state->remainingChars = end - src;
@@ -748,21 +752,20 @@ int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s)
return (end1 > src1) - (end2 > src2);
}
-QByteArray QUtf16::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness e)
+QByteArray QUtf16::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness endian)
{
- DataEndianness endian = e;
+ bool writeBom = state && !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
qsizetype length = 2*len;
- if (!state || (!(state->flags & QStringConverter::IgnoreHeader))) {
+ if (writeBom)
length += 2;
- }
- if (e == DetectEndianness) {
+
+ if (endian == DetectEndianness)
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
- }
QByteArray d;
d.resize(length);
char *data = d.data();
- if (!state || !(state->flags & QStringConverter::IgnoreHeader)) {
+ if (writeBom) {
QChar bom(QChar::ByteOrderMark);
if (endian == BigEndianness)
qToBigEndian(bom.unicode(), data);
@@ -777,19 +780,19 @@ QByteArray QUtf16::convertFromUnicode(const QChar *uc, qsizetype len, QStringCon
if (state) {
state->remainingChars = 0;
- state->flags |= QStringConverter::IgnoreHeader;
+ state->internalState |= HeaderDone;
}
return d;
}
-QString QUtf16::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness e)
+QString QUtf16::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness endian)
{
- DataEndianness endian = e;
bool half = false;
uchar buf = 0;
- bool headerdone = false;
+ bool headerdone = state && state->internalState & HeaderDone;
if (state) {
- headerdone = state->flags & QStringConverter::IgnoreHeader;
+ if (state->flags & QStringConverter::Flag::DontSkipInitialBom)
+ headerdone = true;
if (endian == DetectEndianness)
endian = (DataEndianness)state->state_data[Endian];
if (state->remainingChars) {
@@ -844,7 +847,7 @@ QString QUtf16::convertToUnicode(const char *chars, qsizetype len, QStringConver
if (state) {
if (headerdone)
- state->flags |= QStringConverter::IgnoreHeader;
+ state->internalState |= HeaderDone;
state->state_data[Endian] = endian;
if (half) {
state->remainingChars = 1;
@@ -857,20 +860,19 @@ QString QUtf16::convertToUnicode(const char *chars, qsizetype len, QStringConver
return result;
}
-QByteArray QUtf32::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness e)
+QByteArray QUtf32::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness endian)
{
- DataEndianness endian = e;
+ bool writeBom = state && !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
qsizetype length = 4*len;
- if (!state || (!(state->flags & QStringConverter::IgnoreHeader))) {
+ if (writeBom)
length += 4;
- }
- if (e == DetectEndianness) {
+
+ if (endian == DetectEndianness)
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
- }
QByteArray d(length, Qt::Uninitialized);
char *data = d.data();
- if (!state || !(state->flags & QStringConverter::IgnoreHeader)) {
+ if (writeBom) {
if (endian == BigEndianness) {
data[0] = 0;
data[1] = 0;
@@ -902,22 +904,21 @@ QByteArray QUtf32::convertFromUnicode(const QChar *uc, qsizetype len, QStringCon
if (state) {
state->remainingChars = 0;
- state->flags |= QStringConverter::IgnoreHeader;
+ state->internalState |= HeaderDone;
}
return d;
}
-QString QUtf32::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness e)
+QString QUtf32::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness endian)
{
- DataEndianness endian = e;
uchar tuple[4];
int num = 0;
- bool headerdone = false;
+ bool headerdone = state && state->internalState & HeaderDone;
if (state) {
- headerdone = state->flags & QStringConverter::IgnoreHeader;
- if (endian == DetectEndianness) {
+ if (state->flags & QStringConverter::Flag::DontSkipInitialBom)
+ headerdone = true;
+ if (endian == DetectEndianness)
endian = (DataEndianness)state->state_data[Endian];
- }
num = state->remainingChars;
memcpy(tuple, &state->state_data[Data], 4);
}
@@ -963,7 +964,7 @@ QString QUtf32::convertToUnicode(const char *chars, qsizetype len, QStringConver
if (state) {
if (headerdone)
- state->flags |= QStringConverter::IgnoreHeader;
+ state->internalState |= HeaderDone;
state->state_data[Endian] = endian;
state->remainingChars = num;
memcpy(&state->state_data[Data], tuple, 4);
@@ -1178,10 +1179,17 @@ QByteArray QLocal8Bit::convertFromUnicode(const QChar *ch, qsizetype uclen, QStr
/*!
\enum QStringConverter::Flag
- \value DefaultConversion No flag is set.
+ \value Default Default conversion rules apply.
\value ConvertInvalidToNull If this flag is set, each invalid input
- character is output as a null character.
- \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
+ character is output as a null character. If it is not set,
+ invalid input characters are represented as QChar::ReplacementCharacter
+ if the output encoding can represent that character, otherwise as a question mark.
+ \value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
+ character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
+ encodings.
+ \value DontSkipInitialBom When converting from an input encoding to a QString the QTextDecoder usually skips an
+ leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
+ skipped, but inserted at the start of the created QString.
\value Stateless Ignore possible converter states between different function calls
to encode or decode strings.
@@ -1196,6 +1204,7 @@ void QStringConverter::State::clear()
state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
remainingChars = 0;
invalidChars = 0;
+ internalState = 0;
}
static QChar *fromUtf8(QChar *out, const char *in, qsizetype length, QStringConverter::State *state)
@@ -1307,7 +1316,7 @@ static QChar *fromLatin1(QChar *out, const char *chars, qsizetype len, QStringCo
static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
{
- const char replacement = (state && state->flags & QStringConverter::ConvertInvalidToNull) ? 0 : '?';
+ const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
int invalid = 0;
for (qsizetype i = 0; i < in.length(); ++i) {
if (in[i] > QChar(0xff)) {
diff --git a/src/corelib/text/qstringconverter.h b/src/corelib/text/qstringconverter.h
index f2b78508e6..3f657c9935 100644
--- a/src/corelib/text/qstringconverter.h
+++ b/src/corelib/text/qstringconverter.h
@@ -57,16 +57,17 @@ QT_BEGIN_NAMESPACE
class QStringConverterBase
{
public:
- enum Flag {
- DefaultConversion,
- ConvertInvalidToNull = 0x1,
- IgnoreHeader = 0x2,
- Stateless = 0x4
+ enum class Flag {
+ Default = 0,
+ Stateless = 0x1,
+ ConvertInvalidToNull = 0x2,
+ WriteBom = 0x4,
+ DontSkipInitialBom = 0x8
};
Q_DECLARE_FLAGS(Flags, Flag)
struct State {
- constexpr State(Flags f = DefaultConversion)
+ constexpr State(Flags f = Flag::Default)
: flags(f), state_data{0, 0, 0, 0} {}
~State() { clear(); }
State(State &&other)
@@ -91,6 +92,7 @@ public:
Q_CORE_EXPORT void clear();
Flags flags;
+ int internalState = 0;
qsizetype remainingChars = 0;
qsizetype invalidChars = 0;
@@ -166,9 +168,7 @@ protected:
: QStringConverter(i)
{}
public:
- // ### We shouldn't write a BOM by default. Need to resolve this
- // while keeping compat with QTextCodec
- QSTRINGCONVERTER_CONSTEXPR QStringEncoder(Encoding encoding, Flags flags = IgnoreHeader)
+ QSTRINGCONVERTER_CONSTEXPR QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
: QStringConverter(encoding, flags)
{}
@@ -200,13 +200,13 @@ public:
qsizetype requiredSpace(qsizetype inputLength) const
{ return iface->fromUtf16Len(inputLength); }
char *decodeIntoBuffer(char *out, const QChar *in, qsizetype length)
- { return iface->fromUtf16(out, QStringView(in, length), state.flags & Stateless ? nullptr : &state); }
+ { return iface->fromUtf16(out, QStringView(in, length), state.flags & Flag::Stateless ? nullptr : &state); }
QByteArray encode(QStringView in)
{
QByteArray result(iface->fromUtf16Len(in.size()), Qt::Uninitialized);
char *out = result.data();
// ### Fixme: needs to be moved into the conversion methods to honor the other flags
- out = iface->fromUtf16(out, in, state.flags & Stateless ? nullptr : &state);
+ out = iface->fromUtf16(out, in, state.flags & Flag::Stateless ? nullptr : &state);
result.truncate(out - result.constData());
return result;
}
@@ -229,7 +229,7 @@ protected:
: QStringConverter(i)
{}
public:
- QSTRINGCONVERTER_CONSTEXPR QStringDecoder(Encoding encoding, Flags flags = DefaultConversion)
+ QSTRINGCONVERTER_CONSTEXPR QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
: QStringConverter(encoding, flags)
{}
@@ -259,13 +259,13 @@ public:
qsizetype requiredSpace(qsizetype inputLength) const
{ return iface->toUtf16Len(inputLength); }
QChar *decodeIntoBuffer(QChar *out, const char *in, qsizetype length)
- { return iface->toUtf16(out, in, length, state.flags & Stateless ? nullptr : &state); }
+ { return iface->toUtf16(out, in, length, state.flags & Flag::Stateless ? nullptr : &state); }
QString decode(const char *in, qsizetype length)
{
QString result(iface->toUtf16Len(length), Qt::Uninitialized);
QChar *out = result.data();
// ### Fixme: state handling needs to be moved into the conversion methods
- out = iface->toUtf16(out, in, length, state.flags & Stateless ? nullptr : &state);
+ out = iface->toUtf16(out, in, length, state.flags & Flag::Stateless ? nullptr : &state);
result.truncate(out - result.constData());
return result;
}