summaryrefslogtreecommitdiffstats
path: root/src/corelib/codecs
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2014-04-23 11:18:17 -0700
committerThe Qt Project <gerrit-noreply@qt-project.org>2014-04-24 10:47:49 +0200
commitf56ef579ba5b1d3adda060fa9c0707e37f9f1baa (patch)
tree59128b7dca8086510bde7e6912ba2a7532361ccb /src/corelib/codecs
parentcae970c68663c320bcce86160d44d857bfc68fd0 (diff)
Restore handling of BOMs in QString::fromUtf8
8dd47e34b9b96ac27a99cdcf10b8aec506882fc2 removed the handling of the BOMs but did not document it. This brings the behavior back and adds a unit test so we don't break it again. Discussed-on: http://lists.qt-project.org/pipermail/development/2014-April/016532.html Change-Id: Ifb7a9a6e5a494622f46b8ab435e1d168b862d952 Reviewed-by: Olivier Goffart <ogoffart@woboq.com> Reviewed-by: Lars Knoll <lars.knoll@digia.com>
Diffstat (limited to 'src/corelib/codecs')
-rw-r--r--src/corelib/codecs/qutfcodec.cpp44
1 files changed, 29 insertions, 15 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 54312601e4..072cda63aa 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -52,6 +52,8 @@ QT_BEGIN_NAMESPACE
enum { Endian = 0, Data = 1 };
+static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
+
#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
{
@@ -187,9 +189,9 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
int invalid = 0;
if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
// append UTF-8 BOM
- *cursor++ = 0xef;
- *cursor++ = 0xbb;
- *cursor++ = 0xbf;
+ *cursor++ = utf8bom[0];
+ *cursor++ = utf8bom[1];
+ *cursor++ = utf8bom[2];
}
const ushort *nextAscii = src;
@@ -240,19 +242,31 @@ QString QUtf8::convertToUnicode(const char *chars, int len)
const uchar *src = reinterpret_cast<const uchar *>(chars);
const uchar *end = src + len;
- while (src < end) {
- const uchar *nextAscii = end;
- if (simdDecodeAscii(dst, nextAscii, src, end))
- break;
+ // attempt to do a full decoding in SIMD
+ const uchar *nextAscii = end;
+ if (!simdDecodeAscii(dst, nextAscii, src, end)) {
+ // at least one non-ASCII entry
+ // check if we failed to decode the UTF-8 BOM; if so, skip it
+ if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars))
+ && end - src >= 3
+ && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
+ src += 3;
+ }
- do {
- uchar b = *src++;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
- if (res < 0) {
- // decoding error
- *dst++ = QChar::ReplacementCharacter;
- }
- } while (src < nextAscii);
+ while (src < end) {
+ nextAscii = end;
+ if (simdDecodeAscii(dst, nextAscii, src, end))
+ break;
+
+ do {
+ uchar b = *src++;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+ if (res < 0) {
+ // decoding error
+ *dst++ = QChar::ReplacementCharacter;
+ }
+ } while (src < nextAscii);
+ }
}
result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));