summaryrefslogtreecommitdiffstats
path: root/Source/WebCore/platform/text/TextCodecUTF8.cpp
diff options
context:
space:
mode:
authorKonstantin Tokarev <annulen@yandex.ru>2017-10-18 16:01:48 +0300
committerKonstantin Tokarev <annulen@yandex.ru>2017-10-20 09:53:13 +0000
commitf5345badaeefac7c03c8a26dea6b89706871c3fe (patch)
treebc863fadf9ec8869690fa51f7e7e63ec2ad60294 /Source/WebCore/platform/text/TextCodecUTF8.cpp
parent778e0c7c940f8a22f9a43290aef378fb707ca088 (diff)
Import WebKit commit e89cea5f01479652674d3c24b2f387eb0987d0a1
Change-Id: Ifc00865ceeb5b83b7990f91af7dbbd1a05df1c30 Reviewed-by: Konstantin Tokarev <annulen@yandex.ru>
Diffstat (limited to 'Source/WebCore/platform/text/TextCodecUTF8.cpp')
-rw-r--r--Source/WebCore/platform/text/TextCodecUTF8.cpp156
1 files changed, 81 insertions, 75 deletions
diff --git a/Source/WebCore/platform/text/TextCodecUTF8.cpp b/Source/WebCore/platform/text/TextCodecUTF8.cpp
index 6b32955c1..31f548662 100644
--- a/Source/WebCore/platform/text/TextCodecUTF8.cpp
+++ b/Source/WebCore/platform/text/TextCodecUTF8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
+ * Copyright (C) 2004-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -31,18 +31,10 @@
#include <wtf/text/StringBuffer.h>
#include <wtf/unicode/CharacterNames.h>
-using namespace WTF;
-using namespace WTF::Unicode;
-
namespace WebCore {
const int nonCharacter = -1;
-std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
-{
- return std::make_unique<TextCodecUTF8>();
-}
-
void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
{
registrar("UTF-8", "UTF-8");
@@ -59,7 +51,9 @@ void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
{
- registrar("UTF-8", create, 0);
+ registrar("UTF-8", [] (const TextEncoding&, const void*) -> std::unique_ptr<TextCodec> {
+ return std::make_unique<TextCodecUTF8>();
+ }, nullptr);
}
static inline int nonASCIISequenceLength(uint8_t firstByte)
@@ -73,11 +67,11 @@ static inline int nonASCIISequenceLength(uint8_t firstByte)
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -85,55 +79,76 @@ static inline int nonASCIISequenceLength(uint8_t firstByte)
return lengths[firstByte];
}
-static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
+static inline int decodeNonASCIISequence(const uint8_t* sequence, int& length)
{
ASSERT(!isASCII(sequence[0]));
if (length == 2) {
+ ASSERT(sequence[0] >= 0xC2);
ASSERT(sequence[0] <= 0xDF);
- if (sequence[0] < 0xC2)
- return nonCharacter;
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
+ }
return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
}
if (length == 3) {
- ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
+ ASSERT(sequence[0] >= 0xE0);
+ ASSERT(sequence[0] <= 0xEF);
switch (sequence[0]) {
case 0xE0:
- if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
+ if (sequence[1] < 0xA0 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
+ }
break;
case 0xED:
- if (sequence[1] < 0x80 || sequence[1] > 0x9F)
+ if (sequence[1] < 0x80 || sequence[1] > 0x9F) {
+ length = 1;
return nonCharacter;
+ }
break;
default:
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
+ }
}
- if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
+ length = 2;
return nonCharacter;
+ }
return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
}
ASSERT(length == 4);
- ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
+ ASSERT(sequence[0] >= 0xF0);
+ ASSERT(sequence[0] <= 0xF4);
switch (sequence[0]) {
case 0xF0:
- if (sequence[1] < 0x90 || sequence[1] > 0xBF)
+ if (sequence[1] < 0x90 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
+ }
break;
case 0xF4:
- if (sequence[1] < 0x80 || sequence[1] > 0x8F)
+ if (sequence[1] < 0x80 || sequence[1] > 0x8F) {
+ length = 1;
return nonCharacter;
+ }
break;
default:
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
+ }
}
- if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
+ length = 2;
return nonCharacter;
- if (sequence[3] < 0x80 || sequence[3] > 0xBF)
+ }
+ if (sequence[3] < 0x80 || sequence[3] > 0xBF) {
+ length = 3;
return nonCharacter;
+ }
return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
}
@@ -156,18 +171,7 @@ void TextCodecUTF8::consumePartialSequenceByte()
memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
}
-void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
-{
- sawError = true;
- if (stopOnError)
- return;
- // Each error generates a replacement character and consumes one byte.
- *destination++ = replacementCharacter;
- consumePartialSequenceByte();
-}
-
-template <>
-bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
+bool TextCodecUTF8::handlePartialSequence(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush)
{
ASSERT(m_partialSequenceSize);
do {
@@ -199,7 +203,7 @@ bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint
m_partialSequenceSize = count;
}
int character = decodeNonASCIISequence(m_partialSequence, count);
- if ((character == nonCharacter) || (character > 0xff))
+ if (character == nonCharacter || character > 0xFF)
return true;
m_partialSequenceSize -= count;
@@ -209,8 +213,7 @@ bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint
return false;
}
-template <>
-bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
+void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
{
ASSERT(m_partialSequenceSize);
do {
@@ -221,9 +224,11 @@ bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint
}
int count = nonASCIISequenceLength(m_partialSequence[0]);
if (!count) {
- handleError(destination, stopOnError, sawError);
+ sawError = true;
if (stopOnError)
- return false;
+ return;
+ *destination++ = replacementCharacter;
+ consumePartialSequenceByte();
continue;
}
if (count > m_partialSequenceSize) {
@@ -233,12 +238,15 @@ bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint
// add it to the existing partial sequence.
memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
m_partialSequenceSize += end - source;
- return false;
+ return;
}
// An incomplete partial sequence at the end is an error.
- handleError(destination, stopOnError, sawError);
+ sawError = true;
if (stopOnError)
- return false;
+ return;
+ *destination++ = replacementCharacter;
+ m_partialSequenceSize = 0;
+ source = end;
continue;
}
memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
@@ -247,17 +255,18 @@ bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint
}
int character = decodeNonASCIISequence(m_partialSequence, count);
if (character == nonCharacter) {
- handleError(destination, stopOnError, sawError);
+ sawError = true;
if (stopOnError)
- return false;
+ return;
+ *destination++ = replacementCharacter;
+ m_partialSequenceSize -= count;
+ memmove(m_partialSequence, m_partialSequence + count, m_partialSequenceSize);
continue;
}
m_partialSequenceSize -= count;
destination = appendCharacter(destination, character);
} while (m_partialSequenceSize);
-
- return false;
}
String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
@@ -269,7 +278,7 @@ String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool
const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
const uint8_t* end = source + length;
- const uint8_t* alignedEnd = alignToMachineWord(end);
+ const uint8_t* alignedEnd = WTF::alignToMachineWord(end);
LChar* destination = buffer.characters();
do {
@@ -279,7 +288,7 @@ String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool
// in some compilers.
LChar* destinationForHandlePartialSequence = destination;
const uint8_t* sourceForHandlePartialSequence = source;
- if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
+ if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush)) {
source = sourceForHandlePartialSequence;
goto upConvertTo16Bit;
}
@@ -292,14 +301,14 @@ String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool
while (source < end) {
if (isASCII(*source)) {
// Fast path for ASCII. Most UTF-8 text will be ASCII.
- if (isAlignedToMachineWord(source)) {
+ if (WTF::isAlignedToMachineWord(source)) {
while (source < alignedEnd) {
- MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
- if (!isAllASCII<LChar>(chunk))
+ auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
+ if (!WTF::isAllASCII<LChar>(chunk))
break;
copyASCIIMachineWord(destination, source);
- source += sizeof(MachineWord);
- destination += sizeof(MachineWord);
+ source += sizeof(WTF::MachineWord);
+ destination += sizeof(WTF::MachineWord);
}
if (source == end)
break;
@@ -328,10 +337,10 @@ String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool
sawError = true;
if (stopOnError)
break;
-
+
goto upConvertTo16Bit;
}
- if (character > 0xff)
+ if (character > 0xFF)
goto upConvertTo16Bit;
source += count;
@@ -369,14 +378,14 @@ upConvertTo16Bit:
while (source < end) {
if (isASCII(*source)) {
// Fast path for ASCII. Most UTF-8 text will be ASCII.
- if (isAlignedToMachineWord(source)) {
+ if (WTF::isAlignedToMachineWord(source)) {
while (source < alignedEnd) {
- MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
- if (!isAllASCII<LChar>(chunk))
+ auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
+ if (!WTF::isAllASCII<LChar>(chunk))
break;
copyASCIIMachineWord(destination16, source);
- source += sizeof(MachineWord);
- destination16 += sizeof(MachineWord);
+ source += sizeof(WTF::MachineWord);
+ destination16 += sizeof(WTF::MachineWord);
}
if (source == end)
break;
@@ -405,9 +414,8 @@ upConvertTo16Bit:
sawError = true;
if (stopOnError)
break;
- // Each error generates a replacement character and consumes one byte.
*destination16++ = replacementCharacter;
- ++source;
+ source += count ? count : 1;
continue;
}
source += count;
@@ -427,17 +435,15 @@ CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl
// Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
if (length > std::numeric_limits<size_t>::max() / 3)
CRASH();
- Vector<uint8_t> bytes(length * 3);
- size_t i = 0;
+ Vector<char, 3000> bytes(length * 3);
size_t bytesWritten = 0;
- while (i < length) {
+ for (size_t i = 0; i < length; ) {
UChar32 character;
U16_NEXT(characters, i, length, character);
U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
}
-
- return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
+ return CString { bytes.data(), bytesWritten };
}
} // namespace WebCore