Import WebKit commit e89cea5f01479652674d3c24b2f387eb0987d0a1

Change-Id: Ifc00865ceeb5b83b7990f91af7dbbd1a05df1c30 Reviewed-by: Konstantin Tokarev <annulen@yandex.ru>
author: Konstantin Tokarev <annulen@yandex.ru> 2017-10-18 16:01:48 +0300
committer: Konstantin Tokarev <annulen@yandex.ru> 2017-10-20 09:53:13 +0000
commit: f5345badaeefac7c03c8a26dea6b89706871c3fe (patch)
tree: bc863fadf9ec8869690fa51f7e7e63ec2ad60294 /Source/WebCore/platform/text/TextCodecUTF8.cpp
parent: 778e0c7c940f8a22f9a43290aef378fb707ca088 (diff)
1 files changed, 81 insertions, 75 deletions
diff --git a/Source/WebCore/platform/text/TextCodecUTF8.cpp b/Source/WebCore/platform/text/TextCodecUTF8.cpp
index 6b32955c1..31f548662 100644
--- a/Source/WebCore/platform/text/TextCodecUTF8.cpp
+++ b/Source/WebCore/platform/text/TextCodecUTF8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
+ * Copyright (C) 2004-2017 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -31,18 +31,10 @@
 #include <wtf/text/StringBuffer.h>
 #include <wtf/unicode/CharacterNames.h>
 
-using namespace WTF;
-using namespace WTF::Unicode;
-
 namespace WebCore {
 
 const int nonCharacter = -1;
 
-std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
-{
-    return std::make_unique<TextCodecUTF8>();
-}
-
 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
 {
     registrar("UTF-8", "UTF-8");
@@ -59,7 +51,9 @@ void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
 
 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
 {
-    registrar("UTF-8", create, 0);
+    registrar("UTF-8", [] (const TextEncoding&, const void*) -> std::unique_ptr<TextCodec> {
+        return std::make_unique<TextCodecUTF8>();
+    }, nullptr);
 }
 
 static inline int nonASCIISequenceLength(uint8_t firstByte)
@@ -73,11 +67,11 @@ static inline int nonASCIISequenceLength(uint8_t firstByte)
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -85,55 +79,76 @@ static inline int nonASCIISequenceLength(uint8_t firstByte)
     return lengths[firstByte];
 }
 
-static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
+static inline int decodeNonASCIISequence(const uint8_t* sequence, int& length)
 {
     ASSERT(!isASCII(sequence[0]));
     if (length == 2) {
+        ASSERT(sequence[0] >= 0xC2);
         ASSERT(sequence[0] <= 0xDF);
-        if (sequence[0] < 0xC2)
-            return nonCharacter;
-        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+        if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
+            length = 1;
             return nonCharacter;
+        }
         return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
     }
     if (length == 3) {
-        ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
+        ASSERT(sequence[0] >= 0xE0);
+        ASSERT(sequence[0] <= 0xEF);
         switch (sequence[0]) {
         case 0xE0:
-            if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
+            if (sequence[1] < 0xA0 || sequence[1] > 0xBF) {
+                length = 1;
                 return nonCharacter;
+            }
             break;
         case 0xED:
-            if (sequence[1] < 0x80 || sequence[1] > 0x9F)
+            if (sequence[1] < 0x80 || sequence[1] > 0x9F) {
+                length = 1;
                 return nonCharacter;
+            }
             break;
         default:
-            if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+            if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
+                length = 1;
                 return nonCharacter;
+            }
         }
-        if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+        if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
+            length = 2;
             return nonCharacter;
+        }
         return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
     }
     ASSERT(length == 4);
-    ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
+    ASSERT(sequence[0] >= 0xF0);
+    ASSERT(sequence[0] <= 0xF4);
     switch (sequence[0]) {
     case 0xF0:
-        if (sequence[1] < 0x90 || sequence[1] > 0xBF)
+        if (sequence[1] < 0x90 || sequence[1] > 0xBF) {
+            length = 1;
             return nonCharacter;
+        }
         break;
     case 0xF4:
-        if (sequence[1] < 0x80 || sequence[1] > 0x8F)
+        if (sequence[1] < 0x80 || sequence[1] > 0x8F) {
+            length = 1;
             return nonCharacter;
+        }
         break;
     default:
-        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+        if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
+            length = 1;
             return nonCharacter;
+        }
     }
-    if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+    if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
+        length = 2;
         return nonCharacter;
-    if (sequence[3] < 0x80 || sequence[3] > 0xBF)
+    }
+    if (sequence[3] < 0x80 || sequence[3] > 0xBF) {
+        length = 3;
         return nonCharacter;
+    }
     return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
 }
 
@@ -156,18 +171,7 @@ void TextCodecUTF8::consumePartialSequenceByte()
     memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
 }
 
-void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
-{
-    sawError = true;
-    if (stopOnError)
-        return;
-    // Each error generates a replacement character and consumes one byte.
-    *destination++ = replacementCharacter;
-    consumePartialSequenceByte();
-}
-
-template <>
-bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
+bool TextCodecUTF8::handlePartialSequence(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush)
 {
     ASSERT(m_partialSequenceSize);
     do {
@@ -199,7 +203,7 @@ bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint
             m_partialSequenceSize = count;
         }
         int character = decodeNonASCIISequence(m_partialSequence, count);
-        if ((character == nonCharacter) || (character > 0xff))
+        if (character == nonCharacter || character > 0xFF)
             return true;
 
         m_partialSequenceSize -= count;
@@ -209,8 +213,7 @@ bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint
     return false;
 }
 
-template <>
-bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
+void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
 {
     ASSERT(m_partialSequenceSize);
     do {
@@ -221,9 +224,11 @@ bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint
         }
         int count = nonASCIISequenceLength(m_partialSequence[0]);
         if (!count) {
-            handleError(destination, stopOnError, sawError);
+            sawError = true;
             if (stopOnError)
-                return false;
+                return;
+            *destination++ = replacementCharacter;
+            consumePartialSequenceByte();
             continue;
         }
         if (count > m_partialSequenceSize) {
@@ -233,12 +238,15 @@ bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint
                     // add it to the existing partial sequence.
                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
                     m_partialSequenceSize += end - source;
-                    return false;
+                    return;
                 }
                 // An incomplete partial sequence at the end is an error.
-                handleError(destination, stopOnError, sawError);
+                sawError = true;
                 if (stopOnError)
-                    return false;
+                    return;
+                *destination++ = replacementCharacter;
+                m_partialSequenceSize = 0;
+                source = end;
                 continue;
             }
             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
@@ -247,17 +255,18 @@ bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint
         }
         int character = decodeNonASCIISequence(m_partialSequence, count);
         if (character == nonCharacter) {
-            handleError(destination, stopOnError, sawError);
+            sawError = true;
             if (stopOnError)
-                return false;
+                return;
+            *destination++ = replacementCharacter;
+            m_partialSequenceSize -= count;
+            memmove(m_partialSequence, m_partialSequence + count, m_partialSequenceSize);
             continue;
         }
 
         m_partialSequenceSize -= count;
         destination = appendCharacter(destination, character);
     } while (m_partialSequenceSize);
-
-    return false;
 }
     
 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
@@ -269,7 +278,7 @@ String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool
 
     const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
     const uint8_t* end = source + length;
-    const uint8_t* alignedEnd = alignToMachineWord(end);
+    const uint8_t* alignedEnd = WTF::alignToMachineWord(end);
     LChar* destination = buffer.characters();
 
     do {
@@ -279,7 +288,7 @@ String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool
             // in some compilers.
             LChar* destinationForHandlePartialSequence = destination;
             const uint8_t* sourceForHandlePartialSequence = source;
-            if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
+            if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush)) {
                 source = sourceForHandlePartialSequence;
                 goto upConvertTo16Bit;
             }
@@ -292,14 +301,14 @@ String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool
         while (source < end) {
             if (isASCII(*source)) {
                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
-                if (isAlignedToMachineWord(source)) {
+                if (WTF::isAlignedToMachineWord(source)) {
                     while (source < alignedEnd) {
-                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
-                        if (!isAllASCII<LChar>(chunk))
+                        auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
+                        if (!WTF::isAllASCII<LChar>(chunk))
                             break;
                         copyASCIIMachineWord(destination, source);
-                        source += sizeof(MachineWord);
-                        destination += sizeof(MachineWord);
+                        source += sizeof(WTF::MachineWord);
+                        destination += sizeof(WTF::MachineWord);
                     }
                     if (source == end)
                         break;
@@ -328,10 +337,10 @@ String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool
                 sawError = true;
                 if (stopOnError)
                     break;
-                
+
                 goto upConvertTo16Bit;
             }
-            if (character > 0xff)
+            if (character > 0xFF)
                 goto upConvertTo16Bit;
 
             source += count;
@@ -369,14 +378,14 @@ upConvertTo16Bit:
         while (source < end) {
             if (isASCII(*source)) {
                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
-                if (isAlignedToMachineWord(source)) {
+                if (WTF::isAlignedToMachineWord(source)) {
                     while (source < alignedEnd) {
-                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
-                        if (!isAllASCII<LChar>(chunk))
+                        auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
+                        if (!WTF::isAllASCII<LChar>(chunk))
                             break;
                         copyASCIIMachineWord(destination16, source);
-                        source += sizeof(MachineWord);
-                        destination16 += sizeof(MachineWord);
+                        source += sizeof(WTF::MachineWord);
+                        destination16 += sizeof(WTF::MachineWord);
                     }
                     if (source == end)
                         break;
@@ -405,9 +414,8 @@ upConvertTo16Bit:
                 sawError = true;
                 if (stopOnError)
                     break;
-                // Each error generates a replacement character and consumes one byte.
                 *destination16++ = replacementCharacter;
-                ++source;
+                source += count ? count : 1;
                 continue;
             }
             source += count;
@@ -427,17 +435,15 @@ CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl
     // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
     if (length > std::numeric_limits<size_t>::max() / 3)
         CRASH();
-    Vector<uint8_t> bytes(length * 3);
 
-    size_t i = 0;
+    Vector<char, 3000> bytes(length * 3);
     size_t bytesWritten = 0;
-    while (i < length) {
+    for (size_t i = 0; i < length; ) {
         UChar32 character;
         U16_NEXT(characters, i, length, character);
         U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
     }
-
-    return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
+    return CString { bytes.data(), bytesWritten };
 }
 
 } // namespace WebCore
author	Konstantin Tokarev <annulen@yandex.ru>	2017-10-18 16:01:48 +0300
committer	Konstantin Tokarev <annulen@yandex.ru>	2017-10-20 09:53:13 +0000
commit	f5345badaeefac7c03c8a26dea6b89706871c3fe (patch)
tree	bc863fadf9ec8869690fa51f7e7e63ec2ad60294 /Source/WebCore/platform/text/TextCodecUTF8.cpp
parent	778e0c7c940f8a22f9a43290aef378fb707ca088 (diff)