aboutsummaryrefslogtreecommitdiffstats
path: root/taglib/toolkit
diff options
context:
space:
mode:
authorTsuda Kageyu <tsuda.kageyu@gmail.com>2017-01-27 11:44:56 +0900
committerTsuda Kageyu <tsuda.kageyu@gmail.com>2017-01-27 14:47:55 +0900
commit0c45c63943ec70bcb8734299ca0b86f4f0131c6f (patch)
tree0aeac6ccec8690b0c637e8a4e016c59be053f7bf /taglib/toolkit
parent586c9bd962f0c3f9aa520d7f2f9bf0f7447a9cbf (diff)
Replace unicode.h/unicode.cpp by the UTF8-CPP library.
unicode.h/unicode.cpp are no longer maintained and incompatible with Debian's guideline. UTF8-CPP is maintained on GitHub and published under the Boost Software License which is compatible with either LGPL or MPL and should go along with Debian's guideline.
Diffstat (limited to 'taglib/toolkit')
-rw-r--r--taglib/toolkit/tstring.cpp100
-rw-r--r--taglib/toolkit/unicode.cpp303
-rw-r--r--taglib/toolkit/unicode.h149
-rw-r--r--taglib/toolkit/utf8/checked.h327
-rw-r--r--taglib/toolkit/utf8/core.h329
5 files changed, 673 insertions, 535 deletions
diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index 1639d7c5..b154cbae 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -31,16 +31,11 @@
#include <cstdio>
#include <cstring>
-#ifdef _WIN32
-# include <windows.h>
-#else
-# include "unicode.h"
-#endif
-
#include <tdebug.h>
#include <tstringlist.h>
#include <trefcounter.h>
#include <tutils.h>
+#include <utf8/checked.h>
#include "tstring.h"
@@ -48,72 +43,6 @@ namespace
{
using namespace TagLib;
- size_t UTF16toUTF8(const wchar_t *src, size_t srcLength, char *dst, size_t dstLength)
- {
- size_t len = 0;
-
-#ifdef _WIN32
-
- len = ::WideCharToMultiByte(
- CP_UTF8, 0, src, static_cast<int>(srcLength), dst, static_cast<int>(dstLength), NULL, NULL);
-
-#else
-
- using namespace Unicode;
-
- const UTF16 *srcBegin = src;
- const UTF16 *srcEnd = srcBegin + srcLength;
-
- UTF8 *dstBegin = reinterpret_cast<UTF8*>(dst);
- UTF8 *dstEnd = dstBegin + dstLength;
-
- ConversionResult result = ConvertUTF16toUTF8(
- &srcBegin, srcEnd, &dstBegin, dstEnd, lenientConversion);
-
- if(result == conversionOK)
- len = dstBegin - reinterpret_cast<UTF8*>(dst);
-
-#endif
-
- if(len == 0)
- debug("String::UTF16toUTF8() - Unicode conversion error.");
-
- return len;
- }
-
- size_t UTF8toUTF16(const char *src, size_t srcLength, wchar_t *dst, size_t dstLength)
- {
- size_t len = 0;
-
-#ifdef _WIN32
-
- len = ::MultiByteToWideChar(
- CP_UTF8, MB_ERR_INVALID_CHARS, src, static_cast<int>(srcLength), dst, static_cast<int>(dstLength));
-
-#else
-
- using namespace Unicode;
-
- const UTF8 *srcBegin = reinterpret_cast<const UTF8*>(src);
- const UTF8 *srcEnd = srcBegin + srcLength;
-
- UTF16 *dstBegin = dst;
- UTF16 *dstEnd = dstBegin + dstLength;
-
- ConversionResult result = ConvertUTF8toUTF16(
- &srcBegin, srcEnd, &dstBegin, dstEnd, lenientConversion);
-
- if(result == conversionOK)
- len = dstBegin - dst;
-
-#endif
-
- if(len == 0)
- debug("String::UTF8toUTF16() - Unicode conversion error.");
-
- return len;
- }
-
// Returns the native format of std::wstring.
String::Type wcharByteOrder()
{
@@ -139,9 +68,13 @@ namespace
{
data.resize(length);
- if(length > 0) {
- const size_t len = UTF8toUTF16(s, length, &data[0], data.size());
- data.resize(len);
+ try {
+ const std::wstring::iterator dstEnd = utf8::utf8to16(s, s + length, data.begin());
+ data.resize(dstEnd - data.begin());
+ }
+ catch(const utf8::exception &e) {
+ debug(String("String::copyFromUTF8() - UTF8-CPP error: ") + e.what());
+ data.clear();
}
}
@@ -517,19 +450,20 @@ ByteVector String::data(Type t) const
return v;
}
case UTF8:
- if(!d->data.empty())
{
- ByteVector v(size() * 4 + 1, 0);
+ ByteVector v(size() * 4, 0);
- const size_t len = UTF16toUTF8(
- d->data.c_str(), d->data.size(), v.data(), v.size());
- v.resize(static_cast<unsigned int>(len));
+ try {
+ const ByteVector::Iterator dstEnd = utf8::utf16to8(begin(), end(), v.begin());
+ v.resize(static_cast<unsigned int>(dstEnd - v.begin()));
+ }
+ catch(const utf8::exception &e) {
+ debug(String("String::data() - UTF8-CPP error: ") + e.what());
+ v.clear();
+ }
return v;
}
- else {
- return ByteVector();
- }
case UTF16:
{
ByteVector v(2 + size() * 2, 0);
diff --git a/taglib/toolkit/unicode.cpp b/taglib/toolkit/unicode.cpp
deleted file mode 100644
index 1b26977e..00000000
--- a/taglib/toolkit/unicode.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-/*******************************************************************************
- * *
- * THIS FILE IS INCLUDED IN TAGLIB, BUT IS NOT COPYRIGHTED BY THE TAGLIB *
- * AUTHORS, NOT PART OF THE TAGLIB API AND COULD GO AWAY AT ANY POINT IN TIME. *
- * AS SUCH IT SHOULD BE CONSIERED FOR INTERNAL USE ONLY. *
- * *
- *******************************************************************************/
-
-/*
- * Copyright 2001 Unicode, Inc.
- *
- * Disclaimer
- *
- * This source code is provided as is by Unicode, Inc. No claims are
- * made as to fitness for any particular purpose. No warranties of any
- * kind are expressed or implied. The recipient agrees to determine
- * applicability of information provided. If this file has been
- * purchased on magnetic or optical media from Unicode, Inc., the
- * sole remedy for any claim will be exchange of defective media
- * within 90 days of receipt.
- *
- * Limitations on Rights to Redistribute This Code
- *
- * Unicode, Inc. hereby grants the right to freely use the information
- * supplied in this file in the creation of products supporting the
- * Unicode Standard, and to make copies of this file in any form
- * for internal or external distribution as long as this notice
- * remains attached.
- */
-
-/*
- * This file has been modified by Scott Wheeler <wheeler@kde.org> to remove
- * the UTF32 conversion functions and to place the appropriate functions
- * in their own C++ namespace.
- */
-
-/* ---------------------------------------------------------------------
-
- Conversions between UTF32, UTF-16, and UTF-8. Source code file.
- Author: Mark E. Davis, 1994.
- Rev History: Rick McGowan, fixes & updates May 2001.
- Sept 2001: fixed const & error conditions per
- mods suggested by S. Parent & A. Lillich.
-
- See the header file "ConvertUTF.h" for complete documentation.
-
------------------------------------------------------------------------- */
-
-
-#include "unicode.h"
-#include <stdio.h>
-
-#define UNI_SUR_HIGH_START (UTF32)0xD800
-#define UNI_SUR_HIGH_END (UTF32)0xDBFF
-#define UNI_SUR_LOW_START (UTF32)0xDC00
-#define UNI_SUR_LOW_END (UTF32)0xDFFF
-#define false 0
-#define true 1
-
-namespace Unicode {
-
-static const int halfShift = 10; /* used for shifting by 10 bits */
-
-static const UTF32 halfBase = 0x0010000UL;
-static const UTF32 halfMask = 0x3FFUL;
-
-/*
- * Index into the table below with the first byte of a UTF-8 sequence to
- * get the number of trailing bytes that are supposed to follow it.
- */
-static const char trailingBytesForUTF8[256] = {
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
-};
-
-/*
- * Magic values subtracted from a buffer value during UTF8 conversion.
- * This table contains as many values as there might be trailing bytes
- * in a UTF-8 sequence.
- */
-static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
- 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
-
-/*
- * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
- * into the first byte, depending on how many bytes follow. There are
- * as many entries in this table as there are UTF-8 sequence types.
- * (I.e., one byte sequence, two byte... six byte sequence.)
- */
-static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
-
-/* --------------------------------------------------------------------- */
-
-/* The interface converts a whole buffer to avoid function-call overhead.
- * Constants have been gathered. Loops & conditionals have been removed as
- * much as possible for efficiency, in favor of drop-through switches.
- * (See "Note A" at the bottom of the file for equivalent code.)
- * If your compiler supports it, the "isLegalUTF8" call can be turned
- * into an inline function.
- */
-
-/* --------------------------------------------------------------------- */
-
-ConversionResult ConvertUTF16toUTF8 (
- const UTF16** sourceStart, const UTF16* sourceEnd,
- UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
- ConversionResult result = conversionOK;
- const UTF16* source = *sourceStart;
- UTF8* target = *targetStart;
- while (source < sourceEnd) {
- UTF32 ch;
- unsigned short bytesToWrite = 0;
- const UTF32 byteMask = 0xBF;
- const UTF32 byteMark = 0x80;
- const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
- ch = *source++;
- /* If we have a surrogate pair, convert to UTF32 first. */
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
- UTF32 ch2 = *source;
- if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
- ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
- + (ch2 - UNI_SUR_LOW_START) + halfBase;
- ++source;
- } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
- --source; /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- }
- } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) {
- --source; /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- }
- /* Figure out how many bytes the result will require */
- if (ch < (UTF32)0x80) { bytesToWrite = 1;
- } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
- } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
- } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
- } else { bytesToWrite = 2;
- ch = UNI_REPLACEMENT_CHAR;
- }
- // printf("bytes to write = %i\n", bytesToWrite);
- target += bytesToWrite;
- if (target > targetEnd) {
- source = oldSource; /* Back up source pointer! */
- target -= bytesToWrite; result = targetExhausted; break;
- }
- switch (bytesToWrite) { /* note: everything falls through. */
- case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
- case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
- case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
- case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
- }
- target += bytesToWrite;
- }
- *sourceStart = source;
- *targetStart = target;
- return result;
-}
-
-/* --------------------------------------------------------------------- */
-
-/*
- * Utility routine to tell whether a sequence of bytes is legal UTF-8.
- * This must be called with the length pre-determined by the first byte.
- * If not calling this from ConvertUTF8to*, then the length can be set by:
- * length = trailingBytesForUTF8[*source]+1;
- * and the sequence is illegal right away if there aren't that many bytes
- * available.
- * If presented with a length > 4, this returns false. The Unicode
- * definition of UTF-8 goes up to 4-byte sequences.
- */
-
-static Boolean isLegalUTF8(const UTF8 *source, int length) {
- UTF8 a;
- const UTF8 *srcptr = source+length;
- switch (length) {
- default: return false;
- /* Everything else falls through when "true"... */
- case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
- case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
- case 2: if ((a = (*--srcptr)) > 0xBF) return false;
- switch (*source) {
- /* no fall-through in this inner switch */
- case 0xE0: if (a < 0xA0) return false; break;
- case 0xF0: if (a < 0x90) return false; break;
- case 0xF4: if (a > 0x8F) return false; break;
- default: if (a < 0x80) return false;
- }
- case 1: if (*source >= 0x80 && *source < 0xC2) return false;
- if (*source > 0xF4) return false;
- }
- return true;
-}
-
-/* --------------------------------------------------------------------- */
-
-/*
- * Exported function to return whether a UTF-8 sequence is legal or not.
- * This is not used here; it's just exported.
- */
-Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
- int length = trailingBytesForUTF8[*source]+1;
- if (source+length > sourceEnd) {
- return false;
- }
- return isLegalUTF8(source, length);
-}
-
-/* --------------------------------------------------------------------- */
-
-ConversionResult ConvertUTF8toUTF16 (
- const UTF8** sourceStart, const UTF8* sourceEnd,
- UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
- ConversionResult result = conversionOK;
- const UTF8* source = *sourceStart;
- UTF16* target = *targetStart;
- while (source < sourceEnd) {
- UTF32 ch = 0;
- unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
- if (source + extraBytesToRead >= sourceEnd) {
- result = sourceExhausted; break;
- }
- /* Do this check whether lenient or strict */
- if (! isLegalUTF8(source, extraBytesToRead+1)) {
- result = sourceIllegal;
- break;
- }
- /*
- * The cases all fall through. See "Note A" below.
- */
- switch (extraBytesToRead) {
- case 3: ch += *source++; ch <<= 6;
- case 2: ch += *source++; ch <<= 6;
- case 1: ch += *source++; ch <<= 6;
- case 0: ch += *source++;
- }
- ch -= offsetsFromUTF8[extraBytesToRead];
-
- if (target >= targetEnd) {
- source -= (extraBytesToRead+1); /* Back up source pointer! */
- result = targetExhausted; break;
- }
- if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
- if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
- source -= (extraBytesToRead+1); /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- } else {
- *target++ = (UTF16)ch; /* normal case */
- }
- } else if (ch > UNI_MAX_UTF16) {
- if (flags == strictConversion) {
- result = sourceIllegal;
- source -= (extraBytesToRead+1); /* return to the start */
- break; /* Bail out; shouldn't continue */
- } else {
- *target++ = UNI_REPLACEMENT_CHAR;
- }
- } else {
- /* target is a character in range 0xFFFF - 0x10FFFF. */
- if (target + 1 >= targetEnd) {
- source -= (extraBytesToRead+1); /* Back up source pointer! */
- result = targetExhausted; break;
- }
- ch -= halfBase;
- *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
- *target++ = (ch & halfMask) + UNI_SUR_LOW_START;
- }
- }
- *sourceStart = source;
- *targetStart = target;
- return result;
-}
-
-}
-
-/* ---------------------------------------------------------------------
-
- Note A.
- The fall-through switches in UTF-8 reading code save a
- temp variable, some decrements & conditionals. The switches
- are equivalent to the following loop:
- {
- int tmpBytesToRead = extraBytesToRead+1;
- do {
- ch += *source++;
- --tmpBytesToRead;
- if (tmpBytesToRead) ch <<= 6;
- } while (tmpBytesToRead > 0);
- }
- In UTF-8 writing code, the switches on "bytesToWrite" are
- similarly unrolled loops.
-
- --------------------------------------------------------------------- */
-
-
diff --git a/taglib/toolkit/unicode.h b/taglib/toolkit/unicode.h
deleted file mode 100644
index d3a869f0..00000000
--- a/taglib/toolkit/unicode.h
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef TAGLIB_UNICODE_H
-#define TAGLIB_UNICODE_H
-
-/*******************************************************************************
- * *
- * THIS FILE IS INCLUDED IN TAGLIB, BUT IS NOT COPYRIGHTED BY THE TAGLIB *
- * AUTHORS, NOT PART OF THE TAGLIB API AND COULD GO AWAY AT ANY POINT IN TIME. *
- * AS SUCH IT SHOULD BE CONSIERED FOR INTERNAL USE ONLY. *
- * *
- *******************************************************************************/
-
-#ifndef DO_NOT_DOCUMENT // tell Doxygen not to document this header
-
-/*
- * Copyright 2001 Unicode, Inc.
- *
- * Disclaimer
- *
- * This source code is provided as is by Unicode, Inc. No claims are
- * made as to fitness for any particular purpose. No warranties of any
- * kind are expressed or implied. The recipient agrees to determine
- * applicability of information provided. If this file has been
- * purchased on magnetic or optical media from Unicode, Inc., the
- * sole remedy for any claim will be exchange of defective media
- * within 90 days of receipt.
- *
- * Limitations on Rights to Redistribute This Code
- *
- * Unicode, Inc. hereby grants the right to freely use the information
- * supplied in this file in the creation of products supporting the
- * Unicode Standard, and to make copies of this file in any form
- * for internal or external distribution as long as this notice
- * remains attached.
- */
-
-/*
- * This file has been modified by Scott Wheeler <wheeler@kde.org> to remove
- * the UTF32 conversion functions and to place the appropriate functions
- * in their own C++ namespace.
- */
-
-/* ---------------------------------------------------------------------
-
- Conversions between UTF32, UTF-16, and UTF-8. Header file.
-
- Several functions are included here, forming a complete set of
- conversions between the three formats. UTF-7 is not included
- here, but is handled in a separate source file.
-
- Each of these routines takes pointers to input buffers and output
- buffers. The input buffers are const.
-
- Each routine converts the text between *sourceStart and sourceEnd,
- putting the result into the buffer between *targetStart and
- targetEnd. Note: the end pointers are *after* the last item: e.g.
- *(sourceEnd - 1) is the last item.
-
- The return result indicates whether the conversion was successful,
- and if not, whether the problem was in the source or target buffers.
- (Only the first encountered problem is indicated.)
-
- After the conversion, *sourceStart and *targetStart are both
- updated to point to the end of last text successfully converted in
- the respective buffers.
-
- Input parameters:
- sourceStart - pointer to a pointer to the source buffer.
- The contents of this are modified on return so that
- it points at the next thing to be converted.
- targetStart - similarly, pointer to pointer to the target buffer.
- sourceEnd, targetEnd - respectively pointers to the ends of the
- two buffers, for overflow checking only.
-
- These conversion functions take a ConversionFlags argument. When this
- flag is set to strict, both irregular sequences and isolated surrogates
- will cause an error. When the flag is set to lenient, both irregular
- sequences and isolated surrogates are converted.
-
- Whether the flag is strict or lenient, all illegal sequences will cause
- an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
- or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
- must check for illegal sequences.
-
- When the flag is set to lenient, characters over 0x10FFFF are converted
- to the replacement character; otherwise (when the flag is set to strict)
- they constitute an error.
-
- Output parameters:
- The value "sourceIllegal" is returned from some routines if the input
- sequence is malformed. When "sourceIllegal" is returned, the source
- value will point to the illegal value that caused the problem. E.g.,
- in UTF-8 when a sequence is malformed, it points to the start of the
- malformed sequence.
-
- Author: Mark E. Davis, 1994.
- Rev History: Rick McGowan, fixes & updates May 2001.
- Fixes & updates, Sept 2001.
-
------------------------------------------------------------------------- */
-
-/* ---------------------------------------------------------------------
- The following 4 definitions are compiler-specific.
- The C standard does not guarantee that wchar_t has at least
- 16 bits, so wchar_t is no less portable than unsigned short!
- All should be unsigned values to avoid sign extension during
- bit mask & shift operations.
------------------------------------------------------------------------- */
-
-/* Some fundamental constants */
-#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
-#define UNI_MAX_BMP (UTF32)0x0000FFFF
-#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
-#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
-
-namespace Unicode {
-
-typedef unsigned long UTF32; /* at least 32 bits */
-typedef wchar_t UTF16; /* TagLib assumes that wchar_t is sufficient for UTF-16. */
-typedef unsigned char UTF8; /* typically 8 bits */
-typedef unsigned char Boolean; /* 0 or 1 */
-
-typedef enum {
- conversionOK = 0, /* conversion successful */
- sourceExhausted = 1, /* partial character in source, but hit end */
- targetExhausted = 2, /* insuff. room in target for conversion */
- sourceIllegal = 3 /* source sequence is illegal/malformed */
-} ConversionResult;
-
-typedef enum {
- strictConversion = 0,
- lenientConversion
-} ConversionFlags;
-
-ConversionResult ConvertUTF8toUTF16 (
- const UTF8** sourceStart, const UTF8* sourceEnd,
- UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
-
-ConversionResult ConvertUTF16toUTF8 (
- const UTF16** sourceStart, const UTF16* sourceEnd,
- UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
-
-Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
-
-} // namespace Unicode
-
-/* --------------------------------------------------------------------- */
-
-#endif
-#endif
diff --git a/taglib/toolkit/utf8/checked.h b/taglib/toolkit/utf8/checked.h
new file mode 100644
index 00000000..13311551
--- /dev/null
+++ b/taglib/toolkit/utf8/checked.h
@@ -0,0 +1,327 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+#include <stdexcept>
+
+namespace utf8
+{
+ // Base for the exceptions that may be thrown from the library
+ class exception : public ::std::exception {
+ };
+
+ // Exceptions that may be thrown from the library functions.
+ class invalid_code_point : public exception {
+ uint32_t cp;
+ public:
+ invalid_code_point(uint32_t cp) : cp(cp) {}
+ virtual const char* what() const throw() { return "Invalid code point"; }
+ uint32_t code_point() const {return cp;}
+ };
+
+ class invalid_utf8 : public exception {
+ uint8_t u8;
+ public:
+ invalid_utf8 (uint8_t u) : u8(u) {}
+ virtual const char* what() const throw() { return "Invalid UTF-8"; }
+ uint8_t utf8_octet() const {return u8;}
+ };
+
+ class invalid_utf16 : public exception {
+ uint16_t u16;
+ public:
+ invalid_utf16 (uint16_t u) : u16(u) {}
+ virtual const char* what() const throw() { return "Invalid UTF-16"; }
+ uint16_t utf16_word() const {return u16;}
+ };
+
+ class not_enough_room : public exception {
+ public:
+ virtual const char* what() const throw() { return "Not enough space"; }
+ };
+
+ /// The library API - functions intended to be called by the users
+
+ template <typename octet_iterator>
+ octet_iterator append(uint32_t cp, octet_iterator result)
+ {
+ if (!utf8::internal::is_code_point_valid(cp))
+ throw invalid_code_point(cp);
+
+ if (cp < 0x80) // one octet
+ *(result++) = static_cast<uint8_t>(cp);
+ else if (cp < 0x800) { // two octets
+ *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
+ *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+ }
+ else if (cp < 0x10000) { // three octets
+ *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
+ *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
+ *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+ }
+ else { // four octets
+ *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
+ *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
+ *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
+ *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+ }
+ return result;
+ }
+
+ template <typename octet_iterator, typename output_iterator>
+ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+ {
+ while (start != end) {
+ octet_iterator sequence_start = start;
+ internal::utf_error err_code = utf8::internal::validate_next(start, end);
+ switch (err_code) {
+ case internal::UTF8_OK :
+ for (octet_iterator it = sequence_start; it != start; ++it)
+ *out++ = *it;
+ break;
+ case internal::NOT_ENOUGH_ROOM:
+ throw not_enough_room();
+ case internal::INVALID_LEAD:
+ out = utf8::append (replacement, out);
+ ++start;
+ break;
+ case internal::INCOMPLETE_SEQUENCE:
+ case internal::OVERLONG_SEQUENCE:
+ case internal::INVALID_CODE_POINT:
+ out = utf8::append (replacement, out);
+ ++start;
+ // just one replacement mark for the sequence
+ while (start != end && utf8::internal::is_trail(*start))
+ ++start;
+ break;
+ }
+ }
+ return out;
+ }
+
+ template <typename octet_iterator, typename output_iterator>
+ inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+ {
+ static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
+ return utf8::replace_invalid(start, end, out, replacement_marker);
+ }
+
+ template <typename octet_iterator>
+ uint32_t next(octet_iterator& it, octet_iterator end)
+ {
+ uint32_t cp = 0;
+ internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
+ switch (err_code) {
+ case internal::UTF8_OK :
+ break;
+ case internal::NOT_ENOUGH_ROOM :
+ throw not_enough_room();
+ case internal::INVALID_LEAD :
+ case internal::INCOMPLETE_SEQUENCE :
+ case internal::OVERLONG_SEQUENCE :
+ throw invalid_utf8(*it);
+ case internal::INVALID_CODE_POINT :
+ throw invalid_code_point(cp);
+ }
+ return cp;
+ }
+
+ template <typename octet_iterator>
+ uint32_t peek_next(octet_iterator it, octet_iterator end)
+ {
+ return utf8::next(it, end);
+ }
+
+ template <typename octet_iterator>
+ uint32_t prior(octet_iterator& it, octet_iterator start)
+ {
+ // can't do much if it == start
+ if (it == start)
+ throw not_enough_room();
+
+ octet_iterator end = it;
+ // Go back until we hit either a lead octet or start
+ while (utf8::internal::is_trail(*(--it)))
+ if (it == start)
+ throw invalid_utf8(*it); // error - no lead byte in the sequence
+ return utf8::peek_next(it, end);
+ }
+
+ /// Deprecated in versions that include "prior"
+ template <typename octet_iterator>
+ uint32_t previous(octet_iterator& it, octet_iterator pass_start)
+ {
+ octet_iterator end = it;
+ while (utf8::internal::is_trail(*(--it)))
+ if (it == pass_start)
+ throw invalid_utf8(*it); // error - no lead byte in the sequence
+ octet_iterator temp = it;
+ return utf8::next(temp, end);
+ }
+
+ template <typename octet_iterator, typename distance_type>
+ void advance (octet_iterator& it, distance_type n, octet_iterator end)
+ {
+ for (distance_type i = 0; i < n; ++i)
+ utf8::next(it, end);
+ }
+
+ template <typename octet_iterator>
+ typename std::iterator_traits<octet_iterator>::difference_type
+ distance (octet_iterator first, octet_iterator last)
+ {
+ typename std::iterator_traits<octet_iterator>::difference_type dist;
+ for (dist = 0; first < last; ++dist)
+ utf8::next(first, last);
+ return dist;
+ }
+
+ template <typename u16bit_iterator, typename octet_iterator>
+ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+ {
+ while (start != end) {
+ uint32_t cp = utf8::internal::mask16(*start++);
+ // Take care of surrogate pairs first
+ if (utf8::internal::is_lead_surrogate(cp)) {
+ if (start != end) {
+ uint32_t trail_surrogate = utf8::internal::mask16(*start++);
+ if (utf8::internal::is_trail_surrogate(trail_surrogate))
+ cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+ else
+ throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
+ }
+ else
+ throw invalid_utf16(static_cast<uint16_t>(cp));
+
+ }
+ // Lone trail surrogate
+ else if (utf8::internal::is_trail_surrogate(cp))
+ throw invalid_utf16(static_cast<uint16_t>(cp));
+
+ result = utf8::append(cp, result);
+ }
+ return result;
+ }
+
+ template <typename u16bit_iterator, typename octet_iterator>
+ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+ {
+ while (start != end) {
+ uint32_t cp = utf8::next(start, end);
+ if (cp > 0xffff) { //make a surrogate pair
+ *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
+ *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+ }
+ else
+ *result++ = static_cast<uint16_t>(cp);
+ }
+ return result;
+ }
+
+ template <typename octet_iterator, typename u32bit_iterator>
+ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+ {
+ while (start != end)
+ result = utf8::append(*(start++), result);
+
+ return result;
+ }
+
+ template <typename octet_iterator, typename u32bit_iterator>
+ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
+ {
+ while (start != end)
+ (*result++) = utf8::next(start, end);
+
+ return result;
+ }
+
+ // The iterator class
+ template <typename octet_iterator>
+ class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
+ octet_iterator it;
+ octet_iterator range_start;
+ octet_iterator range_end;
+ public:
+ iterator () {}
+ explicit iterator (const octet_iterator& octet_it,
+ const octet_iterator& range_start,
+ const octet_iterator& range_end) :
+ it(octet_it), range_start(range_start), range_end(range_end)
+ {
+ if (it < range_start || it > range_end)
+ throw std::out_of_range("Invalid utf-8 iterator position");
+ }
+ // the default "big three" are OK
+ octet_iterator base () const { return it; }
+ uint32_t operator * () const
+ {
+ octet_iterator temp = it;
+ return utf8::next(temp, range_end);
+ }
+ bool operator == (const iterator& rhs) const
+ {
+ if (range_start != rhs.range_start || range_end != rhs.range_end)
+ throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
+ return (it == rhs.it);
+ }
+ bool operator != (const iterator& rhs) const
+ {
+ return !(operator == (rhs));
+ }
+ iterator& operator ++ ()
+ {
+ utf8::next(it, range_end);
+ return *this;
+ }
+ iterator operator ++ (int)
+ {
+ iterator temp = *this;
+ utf8::next(it, range_end);
+ return temp;
+ }
+ iterator& operator -- ()
+ {
+ utf8::prior(it, range_start);
+ return *this;
+ }
+ iterator operator -- (int)
+ {
+ iterator temp = *this;
+ utf8::prior(it, range_start);
+ return temp;
+ }
+ }; // class iterator
+
+} // namespace utf8
+
+#endif //header guard
+
+
diff --git a/taglib/toolkit/utf8/core.h b/taglib/toolkit/utf8/core.h
new file mode 100644
index 00000000..693d388c
--- /dev/null
+++ b/taglib/toolkit/utf8/core.h
@@ -0,0 +1,329 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include <iterator>
+
+namespace utf8
+{
+ // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
+ // You may need to change them to match your system.
+ // These typedefs have the same names as ones from cstdint, or boost/cstdint
+ typedef unsigned char uint8_t;
+ typedef unsigned short uint16_t;
+ typedef unsigned int uint32_t;
+
+// Helper code - not intended to be directly called by the library users. May be changed at any time
+namespace internal
+{
+ // Unicode constants
+ // Leading (high) surrogates: 0xd800 - 0xdbff
+ // Trailing (low) surrogates: 0xdc00 - 0xdfff
+ const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
+ const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
+ const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
+ const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
+ const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
+ const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
+
+ // Maximum valid value for a Unicode code point
+ const uint32_t CODE_POINT_MAX = 0x0010ffffu;
+
+ template<typename octet_type>
+ inline uint8_t mask8(octet_type oc)
+ {
+ return static_cast<uint8_t>(0xff & oc);
+ }
+ template<typename u16_type>
+ inline uint16_t mask16(u16_type oc)
+ {
+ return static_cast<uint16_t>(0xffff & oc);
+ }
+ template<typename octet_type>
+ inline bool is_trail(octet_type oc)
+ {
+ return ((utf8::internal::mask8(oc) >> 6) == 0x2);
+ }
+
+ template <typename u16>
+ inline bool is_lead_surrogate(u16 cp)
+ {
+ return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
+ }
+
+ template <typename u16>
+ inline bool is_trail_surrogate(u16 cp)
+ {
+ return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+ }
+
+ template <typename u16>
+ inline bool is_surrogate(u16 cp)
+ {
+ return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+ }
+
+ template <typename u32>
+ inline bool is_code_point_valid(u32 cp)
+ {
+ return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
+ }
+
+ template <typename octet_iterator>
+ inline typename std::iterator_traits<octet_iterator>::difference_type
+ sequence_length(octet_iterator lead_it)
+ {
+ uint8_t lead = utf8::internal::mask8(*lead_it);
+ if (lead < 0x80)
+ return 1;
+ else if ((lead >> 5) == 0x6)
+ return 2;
+ else if ((lead >> 4) == 0xe)
+ return 3;
+ else if ((lead >> 3) == 0x1e)
+ return 4;
+ else
+ return 0;
+ }
+
+ template <typename octet_difference_type>
+ inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
+ {
+ if (cp < 0x80) {
+ if (length != 1)
+ return true;
+ }
+ else if (cp < 0x800) {
+ if (length != 2)
+ return true;
+ }
+ else if (cp < 0x10000) {
+ if (length != 3)
+ return true;
+ }
+
+ return false;
+ }
+
+ enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
+
+ /// Helper for get_sequence_x
+ template <typename octet_iterator>
+ utf_error increase_safely(octet_iterator& it, octet_iterator end)
+ {
+ if (++it == end)
+ return NOT_ENOUGH_ROOM;
+
+ if (!utf8::internal::is_trail(*it))
+ return INCOMPLETE_SEQUENCE;
+
+ return UTF8_OK;
+ }
+
+ #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
+
+ /// get_sequence_x functions decode utf-8 sequences of the length x
+ template <typename octet_iterator>
+ utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ if (it == end)
+ return NOT_ENOUGH_ROOM;
+
+ code_point = utf8::internal::mask8(*it);
+
+ return UTF8_OK;
+ }
+
+ template <typename octet_iterator>
+ utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ if (it == end)
+ return NOT_ENOUGH_ROOM;
+
+ code_point = utf8::internal::mask8(*it);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
+
+ return UTF8_OK;
+ }
+
+ template <typename octet_iterator>
+ utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ if (it == end)
+ return NOT_ENOUGH_ROOM;
+
+ code_point = utf8::internal::mask8(*it);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point += (*it) & 0x3f;
+
+ return UTF8_OK;
+ }
+
+ template <typename octet_iterator>
+ utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ if (it == end)
+ return NOT_ENOUGH_ROOM;
+
+ code_point = utf8::internal::mask8(*it);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point += (*it) & 0x3f;
+
+ return UTF8_OK;
+ }
+
+ #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
+
+ template <typename octet_iterator>
+ utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ // Save the original value of it so we can go back in case of failure
+ // Of course, it does not make much sense with i.e. stream iterators
+ octet_iterator original_it = it;
+
+ uint32_t cp = 0;
+ // Determine the sequence length based on the lead octet
+ typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
+ const octet_difference_type length = utf8::internal::sequence_length(it);
+
+ // Get trail octets and calculate the code point
+ utf_error err = UTF8_OK;
+ switch (length) {
+ case 0:
+ return INVALID_LEAD;
+ case 1:
+ err = utf8::internal::get_sequence_1(it, end, cp);
+ break;
+ case 2:
+ err = utf8::internal::get_sequence_2(it, end, cp);
+ break;
+ case 3:
+ err = utf8::internal::get_sequence_3(it, end, cp);
+ break;
+ case 4:
+ err = utf8::internal::get_sequence_4(it, end, cp);
+ break;
+ }
+
+ if (err == UTF8_OK) {
+ // Decoding succeeded. Now, security checks...
+ if (utf8::internal::is_code_point_valid(cp)) {
+ if (!utf8::internal::is_overlong_sequence(cp, length)){
+ // Passed! Return here.
+ code_point = cp;
+ ++it;
+ return UTF8_OK;
+ }
+ else
+ err = OVERLONG_SEQUENCE;
+ }
+ else
+ err = INVALID_CODE_POINT;
+ }
+
+ // Failure branch - restore the original value of the iterator
+ it = original_it;
+ return err;
+ }
+
+ template <typename octet_iterator>
+ inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
+ uint32_t ignored;
+ return utf8::internal::validate_next(it, end, ignored);
+ }
+
+} // namespace internal
+
+ /// The library API - functions intended to be called by the users
+
+ // Byte order mark
+ const uint8_t bom[] = {0xef, 0xbb, 0xbf};
+
+ template <typename octet_iterator>
+ octet_iterator find_invalid(octet_iterator start, octet_iterator end)
+ {
+ octet_iterator result = start;
+ while (result != end) {
+ utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
+ if (err_code != internal::UTF8_OK)
+ return result;
+ }
+ return result;
+ }
+
+ template <typename octet_iterator>
+ inline bool is_valid(octet_iterator start, octet_iterator end)
+ {
+ return (utf8::find_invalid(start, end) == end);
+ }
+
+ template <typename octet_iterator>
+ inline bool starts_with_bom (octet_iterator it, octet_iterator end)
+ {
+ return (
+ ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
+ ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
+ ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
+ );
+ }
+
+ //Deprecated in release 2.3
+ template <typename octet_iterator>
+ inline bool is_bom (octet_iterator it)
+ {
+ return (
+ (utf8::internal::mask8(*it++)) == bom[0] &&
+ (utf8::internal::mask8(*it++)) == bom[1] &&
+ (utf8::internal::mask8(*it)) == bom[2]
+ );
+ }
+} // namespace utf8
+
+#endif // header guard
+
+