#include #include #include #include "emitterutils.h" #include "exp.h" #include "indentation.h" #include "regex_yaml.h" #include "regeximpl.h" #include "stringsource.h" #include "yaml-cpp/binary.h" // IWYU pragma: keep #include "yaml-cpp/null.h" #include "yaml-cpp/ostream_wrapper.h" namespace YAML { namespace Utils { namespace { enum { REPLACEMENT_CHARACTER = 0xFFFD }; bool IsAnchorChar(int ch) { // test for ns-anchor-char switch (ch) { case ',': case '[': case ']': case '{': case '}': // c-flow-indicator case ' ': case '\t': // s-white case 0xFEFF: // c-byte-order-mark case 0xA: case 0xD: // b-char return false; case 0x85: return true; } if (ch < 0x20) { return false; } if (ch < 0x7E) { return true; } if (ch < 0xA0) { return false; } if (ch >= 0xD800 && ch <= 0xDFFF) { return false; } if ((ch & 0xFFFE) == 0xFFFE) { return false; } if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) { return false; } if (ch > 0x10FFFF) { return false; } return true; } int Utf8BytesIndicated(char ch) { int byteVal = static_cast(ch); switch (byteVal >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: return 1; case 12: case 13: return 2; case 14: return 3; case 15: return 4; default: return -1; } } bool IsTrailingByte(char ch) { return (ch & 0xC0) == 0x80; } bool GetNextCodePointAndAdvance(int& codePoint, std::string::const_iterator& first, std::string::const_iterator last) { if (first == last) return false; int nBytes = Utf8BytesIndicated(*first); if (nBytes < 1) { // Bad lead byte ++first; codePoint = REPLACEMENT_CHARACTER; return true; } if (nBytes == 1) { codePoint = *first++; return true; } // Gather bits from trailing bytes codePoint = static_cast(*first) & ~(0xFF << (7 - nBytes)); ++first; --nBytes; for (; nBytes > 0; ++first, --nBytes) { if ((first == last) || !IsTrailingByte(*first)) { codePoint = REPLACEMENT_CHARACTER; break; } codePoint <<= 6; codePoint |= *first & 0x3F; } // Check for illegal code points if (codePoint > 0x10FFFF) codePoint = REPLACEMENT_CHARACTER; else if (codePoint >= 0xD800 && codePoint <= 0xDFFF) codePoint = REPLACEMENT_CHARACTER; else if ((codePoint & 0xFFFE) == 0xFFFE) codePoint = REPLACEMENT_CHARACTER; else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF) codePoint = REPLACEMENT_CHARACTER; return true; } void WriteCodePoint(ostream_wrapper& out, int codePoint) { if (codePoint < 0 || codePoint > 0x10FFFF) { codePoint = REPLACEMENT_CHARACTER; } if (codePoint <= 0x7F) { out << static_cast(codePoint); } else if (codePoint <= 0x7FF) { out << static_cast(0xC0 | (codePoint >> 6)) << static_cast(0x80 | (codePoint & 0x3F)); } else if (codePoint <= 0xFFFF) { out << static_cast(0xE0 | (codePoint >> 12)) << static_cast(0x80 | ((codePoint >> 6) & 0x3F)) << static_cast(0x80 | (codePoint & 0x3F)); } else { out << static_cast(0xF0 | (codePoint >> 18)) << static_cast(0x80 | ((codePoint >> 12) & 0x3F)) << static_cast(0x80 | ((codePoint >> 6) & 0x3F)) << static_cast(0x80 | (codePoint & 0x3F)); } } bool IsValidPlainScalar(const std::string& str, FlowType::value flowType, bool allowOnlyAscii) { // check against null if (IsNullString(str)) { return false; } // check the start const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow() : Exp::PlainScalar()); if (!start.Matches(str)) { return false; } // and check the end for plain whitespace (which can't be faithfully kept in a // plain scalar) if (!str.empty() && *str.rbegin() == ' ') { return false; } // then check until something is disallowed static const RegEx& disallowed_flow = Exp::EndScalarInFlow() | (Exp::BlankOrBreak() + Exp::Comment()) | Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() | Exp::Tab() | Exp::Ampersand(); static const RegEx& disallowed_block = Exp::EndScalar() | (Exp::BlankOrBreak() + Exp::Comment()) | Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() | Exp::Tab() | Exp::Ampersand(); const RegEx& disallowed = flowType == FlowType::Flow ? disallowed_flow : disallowed_block; StringCharSource buffer(str.c_str(), str.size()); while (buffer) { if (disallowed.Matches(buffer)) { return false; } if (allowOnlyAscii && (0x80 <= static_cast(buffer[0]))) { return false; } ++buffer; } return true; } bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii) { // TODO: check for non-printable characters? return std::none_of(str.begin(), str.end(), [=](char ch) { return (escapeNonAscii && (0x80 <= static_cast(ch))) || (ch == '\n'); }); } bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType, bool escapeNonAscii) { if (flowType == FlowType::Flow) { return false; } // TODO: check for non-printable characters? return std::none_of(str.begin(), str.end(), [=](char ch) { return (escapeNonAscii && (0x80 <= static_cast(ch))); }); } std::pair EncodeUTF16SurrogatePair(int codePoint) { const uint32_t leadOffset = 0xD800 - (0x10000 >> 10); return { leadOffset | (codePoint >> 10), 0xDC00 | (codePoint & 0x3FF), }; } void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) { static const char hexDigits[] = "0123456789abcdef"; out << "\\"; int digits = 8; if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) { out << "x"; digits = 2; } else if (codePoint < 0xFFFF) { out << "u"; digits = 4; } else if (stringEscapingStyle != StringEscaping::JSON) { out << "U"; digits = 8; } else { auto surrogatePair = EncodeUTF16SurrogatePair(codePoint); WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle); WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle); return; } // Write digits into the escape sequence for (; digits > 0; --digits) out << hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF]; } bool WriteAliasName(ostream_wrapper& out, const std::string& str) { int codePoint; for (std::string::const_iterator i = str.begin(); GetNextCodePointAndAdvance(codePoint, i, str.end());) { if (!IsAnchorChar(codePoint)) { return false; } WriteCodePoint(out, codePoint); } return true; } } // namespace StringFormat::value ComputeStringFormat(const std::string& str, EMITTER_MANIP strFormat, FlowType::value flowType, bool escapeNonAscii) { switch (strFormat) { case Auto: if (IsValidPlainScalar(str, flowType, escapeNonAscii)) { return StringFormat::Plain; } return StringFormat::DoubleQuoted; case SingleQuoted: if (IsValidSingleQuotedScalar(str, escapeNonAscii)) { return StringFormat::SingleQuoted; } return StringFormat::DoubleQuoted; case DoubleQuoted: return StringFormat::DoubleQuoted; case Literal: if (IsValidLiteralScalar(str, flowType, escapeNonAscii)) { return StringFormat::Literal; } return StringFormat::DoubleQuoted; default: break; } return StringFormat::DoubleQuoted; } bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) { out << "'"; int codePoint; for (std::string::const_iterator i = str.begin(); GetNextCodePointAndAdvance(codePoint, i, str.end());) { if (codePoint == '\n') { return false; // We can't handle a new line and the attendant indentation // yet } if (codePoint == '\'') { out << "''"; } else { WriteCodePoint(out, codePoint); } } out << "'"; return true; } bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, StringEscaping::value stringEscaping) { out << "\""; int codePoint; for (std::string::const_iterator i = str.begin(); GetNextCodePointAndAdvance(codePoint, i, str.end());) { switch (codePoint) { case '\"': out << "\\\""; break; case '\\': out << "\\\\"; break; case '\n': out << "\\n"; break; case '\t': out << "\\t"; break; case '\r': out << "\\r"; break; case '\b': out << "\\b"; break; case '\f': out << "\\f"; break; default: if (codePoint < 0x20 || (codePoint >= 0x80 && codePoint <= 0xA0)) { // Control characters and non-breaking space WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping); } else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be // escaped (YAML 1.2, sec. 5.2) WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping); } else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) { WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping); } else { WriteCodePoint(out, codePoint); } } } out << "\""; return true; } bool WriteLiteralString(ostream_wrapper& out, const std::string& str, std::size_t indent) { out << "|\n"; int codePoint; for (std::string::const_iterator i = str.begin(); GetNextCodePointAndAdvance(codePoint, i, str.end());) { if (codePoint == '\n') { out << "\n"; } else { out<< IndentTo(indent); WriteCodePoint(out, codePoint); } } return true; } bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) { if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) { out << ch; } else if (ch == '\"') { out << R"("\"")"; } else if (ch == '\t') { out << R"("\t")"; } else if (ch == '\n') { out << R"("\n")"; } else if (ch == '\b') { out << R"("\b")"; } else if (ch == '\r') { out << R"("\r")"; } else if (ch == '\f') { out << R"("\f")"; } else if (ch == '\\') { out << R"("\\")"; } else if (0x20 <= ch && ch <= 0x7e) { out << "\"" << ch << "\""; } else { out << "\""; WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle); out << "\""; } return true; } bool WriteComment(ostream_wrapper& out, const std::string& str, std::size_t postCommentIndent) { const std::size_t curIndent = out.col(); out << "#" << Indentation(postCommentIndent); out.set_comment(); int codePoint; for (std::string::const_iterator i = str.begin(); GetNextCodePointAndAdvance(codePoint, i, str.end());) { if (codePoint == '\n') { out << "\n" << IndentTo(curIndent) << "#" << Indentation(postCommentIndent); out.set_comment(); } else { WriteCodePoint(out, codePoint); } } return true; } bool WriteAlias(ostream_wrapper& out, const std::string& str) { out << "*"; return WriteAliasName(out, str); } bool WriteAnchor(ostream_wrapper& out, const std::string& str) { out << "&"; return WriteAliasName(out, str); } bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim) { out << (verbatim ? "!<" : "!"); StringCharSource buffer(str.c_str(), str.size()); const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag(); while (buffer) { int n = reValid.Match(buffer); if (n <= 0) { return false; } while (--n >= 0) { out << buffer[0]; ++buffer; } } if (verbatim) { out << ">"; } return true; } bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix, const std::string& tag) { out << "!"; StringCharSource prefixBuffer(prefix.c_str(), prefix.size()); while (prefixBuffer) { int n = Exp::URI().Match(prefixBuffer); if (n <= 0) { return false; } while (--n >= 0) { out << prefixBuffer[0]; ++prefixBuffer; } } out << "!"; StringCharSource tagBuffer(tag.c_str(), tag.size()); while (tagBuffer) { int n = Exp::Tag().Match(tagBuffer); if (n <= 0) { return false; } while (--n >= 0) { out << tagBuffer[0]; ++tagBuffer; } } return true; } bool WriteBinary(ostream_wrapper& out, const Binary& binary) { WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()), StringEscaping::None); return true; } } // namespace Utils } // namespace YAML