diff options
Diffstat (limited to 'chromium/third_party/libwebm/source/webvttparser.cc')
-rw-r--r-- | chromium/third_party/libwebm/source/webvttparser.cc | 702 |
1 files changed, 702 insertions, 0 deletions
diff --git a/chromium/third_party/libwebm/source/webvttparser.cc b/chromium/third_party/libwebm/source/webvttparser.cc new file mode 100644 index 00000000000..655252c35f9 --- /dev/null +++ b/chromium/third_party/libwebm/source/webvttparser.cc @@ -0,0 +1,702 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#include "./webvttparser.h" // NOLINT +#include <climits> + +namespace libwebvtt { + +// NOLINT'ing this enum because clang-format puts it in a single line which +// makes it look really unreadable. +enum { + kNUL = '\x00', + kSPACE = ' ', + kTAB = '\x09', + kLF = '\x0A', + kCR = '\x0D' +}; // NOLINT + +Reader::~Reader() {} + +LineReader::~LineReader() {} + +int LineReader::GetLine(std::string* line_ptr) { + if (line_ptr == NULL) + return -1; + + std::string& ln = *line_ptr; + ln.clear(); + + // Consume characters from the stream, until we + // reach end-of-line (or end-of-stream). + + // The WebVTT spec states that lines may be + // terminated in any of these three ways: + // LF + // CR + // CR LF + + // We interrogate each character as we read it from the stream. + // If we detect an end-of-line character, we consume the full + // end-of-line indication, and we're done; otherwise, accumulate + // the character and repeat. + + for (;;) { + char c; + const int e = GetChar(&c); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return (ln.empty()) ? 1 : 0; + + // We have a character, so we must first determine + // whether we have reached end-of-line. + + if (c == kLF) + return 0; // handle the easy end-of-line case immediately + + if (c == kCR) + break; // handle the hard end-of-line case outside of loop + + if (c == '\xFE' || c == '\xFF') // not UTF-8 + return -1; + + // To defend against pathological or malicious streams, we + // cap the line length at some arbitrarily-large value: + enum { kMaxLineLength = 10000 }; // arbitrary + + if (ln.length() >= kMaxLineLength) + return -1; + + // We don't have an end-of-line character, so accumulate + // the character in our line buffer. + ln.push_back(c); + } + + // We detected a CR. We must interrogate the next character + // in the stream, to determine whether we have a LF (which + // would make it part of this same line). + + char c; + const int e = GetChar(&c); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return 0; + + // If next character in the stream is not a LF, return it + // to the stream (because it's part of the next line). + if (c != kLF) + UngetChar(c); + + return 0; +} + +Parser::Parser(Reader* r) : reader_(r), unget_(-1) {} + +Parser::~Parser() {} + +int Parser::Init() { + int e = ParseBOM(); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return -1; + + // Parse "WEBVTT". We read from the stream one character at-a-time, in + // order to defend against non-WebVTT streams (e.g. binary files) that don't + // happen to comprise lines of text demarcated with line terminators. + + const char kId[] = "WEBVTT"; + + for (const char* p = kId; *p; ++p) { + char c; + e = GetChar(&c); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return -1; + + if (c != *p) + return -1; + } + + std::string line; + + e = GetLine(&line); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return 0; // weird but valid + + if (!line.empty()) { + // Parse optional characters that follow "WEBVTT" + + const char c = line[0]; + + if (c != kSPACE && c != kTAB) + return -1; + } + + // The WebVTT spec requires that the "WEBVTT" line + // be followed by an empty line (to separate it from + // first cue). + + e = GetLine(&line); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return 0; // weird but we allow it + + if (!line.empty()) + return -1; + + return 0; // success +} + +int Parser::Parse(Cue* cue) { + if (cue == NULL) + return -1; + + // Parse first non-blank line + + std::string line; + int e; + + for (;;) { + e = GetLine(&line); + + if (e) // EOF is OK here + return e; + + if (!line.empty()) + break; + } + + // A WebVTT cue comprises an optional cue identifier line followed + // by a (non-optional) timings line. You determine whether you have + // a timings line by scanning for the arrow token, the lexeme of which + // may not appear in the cue identifier line. + + const char kArrow[] = "-->"; + std::string::size_type arrow_pos = line.find(kArrow); + + if (arrow_pos != std::string::npos) { + // We found a timings line, which implies that we don't have a cue + // identifier. + + cue->identifier.clear(); + } else { + // We did not find a timings line, so we assume that we have a cue + // identifier line, and then try again to find the cue timings on + // the next line. + + cue->identifier.swap(line); + + e = GetLine(&line); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return -1; + + arrow_pos = line.find(kArrow); + + if (arrow_pos == std::string::npos) // not a timings line + return -1; + } + + e = ParseTimingsLine(&line, arrow_pos, &cue->start_time, &cue->stop_time, + &cue->settings); + + if (e) // error + return e; + + // The cue payload comprises all the non-empty + // lines that follow the timings line. + + Cue::payload_t& p = cue->payload; + p.clear(); + + for (;;) { + e = GetLine(&line); + + if (e < 0) // error + return e; + + if (line.empty()) + break; + + p.push_back(line); + } + + if (p.empty()) + return -1; + + return 0; // success +} + +int Parser::GetChar(char* c) { + if (unget_ >= 0) { + *c = static_cast<char>(unget_); + unget_ = -1; + return 0; + } + + return reader_->GetChar(c); +} + +void Parser::UngetChar(char c) { unget_ = static_cast<unsigned char>(c); } + +int Parser::ParseBOM() { + // Explanation of UTF-8 BOM: + // http://en.wikipedia.org/wiki/Byte_order_mark + + static const char BOM[] = "\xEF\xBB\xBF"; // UTF-8 BOM + + for (int i = 0; i < 3; ++i) { + char c; + int e = GetChar(&c); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return 1; + + if (c != BOM[i]) { + if (i == 0) { // we don't have a BOM + UngetChar(c); + return 0; // success + } + + // We started a BOM, so we must finish the BOM. + return -1; // error + } + } + + return 0; // success +} + +int Parser::ParseTimingsLine(std::string* line_ptr, + std::string::size_type arrow_pos, Time* start_time, + Time* stop_time, Cue::settings_t* settings) { + if (line_ptr == NULL) + return -1; + + std::string& line = *line_ptr; + + if (arrow_pos == std::string::npos || arrow_pos >= line.length()) + return -1; + + // Place a NUL character at the start of the arrow token, in + // order to demarcate the start time from remainder of line. + line[arrow_pos] = kNUL; + std::string::size_type idx = 0; + + int e = ParseTime(line, &idx, start_time); + if (e) // error + return e; + + // Detect any junk that follows the start time, + // but precedes the arrow symbol. + + while (char c = line[idx]) { + if (c != kSPACE && c != kTAB) + return -1; + ++idx; + } + + // Place a NUL character at the end of the line, + // so the scanner has a place to stop, and begin + // the scan just beyond the arrow token. + + line.push_back(kNUL); + idx = arrow_pos + 3; + + e = ParseTime(line, &idx, stop_time); + if (e) // error + return e; + + e = ParseSettings(line, idx, settings); + if (e) // error + return e; + + return 0; // success +} + +int Parser::ParseTime(const std::string& line, std::string::size_type* idx_ptr, + Time* time) { + if (idx_ptr == NULL) + return -1; + + std::string::size_type& idx = *idx_ptr; + + if (idx == std::string::npos || idx >= line.length()) + return -1; + + if (time == NULL) + return -1; + + // Consume any whitespace that precedes the timestamp. + + while (char c = line[idx]) { + if (c != kSPACE && c != kTAB) + break; + ++idx; + } + + // WebVTT timestamp syntax comes in three flavors: + // SS[.sss] + // MM:SS[.sss] + // HH:MM:SS[.sss] + + // Parse a generic number value. We don't know which component + // of the time we have yet, until we do more parsing. + + int val = ParseNumber(line, &idx); + + if (val < 0) // error + return val; + + Time& t = *time; + + // The presence of a colon character indicates that we have + // an [HH:]MM:SS style syntax. + + if (line[idx] == ':') { + // We have either HH:MM:SS or MM:SS + + // The value we just parsed is either the hours or minutes. + // It must be followed by another number value (that is + // either minutes or seconds). + + const int first_val = val; + + ++idx; // consume colon + + // Parse second value + + val = ParseNumber(line, &idx); + + if (val < 0) + return val; + + if (val >= 60) // either MM or SS + return -1; + + if (line[idx] == ':') { + // We have HH:MM:SS + + t.hours = first_val; + t.minutes = val; // vetted above + + ++idx; // consume MM:SS colon + + // We have parsed the hours and minutes. + // We must now parse the seconds. + + val = ParseNumber(line, &idx); + + if (val < 0) + return val; + + if (val >= 60) // SS part of HH:MM:SS + return -1; + + t.seconds = val; + } else { + // We have MM:SS + + // The implication here is that the hour value was omitted + // from the timestamp (because it was 0). + + if (first_val >= 60) // minutes + return -1; + + t.hours = 0; + t.minutes = first_val; + t.seconds = val; // vetted above + } + } else { + // We have SS (only) + + // The time is expressed as total number of seconds, + // so the seconds value has no upper bound. + + t.seconds = val; + + // Convert SS to HH:MM:SS + + t.minutes = t.seconds / 60; + t.seconds -= t.minutes * 60; + + t.hours = t.minutes / 60; + t.minutes -= t.hours * 60; + } + + // We have parsed the hours, minutes, and seconds. + // We must now parse the milliseconds. + + char c = line[idx]; + + // TODO(matthewjheaney): one option here is to slightly relax the + // syntax rules for WebVTT timestamps, to permit the comma character + // to also be used as the seconds/milliseconds separator. This + // would handle streams that use localization conventions for + // countries in Western Europe. For now we obey the rules specified + // in the WebVTT spec (allow "full stop" only). + + const bool have_milliseconds = (c == '.'); + + if (!have_milliseconds) { + t.milliseconds = 0; + } else { + ++idx; // consume FULL STOP + + val = ParseNumber(line, &idx); + + if (val < 0) + return val; + + if (val >= 1000) + return -1; + + if (val < 10) + t.milliseconds = val * 100; + else if (val < 100) + t.milliseconds = val * 10; + else + t.milliseconds = val; + } + + // We have parsed the time proper. We must check for any + // junk that immediately follows the time specifier. + + c = line[idx]; + + if (c != kNUL && c != kSPACE && c != kTAB) + return -1; + + return 0; // success +} + +int Parser::ParseSettings(const std::string& line, std::string::size_type idx, + Cue::settings_t* settings) { + settings->clear(); + + if (idx == std::string::npos || idx >= line.length()) + return -1; + + for (;;) { + // We must parse a line comprising a sequence of 0 or more + // NAME:VALUE pairs, separated by whitespace. The line iself is + // terminated with a NUL char (indicating end-of-line). + + for (;;) { + const char c = line[idx]; + + if (c == kNUL) // end-of-line + return 0; // success + + if (c != kSPACE && c != kTAB) + break; + + ++idx; // consume whitespace + } + + // We have consumed the whitespace, and have not yet reached + // end-of-line, so there is something on the line for us to parse. + + settings->push_back(Setting()); + Setting& s = settings->back(); + + // Parse the NAME part of the settings pair. + + for (;;) { + const char c = line[idx]; + + if (c == ':') // we have reached end of NAME part + break; + + if (c == kNUL || c == kSPACE || c == kTAB) + return -1; + + s.name.push_back(c); + + ++idx; + } + + if (s.name.empty()) + return -1; + + ++idx; // consume colon + + // Parse the VALUE part of the settings pair. + + for (;;) { + const char c = line[idx]; + + if (c == kNUL || c == kSPACE || c == kTAB) + break; + + if (c == ':') // suspicious when part of VALUE + return -1; // TODO(matthewjheaney): verify this behavior + + s.value.push_back(c); + + ++idx; + } + + if (s.value.empty()) + return -1; + } +} + +int Parser::ParseNumber(const std::string& line, + std::string::size_type* idx_ptr) { + if (idx_ptr == NULL) + return -1; + + std::string::size_type& idx = *idx_ptr; + + if (idx == std::string::npos || idx >= line.length()) + return -1; + + if (!isdigit(line[idx])) + return -1; + + int result = 0; + + while (isdigit(line[idx])) { + const char c = line[idx]; + const int i = c - '0'; + + if (result > INT_MAX / 10) + return -1; + + result *= 10; + + if (result > INT_MAX - i) + return -1; + + result += i; + + ++idx; + } + + return result; +} + +bool Time::operator==(const Time& rhs) const { + if (hours != rhs.hours) + return false; + + if (minutes != rhs.minutes) + return false; + + if (seconds != rhs.seconds) + return false; + + return (milliseconds == rhs.milliseconds); +} + +bool Time::operator<(const Time& rhs) const { + if (hours < rhs.hours) + return true; + + if (hours > rhs.hours) + return false; + + if (minutes < rhs.minutes) + return true; + + if (minutes > rhs.minutes) + return false; + + if (seconds < rhs.seconds) + return true; + + if (seconds > rhs.seconds) + return false; + + return (milliseconds < rhs.milliseconds); +} + +bool Time::operator>(const Time& rhs) const { return rhs.operator<(*this); } + +bool Time::operator<=(const Time& rhs) const { return !this->operator>(rhs); } + +bool Time::operator>=(const Time& rhs) const { return !this->operator<(rhs); } + +presentation_t Time::presentation() const { + const presentation_t h = 1000LL * 3600LL * presentation_t(hours); + const presentation_t m = 1000LL * 60LL * presentation_t(minutes); + const presentation_t s = 1000LL * presentation_t(seconds); + const presentation_t result = h + m + s + milliseconds; + return result; +} + +Time& Time::presentation(presentation_t d) { + if (d < 0) { // error + hours = 0; + minutes = 0; + seconds = 0; + milliseconds = 0; + + return *this; + } + + seconds = static_cast<int>(d / 1000); + milliseconds = static_cast<int>(d - 1000 * seconds); + + minutes = seconds / 60; + seconds -= 60 * minutes; + + hours = minutes / 60; + minutes -= 60 * hours; + + return *this; +} + +Time& Time::operator+=(presentation_t rhs) { + const presentation_t d = this->presentation(); + const presentation_t dd = d + rhs; + this->presentation(dd); + return *this; +} + +Time Time::operator+(presentation_t d) const { + Time t(*this); + t += d; + return t; +} + +Time& Time::operator-=(presentation_t d) { return this->operator+=(-d); } + +presentation_t Time::operator-(const Time& t) const { + const presentation_t rhs = t.presentation(); + const presentation_t lhs = this->presentation(); + const presentation_t result = lhs - rhs; + return result; +} + +} // namespace libwebvtt |