summaryrefslogtreecommitdiffstats
path: root/chromium/third_party/libwebm/source/webvttparser.cc
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/libwebm/source/webvttparser.cc')
-rw-r--r--chromium/third_party/libwebm/source/webvttparser.cc702
1 files changed, 702 insertions, 0 deletions
diff --git a/chromium/third_party/libwebm/source/webvttparser.cc b/chromium/third_party/libwebm/source/webvttparser.cc
new file mode 100644
index 00000000000..655252c35f9
--- /dev/null
+++ b/chromium/third_party/libwebm/source/webvttparser.cc
@@ -0,0 +1,702 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "./webvttparser.h" // NOLINT
+#include <climits>
+
+namespace libwebvtt {
+
+// NOLINT'ing this enum because clang-format puts it in a single line which
+// makes it look really unreadable.
+enum {
+ kNUL = '\x00',
+ kSPACE = ' ',
+ kTAB = '\x09',
+ kLF = '\x0A',
+ kCR = '\x0D'
+}; // NOLINT
+
+Reader::~Reader() {}
+
+LineReader::~LineReader() {}
+
+int LineReader::GetLine(std::string* line_ptr) {
+ if (line_ptr == NULL)
+ return -1;
+
+ std::string& ln = *line_ptr;
+ ln.clear();
+
+ // Consume characters from the stream, until we
+ // reach end-of-line (or end-of-stream).
+
+ // The WebVTT spec states that lines may be
+ // terminated in any of these three ways:
+ // LF
+ // CR
+ // CR LF
+
+ // We interrogate each character as we read it from the stream.
+ // If we detect an end-of-line character, we consume the full
+ // end-of-line indication, and we're done; otherwise, accumulate
+ // the character and repeat.
+
+ for (;;) {
+ char c;
+ const int e = GetChar(&c);
+
+ if (e < 0) // error
+ return e;
+
+ if (e > 0) // EOF
+ return (ln.empty()) ? 1 : 0;
+
+ // We have a character, so we must first determine
+ // whether we have reached end-of-line.
+
+ if (c == kLF)
+ return 0; // handle the easy end-of-line case immediately
+
+ if (c == kCR)
+ break; // handle the hard end-of-line case outside of loop
+
+ if (c == '\xFE' || c == '\xFF') // not UTF-8
+ return -1;
+
+ // To defend against pathological or malicious streams, we
+ // cap the line length at some arbitrarily-large value:
+ enum { kMaxLineLength = 10000 }; // arbitrary
+
+ if (ln.length() >= kMaxLineLength)
+ return -1;
+
+ // We don't have an end-of-line character, so accumulate
+ // the character in our line buffer.
+ ln.push_back(c);
+ }
+
+ // We detected a CR. We must interrogate the next character
+ // in the stream, to determine whether we have a LF (which
+ // would make it part of this same line).
+
+ char c;
+ const int e = GetChar(&c);
+
+ if (e < 0) // error
+ return e;
+
+ if (e > 0) // EOF
+ return 0;
+
+ // If next character in the stream is not a LF, return it
+ // to the stream (because it's part of the next line).
+ if (c != kLF)
+ UngetChar(c);
+
+ return 0;
+}
+
+Parser::Parser(Reader* r) : reader_(r), unget_(-1) {}
+
+Parser::~Parser() {}
+
+int Parser::Init() {
+ int e = ParseBOM();
+
+ if (e < 0) // error
+ return e;
+
+ if (e > 0) // EOF
+ return -1;
+
+ // Parse "WEBVTT". We read from the stream one character at-a-time, in
+ // order to defend against non-WebVTT streams (e.g. binary files) that don't
+ // happen to comprise lines of text demarcated with line terminators.
+
+ const char kId[] = "WEBVTT";
+
+ for (const char* p = kId; *p; ++p) {
+ char c;
+ e = GetChar(&c);
+
+ if (e < 0) // error
+ return e;
+
+ if (e > 0) // EOF
+ return -1;
+
+ if (c != *p)
+ return -1;
+ }
+
+ std::string line;
+
+ e = GetLine(&line);
+
+ if (e < 0) // error
+ return e;
+
+ if (e > 0) // EOF
+ return 0; // weird but valid
+
+ if (!line.empty()) {
+ // Parse optional characters that follow "WEBVTT"
+
+ const char c = line[0];
+
+ if (c != kSPACE && c != kTAB)
+ return -1;
+ }
+
+ // The WebVTT spec requires that the "WEBVTT" line
+ // be followed by an empty line (to separate it from
+ // first cue).
+
+ e = GetLine(&line);
+
+ if (e < 0) // error
+ return e;
+
+ if (e > 0) // EOF
+ return 0; // weird but we allow it
+
+ if (!line.empty())
+ return -1;
+
+ return 0; // success
+}
+
+int Parser::Parse(Cue* cue) {
+ if (cue == NULL)
+ return -1;
+
+ // Parse first non-blank line
+
+ std::string line;
+ int e;
+
+ for (;;) {
+ e = GetLine(&line);
+
+ if (e) // EOF is OK here
+ return e;
+
+ if (!line.empty())
+ break;
+ }
+
+ // A WebVTT cue comprises an optional cue identifier line followed
+ // by a (non-optional) timings line. You determine whether you have
+ // a timings line by scanning for the arrow token, the lexeme of which
+ // may not appear in the cue identifier line.
+
+ const char kArrow[] = "-->";
+ std::string::size_type arrow_pos = line.find(kArrow);
+
+ if (arrow_pos != std::string::npos) {
+ // We found a timings line, which implies that we don't have a cue
+ // identifier.
+
+ cue->identifier.clear();
+ } else {
+ // We did not find a timings line, so we assume that we have a cue
+ // identifier line, and then try again to find the cue timings on
+ // the next line.
+
+ cue->identifier.swap(line);
+
+ e = GetLine(&line);
+
+ if (e < 0) // error
+ return e;
+
+ if (e > 0) // EOF
+ return -1;
+
+ arrow_pos = line.find(kArrow);
+
+ if (arrow_pos == std::string::npos) // not a timings line
+ return -1;
+ }
+
+ e = ParseTimingsLine(&line, arrow_pos, &cue->start_time, &cue->stop_time,
+ &cue->settings);
+
+ if (e) // error
+ return e;
+
+ // The cue payload comprises all the non-empty
+ // lines that follow the timings line.
+
+ Cue::payload_t& p = cue->payload;
+ p.clear();
+
+ for (;;) {
+ e = GetLine(&line);
+
+ if (e < 0) // error
+ return e;
+
+ if (line.empty())
+ break;
+
+ p.push_back(line);
+ }
+
+ if (p.empty())
+ return -1;
+
+ return 0; // success
+}
+
+int Parser::GetChar(char* c) {
+ if (unget_ >= 0) {
+ *c = static_cast<char>(unget_);
+ unget_ = -1;
+ return 0;
+ }
+
+ return reader_->GetChar(c);
+}
+
+void Parser::UngetChar(char c) { unget_ = static_cast<unsigned char>(c); }
+
+int Parser::ParseBOM() {
+ // Explanation of UTF-8 BOM:
+ // http://en.wikipedia.org/wiki/Byte_order_mark
+
+ static const char BOM[] = "\xEF\xBB\xBF"; // UTF-8 BOM
+
+ for (int i = 0; i < 3; ++i) {
+ char c;
+ int e = GetChar(&c);
+
+ if (e < 0) // error
+ return e;
+
+ if (e > 0) // EOF
+ return 1;
+
+ if (c != BOM[i]) {
+ if (i == 0) { // we don't have a BOM
+ UngetChar(c);
+ return 0; // success
+ }
+
+ // We started a BOM, so we must finish the BOM.
+ return -1; // error
+ }
+ }
+
+ return 0; // success
+}
+
+int Parser::ParseTimingsLine(std::string* line_ptr,
+ std::string::size_type arrow_pos, Time* start_time,
+ Time* stop_time, Cue::settings_t* settings) {
+ if (line_ptr == NULL)
+ return -1;
+
+ std::string& line = *line_ptr;
+
+ if (arrow_pos == std::string::npos || arrow_pos >= line.length())
+ return -1;
+
+ // Place a NUL character at the start of the arrow token, in
+ // order to demarcate the start time from remainder of line.
+ line[arrow_pos] = kNUL;
+ std::string::size_type idx = 0;
+
+ int e = ParseTime(line, &idx, start_time);
+ if (e) // error
+ return e;
+
+ // Detect any junk that follows the start time,
+ // but precedes the arrow symbol.
+
+ while (char c = line[idx]) {
+ if (c != kSPACE && c != kTAB)
+ return -1;
+ ++idx;
+ }
+
+ // Place a NUL character at the end of the line,
+ // so the scanner has a place to stop, and begin
+ // the scan just beyond the arrow token.
+
+ line.push_back(kNUL);
+ idx = arrow_pos + 3;
+
+ e = ParseTime(line, &idx, stop_time);
+ if (e) // error
+ return e;
+
+ e = ParseSettings(line, idx, settings);
+ if (e) // error
+ return e;
+
+ return 0; // success
+}
+
+int Parser::ParseTime(const std::string& line, std::string::size_type* idx_ptr,
+ Time* time) {
+ if (idx_ptr == NULL)
+ return -1;
+
+ std::string::size_type& idx = *idx_ptr;
+
+ if (idx == std::string::npos || idx >= line.length())
+ return -1;
+
+ if (time == NULL)
+ return -1;
+
+ // Consume any whitespace that precedes the timestamp.
+
+ while (char c = line[idx]) {
+ if (c != kSPACE && c != kTAB)
+ break;
+ ++idx;
+ }
+
+ // WebVTT timestamp syntax comes in three flavors:
+ // SS[.sss]
+ // MM:SS[.sss]
+ // HH:MM:SS[.sss]
+
+ // Parse a generic number value. We don't know which component
+ // of the time we have yet, until we do more parsing.
+
+ int val = ParseNumber(line, &idx);
+
+ if (val < 0) // error
+ return val;
+
+ Time& t = *time;
+
+ // The presence of a colon character indicates that we have
+ // an [HH:]MM:SS style syntax.
+
+ if (line[idx] == ':') {
+ // We have either HH:MM:SS or MM:SS
+
+ // The value we just parsed is either the hours or minutes.
+ // It must be followed by another number value (that is
+ // either minutes or seconds).
+
+ const int first_val = val;
+
+ ++idx; // consume colon
+
+ // Parse second value
+
+ val = ParseNumber(line, &idx);
+
+ if (val < 0)
+ return val;
+
+ if (val >= 60) // either MM or SS
+ return -1;
+
+ if (line[idx] == ':') {
+ // We have HH:MM:SS
+
+ t.hours = first_val;
+ t.minutes = val; // vetted above
+
+ ++idx; // consume MM:SS colon
+
+ // We have parsed the hours and minutes.
+ // We must now parse the seconds.
+
+ val = ParseNumber(line, &idx);
+
+ if (val < 0)
+ return val;
+
+ if (val >= 60) // SS part of HH:MM:SS
+ return -1;
+
+ t.seconds = val;
+ } else {
+ // We have MM:SS
+
+ // The implication here is that the hour value was omitted
+ // from the timestamp (because it was 0).
+
+ if (first_val >= 60) // minutes
+ return -1;
+
+ t.hours = 0;
+ t.minutes = first_val;
+ t.seconds = val; // vetted above
+ }
+ } else {
+ // We have SS (only)
+
+ // The time is expressed as total number of seconds,
+ // so the seconds value has no upper bound.
+
+ t.seconds = val;
+
+ // Convert SS to HH:MM:SS
+
+ t.minutes = t.seconds / 60;
+ t.seconds -= t.minutes * 60;
+
+ t.hours = t.minutes / 60;
+ t.minutes -= t.hours * 60;
+ }
+
+ // We have parsed the hours, minutes, and seconds.
+ // We must now parse the milliseconds.
+
+ char c = line[idx];
+
+ // TODO(matthewjheaney): one option here is to slightly relax the
+ // syntax rules for WebVTT timestamps, to permit the comma character
+ // to also be used as the seconds/milliseconds separator. This
+ // would handle streams that use localization conventions for
+ // countries in Western Europe. For now we obey the rules specified
+ // in the WebVTT spec (allow "full stop" only).
+
+ const bool have_milliseconds = (c == '.');
+
+ if (!have_milliseconds) {
+ t.milliseconds = 0;
+ } else {
+ ++idx; // consume FULL STOP
+
+ val = ParseNumber(line, &idx);
+
+ if (val < 0)
+ return val;
+
+ if (val >= 1000)
+ return -1;
+
+ if (val < 10)
+ t.milliseconds = val * 100;
+ else if (val < 100)
+ t.milliseconds = val * 10;
+ else
+ t.milliseconds = val;
+ }
+
+ // We have parsed the time proper. We must check for any
+ // junk that immediately follows the time specifier.
+
+ c = line[idx];
+
+ if (c != kNUL && c != kSPACE && c != kTAB)
+ return -1;
+
+ return 0; // success
+}
+
+int Parser::ParseSettings(const std::string& line, std::string::size_type idx,
+ Cue::settings_t* settings) {
+ settings->clear();
+
+ if (idx == std::string::npos || idx >= line.length())
+ return -1;
+
+ for (;;) {
+ // We must parse a line comprising a sequence of 0 or more
+ // NAME:VALUE pairs, separated by whitespace. The line iself is
+ // terminated with a NUL char (indicating end-of-line).
+
+ for (;;) {
+ const char c = line[idx];
+
+ if (c == kNUL) // end-of-line
+ return 0; // success
+
+ if (c != kSPACE && c != kTAB)
+ break;
+
+ ++idx; // consume whitespace
+ }
+
+ // We have consumed the whitespace, and have not yet reached
+ // end-of-line, so there is something on the line for us to parse.
+
+ settings->push_back(Setting());
+ Setting& s = settings->back();
+
+ // Parse the NAME part of the settings pair.
+
+ for (;;) {
+ const char c = line[idx];
+
+ if (c == ':') // we have reached end of NAME part
+ break;
+
+ if (c == kNUL || c == kSPACE || c == kTAB)
+ return -1;
+
+ s.name.push_back(c);
+
+ ++idx;
+ }
+
+ if (s.name.empty())
+ return -1;
+
+ ++idx; // consume colon
+
+ // Parse the VALUE part of the settings pair.
+
+ for (;;) {
+ const char c = line[idx];
+
+ if (c == kNUL || c == kSPACE || c == kTAB)
+ break;
+
+ if (c == ':') // suspicious when part of VALUE
+ return -1; // TODO(matthewjheaney): verify this behavior
+
+ s.value.push_back(c);
+
+ ++idx;
+ }
+
+ if (s.value.empty())
+ return -1;
+ }
+}
+
+int Parser::ParseNumber(const std::string& line,
+ std::string::size_type* idx_ptr) {
+ if (idx_ptr == NULL)
+ return -1;
+
+ std::string::size_type& idx = *idx_ptr;
+
+ if (idx == std::string::npos || idx >= line.length())
+ return -1;
+
+ if (!isdigit(line[idx]))
+ return -1;
+
+ int result = 0;
+
+ while (isdigit(line[idx])) {
+ const char c = line[idx];
+ const int i = c - '0';
+
+ if (result > INT_MAX / 10)
+ return -1;
+
+ result *= 10;
+
+ if (result > INT_MAX - i)
+ return -1;
+
+ result += i;
+
+ ++idx;
+ }
+
+ return result;
+}
+
+bool Time::operator==(const Time& rhs) const {
+ if (hours != rhs.hours)
+ return false;
+
+ if (minutes != rhs.minutes)
+ return false;
+
+ if (seconds != rhs.seconds)
+ return false;
+
+ return (milliseconds == rhs.milliseconds);
+}
+
+bool Time::operator<(const Time& rhs) const {
+ if (hours < rhs.hours)
+ return true;
+
+ if (hours > rhs.hours)
+ return false;
+
+ if (minutes < rhs.minutes)
+ return true;
+
+ if (minutes > rhs.minutes)
+ return false;
+
+ if (seconds < rhs.seconds)
+ return true;
+
+ if (seconds > rhs.seconds)
+ return false;
+
+ return (milliseconds < rhs.milliseconds);
+}
+
+bool Time::operator>(const Time& rhs) const { return rhs.operator<(*this); }
+
+bool Time::operator<=(const Time& rhs) const { return !this->operator>(rhs); }
+
+bool Time::operator>=(const Time& rhs) const { return !this->operator<(rhs); }
+
+presentation_t Time::presentation() const {
+ const presentation_t h = 1000LL * 3600LL * presentation_t(hours);
+ const presentation_t m = 1000LL * 60LL * presentation_t(minutes);
+ const presentation_t s = 1000LL * presentation_t(seconds);
+ const presentation_t result = h + m + s + milliseconds;
+ return result;
+}
+
+Time& Time::presentation(presentation_t d) {
+ if (d < 0) { // error
+ hours = 0;
+ minutes = 0;
+ seconds = 0;
+ milliseconds = 0;
+
+ return *this;
+ }
+
+ seconds = static_cast<int>(d / 1000);
+ milliseconds = static_cast<int>(d - 1000 * seconds);
+
+ minutes = seconds / 60;
+ seconds -= 60 * minutes;
+
+ hours = minutes / 60;
+ minutes -= 60 * hours;
+
+ return *this;
+}
+
+Time& Time::operator+=(presentation_t rhs) {
+ const presentation_t d = this->presentation();
+ const presentation_t dd = d + rhs;
+ this->presentation(dd);
+ return *this;
+}
+
+Time Time::operator+(presentation_t d) const {
+ Time t(*this);
+ t += d;
+ return t;
+}
+
+Time& Time::operator-=(presentation_t d) { return this->operator+=(-d); }
+
+presentation_t Time::operator-(const Time& t) const {
+ const presentation_t rhs = t.presentation();
+ const presentation_t lhs = this->presentation();
+ const presentation_t result = lhs - rhs;
+ return result;
+}
+
+} // namespace libwebvtt