summaryrefslogtreecommitdiffstats
path: root/src/libs/3rdparty/libarchive/archive_read_support_format_warc.c
diff options
context:
space:
mode:
authorArttu Tarkiainen <arttu.tarkiainen@qt.io>2021-06-11 09:24:32 +0300
committerArttu Tarkiainen <arttu.tarkiainen@qt.io>2021-09-10 09:29:55 +0300
commitbfa8427531cf871b7423c252f522b7bfe75c7edd (patch)
tree6e7cc2011990d04e06cba39bf3bb636ff0adc8bd /src/libs/3rdparty/libarchive/archive_read_support_format_warc.c
parent2767b0c96a0dbe713bc2d1e346d10f6787556d5b (diff)
Add sources and qmake project files for libarchive
- Included from upstream source archive distribution: * Sources from "libarchive/" directory, excluding manpages, tests and build files * "COPYING" from archive root * Configuration headers in "3rdparty/libarchive/config/*" are pre-generated from "build/cmake/config.h.in" - Add project files for qmake, document usage of library in "3rdparty/libarchive/qt_attribution.json". - Update build instructions for Coin. - Support for libarchive can be enabled or disabled with the "libarchive" configuration feature. - Update "Getting Started" page in documentation. Change-Id: I2c2312600b3c6ede4925625d29953dcebaa48b98 Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org> Reviewed-by: Katja Marttila <katja.marttila@qt.io>
Diffstat (limited to 'src/libs/3rdparty/libarchive/archive_read_support_format_warc.c')
-rw-r--r--src/libs/3rdparty/libarchive/archive_read_support_format_warc.c848
1 files changed, 848 insertions, 0 deletions
diff --git a/src/libs/3rdparty/libarchive/archive_read_support_format_warc.c b/src/libs/3rdparty/libarchive/archive_read_support_format_warc.c
new file mode 100644
index 000000000..27329962d
--- /dev/null
+++ b/src/libs/3rdparty/libarchive/archive_read_support_format_warc.c
@@ -0,0 +1,848 @@
+/*-
+ * Copyright (c) 2014 Sebastian Freundt
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "archive_platform.h"
+__FBSDID("$FreeBSD$");
+
+/**
+ * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
+ * ISO 28500:2009.
+ * For the purposes of this file we used the final draft from:
+ * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
+ *
+ * Todo:
+ * [ ] real-world warcs can contain resources at endpoints ending in /
+ * e.g. http://bibnum.bnf.fr/warc/
+ * if you're lucky their response contains a Content-Location: header
+ * pointing to a unix-compliant filename, in the example above it's
+ * Content-Location: http://bibnum.bnf.fr/warc/index.html
+ * however, that's not mandated and github for example doesn't follow
+ * this convention.
+ * We need a set of archive options to control what to do with
+ * entries like these, at the moment care is taken to skip them.
+ *
+ **/
+
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#endif
+#ifdef HAVE_CTYPE_H
+#include <ctype.h>
+#endif
+#ifdef HAVE_TIME_H
+#include <time.h>
+#endif
+
+#include "archive.h"
+#include "archive_entry.h"
+#include "archive_private.h"
+#include "archive_read_private.h"
+
+typedef enum {
+ WT_NONE,
+ /* warcinfo */
+ WT_INFO,
+ /* metadata */
+ WT_META,
+ /* resource */
+ WT_RSRC,
+ /* request, unsupported */
+ WT_REQ,
+ /* response, unsupported */
+ WT_RSP,
+ /* revisit, unsupported */
+ WT_RVIS,
+ /* conversion, unsupported */
+ WT_CONV,
+ /* continuation, unsupported at the moment */
+ WT_CONT,
+ /* invalid type */
+ LAST_WT
+} warc_type_t;
+
+typedef struct {
+ size_t len;
+ const char *str;
+} warc_string_t;
+
+typedef struct {
+ size_t len;
+ char *str;
+} warc_strbuf_t;
+
+struct warc_s {
+ /* content length ahead */
+ size_t cntlen;
+ /* and how much we've processed so far */
+ size_t cntoff;
+ /* and how much we need to consume between calls */
+ size_t unconsumed;
+
+ /* string pool */
+ warc_strbuf_t pool;
+ /* previous version */
+ unsigned int pver;
+ /* stringified format name */
+ struct archive_string sver;
+};
+
+static int _warc_bid(struct archive_read *a, int);
+static int _warc_cleanup(struct archive_read *a);
+static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
+static int _warc_skip(struct archive_read *a);
+static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
+
+/* private routines */
+static unsigned int _warc_rdver(const char *buf, size_t bsz);
+static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
+static warc_string_t _warc_rduri(const char *buf, size_t bsz);
+static ssize_t _warc_rdlen(const char *buf, size_t bsz);
+static time_t _warc_rdrtm(const char *buf, size_t bsz);
+static time_t _warc_rdmtm(const char *buf, size_t bsz);
+static const char *_warc_find_eoh(const char *buf, size_t bsz);
+static const char *_warc_find_eol(const char *buf, size_t bsz);
+
+int
+archive_read_support_format_warc(struct archive *_a)
+{
+ struct archive_read *a = (struct archive_read *)_a;
+ struct warc_s *w;
+ int r;
+
+ archive_check_magic(_a, ARCHIVE_READ_MAGIC,
+ ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
+
+ if ((w = calloc(1, sizeof(*w))) == NULL) {
+ archive_set_error(&a->archive, ENOMEM,
+ "Can't allocate warc data");
+ return (ARCHIVE_FATAL);
+ }
+
+ r = __archive_read_register_format(
+ a, w, "warc",
+ _warc_bid, NULL, _warc_rdhdr, _warc_read,
+ _warc_skip, NULL, _warc_cleanup, NULL, NULL);
+
+ if (r != ARCHIVE_OK) {
+ free(w);
+ return (r);
+ }
+ return (ARCHIVE_OK);
+}
+
+static int
+_warc_cleanup(struct archive_read *a)
+{
+ struct warc_s *w = a->format->data;
+
+ if (w->pool.len > 0U) {
+ free(w->pool.str);
+ }
+ archive_string_free(&w->sver);
+ free(w);
+ a->format->data = NULL;
+ return (ARCHIVE_OK);
+}
+
+static int
+_warc_bid(struct archive_read *a, int best_bid)
+{
+ const char *hdr;
+ ssize_t nrd;
+ unsigned int ver;
+
+ (void)best_bid; /* UNUSED */
+
+ /* check first line of file, it should be a record already */
+ if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
+ /* no idea what to do */
+ return -1;
+ } else if (nrd < 12) {
+ /* nah, not for us, our magic cookie is at least 12 bytes */
+ return -1;
+ }
+
+ /* otherwise snarf the record's version number */
+ ver = _warc_rdver(hdr, nrd);
+ if (ver < 1200U || ver > 10000U) {
+ /* we only support WARC 0.12 to 1.0 */
+ return -1;
+ }
+
+ /* otherwise be confident */
+ return (64);
+}
+
+static int
+_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
+{
+#define HDR_PROBE_LEN (12U)
+ struct warc_s *w = a->format->data;
+ unsigned int ver;
+ const char *buf;
+ ssize_t nrd;
+ const char *eoh;
+ /* for the file name, saves some strndup()'ing */
+ warc_string_t fnam;
+ /* warc record type, not that we really use it a lot */
+ warc_type_t ftyp;
+ /* content-length+error monad */
+ ssize_t cntlen;
+ /* record time is the WARC-Date time we reinterpret it as ctime */
+ time_t rtime;
+ /* mtime is the Last-Modified time which will be the entry's mtime */
+ time_t mtime;
+
+start_over:
+ /* just use read_ahead() they keep track of unconsumed
+ * bits and bobs for us; no need to put an extra shift in
+ * and reproduce that functionality here */
+ buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
+
+ if (nrd < 0) {
+ /* no good */
+ archive_set_error(
+ &a->archive, ARCHIVE_ERRNO_MISC,
+ "Bad record header");
+ return (ARCHIVE_FATAL);
+ } else if (buf == NULL) {
+ /* there should be room for at least WARC/bla\r\n
+ * must be EOF therefore */
+ return (ARCHIVE_EOF);
+ }
+ /* looks good so far, try and find the end of the header now */
+ eoh = _warc_find_eoh(buf, nrd);
+ if (eoh == NULL) {
+ /* still no good, the header end might be beyond the
+ * probe we've requested, but then again who'd cram
+ * so much stuff into the header *and* be 28500-compliant */
+ archive_set_error(
+ &a->archive, ARCHIVE_ERRNO_MISC,
+ "Bad record header");
+ return (ARCHIVE_FATAL);
+ }
+ ver = _warc_rdver(buf, eoh - buf);
+ /* we currently support WARC 0.12 to 1.0 */
+ if (ver == 0U) {
+ archive_set_error(
+ &a->archive, ARCHIVE_ERRNO_MISC,
+ "Invalid record version");
+ return (ARCHIVE_FATAL);
+ } else if (ver < 1200U || ver > 10000U) {
+ archive_set_error(
+ &a->archive, ARCHIVE_ERRNO_MISC,
+ "Unsupported record version: %u.%u",
+ ver / 10000, (ver % 10000) / 100);
+ return (ARCHIVE_FATAL);
+ }
+ cntlen = _warc_rdlen(buf, eoh - buf);
+ if (cntlen < 0) {
+ /* nightmare! the specs say content-length is mandatory
+ * so I don't feel overly bad stopping the reader here */
+ archive_set_error(
+ &a->archive, EINVAL,
+ "Bad content length");
+ return (ARCHIVE_FATAL);
+ }
+ rtime = _warc_rdrtm(buf, eoh - buf);
+ if (rtime == (time_t)-1) {
+ /* record time is mandatory as per WARC/1.0,
+ * so just barf here, fast and loud */
+ archive_set_error(
+ &a->archive, EINVAL,
+ "Bad record time");
+ return (ARCHIVE_FATAL);
+ }
+
+ /* let the world know we're a WARC archive */
+ a->archive.archive_format = ARCHIVE_FORMAT_WARC;
+ if (ver != w->pver) {
+ /* stringify this entry's version */
+ archive_string_sprintf(&w->sver,
+ "WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
+ /* remember the version */
+ w->pver = ver;
+ }
+ /* start off with the type */
+ ftyp = _warc_rdtyp(buf, eoh - buf);
+ /* and let future calls know about the content */
+ w->cntlen = cntlen;
+ w->cntoff = 0U;
+ mtime = 0;/* Avoid compiling error on some platform. */
+
+ switch (ftyp) {
+ case WT_RSRC:
+ case WT_RSP:
+ /* only try and read the filename in the cases that are
+ * guaranteed to have one */
+ fnam = _warc_rduri(buf, eoh - buf);
+ /* check the last character in the URI to avoid creating
+ * directory endpoints as files, see Todo above */
+ if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
+ /* break here for now */
+ fnam.len = 0U;
+ fnam.str = NULL;
+ break;
+ }
+ /* bang to our string pool, so we save a
+ * malloc()+free() roundtrip */
+ if (fnam.len + 1U > w->pool.len) {
+ w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
+ w->pool.str = realloc(w->pool.str, w->pool.len);
+ }
+ memcpy(w->pool.str, fnam.str, fnam.len);
+ w->pool.str[fnam.len] = '\0';
+ /* let no one else know about the pool, it's a secret, shhh */
+ fnam.str = w->pool.str;
+
+ /* snarf mtime or deduce from rtime
+ * this is a custom header added by our writer, it's quite
+ * hard to believe anyone else would go through with it
+ * (apart from being part of some http responses of course) */
+ if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
+ mtime = rtime;
+ }
+ break;
+ case WT_NONE:
+ case WT_INFO:
+ case WT_META:
+ case WT_REQ:
+ case WT_RVIS:
+ case WT_CONV:
+ case WT_CONT:
+ case LAST_WT:
+ default:
+ fnam.len = 0U;
+ fnam.str = NULL;
+ break;
+ }
+
+ /* now eat some of those delicious buffer bits */
+ __archive_read_consume(a, eoh - buf);
+
+ switch (ftyp) {
+ case WT_RSRC:
+ case WT_RSP:
+ if (fnam.len > 0U) {
+ /* populate entry object */
+ archive_entry_set_filetype(entry, AE_IFREG);
+ archive_entry_copy_pathname(entry, fnam.str);
+ archive_entry_set_size(entry, cntlen);
+ archive_entry_set_perm(entry, 0644);
+ /* rtime is the new ctime, mtime stays mtime */
+ archive_entry_set_ctime(entry, rtime, 0L);
+ archive_entry_set_mtime(entry, mtime, 0L);
+ break;
+ }
+ /* FALLTHROUGH */
+ case WT_NONE:
+ case WT_INFO:
+ case WT_META:
+ case WT_REQ:
+ case WT_RVIS:
+ case WT_CONV:
+ case WT_CONT:
+ case LAST_WT:
+ default:
+ /* consume the content and start over */
+ _warc_skip(a);
+ goto start_over;
+ }
+ return (ARCHIVE_OK);
+}
+
+static int
+_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
+{
+ struct warc_s *w = a->format->data;
+ const char *rab;
+ ssize_t nrd;
+
+ if (w->cntoff >= w->cntlen) {
+ eof:
+ /* it's our lucky day, no work, we can leave early */
+ *buf = NULL;
+ *bsz = 0U;
+ *off = w->cntoff + 4U/*for \r\n\r\n separator*/;
+ w->unconsumed = 0U;
+ return (ARCHIVE_EOF);
+ }
+
+ if (w->unconsumed) {
+ __archive_read_consume(a, w->unconsumed);
+ w->unconsumed = 0U;
+ }
+
+ rab = __archive_read_ahead(a, 1U, &nrd);
+ if (nrd < 0) {
+ *bsz = 0U;
+ /* big catastrophe */
+ return (int)nrd;
+ } else if (nrd == 0) {
+ goto eof;
+ } else if ((size_t)nrd > w->cntlen - w->cntoff) {
+ /* clamp to content-length */
+ nrd = w->cntlen - w->cntoff;
+ }
+ *off = w->cntoff;
+ *bsz = nrd;
+ *buf = rab;
+
+ w->cntoff += nrd;
+ w->unconsumed = (size_t)nrd;
+ return (ARCHIVE_OK);
+}
+
+static int
+_warc_skip(struct archive_read *a)
+{
+ struct warc_s *w = a->format->data;
+
+ __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
+ w->cntlen = 0U;
+ w->cntoff = 0U;
+ return (ARCHIVE_OK);
+}
+
+
+/* private routines */
+static void*
+deconst(const void *c)
+{
+ return (void *)(uintptr_t)c;
+}
+
+static char*
+xmemmem(const char *hay, const size_t haysize,
+ const char *needle, const size_t needlesize)
+{
+ const char *const eoh = hay + haysize;
+ const char *const eon = needle + needlesize;
+ const char *hp;
+ const char *np;
+ const char *cand;
+ unsigned int hsum;
+ unsigned int nsum;
+ unsigned int eqp;
+
+ /* trivial checks first
+ * a 0-sized needle is defined to be found anywhere in haystack
+ * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
+ * that happens to begin with *NEEDLE) */
+ if (needlesize == 0UL) {
+ return deconst(hay);
+ } else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
+ /* trivial */
+ return NULL;
+ }
+
+ /* First characters of haystack and needle are the same now. Both are
+ * guaranteed to be at least one character long. Now computes the sum
+ * of characters values of needle together with the sum of the first
+ * needle_len characters of haystack. */
+ for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
+ hp < eoh && np < eon;
+ hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
+
+ /* HP now references the (NEEDLESIZE + 1)-th character. */
+ if (np < eon) {
+ /* haystack is smaller than needle, :O */
+ return NULL;
+ } else if (eqp) {
+ /* found a match */
+ return deconst(hay);
+ }
+
+ /* now loop through the rest of haystack,
+ * updating the sum iteratively */
+ for (cand = hay; hp < eoh; hp++) {
+ hsum ^= *cand++;
+ hsum ^= *hp;
+
+ /* Since the sum of the characters is already known to be
+ * equal at that point, it is enough to check just NEEDLESIZE - 1
+ * characters for equality,
+ * also CAND is by design < HP, so no need for range checks */
+ if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
+ return deconst(cand);
+ }
+ }
+ return NULL;
+}
+
+static int
+strtoi_lim(const char *str, const char **ep, int llim, int ulim)
+{
+ int res = 0;
+ const char *sp;
+ /* we keep track of the number of digits via rulim */
+ int rulim;
+
+ for (sp = str, rulim = ulim > 10 ? ulim : 10;
+ res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
+ sp++, rulim /= 10) {
+ res *= 10;
+ res += *sp - '0';
+ }
+ if (sp == str) {
+ res = -1;
+ } else if (res < llim || res > ulim) {
+ res = -2;
+ }
+ *ep = (const char*)sp;
+ return res;
+}
+
+static time_t
+time_from_tm(struct tm *t)
+{
+#if HAVE_TIMEGM
+ /* Use platform timegm() if available. */
+ return (timegm(t));
+#elif HAVE__MKGMTIME64
+ return (_mkgmtime64(t));
+#else
+ /* Else use direct calculation using POSIX assumptions. */
+ /* First, fix up tm_yday based on the year/month/day. */
+ if (mktime(t) == (time_t)-1)
+ return ((time_t)-1);
+ /* Then we can compute timegm() from first principles. */
+ return (t->tm_sec
+ + t->tm_min * 60
+ + t->tm_hour * 3600
+ + t->tm_yday * 86400
+ + (t->tm_year - 70) * 31536000
+ + ((t->tm_year - 69) / 4) * 86400
+ - ((t->tm_year - 1) / 100) * 86400
+ + ((t->tm_year + 299) / 400) * 86400);
+#endif
+}
+
+static time_t
+xstrpisotime(const char *s, char **endptr)
+{
+/** like strptime() but strictly for ISO 8601 Zulu strings */
+ struct tm tm;
+ time_t res = (time_t)-1;
+
+ /* make sure tm is clean */
+ memset(&tm, 0, sizeof(tm));
+
+ /* as a courtesy to our callers, and since this is a non-standard
+ * routine, we skip leading whitespace */
+ while (*s == ' ' || *s == '\t')
+ ++s;
+
+ /* read year */
+ if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
+ goto out;
+ }
+ /* read month */
+ if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
+ goto out;
+ }
+ /* read day-of-month */
+ if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
+ goto out;
+ }
+ /* read hour */
+ if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
+ goto out;
+ }
+ /* read minute */
+ if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
+ goto out;
+ }
+ /* read second */
+ if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
+ goto out;
+ }
+
+ /* massage TM to fulfill some of POSIX' constraints */
+ tm.tm_year -= 1900;
+ tm.tm_mon--;
+
+ /* now convert our custom tm struct to a unix stamp using UTC */
+ res = time_from_tm(&tm);
+
+out:
+ if (endptr != NULL) {
+ *endptr = deconst(s);
+ }
+ return res;
+}
+
+static unsigned int
+_warc_rdver(const char *buf, size_t bsz)
+{
+ static const char magic[] = "WARC/";
+ const char *c;
+ unsigned int ver = 0U;
+ unsigned int end = 0U;
+
+ if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
+ /* buffer too small or invalid magic */
+ return ver;
+ }
+ /* looks good so far, read the version number for a laugh */
+ buf += sizeof(magic) - 1U;
+
+ if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
+ isdigit((unsigned char)buf[2U])) {
+ /* we support a maximum of 2 digits in the minor version */
+ if (isdigit((unsigned char)buf[3U]))
+ end = 1U;
+ /* set up major version */
+ ver = (buf[0U] - '0') * 10000U;
+ /* set up minor version */
+ if (end == 1U) {
+ ver += (buf[2U] - '0') * 1000U;
+ ver += (buf[3U] - '0') * 100U;
+ } else
+ ver += (buf[2U] - '0') * 100U;
+ /*
+ * WARC below version 0.12 has a space-separated header
+ * WARC 0.12 and above terminates the version with a CRLF
+ */
+ c = buf + 3U + end;
+ if (ver >= 1200U) {
+ if (memcmp(c, "\r\n", 2U) != 0)
+ ver = 0U;
+ } else {
+ /* ver < 1200U */
+ if (*c != ' ' && *c != '\t')
+ ver = 0U;
+ }
+ }
+ return ver;
+}
+
+static unsigned int
+_warc_rdtyp(const char *buf, size_t bsz)
+{
+ static const char _key[] = "\r\nWARC-Type:";
+ const char *val, *eol;
+
+ if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
+ /* no bother */
+ return WT_NONE;
+ }
+ val += sizeof(_key) - 1U;
+ if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
+ /* no end of line */
+ return WT_NONE;
+ }
+
+ /* overread whitespace */
+ while (val < eol && (*val == ' ' || *val == '\t'))
+ ++val;
+
+ if (val + 8U == eol) {
+ if (memcmp(val, "resource", 8U) == 0)
+ return WT_RSRC;
+ else if (memcmp(val, "response", 8U) == 0)
+ return WT_RSP;
+ }
+ return WT_NONE;
+}
+
+static warc_string_t
+_warc_rduri(const char *buf, size_t bsz)
+{
+ static const char _key[] = "\r\nWARC-Target-URI:";
+ const char *val, *uri, *eol, *p;
+ warc_string_t res = {0U, NULL};
+
+ if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
+ /* no bother */
+ return res;
+ }
+ /* overread whitespace */
+ val += sizeof(_key) - 1U;
+ if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
+ /* no end of line */
+ return res;
+ }
+
+ while (val < eol && (*val == ' ' || *val == '\t'))
+ ++val;
+
+ /* overread URL designators */
+ if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
+ /* not touching that! */
+ return res;
+ }
+
+ /* spaces inside uri are not allowed, CRLF should follow */
+ for (p = val; p < eol; p++) {
+ if (isspace((unsigned char)*p))
+ return res;
+ }
+
+ /* there must be at least space for ftp */
+ if (uri < (val + 3U))
+ return res;
+
+ /* move uri to point to after :// */
+ uri += 3U;
+
+ /* now then, inspect the URI */
+ if (memcmp(val, "file", 4U) == 0) {
+ /* perfect, nothing left to do here */
+
+ } else if (memcmp(val, "http", 4U) == 0 ||
+ memcmp(val, "ftp", 3U) == 0) {
+ /* overread domain, and the first / */
+ while (uri < eol && *uri++ != '/');
+ } else {
+ /* not sure what to do? best to bugger off */
+ return res;
+ }
+ res.str = uri;
+ res.len = eol - uri;
+ return res;
+}
+
+static ssize_t
+_warc_rdlen(const char *buf, size_t bsz)
+{
+ static const char _key[] = "\r\nContent-Length:";
+ const char *val, *eol;
+ char *on = NULL;
+ long int len;
+
+ if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
+ /* no bother */
+ return -1;
+ }
+ val += sizeof(_key) - 1U;
+ if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
+ /* no end of line */
+ return -1;
+ }
+
+ /* skip leading whitespace */
+ while (val < eol && (*val == ' ' || *val == '\t'))
+ val++;
+ /* there must be at least one digit */
+ if (!isdigit((unsigned char)*val))
+ return -1;
+ errno = 0;
+ len = strtol(val, &on, 10);
+ if (errno != 0 || on != eol) {
+ /* line must end here */
+ return -1;
+ }
+
+ return (size_t)len;
+}
+
+static time_t
+_warc_rdrtm(const char *buf, size_t bsz)
+{
+ static const char _key[] = "\r\nWARC-Date:";
+ const char *val, *eol;
+ char *on = NULL;
+ time_t res;
+
+ if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
+ /* no bother */
+ return (time_t)-1;
+ }
+ val += sizeof(_key) - 1U;
+ if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
+ /* no end of line */
+ return -1;
+ }
+
+ /* xstrpisotime() kindly overreads whitespace for us, so use that */
+ res = xstrpisotime(val, &on);
+ if (on != eol) {
+ /* line must end here */
+ return -1;
+ }
+ return res;
+}
+
+static time_t
+_warc_rdmtm(const char *buf, size_t bsz)
+{
+ static const char _key[] = "\r\nLast-Modified:";
+ const char *val, *eol;
+ char *on = NULL;
+ time_t res;
+
+ if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
+ /* no bother */
+ return (time_t)-1;
+ }
+ val += sizeof(_key) - 1U;
+ if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
+ /* no end of line */
+ return -1;
+ }
+
+ /* xstrpisotime() kindly overreads whitespace for us, so use that */
+ res = xstrpisotime(val, &on);
+ if (on != eol) {
+ /* line must end here */
+ return -1;
+ }
+ return res;
+}
+
+static const char*
+_warc_find_eoh(const char *buf, size_t bsz)
+{
+ static const char _marker[] = "\r\n\r\n";
+ const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
+
+ if (hit != NULL) {
+ hit += sizeof(_marker) - 1U;
+ }
+ return hit;
+}
+
+static const char*
+_warc_find_eol(const char *buf, size_t bsz)
+{
+ static const char _marker[] = "\r\n";
+ const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
+
+ return hit;
+}
+/* archive_read_support_format_warc.c ends here */