diff options
Diffstat (limited to 'src/libs/3rdparty/libarchive/archive_write_set_format_warc.c')
-rw-r--r-- | src/libs/3rdparty/libarchive/archive_write_set_format_warc.c | 444 |
1 files changed, 444 insertions, 0 deletions
diff --git a/src/libs/3rdparty/libarchive/archive_write_set_format_warc.c b/src/libs/3rdparty/libarchive/archive_write_set_format_warc.c new file mode 100644 index 000000000..0ef003e2f --- /dev/null +++ b/src/libs/3rdparty/libarchive/archive_write_set_format_warc.c @@ -0,0 +1,444 @@ +/*- + * Copyright (c) 2014 Sebastian Freundt + * Author: Sebastian Freundt <devel@fresse.org> + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "archive_platform.h" +__FBSDID("$FreeBSD$"); + +#ifdef HAVE_ERRNO_H +#include <errno.h> +#endif +#include <stdio.h> +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif +#ifdef HAVE_STRING_H +#include <string.h> +#endif +#ifdef HAVE_TIME_H +#include <time.h> +#endif + +#include "archive.h" +#include "archive_entry.h" +#include "archive_entry_locale.h" +#include "archive_private.h" +#include "archive_random_private.h" +#include "archive_write_private.h" +#include "archive_write_set_format_private.h" + +struct warc_s { + unsigned int omit_warcinfo:1; + + time_t now; + mode_t typ; + unsigned int rng; + /* populated size */ + uint64_t populz; +}; + +static const char warcinfo[] = + "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n" + "format: WARC file version 1.0\r\n"; + +typedef enum { + WT_NONE, + /* warcinfo */ + WT_INFO, + /* metadata */ + WT_META, + /* resource */ + WT_RSRC, + /* request, unsupported */ + WT_REQ, + /* response, unsupported */ + WT_RSP, + /* revisit, unsupported */ + WT_RVIS, + /* conversion, unsupported */ + WT_CONV, + /* continuation, unsupported at the moment */ + WT_CONT, + /* invalid type */ + LAST_WT +} warc_type_t; + +typedef struct { + warc_type_t type; + const char *tgturi; + const char *recid; + time_t rtime; + time_t mtime; + const char *cnttyp; + uint64_t cntlen; +} warc_essential_hdr_t; + +typedef struct { + unsigned int u[4U]; +} warc_uuid_t; + +static int _warc_options(struct archive_write*, const char *key, const char *v); +static int _warc_header(struct archive_write *a, struct archive_entry *entry); +static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz); +static int _warc_finish_entry(struct archive_write *a); +static int _warc_close(struct archive_write *a); +static int _warc_free(struct archive_write *a); + +/* private routines */ +static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t); +static int _gen_uuid(warc_uuid_t *tgt); + + +/* + * Set output format to ISO 28500 (aka WARC) format. + */ +int +archive_write_set_format_warc(struct archive *_a) +{ + struct archive_write *a = (struct archive_write *)_a; + struct warc_s *w; + + archive_check_magic(_a, ARCHIVE_WRITE_MAGIC, + ARCHIVE_STATE_NEW, "archive_write_set_format_warc"); + + /* If another format was already registered, unregister it. */ + if (a->format_free != NULL) { + (a->format_free)(a); + } + + w = malloc(sizeof(*w)); + if (w == NULL) { + archive_set_error(&a->archive, ENOMEM, + "Can't allocate warc data"); + return (ARCHIVE_FATAL); + } + /* by default we're emitting a file wide header */ + w->omit_warcinfo = 0U; + /* obtain current time for date fields */ + w->now = time(NULL); + /* reset file type info */ + w->typ = 0; + /* also initialise our rng */ + w->rng = (unsigned int)w->now; + + a->format_data = w; + a->format_name = "WARC/1.0"; + a->format_options = _warc_options; + a->format_write_header = _warc_header; + a->format_write_data = _warc_data; + a->format_close = _warc_close; + a->format_free = _warc_free; + a->format_finish_entry = _warc_finish_entry; + a->archive.archive_format = ARCHIVE_FORMAT_WARC; + a->archive.archive_format_name = "WARC/1.0"; + return (ARCHIVE_OK); +} + + +/* archive methods */ +static int +_warc_options(struct archive_write *a, const char *key, const char *val) +{ + struct warc_s *w = a->format_data; + + if (strcmp(key, "omit-warcinfo") == 0) { + if (val == NULL || strcmp(val, "true") == 0) { + /* great */ + w->omit_warcinfo = 1U; + return (ARCHIVE_OK); + } + } + + /* Note: The "warn" return is just to inform the options + * supervisor that we didn't handle it. It will generate + * a suitable error if no one used this option. */ + return (ARCHIVE_WARN); +} + +static int +_warc_header(struct archive_write *a, struct archive_entry *entry) +{ + struct warc_s *w = a->format_data; + struct archive_string hdr; +#define MAX_HDR_SIZE 512 + + /* check whether warcinfo record needs outputting */ + if (!w->omit_warcinfo) { + ssize_t r; + warc_essential_hdr_t wi = { + WT_INFO, + /*uri*/NULL, + /*urn*/NULL, + /*rtm*/0, + /*mtm*/0, + /*cty*/"application/warc-fields", + /*len*/sizeof(warcinfo) - 1U, + }; + wi.rtime = w->now; + wi.mtime = w->now; + + archive_string_init(&hdr); + r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi); + if (r >= 0) { + /* jackpot! */ + /* now also use HDR buffer for the actual warcinfo */ + archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1); + + /* append end-of-record indicator */ + archive_strncat(&hdr, "\r\n\r\n", 4); + + /* write to output stream */ + __archive_write_output(a, hdr.s, archive_strlen(&hdr)); + } + /* indicate we're done with file header writing */ + w->omit_warcinfo = 1U; + archive_string_free(&hdr); + } + + if (archive_entry_pathname(entry) == NULL) { + archive_set_error(&a->archive, EINVAL, + "Invalid filename"); + return (ARCHIVE_WARN); + } + + w->typ = archive_entry_filetype(entry); + w->populz = 0U; + if (w->typ == AE_IFREG) { + warc_essential_hdr_t rh = { + WT_RSRC, + /*uri*/NULL, + /*urn*/NULL, + /*rtm*/0, + /*mtm*/0, + /*cty*/NULL, + /*len*/0, + }; + ssize_t r; + rh.tgturi = archive_entry_pathname(entry); + rh.rtime = w->now; + rh.mtime = archive_entry_mtime(entry); + rh.cntlen = (size_t)archive_entry_size(entry); + + archive_string_init(&hdr); + r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh); + if (r < 0) { + /* don't bother */ + archive_set_error( + &a->archive, + ARCHIVE_ERRNO_FILE_FORMAT, + "cannot archive file"); + return (ARCHIVE_WARN); + } + /* otherwise append to output stream */ + __archive_write_output(a, hdr.s, r); + /* and let subsequent calls to _data() know about the size */ + w->populz = rh.cntlen; + archive_string_free(&hdr); + return (ARCHIVE_OK); + } + /* just resort to erroring as per Tim's advice */ + __archive_write_entry_filetype_unsupported( + &a->archive, entry, "WARC"); + return (ARCHIVE_FAILED); +} + +static ssize_t +_warc_data(struct archive_write *a, const void *buf, size_t len) +{ + struct warc_s *w = a->format_data; + + if (w->typ == AE_IFREG) { + int rc; + + /* never write more bytes than announced */ + if (len > w->populz) { + len = (size_t)w->populz; + } + + /* now then, out we put the whole shebang */ + rc = __archive_write_output(a, buf, len); + if (rc != ARCHIVE_OK) { + return rc; + } + } + return len; +} + +static int +_warc_finish_entry(struct archive_write *a) +{ + static const char _eor[] = "\r\n\r\n"; + struct warc_s *w = a->format_data; + + if (w->typ == AE_IFREG) { + int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U); + + if (rc != ARCHIVE_OK) { + return rc; + } + } + /* reset type info */ + w->typ = 0; + return (ARCHIVE_OK); +} + +static int +_warc_close(struct archive_write *a) +{ + (void)a; /* UNUSED */ + return (ARCHIVE_OK); +} + +static int +_warc_free(struct archive_write *a) +{ + struct warc_s *w = a->format_data; + + free(w); + a->format_data = NULL; + return (ARCHIVE_OK); +} + + +/* private routines */ +static void +xstrftime(struct archive_string *as, const char *fmt, time_t t) +{ +/** like strftime(3) but for time_t objects */ + struct tm *rt; +#if defined(HAVE_GMTIME_R) || defined(HAVE_GMTIME_S) + struct tm timeHere; +#endif + char strtime[100]; + size_t len; + +#if defined(HAVE_GMTIME_S) + rt = gmtime_s(&timeHere, &t) ? NULL : &timeHere; +#elif defined(HAVE_GMTIME_R) + rt = gmtime_r(&t, &timeHere); +#else + rt = gmtime(&t); +#endif + if (!rt) + return; + /* leave the hard yacker to our role model strftime() */ + len = strftime(strtime, sizeof(strtime)-1, fmt, rt); + archive_strncat(as, strtime, len); +} + +static ssize_t +_popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr) +{ + static const char _ver[] = "WARC/1.0\r\n"; + static const char * const _typ[LAST_WT] = { + NULL, "warcinfo", "metadata", "resource", NULL + }; + char std_uuid[48U]; + + if (hdr.type == WT_NONE || hdr.type > WT_RSRC) { + /* brilliant, how exactly did we get here? */ + return -1; + } + + archive_strcpy(tgt, _ver); + + archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]); + + if (hdr.tgturi != NULL) { + /* check if there's a xyz:// */ + static const char _uri[] = ""; + static const char _fil[] = "file://"; + const char *u; + char *chk = strchr(hdr.tgturi, ':'); + + if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') { + /* yep, it's definitely a URI */ + u = _uri; + } else { + /* hm, best to prepend file:// then */ + u = _fil; + } + archive_string_sprintf(tgt, + "WARC-Target-URI: %s%s\r\n", u, hdr.tgturi); + } + + /* record time is usually when the http is sent off, + * just treat the archive writing as such for a moment */ + xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime); + + /* while we're at it, record the mtime */ + xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime); + + if (hdr.recid == NULL) { + /* generate one, grrrr */ + warc_uuid_t u; + + _gen_uuid(&u); + /* Unfortunately, archive_string_sprintf does not + * handle the minimum number following '%'. + * So we have to use snprintf function here instead + * of archive_string_snprintf function. */ +#if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900) +#define snprintf _snprintf +#endif + snprintf( + std_uuid, sizeof(std_uuid), + "<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>", + u.u[0U], + u.u[1U] >> 16U, u.u[1U] & 0xffffU, + u.u[2U] >> 16U, u.u[2U] & 0xffffU, + u.u[3U]); + hdr.recid = std_uuid; + } + + /* record-id is mandatory, fingers crossed we won't fail */ + archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid); + + if (hdr.cnttyp != NULL) { + archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp); + } + + /* next one is mandatory */ + archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen); + /**/ + archive_strncat(tgt, "\r\n", 2); + + return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt); +} + +static int +_gen_uuid(warc_uuid_t *tgt) +{ + archive_random(tgt->u, sizeof(tgt->u)); + /* obey uuid version 4 rules */ + tgt->u[1U] &= 0xffff0fffU; + tgt->u[1U] |= 0x4000U; + tgt->u[2U] &= 0x3fffffffU; + tgt->u[2U] |= 0x80000000U; + return 0; +} + +/* archive_write_set_format_warc.c ends here */ |