summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/btree/src/btree.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/btree/src/btree.cpp')
-rw-r--r--src/3rdparty/btree/src/btree.cpp4181
1 files changed, 0 insertions, 4181 deletions
diff --git a/src/3rdparty/btree/src/btree.cpp b/src/3rdparty/btree/src/btree.cpp
deleted file mode 100644
index 5e7aec1..0000000
--- a/src/3rdparty/btree/src/btree.cpp
+++ /dev/null
@@ -1,4181 +0,0 @@
-/* $OpenBSD: btree.c,v 1.30 2010/09/01 12:13:21 martinh Exp $ */
-
-/*
- * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include "btree.h"
-#include "btree_p.h"
-
-//#define ENABLE_BIG_KEYS
-
-#ifdef ENABLE_BIG_KEYS
-#warning "Big keys may cause unforseen circumstances. Avoid for now."
-#endif
-
-#undef DEBUG
-
-#ifdef DEBUG
-# define DPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \
- fprintf(stderr, __VA_ARGS__); \
- fprintf(stderr, "\n"); } while (0)
-#else
-# define DPRINTF(...) do { } while (0)
-#endif
-
-#ifndef NO_ERROR_MESSAGES
-# define EPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \
- fprintf(stderr, __VA_ARGS__); \
- fprintf(stderr, "\n"); } while (0)
-#else
-# define EPRINTF(...) do { } while (0)
-#endif
-
-static struct mpage *mpage_lookup(struct btree *bt, pgno_t pgno);
-static void mpage_add(struct btree *bt, struct mpage *mp);
-static void mpage_free(struct mpage *mp);
-static void mpage_del(struct btree *bt, struct mpage *mp);
-static void mpage_flush(struct btree *bt);
-static struct mpage *mpage_copy(struct btree *bt, struct mpage *mp);
-static void mpage_prune(struct btree *bt);
-static void mpage_dirty(struct btree *bt, struct mpage *mp);
-static struct mpage *mpage_touch(struct btree *bt, struct mpage *mp);
-static int mpage_cmp(struct mpage *a, struct mpage *b);
-
-RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp);
-RB_GENERATE(page_cache, mpage, entry, mpage_cmp);
-
-static int btree_read_page(struct btree *bt, pgno_t pgno,
- struct page *page);
-static struct mpage *btree_get_mpage(struct btree *bt, pgno_t pgno);
-enum SearchType {
- SearchKey=0,
- SearchFirst=1,
- SearchLast=2,
-};
-static int btree_search_page_root(struct btree *bt,
- struct mpage *root, struct btval *key,
- struct cursor *cursor, enum SearchType searchType, int modify,
- struct mpage **mpp);
-static int btree_search_page(struct btree *bt,
- struct btree_txn *txn, struct btval *key,
- struct cursor *cursor, enum SearchType searchType, int modify,
- struct mpage **mpp);
-
-static int btree_write_header(struct btree *bt, int fd);
-static int btree_read_header(struct btree *bt);
-static int btree_is_meta_page(struct btree *bt, struct page *p);
-static int btree_read_meta(struct btree *bt, pgno_t *p_next);
-static int btree_read_meta_with_tag(struct btree *bt, unsigned int tag, pgno_t *p_root);
-static int btree_write_meta(struct btree *bt, pgno_t root,
- unsigned int flags, uint32_t tag);
-static void btree_ref(struct btree *bt);
-
-static struct node *btree_search_node(struct btree *bt, struct mpage *mp,
- struct btval *key, int *exactp, unsigned int *kip);
-static int btree_add_node(struct btree *bt, struct mpage *mp,
- indx_t indx, struct btval *key, struct btval *data,
- pgno_t pgno, uint8_t flags);
-static void btree_del_node(struct btree *bt, struct mpage *mp,
- indx_t indx);
-static int btree_read_data(struct btree *bt, struct mpage *mp,
- struct node *leaf, struct btval *data);
-
-static int btree_rebalance(struct btree *bt, struct mpage *mp);
-static int btree_update_key(struct btree *bt, struct mpage *mp,
- indx_t indx, struct btval *key);
-static int btree_adjust_prefix(struct btree *bt,
- struct mpage *src, int delta);
-static int btree_move_node(struct btree *bt, struct mpage *src,
- indx_t srcindx, struct mpage *dst, indx_t dstindx);
-static int btree_merge(struct btree *bt, struct mpage *src,
- struct mpage *dst);
-static int btree_split(struct btree *bt, struct mpage **mpp,
- unsigned int *newindxp, struct btval *newkey,
- struct btval *newdata, pgno_t newpgno);
-static struct mpage *btree_new_page(struct btree *bt, uint32_t flags);
-static int btree_write_overflow_data(struct btree *bt,
- struct page *p, struct btval *data);
-
-static void cursor_pop_page(struct cursor *cursor);
-static struct ppage *cursor_push_page(struct cursor *cursor,
- struct mpage *mp);
-
-static int bt_set_key(struct btree *bt, struct mpage *mp,
- struct node *node, struct btval *key);
-static int btree_sibling(struct cursor *cursor, int move_right, int rightmost);
-static int btree_cursor_next(struct cursor *cursor,
- struct btval *key, struct btval *data);
-static int btree_cursor_prev(struct cursor *cursor,
- struct btval *key, struct btval *data);
-static int btree_cursor_set(struct cursor *cursor,
- struct btval *key, struct btval *data, int *exactp);
-static int btree_cursor_first(struct cursor *cursor,
- struct btval *key, struct btval *data);
-
-static void bt_reduce_separator(struct btree *bt, struct node *min,
- struct btval *sep);
-static void remove_prefix(struct btree *bt, struct btval *key,
- size_t pfxlen);
-static void expand_prefix(struct btree *bt, struct mpage *mp,
- indx_t indx, struct btkey *expkey);
-static void concat_prefix(struct btree *bt, char *pfxstr, size_t pfxlen,
- char *keystr, size_t keylen,char *dest, size_t *size);
-static void common_prefix(struct btree *bt, struct btkey *min,
- struct btkey *max, struct btkey *pfx);
-static void find_common_prefix(struct btree *bt, struct mpage *mp);
-
-static size_t bt_leaf_size(struct btree *bt, struct mpage *mp, struct btval *key,
- struct btval *data);
-static int bt_is_overflow(struct btree *bt, struct mpage *mp, size_t ksize,
- size_t dsize);
-static size_t bt_branch_size(struct btree *bt, struct btval *key);
-
-static pgno_t btree_compact_tree(struct btree *bt, pgno_t pgno,
- struct btree *btc);
-
-static int memncmp(const void *s1, size_t n1,
- const void *s2, size_t n2, void *);
-static int memnrcmp(const void *s1, size_t n1,
- const void *s2, size_t n2, void *);
-
-static uint32_t calculate_crc32(const char *begin, const char *end);
-static uint32_t calculate_checksum(struct btree *bt, const struct page *p);
-static int verify_checksum(struct btree *bt, const struct page *page);
-
-struct btree *btree_open_empty_copy(struct btree *bt);
-int btree_clear(btree **bt);
-int btree_replace(struct btree *bt, struct btree *btw);
-
-static uint32_t
-calculate_crc32(const char *begin, const char *end)
-{
- const uint32_t *begin32 = (const uint32_t*)begin;
- const uint32_t *end32 = (const uint32_t*)(end - ((end - begin) % 4));
- if (begin32 >= end32)
- return 0;
- /* code derived from 32-bit CRC calculation by Gary S. Brown - Copyright (C) 1986. */
- static const uint32_t crctable[256] = {
- 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
- 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
- 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
- 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
- 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
- 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
- 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
- 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
- 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
- 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
- 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
- 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
- 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
- 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
- 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
- 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
- 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
- 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
- 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
- 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
- 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
- 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
- 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
- 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
- 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
- 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
- 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
- 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
- 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
- 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
- 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
- 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
- };
-
- uint32_t crc = ~(*begin32++);
- while (begin32 < end32) {
- begin = (const char*)begin32++;
- crc = (crc >> 8) ^ crctable[(crc ^ *(begin + 0)) & 0x000000ff];
- crc = (crc >> 8) ^ crctable[(crc ^ *(begin + 1)) & 0x000000ff];
- crc = (crc >> 8) ^ crctable[(crc ^ *(begin + 2)) & 0x000000ff];
- crc = (crc >> 8) ^ crctable[(crc ^ *(begin + 3)) & 0x000000ff];
- }
-
- // Hash up remaining bytes
- if ((const char *)end32 < end) {
- begin = (const char *)end32;
- while (begin != end)
- crc = (crc >> 8) ^ crctable[(crc ^ *begin++) & 0x000000ff];
- }
-
- return ~crc;
-}
-
-static uint32_t
-calculate_checksum(struct btree *bt, const struct page *p)
-{
- assert(p && bt);
-
- const uint32_t offset = offsetof(page, checksum) + sizeof(p->checksum);
- const char *begin = (const char *)p;
- const char *end = (const char *)p + bt->head.psize;
-
- if (F_ISSET(bt->flags, BT_NOPGCHECKSUM))
- return 0;
-
- DPRINTF("calculating checksum for page %u, flags %x", p->pgno, p->flags);
-
- if (F_ISSET(p->flags, P_HEAD)) {
- return calculate_crc32(begin + offset, begin + PAGEHDRSZ + sizeof(struct bt_head));
- } else if (F_ISSET(p->flags, P_META)) {
- return calculate_crc32(begin + offset, begin + PAGEHDRSZ + sizeof(struct bt_meta));
- } else if (F_ISSET(p->flags, P_BRANCH) || F_ISSET(p->flags, P_LEAF)) {
- indx_t l = MAX(PAGEHDRSZ, p->lower);
- indx_t u = MIN(bt->head.psize, p->upper);
- if (l > u)
- l = u;
- uint32_t c1 = calculate_crc32(begin + offset, begin + l);
- uint32_t c2 = calculate_crc32(begin + u, end);
- return c1 ^ c2;
- } else if (F_ISSET(p->flags, P_OVERFLOW)) {
- return calculate_crc32(begin + offset, end);
- }
-
- EPRINTF("unknown page type, flags = %x", p->flags);
- return 0;
-}
-
-static int
-verify_checksum(struct btree *bt, const struct page *p)
-{
- assert(bt && p);
-
- uint32_t c;
-
- if (F_ISSET(bt->flags, BT_NOPGCHECKSUM))
- return BT_SUCCESS;
-
- DPRINTF("verifying checksum for page %u", p->pgno);
-
- c = calculate_checksum(bt, p);
- if (c != p->checksum) {
- DPRINTF("checksum for page %u doesn't match: expected %x got %x", p->pgno, p->checksum, c);
- return BT_FAIL;
- }
- return BT_SUCCESS;
-}
-
-static int
-memncmp(const void *s1, size_t n1, const void *s2, size_t n2, void *)
-{
- if (n1 < n2) {
- int ret = memcmp(s1, s2, n1);
- if (ret == 0)
- return -1;
- else return ret;
- }
- else if (n1 > n2) {
- int ret = memcmp(s1, s2, n2);
- if (ret == 0)
- return 1;
- else return ret;
- }
- return memcmp(s1, s2, n1);
-}
-
-static int
-memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2, void *)
-{
- const unsigned char *p1;
- const unsigned char *p2;
-
- if (n1 == 0)
- return n2 == 0 ? 0 : -1;
-
- if (n2 == 0)
- return n1 == 0 ? 0 : 1;
-
- p1 = (const unsigned char *)s1 + n1 - 1;
- p2 = (const unsigned char *)s2 + n2 - 1;
-
- while (*p1 == *p2) {
- if (p1 == s1)
- return (p2 == s2) ? 0 : -1;
- if (p2 == s2)
- return (p1 == p2) ? 0 : 1;
- p1--;
- p2--;
- }
- return *p1 - *p2;
-}
-
-void
-btree_set_cmp(struct btree *bt, bt_cmp_func cmp, void *context)
-{
- bt->cmp = cmp;
- bt->context = context;
-}
-
-int
-btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b)
-{
- return bt->cmp((const char *)a->data, a->size, (const char *)b->data, b->size, bt->context);
-}
-
-static void
-common_prefix(struct btree *bt, struct btkey *min, struct btkey *max,
- struct btkey *pfx)
-{
- size_t n = 0;
- char *p1;
- char *p2;
-
- if (min->len == 0 || max->len == 0 || bt->cmp) {
- pfx->len = 0;
- return;
- }
-
- if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
- p1 = min->str + min->len - 1;
- p2 = max->str + max->len - 1;
-
- while (*p1 == *p2) {
- p1--;
- p2--;
- n++;
- if (p1 < min->str || p2 < max->str)
- break;
- }
-
- assert(n <= (int)sizeof(pfx->str));
- pfx->len = n;
- bcopy(p2 + 1, pfx->str, n);
- } else {
- p1 = min->str;
- p2 = max->str;
-
- while (*p1 == *p2) {
- p1++;
- p2++;
- n++;
- if (n == min->len || n == max->len)
- break;
- }
-
- assert(n <= (int)sizeof(pfx->str));
- pfx->len = n;
- bcopy(max->str, pfx->str, n);
- }
-}
-
-static void
-remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen)
-{
- assert(bt);
- if (pfxlen == 0 || bt->cmp != NULL)
- return;
-
- DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen,
- (int)key->size, (char *)key->data);
- assert(pfxlen <= key->size);
- key->size -= pfxlen;
- if (!F_ISSET(bt->flags, BT_REVERSEKEY))
- key->data = (char *)key->data + pfxlen;
-}
-
-static void
-expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx,
- struct btkey *expkey)
-{
- struct node *node;
- size_t sz;
-
- node = NODEPTR(mp, indx);
- sz = (node->ksize + mp->prefix.len) > MAXPFXSIZE ? (MAXPFXSIZE - mp->prefix.len) : node->ksize;
- expkey->len = sizeof(expkey->str);
- concat_prefix(bt, mp->prefix.str, mp->prefix.len,
- NODEKEY(node), sz, expkey->str, &expkey->len);
-}
-
-static int
-bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2,
- struct btkey *pfx)
-{
- if (bt->cmp) {
- return bt->cmp((const char*)key1->data, key1->size, (const char*)key2->data, key2->size, bt->context);
- } else {
- if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
- return memnrcmp(key1->data, key1->size - pfx->len,
- key2->data, key2->size, 0);
- } else {
- return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len,
- key2->data, key2->size, 0);
-
- }
- }
-}
-
-void
-btval_reset(struct btval *btv)
-{
- if (btv) {
- if (btv->mp)
- btv->mp->ref--;
- if (btv->free_data) {
- assert(btv->data);
- free(btv->data);
- }
- bzero(btv, sizeof(*btv));
- }
-}
-
-int btval_ref(struct btval *btv)
-{
- assert(btv);
- assert(btv->mp);
- return ++btv->mp->ref;
-}
-
-int btval_deref(struct btval *btv)
-{
- assert(btv);
- assert(btv->mp);
- return --btv->mp->ref;
-}
-
-static int
-mpage_cmp(struct mpage *a, struct mpage *b)
-{
- if (a->pgno > b->pgno)
- return 1;
- if (a->pgno < b->pgno)
- return -1;
- return 0;
-}
-
-static struct mpage *
-mpage_lookup(struct btree *bt, pgno_t pgno)
-{
- struct mpage find;
- struct mpage *mp = 0;
-
- find.pgno = pgno;
- mp = RB_FIND(page_cache, bt->page_cache, &find);
- if (mp) {
- bt->stat.hits++;
- /* Update LRU queue. Move page to the end. */
- TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
- TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
- }
- return mp;
-}
-
-static void
-mpage_add(struct btree *bt, struct mpage *mp)
-{
- assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL);
- DPRINTF("mpage_add: mp=%p pgno=%d", mp, mp->pgno);
- bt->stat.cache_size++;
- TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
-}
-
-static void
-mpage_free(struct mpage *mp)
-{
- if (mp != NULL) {
- free(mp->page);
- free(mp);
- }
-}
-
-static void
-mpage_del(struct btree *bt, struct mpage *mp)
-{
- assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp);
- DPRINTF("mpage_del: mp=%p pgno=%d", mp, mp->pgno);
- assert(bt->stat.cache_size > 0);
- bt->stat.cache_size--;
- TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
-}
-
-static void
-mpage_flush(struct btree *bt)
-{
- struct mpage *mp;
-
- while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) {
- mpage_del(bt, mp);
- mpage_free(mp);
- }
-}
-
-static struct mpage *
-mpage_copy(struct btree *bt, struct mpage *mp)
-{
- struct mpage *copy;
-
- if ((copy = (mpage *)calloc(1, sizeof(*copy))) == NULL)
- return NULL;
- if ((copy->page = (page *)malloc(bt->head.psize)) == NULL) {
- free(copy);
- return NULL;
- }
- bcopy(mp->page, copy->page, bt->head.psize);
- bcopy(&mp->prefix, &copy->prefix, sizeof(mp->prefix));
- copy->parent = mp->parent;
- copy->parent_index = mp->parent_index;
- copy->pgno = mp->pgno;
-
- return copy;
-}
-
-/* Remove the least recently used memory pages until the cache size is
- * within the configured bounds. Pages referenced by cursors or returned
- * key/data are not pruned.
- */
-static void
-mpage_prune(struct btree *bt)
-{
- struct mpage *mp, *next;
-
- for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) {
- if (bt->stat.cache_size <= bt->stat.max_cache)
- break;
- next = TAILQ_NEXT(mp, lru_next);
- if (!mp->dirty && mp->ref <= 0) {
- mpage_del(bt, mp);
- mpage_free(mp);
- }
- }
-}
-
-/* Mark a page as dirty and push it on the dirty queue.
- */
-static void
-mpage_dirty(struct btree *bt, struct mpage *mp)
-{
- assert(bt != NULL);
- assert(bt->txn != NULL);
-
- if (!mp->dirty) {
- mp->dirty = 1;
- SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next);
- }
-}
-
-/* Touch a page: make it dirty and re-insert into tree with updated pgno.
- */
-static struct mpage *
-mpage_touch(struct btree *bt, struct mpage *mp)
-{
- assert(bt != NULL);
- assert(bt->txn != NULL);
- assert(mp != NULL);
-
- if (!mp->dirty) {
- DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno);
- if (mp->ref == 0)
- mpage_del(bt, mp);
- else {
- if ((mp = mpage_copy(bt, mp)) == NULL)
- return NULL;
- }
- mp->pgno = mp->page->pgno = bt->txn->next_pgno++;
- mpage_dirty(bt, mp);
- mpage_add(bt, mp);
-
- /* Update the page number to new touched page. */
- if (mp->parent != NULL)
- NODEPGNO(NODEPTR(mp->parent,
- mp->parent_index)) = mp->pgno;
- }
-
- return mp;
-}
-
-static int
-btree_read_page(struct btree *bt, pgno_t pgno, struct page *page)
-{
- ssize_t rc;
-
- DPRINTF("reading page %u", pgno);
- bt->stat.reads++;
- if ((rc = pread(bt->fd, page, bt->head.psize, (off_t)pgno*bt->head.psize)) == 0) {
- DPRINTF("page %u doesn't exist", pgno);
- errno = ENOENT;
- return BT_FAIL;
- } else if (rc != (ssize_t)bt->head.psize) {
- if (rc > 0)
- errno = EINVAL;
- fprintf(stderr, "%s:%d: short pread rc=%zd psize=%d\n",
- __FUNCTION__, __LINE__, rc, bt->head.psize);
- DPRINTF("read: %s", strerror(errno));
- return BT_FAIL;
- }
-
- if (page->pgno != pgno) {
- EPRINTF("page numbers don't match: %u != %u", pgno, page->pgno);
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if (verify_checksum(bt, page) != 0) {
- EPRINTF("checksum error for page %d", pgno);
- errno = EINVAL;
- return BT_FAIL;
- }
-
- DPRINTF("page %u has flags 0x%X", pgno, page->flags);
-
- return BT_SUCCESS;
-}
-
-int
-btree_sync(struct btree *bt)
-{
- unsigned int flags = BT_MARKER;
- if (!F_ISSET(bt->flags, BT_NOSYNC))
- return fsync(bt->fd);
- if (F_ISSET(bt->flags, BT_USEMARKER) && !F_ISSET(bt->meta.flags, BT_MARKER)) {
- /* If we're closing a dead btree then add the tombstone flag */
- if (F_ISSET(bt->meta.flags, BT_TOMBSTONE))
- flags |= BT_TOMBSTONE;
- /* we want to use marker and the last meta page doesn't have it */
- /* put a copy of the last meta page but this time with a marker */
- if (bt->txn) {
- EPRINTF("btree_sync while in transaction is not a good idea");
- return BT_FAIL;
- }
- if (fsync(bt->fd) != 0)
- return BT_FAIL;
- bt->txn = btree_txn_begin(bt, 0);
- if (bt->txn == 0)
- return BT_FAIL;
- if (btree_write_meta(bt, bt->meta.root, flags, bt->meta.tag) == BT_FAIL) {
- btree_txn_abort(bt->txn);
- return BT_FAIL;
- }
- btree_txn_abort(bt->txn);
- return BT_SUCCESS;
- }
- return 0;
-}
-
-
-struct btree_txn *
-btree_txn_begin(struct btree *bt, int rdonly)
-{
- struct btree_txn *txn;
-
- if (!rdonly && bt->txn != NULL) {
- DPRINTF("write transaction already begun");
- errno = EBUSY;
- return NULL;
- }
-
- if ((txn = (btree_txn *)calloc(1, sizeof(*txn))) == NULL) {
- DPRINTF("calloc: %s", strerror(errno));
- return NULL;
- }
-
- if (rdonly) {
- txn->flags |= BT_TXN_RDONLY;
- DPRINTF("taking read lock on txn %p, bt %p", txn, bt);
- } else {
- txn->dirty_queue = (dirty_queue *)calloc(1, sizeof(*txn->dirty_queue));
- if (txn->dirty_queue == NULL) {
- free(txn);
- return NULL;
- }
- SIMPLEQ_INIT(txn->dirty_queue);
-
- DPRINTF("taking write lock on txn %p, bt %p", txn, bt);
- if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) {
- EPRINTF("flock: %s", strerror(errno));
- errno = EBUSY;
- free(txn->dirty_queue);
- free(txn);
- return NULL;
- }
- bt->txn = txn;
- }
-
- txn->bt = bt;
- btree_ref(bt);
-
- if (btree_read_meta(bt, &txn->next_pgno) != BT_SUCCESS) {
- btree_txn_abort(txn);
- return NULL;
- }
-
- txn->root = bt->meta.root;
- txn->tag = bt->meta.tag;
- DPRINTF("begin transaction on btree %p, root page %u (tag %d)", bt, txn->root, txn->tag);
-
- return txn;
-}
-
-struct btree_txn *
-btree_txn_begin_with_tag(struct btree *bt, unsigned int tag)
-{
- struct btree_txn *txn;
- pgno_t root_page;
-
- if (btree_read_meta_with_tag(bt, tag, &root_page) != BT_SUCCESS) {
- return NULL;
- }
-
- if ((txn = (btree_txn *)calloc(1, sizeof(*txn))) == NULL) {
- DPRINTF("calloc: %s", strerror(errno));
- return NULL;
- }
-
- txn->root = root_page;
- txn->bt = bt;
- txn->flags = BT_TXN_RDONLY;
- txn->tag = tag;
- btree_ref(bt);
-
- DPRINTF("begin transaction on btree %p, root page %u (tag %d)", bt, txn->root, txn->tag);
-
- return txn;
-}
-
-void
-btree_txn_abort(struct btree_txn *txn)
-{
- struct mpage *mp;
- struct btree *bt;
-
- if (txn == NULL)
- return;
-
- bt = txn->bt;
- DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root);
-
- if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) {
- /* Discard all dirty pages.
- */
- while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
- mp = SIMPLEQ_FIRST(txn->dirty_queue);
- assert(mp->ref == 0); /* cursors should be closed */
- mpage_del(bt, mp);
- SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
- mpage_free(mp);
- }
-
- DPRINTF("releasing write lock on txn %p", txn);
- txn->bt->txn = NULL;
- if (flock(txn->bt->fd, LOCK_UN) != 0) {
- DPRINTF("failed to unlock fd %d: %s",
- txn->bt->fd, strerror(errno));
- }
- free(txn->dirty_queue);
- }
-
- btree_close(txn->bt);
- free(txn);
-}
-
-int
-btree_txn_is_read(struct btree_txn *txn)
-{
- assert(txn);
- return txn->flags & BT_TXN_RDONLY ? 1 : 0;
-}
-
-int
-btree_txn_is_error(struct btree_txn *txn)
-{
- assert(txn);
- return txn->flags & BT_TXN_ERROR ? 1 : 0;
-}
-
-unsigned int
-btree_txn_get_tag(struct btree_txn *txn)
-{
- assert(txn != 0);
- return txn->tag;
-}
-
-int
-btree_txn_commit(struct btree_txn *txn, unsigned int tag, unsigned int flags)
-{
- int n, done;
- ssize_t rc;
- off_t size;
- struct mpage *mp;
- struct btree *bt;
- struct iovec iov[BT_COMMIT_PAGES];
- const int needfsync = !F_ISSET(txn->bt->flags, BT_NOSYNC) || F_ISSET(flags, BT_FORCE_MARKER);
- unsigned long num_dirty = 0;
- unsigned long num_dirty_branches = 0;
- unsigned long num_dirty_leaves = 0;
- unsigned long num_dirty_overflows = 0;
-
- assert(txn != NULL);
- assert(txn->bt != NULL);
-
- bt = txn->bt;
-
- if (F_ISSET(txn->flags, BT_TXN_RDONLY)) {
- DPRINTF("attempt to commit read-only transaction");
- btree_txn_abort(txn);
- errno = EPERM;
- return BT_FAIL;
- }
-
- if (txn != bt->txn) {
- EPRINTF("attempt to commit unknown transaction");
- btree_txn_abort(txn);
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
- EPRINTF("error flag is set, can't commit");
- btree_txn_abort(txn);
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if (SIMPLEQ_EMPTY(txn->dirty_queue)) {
- if (bt->stat.tag != tag) {
- goto done;
- } else {
- mpage_prune(bt);
- btree_txn_abort(txn);
- return BT_SUCCESS;
- }
- }
-
- if (F_ISSET(bt->flags, BT_FIXPADDING)) {
- size = lseek(bt->fd, 0, SEEK_END);
- size += bt->head.psize - (size % bt->head.psize);
- DPRINTF("extending to multiple of page size: %llu", (long long unsigned)size);
- if (ftruncate(bt->fd, size) != 0) {
- DPRINTF("ftruncate: %s", strerror(errno));
- btree_txn_abort(txn);
- return BT_FAIL;
- }
- bt->flags &= ~BT_FIXPADDING;
- }
-
- DPRINTF("committing transaction on btree %p, root page %u",
- bt, txn->root);
-
- /* Commit up to BT_COMMIT_PAGES dirty pages to disk until done.
- */
- do {
- n = 0;
- done = 1;
- SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) {
- mp->page->checksum = calculate_checksum(bt, mp->page);
- iov[n].iov_len = bt->head.psize;
- iov[n].iov_base = mp->page;
- if (IS_BRANCH(mp))
- num_dirty_branches++;
- else if (IS_LEAF(mp))
- num_dirty_leaves++;
- else
- num_dirty_overflows++;
- DPRINTF("commiting page %u == %u with checksum %x", mp->pgno, mp->page->pgno, mp->page->checksum);
- if (++n >= BT_COMMIT_PAGES) {
- done = 0;
- break;
- }
- }
-
- if (n == 0)
- break;
-
- num_dirty += n;
-
- DPRINTF("commiting %u dirty pages", n);
- bt->stat.writes += n;
- rc = writev(bt->fd, iov, n);
- if (rc != (ssize_t)bt->head.psize*n) {
- if (rc > 0) {
- DPRINTF("short write, filesystem full?");
- } else {
- DPRINTF("writev: %s", strerror(errno));
- }
- btree_txn_abort(txn);
- return BT_FAIL;
- }
-
- /* Remove the dirty flag from the written pages.
- */
- while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
- mp = SIMPLEQ_FIRST(txn->dirty_queue);
- mp->dirty = 0;
- SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
- if (--n == 0)
- break;
- }
- } while (!done);
- if (num_dirty > bt->stat.max_cache) {
- fprintf(stderr, "large transaction: \t %ld B %ld L %ld O dirty \t %d B %d L %d O live pages %s\n",
- num_dirty_branches, num_dirty_leaves, num_dirty_overflows,
- bt->meta.branch_pages, bt->meta.leaf_pages, bt->meta.overflow_pages,
- bt->path);
- }
-done:
- if (needfsync) {
- if (fsync(bt->fd) != 0) {
- btree_txn_abort(txn);
- return BT_FAIL;
- }
- }
- if (btree_write_meta(bt, txn->root,
- needfsync ? BT_MARKER : 0,
- tag) != BT_SUCCESS) {
- btree_txn_abort(txn);
- return BT_FAIL;
- }
-
- mpage_prune(bt);
- btree_txn_abort(txn);
-
- return BT_SUCCESS;
-}
-
-static int
-btree_write_header(struct btree *bt, int fd)
-{
- struct stat sb;
- struct bt_head *h;
- struct page *p;
- ssize_t rc;
- unsigned int psize;
-
- DPRINTF("writing header page");
- assert(bt != NULL);
-
- /* Ask stat for optimal blocksize for I/O but
- don't use smaller than the initial page size */
- psize = PAGESIZE;
- if (fstat(fd, &sb) == 0 && sb.st_blksize > PAGESIZE)
- psize = sb.st_blksize;
-
- if ((p = (page *)calloc(1, psize)) == NULL)
- return -1;
- p->flags = P_HEAD;
-
- h = (bt_head *)METADATA(p);
- h->magic = BT_MAGIC;
- h->version = BT_VERSION;
- h->psize = psize;
- h->ksize = MAXKEYSIZE;
- bcopy(h, &bt->head, sizeof(*h));
-
- p->checksum = calculate_checksum(bt, p);
- DPRINTF("writing page %u with checksum %x", p->pgno, p->checksum);
- bt->stat.writes++;
- rc = write(fd, p, bt->head.psize);
- free(p);
- if (rc != (ssize_t)bt->head.psize) {
- if (rc > 0)
- DPRINTF("short write, filesystem full?");
- return BT_FAIL;
- }
-
- return BT_SUCCESS;
-}
-
-static int
-btree_read_header(struct btree *bt)
-{
- char page[PAGESIZE];
- struct page *p = 0;
- struct page *pcheck = 0;
- struct bt_head *h = 0;
- ssize_t rc;
- assert(bt != NULL);
-
- /* We don't know the page size yet, so use a minimum value.
- */
-
- bt->stat.reads++;
- if ((rc = pread(bt->fd, page, PAGESIZE, 0)) == 0) {
- errno = ENOENT;
- goto fail;
- } else if ((size_t)rc != PAGESIZE) {
- EPRINTF("read: %s", strerror(errno));
- if (rc > 0)
- errno = EINVAL;
- goto fail;
- }
-
- p = (struct page *)page;
-
- if (!F_ISSET(p->flags, P_HEAD)) {
- EPRINTF("page %d not a header page", p->pgno);
- errno = EINVAL;
- goto fail;
- }
-
- h = (bt_head *)METADATA(p);
- if (h->magic != BT_MAGIC) {
- EPRINTF("header has invalid magic");
- errno = EINVAL;
- goto fail;
- }
-
- if (h->version != BT_VERSION) {
- EPRINTF("database is version %u, expected version %u",
- bt->head.version, BT_VERSION);
- errno = EINVAL;
- goto fail;
- }
-
- if (h->ksize != MAXKEYSIZE) {
- EPRINTF("database uses max key size %u, expected max key size %u",
- bt->head.ksize, MAXKEYSIZE);
- errno = EINVAL;
- goto fail;
- }
-
- bcopy(h, &bt->head, sizeof(*h));
-
- if (bt->head.psize == PAGESIZE) {
- pcheck = p;
- } else {
- const size_t pheadsz = PAGEHDRSZ + sizeof(bt_head);
- pcheck = (struct page *)malloc(pheadsz);
- bt->stat.reads++;
- if (pread(bt->fd, page, pheadsz, 0) <= 0) {
- EPRINTF("pread failed to get data to verify checksum");
- goto fail;
- }
- }
-
- if (verify_checksum(bt, pcheck) != 0) {
- EPRINTF("checksum fail");
- goto fail;
- } else {
- if (pcheck != p)
- free(pcheck);
- }
-
- DPRINTF("btree_read_header: magic = %x", bt->head.magic);
- DPRINTF("btree_read_header: version = %d", bt->head.version);
- DPRINTF("btree_read_header: flags = %d", bt->head.flags);
- DPRINTF("btree_read_header: psize = %d", bt->head.psize);
- DPRINTF("btree_read_header: ksize = %d", bt->head.ksize);
-
- return 0;
-fail:
- if (pcheck && pcheck != p)
- free(pcheck);
- return -1;
-}
-
-static int
-btree_write_meta(struct btree *bt, pgno_t root, unsigned int flags, uint32_t tag)
-{
- struct mpage *mp, *prev_mp;
- struct bt_meta *meta;
- ssize_t rc;
-
- DPRINTF("writing meta page for root page %u with flags %d and tag %d", root, flags, tag);
-
- assert(bt != NULL);
- assert(bt->txn != NULL);
-
- if ((mp = btree_new_page(bt, P_META)) == NULL)
- return -1;
-
- // get prev meta mpage from cache
- prev_mp = mpage_lookup(bt, bt->meta.pgno);
-
- bt->meta.prev_meta = bt->meta.pgno;
- bt->meta.pgno = mp->pgno;
- bt->meta.root = root;
- bt->meta.flags = flags;
- bt->meta.created_at = time(0);
- bt->meta.revisions++;
- bt->meta.tag = tag;
-
- if (F_ISSET(bt->flags, BT_NOPGCHECKSUM)) {
- bt->hasher->reset();
- bt->hasher->addData((const char *)&bt->meta, METAHASHLEN);
- QByteArray result = bt->hasher->result();
- memcpy(bt->meta.hash, result.constData(), result.size());
- }
-
- /* Copy the meta data changes to the new meta page. */
- meta = METADATA(mp->page);
- bcopy(&bt->meta, meta, sizeof(*meta));
-
- mp->page->checksum = calculate_checksum(bt, mp->page);
- DPRINTF("writing page %u with checksum %x, digest %.*s", mp->page->pgno, mp->page->checksum, SHA_DIGEST_LENGTH, meta->hash);
-
- bt->stat.writes++;
- rc = write(bt->fd, mp->page, bt->head.psize);
- mp->dirty = 0;
- SIMPLEQ_REMOVE_HEAD(bt->txn->dirty_queue, next);
- if (prev_mp) {
- mpage_del(bt, prev_mp);
- mpage_free(prev_mp);
- }
- if (rc != (ssize_t)bt->head.psize) {
- if (rc > 0)
- DPRINTF("short write, filesystem full?");
- return BT_FAIL;
- }
-
- if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) {
- DPRINTF("failed to update file size: %s", strerror(errno));
- bt->size = 0;
- }
- return BT_SUCCESS;
-}
-
-/* Returns true if page p is a valid meta page, false otherwise.
- */
-static int
-btree_is_meta_page(struct btree *bt, struct page *p)
-{
- struct bt_meta *m;
- unsigned char hash[SHA_DIGEST_LENGTH];
-
- m = METADATA(p);
- if (!F_ISSET(p->flags, P_META)) {
- DPRINTF("page %d not a meta page", p->pgno);
- errno = EINVAL;
- return 0;
- }
-
- if (m->root >= p->pgno && m->root != P_INVALID) {
- EPRINTF("page %d points to an invalid root page", p->pgno);
- errno = EINVAL;
- return 0;
- }
-
- if (F_ISSET(bt->flags, BT_NOPGCHECKSUM)) {
- bt->hasher->reset();
- bt->hasher->addData((const char *)m, METAHASHLEN);
- QByteArray result = bt->hasher->result();
- memcpy(hash, result.constData(), result.size());
-
- if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) {
- EPRINTF("page %d has an invalid digest %.*s", p->pgno, SHA_DIGEST_LENGTH, m->hash);
- errno = EINVAL;
- return 0;
- }
- }
-
- return 1;
-}
-
-static int
-btree_read_meta(struct btree *bt, pgno_t *p_next)
-{
- struct mpage *mp;
- struct bt_meta *meta;
- pgno_t meta_pgno, next_pgno, rest_pgno;
- off_t size;
- off_t bt_prev_sz = bt->size;
-
- assert(bt != NULL);
-
- if ((size = lseek(bt->fd, 0, SEEK_END)) == -1) {
- fprintf(stderr, "failed to lseek errno=%d\n", errno);
- goto fail;
- }
-
- DPRINTF("btree_read_meta: size = %llu", (long long unsigned)size);
-
- if (size < bt->size) {
- EPRINTF("file has shrunk!");
- errno = EIO;
- goto fail;
- }
-
- if ((uint32_t)size == bt->head.psize) { /* there is only the header */
- if (p_next != NULL)
- *p_next = 1;
- return BT_SUCCESS; /* new file */
- }
-
- next_pgno = size / bt->head.psize;
- if (next_pgno == 0) {
- DPRINTF("corrupt file");
- fprintf(stderr, "corrupt file\n");
- errno = EIO;
- goto fail;
- }
-
- meta_pgno = next_pgno - 1;
-
- if (size % bt->head.psize != 0) {
- DPRINTF("filesize not a multiple of the page size!");
- bt->flags |= BT_FIXPADDING;
- next_pgno++;
- }
-
- if (p_next != NULL)
- *p_next = next_pgno;
-
- if (size == bt->size) {
- DPRINTF("size unchanged, keeping current meta page");
- if (F_ISSET(bt->meta.flags, BT_TOMBSTONE)) {
- DPRINTF("file is dead");
- errno = ESTALE;
- return BT_FAIL;
- } else
- return BT_SUCCESS;
- }
- bt->size = size;
-
- while (meta_pgno > 0) {
- mp = btree_get_mpage(bt, meta_pgno); // TODO: Add page type to get_mpage, early out (avoid checksum checks)
- if (mp && btree_is_meta_page(bt, mp->page)) {
- meta = METADATA(mp->page);
- DPRINTF("flags = 0x%x", meta->flags);
- if (F_ISSET(meta->flags, BT_TOMBSTONE)) {
- DPRINTF("file is dead");
- errno = ESTALE;
- return BT_FAIL;
- } else if (F_ISSET(bt->flags, BT_USEMARKER) && !F_ISSET(meta->flags, BT_MARKER)) {
- DPRINTF("found a meta page %d but without marker, skipping...", meta_pgno);
- /* dont skip if pages up to last marked meta are ok */
- if (!F_ISSET(bt->flags, BT_NOPGCHECKSUM)) {
- rest_pgno = meta_pgno - 1;
- while ((mp = btree_get_mpage(bt, rest_pgno)) != NULL) {
- if (rest_pgno == 0 || (btree_is_meta_page(bt, mp->page) && F_ISSET(METADATA(mp->page)->flags, BT_MARKER))) {
- bcopy(meta, &bt->meta, sizeof(bt->meta));
- return BT_SUCCESS;
- }
- rest_pgno--;
- if (mp) {
- mpage_del(bt, mp);
- mpage_free(mp);
- mp = 0;
- }
- }
- }
- } else {
- /* Make copy of last meta page. */
- bcopy(meta, &bt->meta, sizeof(bt->meta));
- return BT_SUCCESS;
- }
- }
- if (mp) {
- mpage_del(bt, mp);
- mpage_free(mp);
- }
- --meta_pgno; /* scan backwards to first valid meta page */
- }
-
- errno = EIO;
- if (bt_prev_sz)
- EPRINTF("failed somehow errno=%d\n", errno);
-fail:
- if (p_next != NULL)
- *p_next = P_INVALID;
- return BT_FAIL;
-}
-
-static int
-btree_read_meta_with_tag(struct btree *bt, unsigned int tag, pgno_t *p_root)
-{
- pgno_t pgno;
- struct page *p;
- struct bt_meta *meta;
-
- assert(bt != NULL);
- assert(p_root != NULL);
-
- if (btree_read_meta(bt, NULL) != BT_SUCCESS)
- return BT_FAIL;
-
- if (bt->meta.tag == tag) {
- *p_root = bt->meta.root;
- return BT_SUCCESS;
- }
-
- if ((p = (page *)malloc(bt->head.psize)) == NULL) {
- free(p);
- return BT_FAIL;
- }
-
- pgno = bt->meta.prev_meta;
- while (pgno != P_INVALID) {
- if (btree_read_page(bt, pgno, p) != BT_SUCCESS) {
- free(p);
- return BT_FAIL;
- }
- if (!F_ISSET(p->flags, P_META)) {
- EPRINTF("corrupted meta page chain detected (page %d flags %d)", pgno, p->flags);
- free(p);
- return BT_FAIL;
- }
- if (!btree_is_meta_page(bt, p)) {
- EPRINTF("corrupted meta page found (page %d flags %d)", pgno, p->flags);
- free(p);
- return BT_FAIL;
- }
- meta = METADATA(p);
- if (meta->tag == tag) {
- *p_root = meta->root;
- free(p);
- return BT_SUCCESS;
- }
- pgno = meta->prev_meta;
- }
- free(p);
- return BT_FAIL;
-}
-
-struct btree *
-btree_open_fd(const char *path, int fd, unsigned int flags)
-{
- struct btree *bt;
- int fl;
-
- fl = fcntl(fd, F_GETFL, 0);
- int rc;
- if ((rc = fcntl(fd, F_SETFL, fl | O_APPEND)) == -1) {
- EPRINTF( "fcntl fd=%d rc=%d errno=%d\n", fd, rc, errno);
- return NULL;
- }
-
- bt = (struct btree *)calloc(1, sizeof(btree));
- // initialize the hasher
- bt->hasher = new QCryptographicHash(QCryptographicHash::Sha1);
-
- if (!bt) {
- EPRINTF("failed to allocate memory for btree");
- goto fail;
- }
-
- bt->fd = fd;
- bt->flags = flags;
- bt->flags &= ~BT_FIXPADDING;
- bt->ref = 1;
- bt->meta.pgno = P_INVALID;
- bt->meta.root = P_INVALID;
- bt->meta.prev_meta = P_INVALID;
- bt->path = (char*)malloc(strlen(path) + 1);
- strcpy(bt->path, path);
-
- if ((bt->page_cache = (struct page_cache *)calloc(1, sizeof(*bt->page_cache))) == NULL)
- goto fail;
- bt->stat.max_cache = BT_MAXCACHE_DEF;
- RB_INIT(bt->page_cache);
-
- if ((bt->lru_queue = (lru_queue *)calloc(1, sizeof(*bt->lru_queue))) == NULL) {
- EPRINTF("failed to allocate lru_queue");
- goto fail;
- }
- TAILQ_INIT(bt->lru_queue);
-
- if (btree_read_header(bt) != 0) {
- if (errno != ENOENT) {
- EPRINTF("failed to read header");
- goto fail;
- }
- DPRINTF("new database");
- btree_write_header(bt, bt->fd);
- }
-
- if (btree_read_meta(bt, NULL) != 0) {
- DPRINTF("valid meta not found. Clearing file");
- if (F_ISSET(bt->flags, BT_RDONLY) || btree_clear(&bt) != BT_SUCCESS) {
- EPRINTF("failed to read meta");
- goto fail;
- }
- }
-
- DPRINTF("opened database version %u, pagesize %u",
- bt->head.version, bt->head.psize);
- DPRINTF("timestamp: %s", ctime((const time_t *)&bt->meta.created_at));
- DPRINTF("depth: %u", bt->meta.depth);
- DPRINTF("entries: %llu", (long long unsigned)bt->meta.entries);
- DPRINTF("revisions: %u", bt->meta.revisions);
- DPRINTF("branch pages: %u", bt->meta.branch_pages);
- DPRINTF("leaf pages: %u", bt->meta.leaf_pages);
- DPRINTF("overflow pages: %u", bt->meta.overflow_pages);
- DPRINTF("root: %u", bt->meta.root);
- DPRINTF("previous meta page: %u", bt->meta.prev_meta);
-
- return bt;
-
-fail:
- EPRINTF("%s: fail errno=%d\n", path, errno);
- if (bt) {
- free(bt->lru_queue);
- free(bt->page_cache);
- free(bt->path);
- }
- free(bt);
- return NULL;
-}
-
-int
-btree_clear(btree **bt)
-{
- struct btree *btc;
-
- assert(bt && *bt);
-
- btc = btree_open_empty_copy(*bt);
-
- if (!btc) {
- EPRINTF("failed to open new file");
- return BT_FAIL;
- }
-
- if (btree_replace(*bt, btc) != BT_SUCCESS) {
- EPRINTF("failed to replace");
- return BT_FAIL;
- }
-
- strcpy(btc->path, (*bt)->path);
- btree_close(*bt);
- *bt = btc;
- return BT_SUCCESS;
-}
-
-int
-btree_replace(btree *bt, btree *btw)
-{
- struct btree_txn *txn;
-
- assert(bt && btw);
-
- if ((txn = btree_txn_begin(bt, 0)) == NULL)
- return BT_FAIL;
-
- DPRINTF("replacing %s with %s", bt->path, btw->path);
- if (rename(btw->path, bt->path) != 0)
- goto fail;
-
- /* Write a "tombstone" meta page so other processes can pick up
- * the change and re-open the file.
- */
- if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE, 0) != BT_SUCCESS)
- goto fail;
-
- btree_txn_abort(txn);
- return BT_SUCCESS;
-fail:
- btree_txn_abort(txn);
- return BT_FAIL;
-}
-
-struct btree*
-btree_open_empty_copy(struct btree *bt)
-{
- char *copy_path = NULL;
- const char copy_ext[] = ".copy.XXXXXX";
- struct btree *btc;
- int fd;
-
- assert(bt != NULL);
-
- DPRINTF("creating empty copy of btree %p with path %s", bt, bt->path);
-
- if (bt->path == NULL) {
- errno = EINVAL;
- return 0;
- }
-
- copy_path = (char*)malloc(strlen(bt->path) + strlen(copy_ext) + 1);
- strcpy(copy_path, bt->path);
- strcat(copy_path, copy_ext);
-
- fd = mkstemp(copy_path);
- if (fd == -1) {
- EPRINTF("failed to get fd for empty copy");
- goto failed;
- }
-
- if ((btc = btree_open_fd(copy_path, fd, bt->flags)) == NULL)
- goto failed;
- DPRINTF("opened empty btree %p", btc);
-
- free(copy_path);
- return btc;
-
-failed:
- unlink(copy_path);
- free(copy_path);
- btree_close(btc);
- return 0;
-}
-
-
-struct btree *
-btree_open(const char *path, unsigned int flags, mode_t mode)
-{
- int fd, oflags;
- struct btree *bt;
-
- if (F_ISSET(flags, BT_RDONLY))
- oflags = O_RDONLY;
- else
- oflags = O_RDWR | O_CREAT | O_APPEND;
-
- fd = open(path, oflags, mode);
- if (fd == -1)
- return NULL;
- if ((bt = btree_open_fd(path, fd, flags)) == NULL)
- close(fd);
- else {
- DPRINTF("opened btree %p", bt);
- }
-
- return bt;
-}
-
-int btree_get_fd(struct btree *bt)
-{
- return bt->fd;
-}
-
-static void
-btree_ref(struct btree *bt)
-{
- bt->ref++;
- DPRINTF("ref is now %d on btree %p", bt->ref, bt);
-}
-
-void
-btree_close(struct btree *bt)
-{
- if (bt == NULL)
- return;
-
- if (bt->ref == 1)
- btree_sync(bt);
-
- if (--bt->ref == 0) {
- DPRINTF("ref is zero, closing btree %p:%s", bt, bt->path);
- close(bt->fd);
- mpage_flush(bt);
- delete bt->hasher;
- free(bt->page_cache);
- free(bt->lru_queue);
- free(bt->path);
- free(bt);
- } else
- DPRINTF("ref is now %d on btree %p", bt->ref, bt);
-}
-
-void
-btree_close_nosync(struct btree *bt)
-{
- if (bt == NULL)
- return;
-
- if (--bt->ref == 0) {
- DPRINTF("ref is zero, closing btree %p:%s", bt, bt->path);
- close(bt->fd);
- mpage_flush(bt);
- delete bt->hasher;
- free(bt->page_cache);
- free(bt->lru_queue);
- free(bt->path);
- free(bt);
- } else
- DPRINTF("ref is now %d on btree %p", bt->ref, bt);
-}
-
-struct btree_txn *
-btree_get_txn(struct btree *bt)
-{
- assert(bt);
- return bt->txn;
-}
-
-/* Search for key within a leaf page, using binary search.
- * Returns the smallest entry larger or equal to the key.
- * If exactp is non-null, stores whether the found entry was an exact match
- * in *exactp (1 or 0).
- * If kip is non-null, stores the index of the found entry in *kip.
- * If no entry larger of equal to the key is found, returns NULL.
- */
-static struct node *
-btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key,
- int *exactp, unsigned int *kip)
-{
- unsigned int i = 0;
- int low, high;
- int rc = 0;
- struct node *node;
- struct btval nodekey;
-
- DPRINTF("searching for [%.*s] in %lu keys in %s page %u with prefix [%.*s]",
- key->size, (const char*)key->data,
- NUMKEYS(mp),
- IS_LEAF(mp) ? "leaf" : "branch",
- mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str);
-
- assert(NUMKEYS(mp) > 0);
-
- bzero(&nodekey, sizeof(nodekey));
-
- low = IS_LEAF(mp) ? 0 : 1;
- high = NUMKEYS(mp) - 1;
- while (low <= high) {
- i = (low + high) >> 1;
- node = NODEPTR(mp, i);
-
- nodekey.size = node->ksize;
- nodekey.data = NODEKEY(node);
-
- rc = bt_cmp(bt, key, &nodekey, &mp->prefix);
-
- if (IS_LEAF(mp))
- DPRINTF("found leaf index %u [%.*s], rc = %i",
- i, (int)nodekey.size, (char *)nodekey.data, rc);
- else
- DPRINTF("found branch index %u [%.*s -> %u], rc = %i",
- i, (int)node->ksize, (char *)NODEKEY(node),
- node->n_pgno, rc);
-
- if (rc == 0)
- break;
- if (rc > 0)
- low = i + 1;
- else
- high = i - 1;
- }
-
- if (rc > 0) { /* Found entry is less than the key. */
- i++; /* Skip to get the smallest entry larger than key. */
- if (i >= NUMKEYS(mp))
- /* There is no entry larger or equal to the key. */
- return NULL;
- }
- if (exactp)
- *exactp = (rc == 0);
- if (kip) /* Store the key index if requested. */
- *kip = i;
-
- return NODEPTR(mp, i);
-}
-
-static void
-cursor_pop_page(struct cursor *cursor)
-{
- struct ppage *top;
-
- top = CURSOR_TOP(cursor);
- CURSOR_POP(cursor);
- top->mpage->ref--;
-
- DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor);
-
- free(top);
-}
-
-static struct ppage *
-cursor_push_page(struct cursor *cursor, struct mpage *mp)
-{
- struct ppage *ppage;
-
- DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor);
-
- if ((ppage = (struct ppage *)calloc(1, sizeof(struct ppage))) == NULL)
- return NULL;
- ppage->mpage = mp;
- mp->ref++;
- CURSOR_PUSH(cursor, ppage);
- return ppage;
-}
-
-static struct mpage *
-btree_get_mpage(struct btree *bt, pgno_t pgno)
-{
- struct mpage *mp;
-
- mp = mpage_lookup(bt, pgno);
- if (mp == NULL) {
- if ((mp = (mpage *)calloc(1, sizeof(*mp))) == NULL)
- return NULL;
- if ((mp->page = (page *)malloc(bt->head.psize)) == NULL) {
- free(mp);
- return NULL;
- }
- if (btree_read_page(bt, pgno, mp->page) != BT_SUCCESS) {
- mpage_free(mp);
- return NULL;
- }
- mp->pgno = pgno;
- mpage_add(bt, mp);
- } else
- DPRINTF("returning page %u from cache", pgno);
-
- DPRINTF("btree_get_mpage %p", mp);
- return mp;
-}
-
-static void
-concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2,
- char *cs, size_t *cn)
-{
- assert(*cn >= n1 + n2);
- if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
- bcopy(s2, cs, n2);
- bcopy(s1, cs + n2, n1);
- } else {
- bcopy(s1, cs, n1);
- bcopy(s2, cs + n1, n2);
- }
- *cn = n1 + n2;
-}
-
-static void
-find_common_prefix(struct btree *bt, struct mpage *mp)
-{
- if (bt->cmp != NULL)
- return;
-
- indx_t lbound = 0, ubound = 0;
- struct mpage *lp, *up;
- struct btkey lprefix, uprefix;
-
- mp->prefix.len = 0;
-
- lp = mp;
- while (lp->parent != NULL) {
- if (lp->parent_index > 0) {
- lbound = lp->parent_index;
- break;
- }
- lp = lp->parent;
- }
-
- up = mp;
- while (up->parent != NULL) {
- if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) {
- ubound = up->parent_index + 1;
- break;
- }
- up = up->parent;
- }
-
- if (lp->parent != NULL && up->parent != NULL) {
- expand_prefix(bt, lp->parent, lbound, &lprefix);
- expand_prefix(bt, up->parent, ubound, &uprefix);
- common_prefix(bt, &lprefix, &uprefix, &mp->prefix);
- }
- else if (mp->parent)
- bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix));
-
- DPRINTF("found common prefix [%.*s] (len %zu) for page %u",
- (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno);
-}
-
-static int
-btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key,
- struct cursor *cursor, enum SearchType searchType, int modify, struct mpage **mpp)
-{
- struct mpage *mp, *parent;
-
- if (cursor && cursor_push_page(cursor, root) == NULL)
- return BT_FAIL;
-
- mp = root;
- DPRINTF("searchType=%d isBranch=%d", searchType, IS_BRANCH(mp));
- while (IS_BRANCH(mp)) {
- unsigned int i = 0;
- struct node *node;
-
- DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp));
- assert(NUMKEYS(mp) > 1);
- DPRINTF("found index 0 to page %u", NODEPGNO(NODEPTR(mp, 0)));
-
- if (searchType == SearchFirst) /* Initialize cursor to first page. */
- i = 0;
- else if (searchType == SearchLast) { /* Initialize cursor to last page. */
- i = NUMKEYS(mp) - 1;
- DPRINTF("SearchLast i=%d", i);
- } else {
- int exact;
- node = btree_search_node(bt, mp, key, &exact, &i);
- if (node == NULL)
- i = NUMKEYS(mp) - 1;
- else if (!exact) {
- assert(i > 0);
- i--;
- }
- }
-
- if (key)
- DPRINTF("following index %u for key %.*s",
- i, (int)key->size, (char *)key->data);
- assert(i < NUMKEYS(mp));
- node = NODEPTR(mp, i);
-
- if (cursor)
- CURSOR_TOP(cursor)->ki = i;
-
- parent = mp;
- if ((mp = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
- return BT_FAIL;
- mp->parent = parent;
- mp->parent_index = i;
- find_common_prefix(bt, mp);
-
- if (cursor && cursor_push_page(cursor, mp) == NULL)
- return BT_FAIL;
-
- if (modify && (mp = mpage_touch(bt, mp)) == NULL)
- return BT_FAIL;
- }
-
- if (!IS_LEAF(mp)) {
- DPRINTF("internal error, index points to a %02X page!?",
- mp->page->flags);
- return BT_FAIL;
- }
-
- DPRINTF("found leaf page %u for key %.*s", mp->pgno,
- key ? (int)key->size : 0, key ? (char *)key->data : NULL);
-
- *mpp = mp;
- return BT_SUCCESS;
-}
-
-/* Search for the page a given key should be in.
- * Stores a pointer to the found page in *mpp.
- * Searches for key if searchType is SearchKey
- * Searches for the lowest page if searchType is SearchFirst
- * Searches for the highest page if searchType is SearchLast
- * If cursor is non-null, pushes parent pages on the cursor stack.
- * If modify is true, visited pages are updated with new page numbers.
- */
-static int
-btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key,
- struct cursor *cursor, enum SearchType searchType, int modify, struct mpage **mpp)
-{
- int rc;
- pgno_t root;
- struct mpage *mp = 0;
-
- /* Can't modify pages outside a transaction. */
- if (txn == NULL && modify) {
- EPRINTF("cannot modify pages outside a transaction");
- errno = EINVAL;
- return BT_FAIL;
- }
-
- /* Choose which root page to start with. If a transaction is given
- * use the root page from the transaction, otherwise read the last
- * committed root page.
- */
- if (txn == NULL) {
- if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS)
- return rc;
- root = bt->meta.root;
- } else if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
- EPRINTF("transaction has failed, must abort");
- errno = EINVAL;
- return BT_FAIL;
- } else
- root = txn->root;
-
- if (root == P_INVALID) { /* Tree is empty. */
- DPRINTF("tree is empty");
- errno = ENOENT;
- return BT_FAIL;
- }
-
- if ((mp = btree_get_mpage(bt, root)) == NULL)
- return BT_FAIL;
-
- DPRINTF("root page has flags 0x%X mp=%p", mp->page->flags, mp);
-
- assert(mp->parent == NULL);
- assert(mp->prefix.len == 0);
-
- if (modify && !mp->dirty) {
- if ((mp = mpage_touch(bt, mp)) == NULL)
- return BT_FAIL;
- txn->root = mp->pgno;
- }
-
- return btree_search_page_root(bt, mp, key, cursor, searchType, modify, mpp);
-}
-
-static int
-btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf,
- struct btval *data)
-{
- struct mpage *omp; /* overflow mpage */
- size_t psz;
- size_t max;
- size_t sz = 0;
- pgno_t pgno;
-
- bzero(data, sizeof(*data));
- max = bt->head.psize - PAGEHDRSZ;
-
- if (!F_ISSET(leaf->flags, F_BIGDATA)) {
- data->size = leaf->n_dsize;
- if (data->size > 0) {
- if (mp == NULL) {
- if ((data->data = malloc(data->size)) == NULL)
- return BT_FAIL;
- bcopy(NODEDATA(leaf), data->data, data->size);
- data->free_data = 1;
- data->mp = NULL;
- } else {
- data->data = NODEDATA(leaf);
- data->free_data = 0;
- data->mp = mp;
- mp->ref++;
- }
- }
- return BT_SUCCESS;
- }
-
- /* Read overflow data.
- */
- DPRINTF("allocating %u byte for overflow data", leaf->n_dsize);
- if ((data->data = malloc(leaf->n_dsize)) == NULL)
- return BT_FAIL;
- data->size = leaf->n_dsize;
- data->free_data = 1;
- data->mp = NULL;
- bcopy(NODEDATA(leaf), &pgno, sizeof(pgno));
- for (sz = 0; sz < data->size; ) {
- if ((omp = btree_get_mpage(bt, pgno)) == NULL ||
- !F_ISSET(omp->page->flags, P_OVERFLOW)) {
- DPRINTF("read overflow page %u failed", pgno);
- free(data->data);
- mpage_free(omp);
- return BT_FAIL;
- }
- psz = data->size - sz;
- if (psz > max)
- psz = max;
- bcopy(omp->page->ptrs, (char *)data->data + sz, psz);
- sz += psz;
- pgno = omp->page->p_next_pgno;
- }
-
- return BT_SUCCESS;
-}
-
-int
-btree_txn_get(struct btree *bt, struct btree_txn *txn,
- struct btval *key, struct btval *data)
-{
- int rc, exact;
- struct node *leaf;
- struct mpage *mp;
-
- assert(key);
- assert(data);
- DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data);
-
- if (bt != NULL && txn != NULL && bt != txn->bt) {
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if (bt == NULL) {
- if (txn == NULL) {
- errno = EINVAL;
- return BT_FAIL;
- }
- bt = txn->bt;
- }
-
- if (key->size == 0 || key->size > MAXKEYSIZE) {
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if ((rc = btree_search_page(bt, txn, key, NULL, SearchKey, 0, &mp)) != BT_SUCCESS)
- return rc;
-
- leaf = btree_search_node(bt, mp, key, &exact, NULL);
- if (leaf && exact)
- rc = btree_read_data(bt, mp, leaf, data);
- else {
- errno = ENOENT;
- rc = BT_FAIL;
- }
-
- mpage_prune(bt);
- return rc;
-}
-
-static int
-btree_sibling(struct cursor *cursor, int move_right, int rightmost)
-{
- int rc;
- struct node *indx;
- struct ppage *parent, *top;
- struct mpage *mp;
-
- top = CURSOR_TOP(cursor);
- if ((parent = SLIST_NEXT(top, entry)) == NULL) {
- errno = ENOENT;
- return BT_FAIL; /* root has no siblings */
- }
-
- DPRINTF("parent page is page %u, index %u",
- parent->mpage->pgno, parent->ki);
-
- cursor_pop_page(cursor);
- if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage))
- : (parent->ki == 0)) {
- DPRINTF("no more keys left, moving to %s node of %s sibling",
- rightmost ? "rightmost" : "leftmost",
- move_right ? "right" : "left");
- if ((rc = btree_sibling(cursor, move_right, rightmost)) != BT_SUCCESS)
- return rc;
- parent = CURSOR_TOP(cursor);
- } else {
- if (move_right)
- parent->ki++;
- else
- parent->ki--;
- DPRINTF("just moving to %s index key %u",
- move_right ? "right" : "left", parent->ki);
- }
- assert(IS_BRANCH(parent->mpage));
-
- indx = NODEPTR(parent->mpage, parent->ki);
- if ((mp = btree_get_mpage(cursor->bt, indx->n_pgno)) == NULL)
- return BT_FAIL;
- mp->parent = parent->mpage;
- mp->parent_index = parent->ki;
-
- top = cursor_push_page(cursor, mp);
- find_common_prefix(cursor->bt, mp);
- if (rightmost)
- top->ki = NUMKEYS(mp)-1;
-
- return BT_SUCCESS;
-}
-
-static int
-bt_set_key(struct btree *bt, struct mpage *mp, struct node *node,
- struct btval *key)
-{
- if (key == NULL)
- return 0;
-
- if (mp->prefix.len > 0) {
- key->size = node->ksize + mp->prefix.len;
- key->data = malloc(key->size);
- if (key->data == NULL)
- return -1;
- concat_prefix(bt,
- mp->prefix.str, mp->prefix.len,
- NODEKEY(node), node->ksize,
- (char *)key->data, &key->size);
- key->free_data = 1;
- } else {
- key->size = node->ksize;
- key->data = NODEKEY(node);
- key->free_data = 0;
- key->mp = mp;
- mp->ref++;
- }
-
- return 0;
-}
-
-static int
-btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data)
-{
- struct ppage *top;
- struct mpage *mp;
- struct node *leaf;
-
- if (cursor->eof) {
- errno = ENOENT;
- return BT_FAIL;
- }
-
- assert(cursor->initialized);
-
- top = CURSOR_TOP(cursor);
- mp = top->mpage;
-
- DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor);
-
- if (top->ki + 1 >= NUMKEYS(mp)) {
- DPRINTF("=====> move to next sibling page");
- if (btree_sibling(cursor, 1, 0) != BT_SUCCESS) {
- cursor->eof = 1;
- return BT_FAIL;
- }
- top = CURSOR_TOP(cursor);
- mp = top->mpage;
- DPRINTF("next page is %u, key index %u", mp->pgno, top->ki);
- } else
- top->ki++;
-
- DPRINTF("==> cursor points to page %u with %lu keys, key index %u",
- mp->pgno, NUMKEYS(mp), top->ki);
-
- assert(IS_LEAF(mp));
- leaf = NODEPTR(mp, top->ki);
-
- if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
- return BT_FAIL;
-
- if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
- return BT_FAIL;
-
- return BT_SUCCESS;
-}
-
-static int
-btree_cursor_prev(struct cursor *cursor, struct btval *key, struct btval *data)
-{
- struct ppage *top;
- struct mpage *mp;
- struct node *leaf;
-
- if (cursor->eof) {
- errno = ENOENT;
- return BT_FAIL;
- }
-
- assert(cursor->initialized);
-
- top = CURSOR_TOP(cursor);
- mp = top->mpage;
-
- DPRINTF("top page is %u in cursor %p", mp->pgno, cursor);
-
- if (top->ki - 1 == -1u) {
- DPRINTF("=====> move to prev sibling page");
- if (btree_sibling(cursor, 0, 1) != BT_SUCCESS) {
- cursor->eof = 1;
- return BT_FAIL;
- }
- top = CURSOR_TOP(cursor);
- mp = top->mpage;
- DPRINTF("next page is %u, key index %u", mp->pgno, top->ki);
- } else
- top->ki--;
-
- DPRINTF("==> cursor points to page %u with %lu keys, key index %u",
- mp->pgno, NUMKEYS(mp), top->ki);
-
- assert(IS_LEAF(mp));
- leaf = NODEPTR(mp, top->ki);
-
- if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
- return BT_FAIL;
-
- if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
- return BT_FAIL;
-
- return BT_SUCCESS;
-}
-
-static int
-btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data,
- int *exactp)
-{
- int rc;
- struct node *leaf;
- struct mpage *mp;
- struct ppage *top;
-
- assert(cursor);
- assert(key);
- assert(key->size > 0);
-
- rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, SearchKey, 0, &mp);
- if (rc != BT_SUCCESS)
- return rc;
- assert(IS_LEAF(mp));
-
- top = CURSOR_TOP(cursor);
- leaf = btree_search_node(cursor->bt, mp, key, exactp, &top->ki);
- if (exactp != NULL && !*exactp) {
- /* BT_CURSOR_EXACT specified and not an exact match. */
- errno = ENOENT;
- return BT_FAIL;
- }
-
- if (leaf == NULL) {
- DPRINTF("===> inexact leaf not found, goto sibling");
- if (btree_sibling(cursor, 1, 0) != BT_SUCCESS)
- return BT_FAIL; /* no entries matched */
- top = CURSOR_TOP(cursor);
- top->ki = 0;
- mp = top->mpage;
- assert(IS_LEAF(mp));
- leaf = NODEPTR(mp, 0);
- }
-
- cursor->initialized = 1;
- cursor->eof = 0;
-
- if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
- return BT_FAIL;
-
- if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
- return BT_FAIL;
- DPRINTF("==> cursor placed on key %.*s",
- (int)key->size, (char *)key->data);
-
- return BT_SUCCESS;
-}
-
-static int
-btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data)
-{
- int rc;
- struct mpage *mp;
- struct node *leaf;
-
- rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, SearchFirst, 0, &mp);
- if (rc != BT_SUCCESS)
- return rc;
- assert(IS_LEAF(mp));
-
- leaf = NODEPTR(mp, 0);
- cursor->initialized = 1;
- cursor->eof = 0;
-
- if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
- return BT_FAIL;
-
- if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
- return BT_FAIL;
-
- return BT_SUCCESS;
-}
-
-static int
-btree_cursor_last(struct cursor *cursor, struct btval *key, struct btval *data)
-{
- int rc;
- struct mpage *mp;
- struct node *leaf;
- struct ppage *top;
-
- rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, SearchLast, 0, &mp);
- if (rc != BT_SUCCESS)
- return rc;
- assert(IS_LEAF(mp));
-
- top = CURSOR_TOP(cursor);
- // get the last leaf in the page
- top->ki = NUMKEYS(mp)-1;
- leaf = NODEPTR(mp, top->ki);
- cursor->initialized = 1;
- cursor->eof = 0;
-
- if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
- return BT_FAIL;
-
- if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
- return BT_FAIL;
-
- return BT_SUCCESS;
-}
-
-int
-btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data,
- enum cursor_op op)
-{
- int rc;
- int exact = 0;
-
- assert(cursor);
-
- switch (op) {
- case BT_CURSOR:
- case BT_CURSOR_EXACT:
- while (CURSOR_TOP(cursor) != NULL)
- cursor_pop_page(cursor);
- if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) {
- errno = EINVAL;
- rc = BT_FAIL;
- } else if (op == BT_CURSOR_EXACT)
- rc = btree_cursor_set(cursor, key, data, &exact);
- else
- rc = btree_cursor_set(cursor, key, data, NULL);
- break;
- case BT_NEXT:
- if (!cursor->initialized)
- rc = btree_cursor_first(cursor, key, data);
- else
- rc = btree_cursor_next(cursor, key, data);
- break;
- case BT_PREV:
- if (!cursor->initialized)
- rc = btree_cursor_last(cursor, key, data);
- else
- rc = btree_cursor_prev(cursor, key, data);
- break;
- case BT_FIRST:
- while (CURSOR_TOP(cursor) != NULL)
- cursor_pop_page(cursor);
- rc = btree_cursor_first(cursor, key, data);
- break;
- case BT_LAST:
- while (CURSOR_TOP(cursor) != NULL)
- cursor_pop_page(cursor);
- rc = btree_cursor_last(cursor, key, data);
- break;
- default:
- DPRINTF("unhandled/unimplemented cursor operation %u", op);
- rc = BT_FAIL;
- break;
- }
-
- mpage_prune(cursor->bt);
-
- return rc;
-}
-
-struct btree *
-btree_cursor_bt(struct cursor *cursor)
-{
- assert(cursor);
- return cursor->bt;
-}
-
-struct btree_txn *
-btree_cursor_txn(struct cursor *cursor)
-{
- assert(cursor);
- return cursor->txn;
-}
-
-static struct mpage *
-btree_new_page(struct btree *bt, uint32_t flags)
-{
- struct mpage *mp;
-
- assert(bt != NULL);
- assert(bt->txn != NULL);
-
- DPRINTF("allocating new mpage %u, page size %u, flags %0X",
- bt->txn->next_pgno, bt->head.psize, flags);
- if ((mp = (mpage *)calloc(1, sizeof(*mp))) == NULL)
- return NULL;
- if ((mp->page = (page *)malloc(bt->head.psize)) == NULL) {
- free(mp);
- return NULL;
- }
- memset(mp->page, 0, bt->head.psize);
- mp->pgno = mp->page->pgno = bt->txn->next_pgno++;
- mp->page->flags = flags;
- mp->page->lower = PAGEHDRSZ;
- mp->page->upper = bt->head.psize;
-
- if (IS_BRANCH(mp))
- bt->meta.branch_pages++;
- else if (IS_LEAF(mp))
- bt->meta.leaf_pages++;
- else if (IS_OVERFLOW(mp))
- bt->meta.overflow_pages++;
-
- mpage_add(bt, mp);
- mpage_dirty(bt, mp);
-
- return mp;
-}
-
-static size_t
-bt_leaf_size(struct btree *bt, struct mpage *mp, struct btval *key, struct btval *data)
-{
- size_t sz;
-
- sz = LEAFSIZE(key, data);
- if (bt_is_overflow(bt, mp, key->size, data->size)) {
- /* put on overflow page */
- sz -= data->size - sizeof(pgno_t);
- }
-
- return sz + sizeof(indx_t);
-}
-
-static int
-bt_is_overflow(struct btree *bt, struct mpage *mp, size_t ksize, size_t dsize)
-{
- assert(bt && mp);
-#ifdef ENABLE_BIG_KEYS
- size_t node_size = dsize + ksize + NODESIZE;
- if ((node_size + sizeof(indx_t) > SIZELEFT(mp))
- || (NUMKEYS(mp) == 0 && (SIZELEFT(mp) - (node_size + sizeof(indx_t))) < MAXKEYSIZE))
- return 1;
-#else
- (void)ksize;
- if (dsize >= bt->head.psize / BT_MINKEYS)
- return 1;
-
-#endif
- return 0;
-}
-
-static size_t
-bt_branch_size(struct btree *bt, struct btval *key)
-{
- size_t sz;
-
- sz = INDXSIZE(key);
- if (sz >= bt->head.psize / BT_MINKEYS) {
- /* put on overflow page */
- /* not implemented */
- /* sz -= key->size - sizeof(pgno_t); */
- }
-
- return sz + sizeof(indx_t);
-}
-
-static int
-btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data)
-{
- size_t done = 0;
- size_t sz;
- size_t max;
- pgno_t *linkp; /* linked page stored here */
- struct mpage *next = NULL;
-
- max = bt->head.psize - PAGEHDRSZ;
-
- while (done < data->size) {
- if (next != NULL)
- p = next->page;
- linkp = &p->p_next_pgno;
- if (data->size - done > max) {
- /* need another overflow page */
- if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL)
- return BT_FAIL;
- *linkp = next->pgno;
- DPRINTF("linking overflow page %u", next->pgno);
- } else
- *linkp = 0; /* indicates end of list */
- sz = data->size - done;
- if (sz > max)
- sz = max;
- DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno);
- bcopy((char *)data->data + done, p->ptrs, sz);
- done += sz;
- }
-
- return BT_SUCCESS;
-}
-
-/* Key prefix should already be stripped.
- */
-static int
-btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx,
- struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags)
-{
- unsigned int i;
- size_t node_size = NODESIZE;
- indx_t ofs;
- struct node *node;
- struct page *p;
- struct mpage *ofp = NULL; /* overflow page */
-
- p = mp->page;
- assert(p->upper >= p->lower);
-
- DPRINTF("add node [%.*s] to %s page %u at index %i, key size %zu",
- key ? (int)key->size : 0, key ? (char *)key->data : NULL,
- IS_LEAF(mp) ? "leaf" : "branch",
- mp->pgno, indx, key ? key->size : 0);
-
- if (key != NULL)
- node_size += key->size;
-
- if (IS_LEAF(mp)) {
- assert(data);
- node_size += data->size;
- if (F_ISSET(flags, F_BIGDATA)) {
- /* Data already on overflow page. */
- node_size -= data->size - sizeof(pgno_t);
-#ifdef ENABLE_BIG_KEYS
- } else if (bt_is_overflow(bt, mp, (key ? key->size : 0), data->size)) {
-#else
- } else if (bt_is_overflow(bt, mp, (key ? key->size : 0), data->size)
- || (node_size + sizeof(indx_t) > SIZELEFT(mp))) {
-#endif
- /* Put data on overflow page. */
- DPRINTF("data size is %zu, put on overflow page",
- data->size);
- node_size -= data->size - sizeof(pgno_t);
- if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL)
- return BT_FAIL;
- DPRINTF("allocated overflow page %u", ofp->pgno);
- flags |= F_BIGDATA;
- }
- }
-
- if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
- DPRINTF("not enough room in page %u, got %lu ptrs",
- mp->pgno, NUMKEYS(mp));
- DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower,
- p->upper - p->lower);
- DPRINTF("node size = %zu", node_size);
- return BT_FAIL;
- }
-
- /* Move higher pointers up one slot. */
- for (i = NUMKEYS(mp); i > indx; i--)
- p->ptrs[i] = p->ptrs[i - 1];
-
- /* Adjust free space offsets. */
- ofs = p->upper - node_size;
- assert(ofs >= p->lower + sizeof(indx_t));
- p->ptrs[indx] = ofs;
- p->upper = ofs;
- p->lower += sizeof(indx_t);
-
- /* Write the node data. */
- node = NODEPTR(mp, indx);
- node->ksize = (key == NULL) ? 0 : key->size;
- node->flags = flags;
- if (IS_LEAF(mp)) {
- node->n_dsize = data->size;
- } else {
- node->n_pgno = pgno;
- }
-
- if (key)
- bcopy(key->data, NODEKEY(node), key->size);
-
- if (IS_LEAF(mp)) {
- assert(key);
- if (ofp == NULL) {
- if (F_ISSET(flags, F_BIGDATA))
- bcopy(data->data, node->data + key->size,
- sizeof(pgno_t));
- else
- bcopy(data->data, node->data + key->size,
- data->size);
- } else {
- bcopy(&ofp->pgno, node->data + key->size,
- sizeof(pgno_t));
- if (btree_write_overflow_data(bt, ofp->page,
- data) == BT_FAIL)
- return BT_FAIL;
- }
- }
-
- return BT_SUCCESS;
-}
-
-static void
-btree_del_node(struct btree *, struct mpage *mp, indx_t indx)
-{
- unsigned int sz;
- indx_t i, j, numkeys, ptr;
- struct node *node;
- char *base;
-
- DPRINTF("delete node %u on %s page %u", indx,
- IS_LEAF(mp) ? "leaf" : "branch", mp->pgno);
- assert(indx < NUMKEYS(mp));
-
- node = NODEPTR(mp, indx);
- sz = NODESIZE + node->ksize;
- if (IS_LEAF(mp)) {
- if (F_ISSET(node->flags, F_BIGDATA))
- sz += sizeof(pgno_t);
- else
- sz += NODEDSZ(node);
- }
-
- ptr = mp->page->ptrs[indx];
- numkeys = NUMKEYS(mp);
- for (i = j = 0; i < numkeys; i++) {
- if (i != indx) {
- mp->page->ptrs[j] = mp->page->ptrs[i];
- if (mp->page->ptrs[i] < ptr)
- mp->page->ptrs[j] += sz;
- j++;
- }
- }
-
- base = (char *)mp->page + mp->page->upper;
- bcopy(base, base + sz, ptr - mp->page->upper);
-
- mp->page->lower -= sizeof(indx_t);
- mp->page->upper += sz;
-}
-
-struct cursor *
-btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn)
-{
- struct cursor *cursor;
-
- if (bt != NULL && txn != NULL && bt != txn->bt) {
- errno = EINVAL;
- DPRINTF("bt=%p does not belong to txn=%p (txn->bt=%p)", bt, txn, txn->bt);
- return NULL;
- }
-
- if (bt == NULL) {
- if (txn == NULL) {
- errno = EINVAL;
- DPRINTF("bt and txn both null");
- return NULL;
- }
- bt = txn->bt;
- }
-
- if ((cursor = (struct cursor *)calloc(1, sizeof(struct cursor))) != NULL) {
- SLIST_INIT(&cursor->stack);
- cursor->bt = bt;
- cursor->txn = txn;
- btree_ref(bt);
- }
-
- return cursor;
-}
-
-void
-btree_cursor_close(struct cursor *cursor)
-{
- if (cursor != NULL) {
- while (!CURSOR_EMPTY(cursor))
- cursor_pop_page(cursor);
-
- btree_close(cursor->bt);
- free(cursor);
- }
-}
-
-static int
-btree_update_key(struct btree *, struct mpage *mp, indx_t indx,
- struct btval *key)
-{
- indx_t ptr, i, numkeys;
- int delta;
- size_t len;
- struct node *node;
- char *base;
-
- node = NODEPTR(mp, indx);
- ptr = mp->page->ptrs[indx];
- DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u",
- indx, ptr,
- (int)node->ksize, (char *)NODEKEY(node),
- (int)key->size, (char *)key->data,
- mp->pgno);
-
- if (key->size != node->ksize) {
- delta = key->size - node->ksize;
- if (delta > 0 && SIZELEFT(mp) < delta) {
- DPRINTF("OUCH! Not enough room, delta = %d", delta);
- return BT_FAIL;
- }
-
- numkeys = NUMKEYS(mp);
- for (i = 0; i < numkeys; i++) {
- if (mp->page->ptrs[i] <= ptr)
- mp->page->ptrs[i] -= delta;
- }
-
- base = (char *)mp->page + mp->page->upper;
- len = ptr - mp->page->upper + NODESIZE;
- bcopy(base, base - delta, len);
- mp->page->upper -= delta;
-
- node = NODEPTR(mp, indx);
- node->ksize = key->size;
- }
-
- bcopy(key->data, NODEKEY(node), key->size);
-
- return BT_SUCCESS;
-}
-
-static int
-btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta)
-{
- indx_t i;
- struct node *node;
- struct btkey tmpkey;
- struct btval key;
-
- DPRINTF("adjusting prefix lengths on page %u with delta %d",
- src->pgno, delta);
- assert(delta != 0);
-
- for (i = 0; i < NUMKEYS(src); i++) {
- node = NODEPTR(src, i);
- tmpkey.len = node->ksize - delta;
- if (delta > 0) {
- if (F_ISSET(bt->flags, BT_REVERSEKEY))
- bcopy(NODEKEY(node), tmpkey.str, tmpkey.len);
- else
- bcopy((char *)NODEKEY(node) + delta, tmpkey.str,
- tmpkey.len);
- } else {
- if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
- bcopy(NODEKEY(node), tmpkey.str, node->ksize);
- bcopy(src->prefix.str, tmpkey.str + node->ksize,
- -delta);
- } else {
- bcopy(src->prefix.str + src->prefix.len + delta,
- tmpkey.str, -delta);
- bcopy(NODEKEY(node), tmpkey.str - delta,
- node->ksize);
- }
- }
- key.size = tmpkey.len;
- key.data = tmpkey.str;
- if (btree_update_key(bt, src, i, &key) != BT_SUCCESS)
- return BT_FAIL;
- }
-
- return BT_SUCCESS;
-}
-
-/* Move a node from src to dst.
- */
-static int
-btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx,
- struct mpage *dst, indx_t dstindx)
-{
- int rc;
- unsigned int pfxlen, mp_pfxlen = 0;
- struct node *srcnode;
- struct mpage *mp = NULL;
- struct btkey tmpkey, srckey;
- struct btval key, data;
-
- assert(src->parent);
- assert(dst->parent);
-
- srcnode = NODEPTR(src, srcindx);
- DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u",
- IS_LEAF(src) ? "leaf" : "branch",
- srcindx,
- (int)srcnode->ksize, (char *)NODEKEY(srcnode),
- src->pgno,
- dstindx, dst->pgno);
-
- find_common_prefix(bt, src);
-
- if (IS_BRANCH(src)) {
- /* Need to check if the page the moved node points to
- * changes prefix.
- */
- if ((mp = btree_get_mpage(bt, NODEPGNO(srcnode))) == NULL)
- return BT_FAIL;
- mp->parent = src;
- mp->parent_index = srcindx;
- find_common_prefix(bt, mp);
- mp_pfxlen = mp->prefix.len;
- }
-
- /* Mark src and dst as dirty. */
- if ((src = mpage_touch(bt, src)) == NULL ||
- (dst = mpage_touch(bt, dst)) == NULL)
- return BT_FAIL;
-
- find_common_prefix(bt, dst);
-
- /* Check if src node has destination page prefix. Otherwise the
- * destination page must expand its prefix on all its nodes.
- */
- srckey.len = srcnode->ksize;
- bcopy(NODEKEY(srcnode), srckey.str, srckey.len);
- common_prefix(bt, &srckey, &dst->prefix, &tmpkey);
- if (tmpkey.len != dst->prefix.len) {
- if (btree_adjust_prefix(bt, dst,
- tmpkey.len - dst->prefix.len) != BT_SUCCESS)
- return BT_FAIL;
- bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey));
- }
-
- if (srcindx == 0 && IS_BRANCH(src)) {
- struct mpage *low;
-
- /* must find the lowest key below src
- */
- assert(btree_search_page_root(bt, src, NULL, NULL, SearchFirst, 0,
- &low) == BT_SUCCESS);
- expand_prefix(bt, low, 0, &srckey);
- DPRINTF("found lowest key [%.*s] on leaf page %u",
- (int)srckey.len, srckey.str, low->pgno);
- } else {
- srckey.len = srcnode->ksize;
- bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize);
- }
- find_common_prefix(bt, src);
-
- /* expand the prefix */
- tmpkey.len = sizeof(tmpkey.str);
- concat_prefix(bt, src->prefix.str, src->prefix.len,
- srckey.str, srckey.len, tmpkey.str, &tmpkey.len);
-
- /* Add the node to the destination page. Adjust prefix for
- * destination page.
- */
- key.size = tmpkey.len;
- key.data = tmpkey.str;
- remove_prefix(bt, &key, dst->prefix.len);
- data.size = NODEDSZ(srcnode);
- data.data = NODEDATA(srcnode);
- rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode),
- srcnode->flags);
- if (rc != BT_SUCCESS)
- return rc;
-
- /* Delete the node from the source page.
- */
- btree_del_node(bt, src, srcindx);
-
- /* Update the parent separators.
- */
- if (srcindx == 0 && src->parent_index != 0) {
- expand_prefix(bt, src, 0, &tmpkey);
- key.size = tmpkey.len;
- key.data = tmpkey.str;
- remove_prefix(bt, &key, src->parent->prefix.len);
-
- DPRINTF("update separator for source page %u to [%.*s]",
- src->pgno, (int)key.size, (char *)key.data);
- if (btree_update_key(bt, src->parent, src->parent_index,
- &key) != BT_SUCCESS)
- return BT_FAIL;
- }
-
- if (srcindx == 0 && IS_BRANCH(src)) {
- struct btval nullkey;
- nullkey.size = 0;
- assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS);
- }
-
- if (dstindx == 0 && dst->parent_index != 0) {
- expand_prefix(bt, dst, 0, &tmpkey);
- key.size = tmpkey.len;
- key.data = tmpkey.str;
- remove_prefix(bt, &key, dst->parent->prefix.len);
-
- DPRINTF("update separator for destination page %u to [%.*s]",
- dst->pgno, (int)key.size, (char *)key.data);
- if (btree_update_key(bt, dst->parent, dst->parent_index,
- &key) != BT_SUCCESS)
- return BT_FAIL;
- }
-
- if (dstindx == 0 && IS_BRANCH(dst)) {
- struct btval nullkey;
- nullkey.size = 0;
- assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS);
- }
-
- /* We can get a new page prefix here!
- * Must update keys in all nodes of this page!
- */
- pfxlen = src->prefix.len;
- find_common_prefix(bt, src);
- if (src->prefix.len != pfxlen) {
- if (btree_adjust_prefix(bt, src,
- src->prefix.len - pfxlen) != BT_SUCCESS)
- return BT_FAIL;
- }
-
- pfxlen = dst->prefix.len;
- find_common_prefix(bt, dst);
- if (dst->prefix.len != pfxlen) {
- if (btree_adjust_prefix(bt, dst,
- dst->prefix.len - pfxlen) != BT_SUCCESS)
- return BT_FAIL;
- }
-
- if (IS_BRANCH(dst)) {
- assert(mp);
- mp->parent = dst;
- mp->parent_index = dstindx;
- find_common_prefix(bt, mp);
- if (mp->prefix.len != mp_pfxlen) {
- DPRINTF("moved branch node has changed prefix");
- if ((mp = mpage_touch(bt, mp)) == NULL)
- return BT_FAIL;
- if (btree_adjust_prefix(bt, mp,
- mp->prefix.len - mp_pfxlen) != BT_SUCCESS)
- return BT_FAIL;
- }
- }
-
- return BT_SUCCESS;
-}
-
-static int
-btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst)
-{
- int rc;
- indx_t i;
- unsigned int pfxlen;
- struct node *srcnode;
- struct btkey tmpkey, dstpfx;
- struct btval key, data;
-
- DPRINTF("merging page %u and %u", src->pgno, dst->pgno);
-
- assert(src->parent); /* can't merge root page */
- assert(dst->parent);
- assert(bt->txn != NULL);
-
- /* Mark src and dst as dirty. */
- if ((src = mpage_touch(bt, src)) == NULL ||
- (dst = mpage_touch(bt, dst)) == NULL)
- return BT_FAIL;
-
- find_common_prefix(bt, src);
- find_common_prefix(bt, dst);
-
- /* Check if source nodes has destination page prefix. Otherwise
- * the destination page must expand its prefix on all its nodes.
- */
- common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx);
- if (dstpfx.len != dst->prefix.len) {
- if (btree_adjust_prefix(bt, dst,
- dstpfx.len - dst->prefix.len) != BT_SUCCESS)
- return BT_FAIL;
- bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx));
- }
-
- /* Move all nodes from src to dst.
- */
- for (i = 0; i < NUMKEYS(src); i++) {
- srcnode = NODEPTR(src, i);
-
- /* If branch node 0 (implicit key), find the real key.
- */
- if (i == 0 && IS_BRANCH(src)) {
- struct mpage *low;
-
- /* must find the lowest key below src
- */
- assert(btree_search_page_root(bt, src, NULL, NULL, SearchFirst, 0,
- &low) == BT_SUCCESS);
- expand_prefix(bt, low, 0, &tmpkey);
- DPRINTF("found lowest key [%.*s] on leaf page %u",
- (int)tmpkey.len, tmpkey.str, low->pgno);
- } else {
- expand_prefix(bt, src, i, &tmpkey);
- }
-
- key.size = tmpkey.len;
- key.data = tmpkey.str;
-
- remove_prefix(bt, &key, dst->prefix.len);
- data.size = NODEDSZ(srcnode);
- data.data = NODEDATA(srcnode);
- rc = btree_add_node(bt, dst, NUMKEYS(dst), &key,
- &data, NODEPGNO(srcnode), srcnode->flags);
- if (rc != BT_SUCCESS)
- return rc;
- }
-
- DPRINTF("dst page %u now has %lu keys (%.1f%% filled)",
- dst->pgno, NUMKEYS(dst), (float)PAGEFILL(bt, dst) / 10);
-
- /* Unlink the src page from parent.
- */
- btree_del_node(bt, src->parent, src->parent_index);
- if (src->parent_index == 0) {
- key.size = 0;
- if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS)
- return BT_FAIL;
-
- pfxlen = src->prefix.len;
- find_common_prefix(bt, src);
- assert (src->prefix.len == pfxlen);
- }
-
- if (IS_LEAF(src))
- bt->meta.leaf_pages--;
- else
- bt->meta.branch_pages--;
-
- return btree_rebalance(bt, src->parent);
-}
-
-#define FILL_THRESHOLD 250
-
-static int
-btree_rebalance(struct btree *bt, struct mpage *mp)
-{
- struct node *node;
- struct mpage *parent;
- struct mpage *root;
- struct mpage *neighbor = NULL;
- indx_t si = 0, di = 0;
-
- assert(bt != NULL);
- assert(bt->txn != NULL);
- assert(mp != NULL);
-
- DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)",
- IS_LEAF(mp) ? "leaf" : "branch",
- mp->pgno, NUMKEYS(mp), (float)PAGEFILL(bt, mp) / 10);
-
- if (PAGEFILL(bt, mp) >= FILL_THRESHOLD) {
- DPRINTF("no need to rebalance page %u, above fill threshold",
- mp->pgno);
- return BT_SUCCESS;
- }
-
- parent = mp->parent;
-
- if (parent == NULL) {
- if (NUMKEYS(mp) == 0) {
- DPRINTF("tree is completely empty");
- bt->txn->root = P_INVALID;
- bt->meta.depth--;
- bt->meta.leaf_pages--;
- } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
- DPRINTF("collapsing root page!");
- bt->txn->root = NODEPGNO(NODEPTR(mp, 0));
- if ((root = btree_get_mpage(bt, bt->txn->root)) == NULL)
- return BT_FAIL;
- root->parent = NULL;
- bt->meta.depth--;
- bt->meta.branch_pages--;
- } else
- DPRINTF("root page doesn't need rebalancing");
- return BT_SUCCESS;
- }
-
- /* The parent (branch page) must have at least 2 pointers,
- * otherwise the tree is invalid.
- */
- assert(NUMKEYS(parent) > 1);
-
- /* Leaf page fill factor is below the threshold.
- * Try to move keys from left or right neighbor, or
- * merge with a neighbor page.
- */
-
- /* Find neighbors.
- */
- if (mp->parent_index == 0) {
- /* We're the leftmost leaf in our parent.
- */
- DPRINTF("reading right neighbor");
- node = NODEPTR(parent, mp->parent_index + 1);
- if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
- return BT_FAIL;
- neighbor->parent_index = mp->parent_index + 1;
- si = 0;
- di = NUMKEYS(mp);
- } else {
- /* There is at least one neighbor to the left.
- */
- DPRINTF("reading left neighbor");
- node = NODEPTR(parent, mp->parent_index - 1);
- if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
- return BT_FAIL;
- neighbor->parent_index = mp->parent_index - 1;
- si = NUMKEYS(neighbor) - 1;
- di = 0;
- }
- neighbor->parent = parent;
-
- DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)",
- neighbor->pgno, NUMKEYS(neighbor), (float)PAGEFILL(bt, neighbor) / 10);
-
- // Calculate the size of the node to be moved
- find_common_prefix (bt, neighbor);
- struct btkey oldMpPrefix = mp->prefix;
- find_common_prefix (bt, mp);
-
- node = NODEPTR(neighbor, si);
- size_t siSize = NODESIZE + node->ksize + neighbor->prefix.len;
- siSize += IS_BRANCH(neighbor) ? 0 : NODEDSZ(node);
-
- // Calculate the delta for the destination page prefix
- struct btkey nKey, newPrefix;
- nKey.len = node->ksize;
- bcopy(NODEKEY(node), nKey.str, nKey.len);
- common_prefix(bt, &nKey, &oldMpPrefix, &newPrefix);
- size_t prfxDelta = mp->prefix.len - newPrefix.len;
-
- /* If the neighbor page is above threshold and has at least two
- * keys, move one key from it.
- *
- * Otherwise we should try to merge them, but that might not be
- * possible, even if both are below threshold, as prefix expansion
- * might make keys larger. FIXME: detect this
- */
- if (PAGEFILL(bt, neighbor) >= FILL_THRESHOLD && NUMKEYS(neighbor) >= 2 &&
- NUMKEYS(mp)*prfxDelta + siSize < SIZELEFT(mp)) {
- // Key in parent of the source can change, if
- // we move the node from idx 0
- // We need to make sure that the new key fits into
- // parent page
- bool canUpdate = true;
- if (si == 0 && neighbor->parent_index != 0) {
- expand_prefix(bt, neighbor, 1, &nKey);
- node = NODEPTR(neighbor->parent, neighbor->parent_index);
- int oldLength = node->ksize;
- int newLength = nKey.len - neighbor->parent->prefix.len;
- if (newLength - oldLength > SIZELEFT(neighbor->parent)) canUpdate = false;
- }
- // Key in parent can change if we move into index 0
- // in destination page
- // We need to ensure that the new key fits into
- // parent page
- if (canUpdate && di == 0 && mp->parent_index != 0) {
- expand_prefix(bt, neighbor, si, &nKey);
- node = NODEPTR(mp->parent, mp->parent_index);
- int oldLength = node->ksize;
- int newLength = nKey.len - mp->parent->prefix.len;
- if (newLength - oldLength > SIZELEFT(mp->parent)) canUpdate = false;
- }
- if (canUpdate) {
- return btree_move_node(bt, neighbor, si, mp, di);
- }
- }
- else { /* FIXME: if (has_enough_room()) */
- // Calculate the worst case space requirement
- // This could be improved by calculating the 'prefix difference'
- // Note that worst case free can be negative
- // We are not changing the node at idx 0 so the parent
- // page free space shouldn't be an issue
- int nFree = SIZELEFT(neighbor) - NUMKEYS(neighbor)*neighbor->prefix.len;
- int mpFree = SIZELEFT(mp) - NUMKEYS(mp)*mp->prefix.len;
- int nRequired = 0;
- for (unsigned int i=0; i<NUMKEYS(neighbor); i++) {
- node = NODEPTR(neighbor, i);
- nRequired += NODESIZE + node->ksize;
- nRequired += IS_BRANCH(neighbor) ? 0 : NODEDSZ(node);
- }
- int mpRequired = 0;
- for (unsigned int i=0; i<NUMKEYS(mp); i++) {
- node = NODEPTR(mp, i);
- mpRequired += NODESIZE + node->ksize;
- mpRequired += IS_BRANCH(mp) ? 0 : NODEDSZ(node);
- }
-
- if (mp->parent_index == 0 && mpFree > nRequired)
- return btree_merge(bt, neighbor, mp);
- else if (nFree > mpRequired)
- return btree_merge(bt, mp, neighbor);
- }
- return BT_SUCCESS;
-}
-
-int
-btree_txn_del(struct btree *bt, struct btree_txn *txn,
- struct btval *key, struct btval *data)
-{
- int rc, exact, close_txn = 0;
- unsigned int ki;
- struct node *leaf;
- struct mpage *mp;
-
- DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data);
-
- assert(key != NULL);
-
- if (bt != NULL && txn != NULL && bt != txn->bt) {
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) {
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if (bt == NULL) {
- if (txn == NULL) {
- errno = EINVAL;
- return BT_FAIL;
- }
- bt = txn->bt;
- }
-
- if (key->size == 0 || key->size > MAXKEYSIZE) {
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if (txn == NULL) {
- close_txn = 1;
- if ((txn = btree_txn_begin(bt, 0)) == NULL)
- return BT_FAIL;
- }
-
- if ((rc = btree_search_page(bt, txn, key, NULL, SearchKey, 1, &mp)) != BT_SUCCESS)
- goto done;
-
- leaf = btree_search_node(bt, mp, key, &exact, &ki);
- if (leaf == NULL || !exact) {
- errno = ENOENT;
- rc = BT_FAIL;
- goto done;
- }
-
- if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS)
- goto done;
- btree_del_node(bt, mp, ki);
- bt->meta.entries--;
- rc = btree_rebalance(bt, mp);
- if (rc != BT_SUCCESS)
- txn->flags |= BT_TXN_ERROR;
-
-done:
- if (close_txn) {
- if (rc == BT_SUCCESS)
- rc = btree_txn_commit(txn, 0, 0);
- else
- btree_txn_abort(txn);
- }
- mpage_prune(bt);
- return rc;
-}
-
-/* Reduce the length of the prefix separator <sep> to the minimum length that
- * still makes it uniquely distinguishable from <min>.
- *
- * <min> is guaranteed to be sorted less than <sep>
- *
- * On return, <sep> is modified to the minimum length.
- */
-static void
-bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep)
-{
- assert(bt);
- size_t n = 0;
- char *p1;
- char *p2;
-
- if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
-
- assert(sep->size > 0);
-
- p1 = (char *)NODEKEY(min) + min->ksize - 1;
- p2 = (char *)sep->data + sep->size - 1;
-
- while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) {
- assert(p2 > (char *)sep->data);
- p1--;
- p2--;
- n++;
- }
-
- sep->data = p2;
- sep->size = n + 1;
- } else {
-
- assert(min->ksize > 0);
- assert(sep->size > 0);
-
- p1 = (char *)NODEKEY(min);
- p2 = (char *)sep->data;
-
- while (*p1 == *p2) {
- p1++;
- p2++;
- n++;
- if (n == min->ksize || n == sep->size)
- break;
- }
-
- sep->size = n + 1;
- }
-
- DPRINTF("reduced separator to [%.*s] > [%.*s]",
- (int)sep->size, (char *)sep->data,
- (int)min->ksize, (char *)NODEKEY(min));
-}
-
-/* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or
- * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and
- * *newindxp with the actual values after split, ie if *mpp and *newindxp
- * refer to a node in the new right sibling page.
- */
-static int
-btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp,
- struct btval *newkey, struct btval *newdata, pgno_t newpgno)
-{
- uint8_t flags;
- int rc = BT_SUCCESS, ins_new = 0;
- indx_t newindx;
- pgno_t pgno = 0;
- size_t orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff;
- unsigned int i, j, split_indx;
- struct node *node;
- struct mpage *pright, *p, *mp;
- struct btval sepkey, tmpkey, rkey, rdata;
- struct page *copy;
- void *allocated_block = 0;
-
- assert(bt != NULL);
- assert(bt->txn != NULL);
-
- mp = *mpp;
- newindx = *newindxp;
-
- DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %i",
- IS_LEAF(mp) ? "leaf" : "branch", mp->pgno,
- (int)newkey->size, (char *)newkey->data, *newindxp);
- DPRINTF("page %u has prefix [%.*s]", mp->pgno,
- (int)mp->prefix.len, (char *)mp->prefix.str);
- orig_pfx_len = mp->prefix.len;
-
- if (mp->parent == NULL) {
- if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL)
- return BT_FAIL;
- mp->parent_index = 0;
- bt->txn->root = mp->parent->pgno;
- DPRINTF("root split! new root = %u", mp->parent->pgno);
- bt->meta.depth++;
-
- /* Add left (implicit) pointer. */
- if (btree_add_node(bt, mp->parent, 0, NULL, NULL,
- mp->pgno, 0) != BT_SUCCESS)
- return BT_FAIL;
- } else {
- DPRINTF("parent branch page is %u", mp->parent->pgno);
- }
-
- /* Create a right sibling. */
- if ((pright = btree_new_page(bt, mp->page->flags)) == NULL)
- return BT_FAIL;
-
- pright->parent = mp->parent;
- pright->parent_index = mp->parent_index + 1;
- DPRINTF("new right sibling: page %u", pright->pgno);
-
- /* Move half of the keys to the right sibling. */
- if ((copy = (page *)malloc(bt->head.psize)) == NULL)
- return BT_FAIL;
- bcopy(mp->page, copy, bt->head.psize);
- assert(mp->ref == 0); /* XXX */
- bzero(&mp->page->ptrs, bt->head.psize - PAGEHDRSZ);
- mp->page->lower = PAGEHDRSZ;
- mp->page->upper = bt->head.psize;
-
- split_indx = NUMKEYSP(copy) / 2 + 1;
- DPRINTF("splitting copy of page %d [split index:%d, newindex:%d]", copy->pgno, split_indx, newindx);
-
- /* First find the separating key between the split pages.
- */
- bzero(&sepkey, sizeof(sepkey));
-
-#ifdef ENABLE_BIG_KEYS
- /* This could happen if there are less than 3 keys in the tree
- */
- if (split_indx >= NUMKEYSP(copy))
- split_indx = NUMKEYSP(copy) - 1;
-#endif
-
- if (newindx == split_indx) {
- sepkey.size = newkey->size;
- sepkey.data = newkey->data;
- remove_prefix(bt, &sepkey, mp->prefix.len);
- } else {
- node = NODEPTRP(copy, split_indx);
- sepkey.size = node->ksize;
- sepkey.data = NODEKEY(node);
- }
-
- if (IS_LEAF(mp) && bt->cmp == NULL) {
- /* Find the smallest separator. */
- /* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */
- node = NODEPTRP(copy, split_indx - 1);
- bt_reduce_separator(bt, node, &sepkey);
- }
-
- /* Fix separator wrt parent prefix. */
- if (bt->cmp == NULL) {
- DPRINTF("concat prefix [%.*s] to separator [%.*s]",
- mp->prefix.len, mp->prefix.str,
- sepkey.size, (char *)sepkey.data);
- tmpkey.size = mp->prefix.len + sepkey.size;
- tmpkey.data = allocated_block = malloc(tmpkey.size);
- tmpkey.free_data = 0;
- tmpkey.mp = 0;
- concat_prefix(bt, mp->prefix.str, mp->prefix.len,
- (char *)sepkey.data, sepkey.size, (char*)tmpkey.data, &tmpkey.size);
- sepkey = tmpkey;
- }
-
- DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data);
- DPRINTF("%d bytes left in parent page %d with branch size %d",
- SIZELEFT(pright->parent), pright->parent->pgno,
- bt_branch_size(bt, &sepkey));
-
- /* Copy separator key to the parent.
- */
- if (SIZELEFT(pright->parent) < bt_branch_size(bt, &sepkey)) {
- rc = btree_split(bt, &pright->parent, &pright->parent_index,
- &sepkey, NULL, pright->pgno);
-
- /* Right page might now have changed parent.
- * Check if left page also changed parent.
- */
- if (pright->parent != mp->parent &&
- mp->parent_index >= NUMKEYS(mp->parent)) {
- mp->parent = pright->parent;
- mp->parent_index = pright->parent_index - 1;
- }
- } else {
- DPRINTF("removing %d bytes from seperator [%.*s]",
- pright->parent->prefix.len,
- sepkey.size, sepkey.data);
- remove_prefix(bt, &sepkey, pright->parent->prefix.len);
- DPRINTF("adding sepkey [%.*s] to page %d with reference to page %d",
- sepkey.size, sepkey.data,
- pright->parent->pgno, pright->pgno);
- rc = btree_add_node(bt, pright->parent, pright->parent_index,
- &sepkey, NULL, pright->pgno, 0);
- }
-
- if (rc != BT_SUCCESS) {
- free(copy);
- if (allocated_block) {
- sepkey.data = allocated_block;
- sepkey.free_data = 1;
- btval_reset(&sepkey);
- }
- return BT_FAIL;
- }
-
- /* Update prefix for right and left page, if the parent was split.
- */
- find_common_prefix(bt, pright);
- assert(orig_pfx_len <= pright->prefix.len);
- right_pfx_diff = pright->prefix.len - orig_pfx_len;
- DPRINTF("right page %d prefix length = %d, diff = %d", pright->pgno, pright->prefix.len, right_pfx_diff);
-
- find_common_prefix(bt, mp);
- assert(orig_pfx_len <= mp->prefix.len);
- left_pfx_diff = mp->prefix.len - orig_pfx_len;
- DPRINTF("left page %d prefix length = %d, diff = %d", mp->pgno, mp->prefix.len, left_pfx_diff);
-
- for (i = j = 0; i <= NUMKEYSP(copy); j++) {
- if (i < split_indx) {
- /* Re-insert in left sibling. */
- p = mp;
- pfx_diff = left_pfx_diff;
- } else {
- /* Insert in right sibling. */
- if (i == split_indx) {
- /* Reset insert index for right sibling. */
- j = (i == newindx && ins_new);
- }
- p = pright;
- pfx_diff = right_pfx_diff;
- }
-
- if (i == newindx && !ins_new) {
- /* Insert the original entry that caused the split. */
- rkey.data = newkey->data;
- rkey.size = newkey->size;
- if (IS_LEAF(mp)) {
- rdata.data = newdata->data;
- rdata.size = newdata->size;
- } else
- pgno = newpgno;
- flags = 0;
- pfx_diff = p->prefix.len;
-
- ins_new = 1;
-
- /* Update page and index for the new key. */
- *newindxp = j;
- *mpp = p;
- } else if (i == NUMKEYSP(copy)) {
- break;
- } else {
- node = NODEPTRP(copy, i);
- rkey.data = NODEKEY(node);
- rkey.size = node->ksize;
- if (IS_LEAF(mp)) {
- rdata.data = NODEDATA(node);
- rdata.size = node->n_dsize;
- } else
- pgno = node->n_pgno;
- flags = node->flags;
-
- i++;
- }
-
- if (!IS_LEAF(mp) && j == 0) {
- /* First branch index doesn't need key data. */
- rkey.size = 0;
- } else {
- remove_prefix(bt, &rkey, pfx_diff);
- }
-
- rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags);
- }
-
- free(copy);
- if (allocated_block) {
- sepkey.data = allocated_block;
- sepkey.free_data = 1;
- btval_reset(&sepkey);
- }
- return rc;
-}
-
-int
-btree_txn_put(struct btree *bt, struct btree_txn *txn,
- struct btval *key, struct btval *data, unsigned int flags)
-{
- int rc = BT_SUCCESS, exact, close_txn = 0;
- unsigned int ki;
- struct node *leaf;
- struct mpage *mp;
- struct btval xkey;
-
- assert(key != NULL);
- assert(data != NULL);
-
- if (bt != NULL && txn != NULL && bt != txn->bt) {
- fprintf(stderr, "%s:%d: transaction does not belong to btree\n",
- __FUNCTION__, __LINE__);
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) {
- fprintf(stderr, "%s:%d: read-only transaction\n",
- __FUNCTION__, __LINE__);
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if (bt == NULL) {
- if (txn == NULL) {
- fprintf(stderr, "%s:%d: neither transaction nor btree\n",
- __FUNCTION__, __LINE__);
- errno = EINVAL;
- return BT_FAIL;
- }
- bt = txn->bt;
- }
-
- if (key->size == 0 || key->size > MAXKEYSIZE) {
- fprintf(stderr, "%s:%d: bad key size %zu (MAXKEYSIZE %d)\n",
- __FUNCTION__, __LINE__, key->size, MAXKEYSIZE);
- errno = EINVAL;
- return BT_FAIL;
- }
-
- DPRINTF("==> put key %.*s, size %zu, data size %zu",
- (int)key->size, (char *)key->data, key->size, data->size);
-
- if (txn == NULL) {
- close_txn = 1;
- if ((txn = btree_txn_begin(bt, 0)) == NULL)
- return BT_FAIL;
- }
-
- rc = btree_search_page(bt, txn, key, NULL, SearchKey, 1, &mp);
- if (rc == BT_SUCCESS) {
- leaf = btree_search_node(bt, mp, key, &exact, &ki);
- if (leaf && exact) {
- if (F_ISSET(flags, BT_NOOVERWRITE)) {
- DPRINTF("duplicate key %.*s",
- (int)key->size, (char *)key->data);
- fprintf(stderr, "db=%s flags=%x duplicate key %.*s",
- bt->path, flags,
- (int)key->size, (char *)key->data);
- errno = EEXIST;
- rc = BT_FAIL;
- goto done;
- }
- if (!F_ISSET(flags, BT_ALLOWDUPS))
- btree_del_node(bt, mp, ki);
- }
- if (leaf == NULL) { /* append if not found */
- ki = NUMKEYS(mp);
- DPRINTF("appending key at index %i", ki);
- }
- } else if (errno == ENOENT) {
- /* new file, just write a root leaf page */
- DPRINTF("allocating new root leaf page");
- if ((mp = btree_new_page(bt, P_LEAF)) == NULL) {
- rc = BT_FAIL;
- goto done;
- }
- txn->root = mp->pgno;
- bt->meta.depth++;
- ki = 0;
- }
- else
- goto done;
-
- assert(IS_LEAF(mp));
- DPRINTF("there are %lu keys, should insert new key at index %i",
- NUMKEYS(mp), ki);
-
- /* Copy the key pointer as it is modified by the prefix code. The
- * caller might have malloc'ed the data.
- */
- xkey.data = key->data;
- xkey.size = key->size;
-
- if (SIZELEFT(mp) < bt_leaf_size(bt, mp, key, data)) {
- rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID);
- } else {
- /* There is room already in this leaf page. */
- remove_prefix(bt, &xkey, mp->prefix.len);
- rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0);
- }
-
- if (rc != BT_SUCCESS)
- txn->flags |= BT_TXN_ERROR;
- else
- bt->meta.entries++;
-
-done:
- if (close_txn) {
- if (rc == BT_SUCCESS)
- rc = btree_txn_commit(txn, 0, 0);
- else
- btree_txn_abort(txn);
- }
- mpage_prune(bt);
- return rc;
-}
-
-static pgno_t
-btree_compact_tree(struct btree *bt, pgno_t pgno, struct btree *btc)
-{
- ssize_t rc;
- indx_t i;
- pgno_t *pnext, next;
- struct node *node;
- struct page *p;
- struct mpage *mp;
- /* Get the page and make a copy of it.
- */
- if ((mp = btree_get_mpage(bt, pgno)) == NULL)
- return P_INVALID;
- if ((p = (page *)malloc(bt->head.psize)) == NULL)
- return P_INVALID;
- bcopy(mp->page, p, bt->head.psize);
-
- /* Go through all nodes in the (copied) page and update the
- * page pointers.
- */
- if (F_ISSET(p->flags, P_BRANCH)) {
- for (i = 0; i < NUMKEYSP(p); i++) {
- node = NODEPTRP(p, i);
- node->n_pgno = btree_compact_tree(bt, node->n_pgno, btc);
- if (node->n_pgno == P_INVALID) {
- free(p);
- return P_INVALID;
- }
- }
- } else if (F_ISSET(p->flags, P_LEAF)) {
- for (i = 0; i < NUMKEYSP(p); i++) {
- node = NODEPTRP(p, i);
- if (F_ISSET(node->flags, F_BIGDATA)) {
- bcopy(NODEDATA(node), &next, sizeof(next));
- next = btree_compact_tree(bt, next, btc);
- if (next == P_INVALID) {
- free(p);
- return P_INVALID;
- }
- bcopy(&next, NODEDATA(node), sizeof(next));
- }
- }
- } else if (F_ISSET(p->flags, P_OVERFLOW)) {
- pnext = &p->p_next_pgno;
- if (*pnext > 0) {
- *pnext = btree_compact_tree(bt, *pnext, btc);
- if (*pnext == P_INVALID) {
- free(p);
- return P_INVALID;
- }
- }
- } else
- assert(0);
-
- pgno = p->pgno = btc->txn->next_pgno++;
- p->checksum = calculate_checksum(bt, p);
- DPRINTF("writing page %u with checksum %x", p->pgno, p->checksum);
- bt->stat.writes++;
- rc = write(btc->fd, p, bt->head.psize);
- free(p);
- if (rc != (ssize_t)bt->head.psize)
- return P_INVALID;
- mpage_prune(bt);
- return pgno;
-}
-
-int
-btree_compact(struct btree *bt)
-{
- char *compact_path = NULL;
- const char compact_ext[] = ".compact.XXXXXX";
- struct btree *btc;
- struct btree_txn *txn, *txnc = NULL;
- int fd;
- pgno_t root;
-
- assert(bt != NULL);
-
- DPRINTF("compacting btree %p with path %s", bt, bt->path);
-
- if (bt->path == NULL) {
- errno = EINVAL;
- return BT_FAIL;
- }
-
- if ((txn = btree_txn_begin(bt, 0)) == NULL)
- return BT_FAIL;
-
- compact_path = (char*)malloc(strlen(bt->path) + strlen(compact_ext) + 1);
- strcpy(compact_path, bt->path);
- strcat(compact_path, compact_ext);
-
- fd = mkstemp(compact_path);
- if (fd == -1) {
- EPRINTF("failed to get fd for compact file");
- free(compact_path);
- btree_txn_abort(txn);
- return BT_FAIL;
- }
-
- if ((btc = btree_open_fd(compact_path, fd, bt->flags)) == NULL)
- goto failed;
- DPRINTF("opened btree %p for compacting", btc);
- bcopy(&bt->meta, &btc->meta, sizeof(bt->meta));
- btc->meta.revisions = 0;
-
- if ((txnc = btree_txn_begin(btc, 0)) == NULL)
- goto failed;
-
- if (bt->meta.root != P_INVALID) {
- root = btree_compact_tree(bt, bt->meta.root, btc);
- if (root == P_INVALID)
- goto failed;
- fsync(fd);
- if (btree_write_meta(btc, root, BT_MARKER, bt->meta.tag) != BT_SUCCESS)
- goto failed;
- } else {
- if (btree_write_meta(btc, btc->meta.root, BT_MARKER, bt->meta.tag) != BT_SUCCESS)
- goto failed;
- }
-
- fsync(fd);
-
- DPRINTF("renaming %s to %s", compact_path, bt->path);
- if (rename(compact_path, bt->path) != 0)
- goto failed;
-
- /* Write a "tombstone" meta page so other processes can pick up
- * the change and re-open the file.
- */
- if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE, 0) != BT_SUCCESS)
- goto failed;
-
- btree_txn_abort(txn);
- btree_txn_abort(txnc);
- free(compact_path);
- btree_close(btc);
- mpage_prune(bt);
- return 0;
-
-failed:
- btree_txn_abort(txn);
- btree_txn_abort(txnc);
- unlink(compact_path);
- free(compact_path);
- btree_close(btc);
- mpage_prune(bt);
- return BT_FAIL;
-}
-
-/* Reverts the last change. Truncates the file at the last root page.
- */
-int
-btree_revert(struct btree *bt)
-{
- if (btree_read_meta(bt, NULL) != 0)
- return -1;
-
- DPRINTF("truncating file at page %u", bt->meta.root);
- fprintf(stderr, "truncating file at page %u\n", bt->meta.root);
- return ftruncate(bt->fd, bt->head.psize * bt->meta.root);
-}
-
-/* Rollback to the previous meta page. Truncates the file at that meta page.
- */
-int
-btree_rollback(struct btree *bt)
-{
- struct bt_meta *meta;
- struct mpage *mp;
- pgno_t prev_meta_pgno;
- int ret;
-
- if (btree_read_meta(bt, NULL) != 0)
- return -1;
-
- prev_meta_pgno = bt->meta.prev_meta;
- DPRINTF("prev_meta_pgno=%d\n", prev_meta_pgno);
- if ((mp = btree_get_mpage(bt, prev_meta_pgno)) == NULL) {
- return -1;
- }
- if (btree_is_meta_page(bt, mp->page)) {
- meta = METADATA(mp->page);
- } else {
- fprintf(stderr, "mp->page flags %x\n", mp->page->flags);
- return -2;
- }
-
- if (prev_meta_pgno != meta->pgno) {
- fprintf(stderr, "read wrong meta page! %d != %d\n", prev_meta_pgno, meta->pgno);
- return -1;
- }
-
- DPRINTF("truncating file at page %u", prev_meta_pgno);
- ret = ftruncate(bt->fd, bt->head.psize * prev_meta_pgno);
- if (ret != 0) {
- fprintf(stderr, "ftruncate failed on size %d\n", bt->head.psize * prev_meta_pgno);
- return ret;
- }
- bt->size = bt->head.psize * prev_meta_pgno;
- memcpy(&bt->meta, meta, sizeof(struct bt_meta));
- return BT_SUCCESS;
-}
-
-void
-btree_set_cache_size(struct btree *bt, unsigned int cache_size)
-{
- assert(bt);
- if (cache_size < bt->stat.max_cache)
- mpage_prune(bt);
- if (cache_size)
- bt->stat.max_cache = cache_size;
-}
-
-unsigned int
-btree_get_flags(struct btree *bt)
-{
- return (bt->flags & ~BT_FIXPADDING);
-}
-
-const char *
-btree_get_path(struct btree *bt)
-{
- return bt->path;
-}
-
-const struct btree_stat *
-btree_stat(struct btree *bt)
-{
- if (bt == NULL)
- return NULL;
- bt->stat.branch_pages = bt->meta.branch_pages;
- bt->stat.leaf_pages = bt->meta.leaf_pages;
- bt->stat.overflow_pages = bt->meta.overflow_pages;
- bt->stat.revisions = bt->meta.revisions;
- bt->stat.depth = bt->meta.depth;
- bt->stat.entries = bt->meta.entries;
- bt->stat.psize = bt->head.psize;
- bt->stat.created_at = bt->meta.created_at;
- bt->stat.tag = bt->meta.tag;
- bt->stat.ksize = bt->head.ksize;
- return &bt->stat;
-}
-
-const char *
-tohexstr(const char *src, size_t srclen, char* dst, size_t dstlen, int tobytes = 0)
-{
- assert(dstlen > 1);
- int bytesleft = dstlen - 1; // reserve space for the null charact
- *dst = 0;
- char *ptr = dst;
- for (size_t i = 0; i < srclen; ++i) {
- if (!tobytes) {
- if (bytesleft < 2)
- break;
- sprintf(ptr, "%02X",(unsigned char)src[i]);
- bytesleft -= 2;
- ptr += 2;
- } else {
- if (bytesleft < 1)
- break;
- sprintf(ptr, "%c", src[i]);
- bytesleft -= 1;
- ptr += 1;
- }
- }
- return dst;
-}
-
-const char *
-get_node_data(struct btree *bt, struct node *node, char *dst, size_t dstlen)
-{
- assert(dstlen > 1);
- if (F_ISSET(node->flags, F_BIGDATA)) {
- btval data;
- if (btree_read_data(bt, 0, node, &data) == BT_FAIL) {
- strncpy(dst, "ERROR: could not read overflow page", dstlen);
- dst[dstlen - 1] = 0;
- return dst;
- }
- tohexstr((const char *)data.data, data.size, dst, dstlen);
- btval_reset(&data);
- return dst;
- } else {
- return tohexstr((const char*)NODEDATA(node), NODEDSZ(node), dst, dstlen);
- }
-}
-
-void
-btree_dump_tree(struct btree *bt, pgno_t pgno, int depth)
-{
- indx_t i;
- pgno_t *pnext, next;
- struct node *node;
- struct page *p;
- struct mpage *mp;
- char indent[32] = {0};
- const int hexlen = MAXKEYSIZE;
- char khexstr[hexlen];
- char dhexstr[hexlen];
-
- for (i = 0; i < depth + 1; ++i)
- strcat(&indent[i], "\t");
-
- /* Get the page.
- */
- if ((mp = btree_get_mpage(bt, pgno)) == NULL)
- return;
- p = mp->page;
- if (F_ISSET(p->flags, P_BRANCH)) {
- fprintf(stderr, "%s", indent);
- fprintf(stderr, "Branch page %d [bytes-free:%d, num-keys:%zu]\n",
- pgno,
- SIZELEFT(mp),
- NUMKEYSP(p));
- for (i = 0; i < NUMKEYSP(p); i++) {
- node = NODEPTRP(p, i);
- fprintf(stderr, "%s", indent);
- fprintf(stderr, "-> Node %d points to page %d with seperator [%s]\n",
- i,
- NODEPGNO(node),
- tohexstr((const char*)node->data, node->ksize, khexstr, hexlen));
- btree_dump_tree(bt, node->n_pgno, depth + 1);
- }
- } else if (F_ISSET(p->flags, P_LEAF)) {
- fprintf(stderr, "%s", indent);
- fprintf(stderr, "Leaf page %d [bytes-free:%d, num-keys:%zu] with prefix [%.*s]\n",
- pgno,
- SIZELEFT(mp),
- NUMKEYSP(p),
- (int)mp->prefix.len, mp->prefix.str);
- for (i = 0; i < NUMKEYSP(p); i++) {
- node = NODEPTRP(p, i);
- fprintf(stderr, "%s", indent);
- fprintf(stderr, "-> Node %d [key:%s, data:%s]\n",
- i,
- tohexstr((const char*)node->data, node->ksize, khexstr, hexlen),
- get_node_data(bt, node, dhexstr, hexlen));
- if (F_ISSET(node->flags, F_BIGDATA)) {
- bcopy(NODEDATA(node), &next, sizeof(next));
- fprintf(stderr, "%s", indent);
- fprintf (stderr, "[!] Data size %d is on overflow page %d\n", node->n_dsize, next);
- btree_dump_tree(bt, next, depth + 1);
- }
- }
- } else if (F_ISSET(p->flags, P_OVERFLOW)) {
- fprintf(stderr, "%s", indent);
- pnext = &p->p_next_pgno;
- if (*pnext > 0)
- fprintf (stderr, "Overflow page %d -> %d\n", pgno, *pnext);
- else
- fprintf (stderr, "Overflow page %d -> NULL\n", pgno);
- if (*pnext > 0)
- btree_dump_tree(bt, *pnext, depth);
- } else
- assert(0);
-}
-
-void
-btree_dump(struct btree *bt)
-{
- assert(bt != NULL);
- fprintf(stderr, "btree_dump %s\n", bt->path);
- if (bt->meta.root != P_INVALID) {
- fprintf(stderr, "Root page %d [depth:%d, entries:%" PRIu64 ", leaves:%d, branches:%d, bt-size:%ld, psize:%d]\n",
- bt->meta.root,
- bt->meta.depth,
- bt->meta.entries,
- bt->meta.leaf_pages,
- bt->meta.branch_pages,
- (long)bt->size,
- bt->head.psize);
- btree_dump_tree(bt, bt->meta.root, 0);
- } else {
- fprintf(stderr, "Root page invalid.\n");
- }
- fflush(stderr);
-}
-
-void
-btree_dump_page_from_memory(struct page *p)
-{
- indx_t i;
- pgno_t pgno;
- struct node *node;
- struct bt_head *head;
- const char *pgstr = F_ISSET(p->flags, P_BRANCH) ? "Branch" : "Leaf";
- const int hexlen = 512;
- char dhexstr[hexlen];
- char khexstr[hexlen];
-
- head = (bt_head*)METADATA(p);
- pgno = p->pgno;
-
- if (head->magic != BT_MAGIC) {
- EPRINTF("header has invalid magic");
- errno = EINVAL;
- return;
- }
-
- if (head->version != BT_VERSION) {
- EPRINTF("database is version %u, expected version %u",
- head->version, BT_VERSION);
- errno = EINVAL;
- return;
- }
-
- if (head->ksize != MAXKEYSIZE) {
- EPRINTF("database uses max key size %u, expected max key size %u",
- head->ksize, MAXKEYSIZE);
- errno = EINVAL;
- return;
- }
-
- fprintf(stderr, "* %s page %d with flags %0X offsets [%d -> %d]\n", pgstr, pgno,
- p->flags, p->lower, p->upper);
-
- for (i = 0; i < NUMKEYSP(p); i++) {
- node = NODEPTRP(p, i);
- fprintf(stderr, "-> Node %d [key:%s, data:%s]\n",
- i,
- tohexstr((const char*)node->data, node->ksize, khexstr, hexlen),
- tohexstr((const char*)NODEDATA(node), NODEDSZ(node), dhexstr, hexlen));
- }
-}
-
-int
-btree_dump_page_from_file(const char *filename, unsigned int pagen)
-{
- int fd;
- ssize_t rc;
- char header[PAGESIZE];
- struct page *p;
- struct bt_head *h;
-
- fd = open(filename, O_RDONLY);
- if (fd == 0)
- return 0;
-
- // read header
- if ((rc = pread(fd, header, PAGESIZE, 0)) == 0) {
- errno = ENOENT;
- return 0;
- } else if (rc != PAGESIZE) {
- if (rc > 0)
- errno = EINVAL;
- EPRINTF("read: %s", strerror(errno));
- return 0;
- }
-
- p = (struct page *)header;
-
- if (!F_ISSET(p->flags, P_HEAD)) {
- EPRINTF("page %d not a header page", p->pgno);
- errno = EINVAL;
- return 0;
- }
-
- h = (bt_head *)METADATA(p);
- if (h->magic != BT_MAGIC) {
- EPRINTF("header has invalid magic");
- errno = EINVAL;
- return 0;
- }
-
- if (h->version != BT_VERSION) {
- EPRINTF("database is version %u, expected version %u",
- h->version, BT_VERSION);
- errno = EINVAL;
- return 0;
- }
-
- if (h->ksize != MAXKEYSIZE) {
- EPRINTF("database uses max key size %u, expected max key size %u",
- h->ksize, MAXKEYSIZE);
- errno = EINVAL;
- return 0;
- }
-
- if ((p = (page *)calloc(1, h->psize)) == NULL)
- return 0;
- rc = pread(fd, p, h->psize, pagen * h->psize);
- if (rc < 0) {
- fprintf(stderr, "error reading page %d\n", pagen);
- return 0;
- } else if (rc < (ssize_t)h->psize) {
- fprintf(stderr, "read incomplete page %d (%d != %d)\n", pagen, h->psize, (int32_t)rc);
- return 0;
- }
-
- fprintf(stderr, "page %d:\n", pagen);
- fprintf(stderr, "\tpgno:%d\n", p->pgno);
- fprintf(stderr, "\tflags:%d: ", p->flags);
- if (p->flags & P_BRANCH)
- fprintf(stderr, "BRANCH ");
- if (p->flags & P_LEAF)
- fprintf(stderr, "LEAF ");
- if (p->flags & P_OVERFLOW)
- fprintf(stderr, "OVERFLOW ");
- if (p->flags & P_META)
- fprintf(stderr, "META ");
- if (p->flags & P_HEAD)
- fprintf(stderr, "HEAD");
- fprintf(stderr, "\n");
- fprintf(stderr, "\tb.fb_lower:%d\n", p->b.fb.fb_lower);
- fprintf(stderr, "\tb.fb_upper:%d\n", p->b.fb.fb_upper);
- fprintf(stderr, "\tb.pb_next_pgno:%d\n", p->b.pb_next_pgno);
-
- if (p->flags & P_META) {
- bt_meta *m = METADATA(p);
- fprintf(stderr, "\n");
- fprintf(stderr, "\tmeta->pgno: %d\n", m->pgno);
- fprintf(stderr, "\tmeta->flags: %d\n", m->flags);
- fprintf(stderr, "\tmeta->root: %d\n", m->root);
- fprintf(stderr, "\tmeta->prev_meta: %d\n", m->prev_meta);
- char *st = ctime((const time_t *)&m->created_at);
- fprintf(stderr, "\tmeta->created_at: %s", (st ? st : "(null)\n"));
- fprintf(stderr, "\tmeta->tag: %d\n", m->tag);
- }
- free(p);
- return 1;
-}