summaryrefslogtreecommitdiff
path: root/db2/btree
diff options
context:
space:
mode:
Diffstat (limited to 'db2/btree')
-rw-r--r--db2/btree/bt_close.c184
-rw-r--r--db2/btree/bt_compare.c205
-rw-r--r--db2/btree/bt_conv.c83
-rw-r--r--db2/btree/bt_cursor.c1577
-rw-r--r--db2/btree/bt_delete.c607
-rw-r--r--db2/btree/bt_open.c355
-rw-r--r--db2/btree/bt_page.c312
-rw-r--r--db2/btree/bt_put.c919
-rw-r--r--db2/btree/bt_rec.c767
-rw-r--r--db2/btree/bt_recno.c1195
-rw-r--r--db2/btree/bt_rsearch.c347
-rw-r--r--db2/btree/bt_search.c335
-rw-r--r--db2/btree/bt_split.c952
-rw-r--r--db2/btree/bt_stat.c257
-rw-r--r--db2/btree/btree.src137
-rw-r--r--db2/btree/btree_auto.c1279
16 files changed, 9511 insertions, 0 deletions
diff --git a/db2/btree/bt_close.c b/db2/btree/bt_close.c
new file mode 100644
index 0000000000..4e80634e86
--- /dev/null
+++ b/db2/btree/bt_close.c
@@ -0,0 +1,184 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_close.c 10.22 (Sleepycat) 8/23/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+static void __bam_upstat __P((DB *dbp));
+
+/*
+ * __bam_close --
+ * Close a btree.
+ *
+ * PUBLIC: int __bam_close __P((DB *));
+ */
+int
+__bam_close(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+
+ DEBUG_LWRITE(dbp, NULL, "bam_close", NULL, NULL, 0);
+
+ t = dbp->internal;
+
+ /* Update tree statistics. */
+ __bam_upstat(dbp);
+
+ /* Free any allocated memory. */
+ if (t->bt_rkey.data)
+ FREE(t->bt_rkey.data, t->bt_rkey.size);
+ if (t->bt_rdata.data)
+ FREE(t->bt_rdata.data, t->bt_rdata.ulen);
+ if (t->bt_sp != t->bt_stack)
+ FREE(t->bt_sp, (t->bt_esp - t->bt_sp) * sizeof(EPG));
+
+ FREE(t, sizeof(BTREE));
+ dbp->internal = NULL;
+
+ return (0);
+}
+
+/*
+ * __bam_sync --
+ * Sync the btree to disk.
+ *
+ * PUBLIC: int __bam_sync __P((DB *, int));
+ */
+int
+__bam_sync(argdbp, flags)
+ DB *argdbp;
+ int flags;
+{
+ DB *dbp;
+ int ret;
+
+ DEBUG_LWRITE(argdbp, NULL, "bam_sync", NULL, NULL, flags);
+
+ /* Check for invalid flags. */
+ if ((ret = __db_syncchk(argdbp, flags)) != 0)
+ return (ret);
+
+ /* If it wasn't possible to modify the file, we're done. */
+ if (F_ISSET(argdbp, DB_AM_INMEM | DB_AM_RDONLY))
+ return (0);
+
+ GETHANDLE(argdbp, NULL, &dbp, ret);
+
+ /* Flush any dirty pages from the cache to the backing file. */
+ if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE)
+ ret = 0;
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_upstat --
+ * Update tree statistics.
+ */
+static void
+__bam_upstat(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ BTMETA *meta;
+ DB_LOCK mlock;
+ db_pgno_t pgno;
+ int flags, ret;
+
+ /*
+ * We use a no-op log call to log the update of the statistics onto the
+ * metadata page. The dbp->close() call isn't transaction protected to
+ * start with, and I'm not sure what undoing a statistics update means,
+ * anyway.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM | DB_AM_RDONLY))
+ return;
+
+ /* Lock the page. */
+ if (__bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &mlock) != 0)
+ return;
+
+ flags = 0;
+ pgno = PGNO_METADATA;
+
+ /* Get the page. */
+ if (__bam_pget(dbp, (PAGE **)&meta, &pgno, 0) == 0) {
+ /* Log the change. */
+ if (DB_LOGGING(dbp) &&
+ (ret = __db_noop_log(dbp->dbenv->lg_info, dbp->txn,
+ &LSN(meta), 0)) == 0)
+ goto err;
+
+ /* Update the statistics. */
+ t = dbp->internal;
+ __bam_add_mstat(&t->lstat, &meta->stat);
+
+ flags = DB_MPOOL_DIRTY;
+ }
+
+err: (void)memp_fput(dbp->mpf, (PAGE *)meta, flags);
+ (void)__BT_LPUT(dbp, mlock);
+}
diff --git a/db2/btree/bt_compare.c b/db2/btree/bt_compare.c
new file mode 100644
index 0000000000..e802fd24ab
--- /dev/null
+++ b/db2/btree/bt_compare.c
@@ -0,0 +1,205 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_compare.c 10.3 (Sleepycat) 7/19/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+/*
+ * __bam_cmp --
+ * Compare a key to a given record.
+ *
+ * PUBLIC: int __bam_cmp __P((DB *, const DBT *, EPG *));
+ */
+int
+__bam_cmp(dbp, k1, e)
+ DB *dbp;
+ const DBT *k1;
+ EPG *e;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BTREE *t;
+ DBT k2;
+ PAGE *h;
+
+ t = dbp->internal;
+
+ /*
+ * Returns:
+ * < 0 if k1 is < record
+ * = 0 if k1 is = record
+ * > 0 if k1 is > record
+ *
+ * The left-most key on internal pages, at any level of the tree, is
+ * guaranteed, by the following code, to be less than any user key.
+ * This saves us from having to update the leftmost key on an internal
+ * page when the user inserts a new key in the tree smaller than
+ * anything we've yet seen.
+ */
+ h = e->page;
+ if (e->indx == 0 &&
+ h->prev_pgno == PGNO_INVALID && TYPE(h) != P_LBTREE)
+ return (1);
+
+ bo = NULL;
+ if (TYPE(h) == P_LBTREE) {
+ bk = GET_BKEYDATA(h, e->indx);
+ if (bk->type == B_OVERFLOW)
+ bo = (BOVERFLOW *)bk;
+ else {
+ memset(&k2, 0, sizeof(k2));
+ k2.data = bk->data;
+ k2.size = bk->len;
+ }
+ } else {
+ bi = GET_BINTERNAL(h, e->indx);
+ if (bi->type == B_OVERFLOW)
+ bo = (BOVERFLOW *)(bi->data);
+ else {
+ memset(&k2, 0, sizeof(k2));
+ k2.data = bi->data;
+ k2.size = bi->len;
+ }
+ }
+
+ /*
+ * XXX
+ * We ignore system errors; the only recoverable one is ENOMEM, and we
+ * don't want to require that comparison routines handle random errors.
+ * We don't want to return a valid comparison, either, so we stop.
+ */
+ if (bo != NULL) {
+ /*
+ * If using the default comparison routine, use __db_moff(),
+ * which compares the overflow key a page at a time.
+ */
+ if (t->bt_compare == __bam_defcmp)
+ return (__db_moff(dbp, k1, bo->pgno));
+
+ /*
+ * Otherwise, we need a contiguous record so we can hand it
+ * to the user's routine.
+ */
+ if (__db_goff(dbp, &k2, bo->tlen,
+ bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen) != 0)
+ abort();
+ }
+ return ((*t->bt_compare)(k1, &k2));
+}
+
+/*
+ * __bam_defcmp --
+ * Default comparison routine.
+ *
+ * PUBLIC: int __bam_defcmp __P((const DBT *, const DBT *));
+ */
+int
+__bam_defcmp(a, b)
+ const DBT *a, *b;
+{
+ size_t len;
+ u_int8_t *p1, *p2;
+
+ /*
+ * Returns:
+ * < 0 if a is < b
+ * = 0 if a is = b
+ * > 0 if a is > b
+ *
+ * XXX
+ * If a size_t doesn't fit into a long, or if the difference between
+ * any two characters doesn't fit into an int, this routine can lose.
+ * What we need is a signed integral type that's guaranteed to be at
+ * least as large as a size_t, and there is no such thing.
+ */
+ len = a->size > b->size ? b->size : a->size;
+ for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
+ if (*p1 != *p2)
+ return ((long)*p1 - (long)*p2);
+ return ((long)a->size - (long)b->size);
+}
+
+/*
+ * __bam_defpfx --
+ * Default prefix routine.
+ *
+ * PUBLIC: size_t __bam_defpfx __P((const DBT *, const DBT *));
+ */
+size_t
+__bam_defpfx(a, b)
+ const DBT *a, *b;
+{
+ size_t cnt, len;
+ u_int8_t *p1, *p2;
+
+ cnt = 1;
+ len = a->size > b->size ? b->size : a->size;
+ for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt)
+ if (*p1 != *p2)
+ return (cnt);
+
+ /*
+ * We know that a->size must be <= b->size, or they wouldn't be
+ * in this order.
+ */
+ return (a->size < b->size ? a->size + 1 : a->size);
+}
diff --git a/db2/btree/bt_conv.c b/db2/btree/bt_conv.c
new file mode 100644
index 0000000000..537e2f98ec
--- /dev/null
+++ b/db2/btree/bt_conv.c
@@ -0,0 +1,83 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_conv.c 10.3 (Sleepycat) 8/9/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_swap.h"
+#include "btree.h"
+
+/*
+ * __bam_pgin, __bam_pgout --
+ * Convert host-specific page layout to/from the host-independent
+ * format stored on disk.
+ *
+ * PUBLIC: int __bam_pgin __P((db_pgno_t, void *, DBT *));
+ * PUBLIC: int __bam_pgout __P((db_pgno_t, void *, DBT *));
+ */
+int
+__bam_pgin(pg, pp, cookie)
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!pginfo->needswap)
+ return (0);
+ return (pg == PGNO_METADATA ? __bam_mswap(pp) : __db_pgin(pg, pp));
+}
+
+int
+__bam_pgout(pg, pp, cookie)
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!pginfo->needswap)
+ return (0);
+ return (pg == PGNO_METADATA ? __bam_mswap(pp) : __db_pgout(pg, pp));
+}
+
+/*
+ * __bam_mswap --
+ * Swap the bytes on the btree metadata page.
+ *
+ * PUBLIC: int __bam_mswap __P((PAGE *));
+ */
+int
+__bam_mswap(pg)
+ PAGE *pg;
+{
+ u_int8_t *p;
+
+ p = (u_int8_t *)pg;
+ SWAP32(p); /* lsn.file */
+ SWAP32(p); /* lsn.offset */
+ SWAP32(p); /* pgno */
+ SWAP32(p); /* magic */
+ SWAP32(p); /* version */
+ SWAP32(p); /* pagesize */
+ SWAP32(p); /* maxkey */
+ SWAP32(p); /* minkey */
+ SWAP32(p); /* free */
+ SWAP32(p); /* flags */
+ return (0);
+}
diff --git a/db2/btree/bt_cursor.c b/db2/btree/bt_cursor.c
new file mode 100644
index 0000000000..592ec9b3ff
--- /dev/null
+++ b/db2/btree/bt_cursor.c
@@ -0,0 +1,1577 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_cursor.c 10.26 (Sleepycat) 8/24/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+static int __bam_c_close __P((DBC *));
+static int __bam_c_del __P((DBC *, int));
+static int __bam_c_first __P((DB *, CURSOR *));
+static int __bam_c_get __P((DBC *, DBT *, DBT *, int));
+static int __bam_c_last __P((DB *, CURSOR *));
+static int __bam_c_next __P((DB *, CURSOR *, int));
+static int __bam_c_physdel __P((DB *, CURSOR *, PAGE *));
+static int __bam_c_prev __P((DB *, CURSOR *));
+static int __bam_c_put __P((DBC *, DBT *, DBT *, int));
+static int __bam_c_rget __P((DB *, CURSOR *, DBT *, DBT *, int));
+static int __bam_c_search __P((DB *, CURSOR *, const DBT *, u_int, int, int *));
+
+/* Discard the current page/lock held by a cursor. */
+#undef DISCARD
+#define DISCARD(dbp, cp) { \
+ (void)memp_fput(dbp->mpf, (cp)->page, 0); \
+ (cp)->page = NULL; \
+ (void)__BT_TLPUT((dbp), (cp)->lock); \
+ (cp)->lock = LOCK_INVALID; \
+}
+
+/*
+ * __bam_cursor --
+ * Interface to the cursor functions.
+ *
+ * PUBLIC: int __bam_cursor __P((DB *, DB_TXN *, DBC **));
+ */
+int
+__bam_cursor(dbp, txn, dbcp)
+ DB *dbp;
+ DB_TXN *txn;
+ DBC **dbcp;
+{
+ CURSOR *cp;
+ DBC *dbc;
+
+ DEBUG_LWRITE(dbp, txn, "bam_cursor", NULL, NULL, 0);
+
+ if ((dbc = (DBC *)calloc(1, sizeof(DBC))) == NULL)
+ return (ENOMEM);
+ if ((cp = (CURSOR *)calloc(1, sizeof(CURSOR))) == NULL) {
+ free(dbc);
+ return (ENOMEM);
+ }
+
+ cp->dbc = dbc;
+ cp->pgno = cp->dpgno = PGNO_INVALID;
+ cp->lock = LOCK_INVALID;
+
+ dbc->dbp = dbp;
+ dbc->txn = txn;
+ dbc->internal = cp;
+ dbc->c_close = __bam_c_close;
+ dbc->c_del = __bam_c_del;
+ dbc->c_get = __bam_c_get;
+ dbc->c_put = __bam_c_put;
+
+ /* All cursor structures hang off the main DB structure. */
+ DB_THREAD_LOCK(dbp);
+ TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links);
+ DB_THREAD_UNLOCK(dbp);
+
+ *dbcp = dbc;
+ return (0);
+}
+
+/*
+ * __bam_c_close --
+ * Close a single cursor.
+ */
+static int
+__bam_c_close(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ CURSOR *cp;
+ int ret;
+
+ DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_close", NULL, NULL, 0);
+
+ GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
+ cp = dbc->internal;
+
+ /* If a cursor key was deleted do the actual deletion. */
+ ret = F_ISSET(cp, C_DELETED) ? __bam_c_physdel(dbp, cp, NULL) : 0;
+
+ /* Discard any lock if we're not inside a transaction. */
+ if (dbp->txn == NULL && cp->lock != LOCK_INVALID)
+ (void)__BT_TLPUT(dbp, cp->lock);
+
+ /* Remove the cursor from the queue. */
+ DB_THREAD_LOCK(dbp);
+ TAILQ_REMOVE(&dbp->curs_queue, dbc, links);
+ DB_THREAD_UNLOCK(dbp);
+
+ /* Discard the structures. */
+ FREE(cp, sizeof(CURSOR));
+ FREE(dbc, sizeof(DBC));
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_c_del --
+ * Delete using a cursor.
+ */
+static int
+__bam_c_del(dbc, flags)
+ DBC *dbc;
+ int flags;
+{
+ CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_indx_t indx;
+ int ret;
+
+ DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_del", NULL, NULL, flags);
+
+ cp = dbc->internal;
+
+ /* Check for invalid flags. */
+ if ((ret = __db_cdelchk(dbc->dbp, flags,
+ F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
+ return (ret);
+
+ /* If already deleted, return failure. */
+ if (F_ISSET(cp, C_DELETED | C_REPLACE))
+ return (DB_KEYEMPTY);
+
+ GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
+
+ /*
+ * We don't physically delete the record until the cursor moves,
+ * so we have to have a long-lived write lock on the page instead
+ * of a long-lived read lock. Note, we have to have a read lock
+ * to even get here, so we simply discard it.
+ */
+ if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) {
+ if ((ret = __bam_lget(dbp,
+ 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
+ goto err;
+ (void)__BT_TLPUT(dbp, cp->lock);
+ cp->lock = lock;
+ cp->mode = DB_LOCK_WRITE;
+ }
+
+ /*
+ * Acquire the underlying page (which may be different from the above
+ * page because it may be a duplicate page), and set the on-page and
+ * in-cursor delete flags. We don't need to lock it as we've already
+ * write-locked the page leading to it.
+ */
+ if (cp->dpgno == PGNO_INVALID) {
+ pgno = cp->pgno;
+ indx = cp->indx;
+ } else {
+ pgno = cp->dpgno;
+ indx = cp->dindx;
+ }
+
+ if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+ goto err;
+
+ /* Log the change. */
+ if (DB_LOGGING(dbp) &&
+ (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h),
+ 0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ goto err;
+ }
+
+ /* Set the intent-to-delete flag on the page and in all cursors. */
+ if (cp->dpgno == PGNO_INVALID)
+ GET_BKEYDATA(h, indx + O_INDX)->deleted = 1;
+ else
+ GET_BKEYDATA(h, indx)->deleted = 1;
+ (void)__bam_ca_delete(dbp, pgno, indx, NULL);
+
+ ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
+
+err: PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_get --
+ * Retrieve a key/data pair from the tree.
+ *
+ * PUBLIC: int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, int));
+ */
+int
+__bam_get(argdbp, txn, key, data, flags)
+ DB *argdbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ int flags;
+{
+ DBC dbc;
+ CURSOR cp;
+ int ret;
+
+ DEBUG_LREAD(argdbp, txn, "bam_get", key, NULL, flags);
+
+ /* Check for invalid flags. */
+ if ((ret = __db_getchk(argdbp, key, data, flags)) != 0)
+ return (ret);
+
+ /* Build a cursor. */
+ memset(&cp, 0, sizeof(cp));
+ cp.dbc = &dbc;
+ cp.pgno = cp.dpgno = PGNO_INVALID;
+ cp.lock = LOCK_INVALID;
+
+ memset(&dbc, 0, sizeof(dbc));
+ dbc.dbp = argdbp;
+ dbc.txn = txn;
+ dbc.internal = &cp;
+
+ /* Get the key. */
+ if ((ret = __bam_c_get(&dbc,
+ key, data, LF_ISSET(DB_SET_RECNO) ? DB_SET_RECNO : DB_SET)) != 0)
+ return (ret);
+
+ /* Discard any lock, the cursor didn't really exist. */
+ if (cp.lock != LOCK_INVALID)
+ (void)__BT_TLPUT(argdbp, cp.lock);
+
+ return (0);
+}
+
+/*
+ * __bam_c_get --
+ * Get using a cursor (btree).
+ */
+static int
+__bam_c_get(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ int flags;
+{
+ BTREE *t;
+ CURSOR *cp, copy;
+ DB *dbp;
+ PAGE *h;
+ int exact, ret;
+
+ DEBUG_LREAD(dbc->dbp, dbc->txn, "bam_c_get",
+ flags == DB_SET || flags == DB_SET_RANGE ? key : NULL,
+ NULL, flags);
+
+ cp = dbc->internal;
+
+ /* Check for invalid flags. */
+ if ((ret = __db_cgetchk(dbc->dbp,
+ key, data, flags, cp->pgno != PGNO_INVALID)) != 0)
+ return (ret);
+
+ GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
+ t = dbp->internal;
+
+ /*
+ * Break out the code to return a cursor's record number. It
+ * has nothing to do with the cursor get code except that it's
+ * been rammed into the interface.
+ */
+ if (LF_ISSET(DB_GET_RECNO)) {
+ ret = __bam_c_rget(dbp, cp, key, data, flags);
+ PUTHANDLE(dbp);
+ return (ret);
+ }
+
+ /* Initialize the cursor for a new retrieval. */
+ copy = *cp;
+ cp->page = NULL;
+ cp->lock = LOCK_INVALID;
+
+ switch (flags) {
+ case DB_CURRENT:
+ /* It's not possible to return a deleted record. */
+ if (F_ISSET(cp, C_DELETED | C_REPLACE)) {
+ PUTHANDLE(dbp);
+ return (DB_KEYEMPTY);
+ }
+
+ /* Get the page with the current item on it. */
+ if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0)
+ goto err;
+ break;
+ case DB_NEXT:
+ if (cp->pgno != PGNO_INVALID) {
+ if ((ret = __bam_c_next(dbp, cp, 1)) != 0)
+ goto err;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_FIRST:
+ if ((ret = __bam_c_first(dbp, cp)) != 0)
+ goto err;
+ break;
+ case DB_PREV:
+ if (cp->pgno != PGNO_INVALID) {
+ if ((ret = __bam_c_prev(dbp, cp)) != 0)
+ goto err;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST:
+ if ((ret = __bam_c_last(dbp, cp)) != 0)
+ goto err;
+ break;
+ case DB_SET_RECNO:
+ exact = 1;
+ if ((ret =
+ __bam_c_search(dbp, cp, key, S_FIND, 1, &exact)) != 0)
+ goto err;
+ break;
+ case DB_SET:
+ exact = 1;
+ if ((ret =
+ __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0)
+ goto err;
+ break;
+ case DB_SET_RANGE:
+ exact = 0;
+ if ((ret =
+ __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0)
+ goto err;
+ break;
+ }
+
+ /*
+ * Return the key if the user didn't give us one. If we've moved to
+ * a duplicate page, we may no longer have a pointer to the main page,
+ * so we have to go get it. We know that it's already read-locked,
+ * however, so we don't have to acquire a new lock.
+ */
+ if (flags != DB_SET) {
+ if (cp->dpgno != PGNO_INVALID) {
+ if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0)
+ goto err;
+ } else
+ h = cp->page;
+ ret = __db_ret(dbp,
+ h, cp->indx, key, &t->bt_rkey.data, &t->bt_rkey.ulen);
+ if (cp->dpgno != PGNO_INVALID)
+ (void)memp_fput(dbp->mpf, h, 0);
+ if (ret)
+ goto err;
+ }
+
+ /* Return the data. */
+ if ((ret = __db_ret(dbp, cp->page,
+ cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx,
+ data, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0)
+ goto err;
+
+ /*
+ * If the previous cursor record has been deleted, delete it. The
+ * returned key isn't a deleted key, so clear the flag.
+ */
+ if (F_ISSET(&copy, C_DELETED) && __bam_c_physdel(dbp, &copy, cp->page))
+ goto err;
+ F_CLR(cp, C_DELETED | C_REPLACE);
+
+ /* Release the previous lock, if any. */
+ if (copy.lock != LOCK_INVALID)
+ (void)__BT_TLPUT(dbp, copy.lock);
+
+ /* Release the pinned page. */
+ ret = memp_fput(dbp->mpf, cp->page, 0);
+
+ ++t->lstat.bt_get;
+
+ if (0) {
+err: if (cp->page != NULL)
+ (void)memp_fput(dbp->mpf, cp->page, 0);
+ if (cp->lock != LOCK_INVALID)
+ (void)__BT_TLPUT(dbp, cp->lock);
+ *cp = copy;
+ }
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_c_rget --
+ * Return the record number for a cursor.
+ */
+static int
+__bam_c_rget(dbp, cp, key, data, flags)
+ DB *dbp;
+ CURSOR *cp;
+ DBT *key, *data;
+ int flags;
+{
+ BTREE *t;
+ DBT dbt;
+ db_recno_t recno;
+ int exact, ret;
+
+ /* Get the page with the current item on it. */
+ if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0)
+ return (ret);
+
+ /* Get a copy of the key. */
+ memset(&dbt, 0, sizeof(DBT));
+ dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
+ if ((ret = __db_ret(dbp, cp->page, cp->indx, &dbt, NULL, NULL)) != 0)
+ goto err;
+
+ exact = 1;
+ if ((ret = __bam_search(dbp, &dbt, S_FIND, 1, &recno, &exact)) != 0)
+ goto err;
+
+ t = dbp->internal;
+ ret = __db_retcopy(data, &recno, sizeof(recno),
+ &t->bt_rdata.data, &t->bt_rdata.ulen, dbp->db_malloc);
+
+ /* Release the stack. */
+ __bam_stkrel(dbp);
+
+err: (void)memp_fput(dbp->mpf, cp->page, 0);
+ free(dbt.data);
+ return (ret);
+}
+
+/*
+ * __bam_c_put --
+ * Put using a cursor.
+ */
+static int
+__bam_c_put(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ int flags;
+{
+ BTREE *t;
+ CURSOR *cp, copy;
+ DB *dbp;
+ DBT dbt;
+ db_indx_t indx;
+ db_pgno_t pgno;
+ int exact, needkey, ret;
+ void *arg;
+
+ DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_put",
+ flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
+ data, flags);
+
+ cp = dbc->internal;
+
+ if ((ret = __db_cputchk(dbc->dbp, key, data, flags,
+ F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
+ return (ret);
+
+ GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
+ t = dbp->internal;
+
+ /* Initialize the cursor for a new retrieval. */
+ copy = *cp;
+ cp->page = NULL;
+ cp->lock = LOCK_INVALID;
+
+ /*
+ * To split, we need a valid key for the page. Since it's a cursor,
+ * we have to build one.
+ */
+ if (0) {
+split: if (needkey) {
+ memset(&dbt, 0, sizeof(DBT));
+ ret = __db_ret(dbp, cp->page, indx,
+ &dbt, &t->bt_rkey.data, &t->bt_rkey.ulen);
+
+ DISCARD(dbp, cp);
+
+ if (ret)
+ goto err;
+ arg = &dbt;
+ } else {
+ (void)__bam_stkrel(dbp);
+ arg = key;
+ }
+ if ((ret = __bam_split(dbp, arg)) != 0)
+ goto err;
+ }
+
+ /* If there's no key supplied, use the cursor. */
+ if (flags == DB_KEYFIRST || flags == DB_KEYLAST)
+ needkey = 0;
+ else {
+ needkey = 1;
+ if (cp->dpgno == PGNO_INVALID) {
+ pgno = cp->pgno;
+ indx = cp->indx;
+ } else {
+ pgno = cp->dpgno;
+ indx = cp->dindx;
+ }
+ /* Acquire the current page. */
+ if ((ret = __bam_lget(dbp,
+ 0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) != 0)
+ goto err;
+ if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+ goto err;
+ }
+
+ ret = 0;
+ switch (flags) {
+ case DB_AFTER:
+ case DB_BEFORE:
+ case DB_CURRENT:
+ if ((ret = __bam_iitem(dbp, &cp->page,
+ &indx, key, data, flags, 0)) == DB_NEEDSPLIT)
+ goto split;
+ break;
+ case DB_KEYFIRST:
+ exact = 0;
+ if ((ret =
+ __bam_c_search(dbp, cp, key, S_KEYFIRST, 0, &exact)) != 0)
+ goto err;
+
+ indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx;
+ if ((ret = __bam_iitem(dbp, &cp->page, &indx, key,
+ data, DB_BEFORE, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT)
+ goto split;
+ if (ret)
+ goto err;
+ break;
+ case DB_KEYLAST:
+ exact = 0;
+ if ((ret =
+ __bam_c_search(dbp, cp, key, S_KEYLAST, 0, &exact)) != 0)
+ goto err;
+
+ indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx;
+ if ((ret = __bam_iitem(dbp, &cp->page, &indx, key,
+ data, DB_AFTER, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT)
+ goto split;
+ break;
+ }
+ if (ret)
+ goto err;
+
+ /*
+ * Update the cursor to point to the new entry. The new entry was
+ * stored on the current page, because we split pages until it was
+ * possible.
+ */
+ if (cp->dpgno == PGNO_INVALID)
+ cp->indx = indx;
+ else
+ cp->dindx = indx;
+
+ /*
+ * If the previous cursor record has been deleted, delete it. The
+ * returned key isn't a deleted key, so clear the flag.
+ */
+ if (F_ISSET(&copy, C_DELETED) &&
+ (ret = __bam_c_physdel(dbp, &copy, cp->page)) != 0)
+ goto err;
+ F_CLR(cp, C_DELETED | C_REPLACE);
+
+ /* Release the previous lock, if any. */
+ if (copy.lock != LOCK_INVALID)
+ (void)__BT_TLPUT(dbp, copy.lock);
+
+ /* Discard the pinned page. */
+ ret = memp_fput(dbp->mpf, cp->page, 0);
+ if (0) {
+err: if (cp->page != NULL)
+ (void)memp_fput(dbp->mpf, cp->page, 0);
+ if (cp->lock != LOCK_INVALID)
+ (void)__BT_TLPUT(dbp, cp->lock);
+ *cp = copy;
+ }
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_c_first --
+ * Return the first record.
+ */
+static int
+__bam_c_first(dbp, cp)
+ DB *dbp;
+ CURSOR *cp;
+{
+ db_pgno_t pgno;
+ int ret;
+
+ /* Walk down the left-hand side of the tree. */
+ for (pgno = PGNO_ROOT;;) {
+ if ((ret =
+ __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+ return (ret);
+
+ /* If we find a leaf page, we're done. */
+ if (ISLEAF(cp->page))
+ break;
+
+ pgno = GET_BINTERNAL(cp->page, 0)->pgno;
+ DISCARD(dbp, cp);
+ }
+
+ cp->pgno = cp->page->pgno;
+ cp->indx = 0;
+ cp->dpgno = PGNO_INVALID;
+
+ /* If it's an empty page or a deleted record, go to the next one. */
+ if (NUM_ENT(cp->page) == 0 ||
+ GET_BKEYDATA(cp->page, cp->indx + O_INDX)->deleted)
+ if ((ret = __bam_c_next(dbp, cp, 0)) != 0)
+ return (ret);
+
+ /* If it's a duplicate reference, go to the first entry. */
+ if ((ret = __bam_ovfl_chk(dbp, cp, O_INDX, 0)) != 0)
+ return (ret);
+
+ /* If it's a deleted record, go to the next one. */
+ if (cp->dpgno != PGNO_INVALID &&
+ GET_BKEYDATA(cp->page, cp->dindx)->deleted)
+ if ((ret = __bam_c_next(dbp, cp, 0)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * __bam_c_last --
+ * Return the last record.
+ */
+static int
+__bam_c_last(dbp, cp)
+ DB *dbp;
+ CURSOR *cp;
+{
+ db_pgno_t pgno;
+ int ret;
+
+ /* Walk down the right-hand side of the tree. */
+ for (pgno = PGNO_ROOT;;) {
+ if ((ret =
+ __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+ return (ret);
+
+ /* If we find a leaf page, we're done. */
+ if (ISLEAF(cp->page))
+ break;
+
+ pgno =
+ GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno;
+ DISCARD(dbp, cp);
+ }
+
+ cp->pgno = cp->page->pgno;
+ cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX;
+ cp->dpgno = PGNO_INVALID;
+
+ /* If it's an empty page or a deleted record, go to the previous one. */
+ if (NUM_ENT(cp->page) == 0 ||
+ GET_BKEYDATA(cp->page, cp->indx + O_INDX)->deleted)
+ if ((ret = __bam_c_prev(dbp, cp)) != 0)
+ return (ret);
+
+ /* If it's a duplicate reference, go to the last entry. */
+ if ((ret = __bam_ovfl_chk(dbp, cp, cp->indx + O_INDX, 1)) != 0)
+ return (ret);
+
+ /* If it's a deleted record, go to the previous one. */
+ if (cp->dpgno != PGNO_INVALID &&
+ GET_BKEYDATA(cp->page, cp->dindx)->deleted)
+ if ((ret = __bam_c_prev(dbp, cp)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * __bam_c_next --
+ * Move to the next record.
+ */
+static int
+__bam_c_next(dbp, cp, initial_move)
+ DB *dbp;
+ CURSOR *cp;
+ int initial_move;
+{
+ db_indx_t adjust, indx;
+ db_pgno_t pgno;
+ int ret;
+
+ /*
+ * We're either moving through a page of duplicates or a btree leaf
+ * page.
+ */
+ if (cp->dpgno == PGNO_INVALID) {
+ adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
+ pgno = cp->pgno;
+ indx = cp->indx;
+ } else {
+ adjust = O_INDX;
+ pgno = cp->dpgno;
+ indx = cp->dindx;
+ }
+ if (cp->page == NULL) {
+ if ((ret =
+ __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If at the end of the page, move to a subsequent page.
+ *
+ * !!!
+ * Check for >= NUM_ENT. If we're here as the result of a search that
+ * landed us on NUM_ENT, we'll increment indx before we test.
+ *
+ * !!!
+ * This code handles empty pages and pages with only deleted entries.
+ */
+ if (initial_move)
+ indx += adjust;
+ for (;;) {
+ if (indx >= NUM_ENT(cp->page)) {
+ pgno = cp->page->next_pgno;
+ DISCARD(dbp, cp);
+
+ /*
+ * If we're in a btree leaf page, we've reached the end
+ * of the tree. If we've reached the end of a page of
+ * duplicates, continue from the btree leaf page where
+ * we found this page of duplicates.
+ */
+ if (pgno == PGNO_INVALID) {
+ /* If in a btree leaf page, it's EOF. */
+ if (cp->dpgno == PGNO_INVALID)
+ return (DB_NOTFOUND);
+
+ /* Continue from the last btree leaf page. */
+ cp->dpgno = PGNO_INVALID;
+
+ adjust = P_INDX;
+ pgno = cp->pgno;
+ indx = cp->indx + P_INDX;
+ } else
+ indx = 0;
+
+ if ((ret = __bam_lget(dbp,
+ 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+ return (ret);
+ continue;
+ }
+
+ /* Ignore deleted records. */
+ if (dbp->type == DB_BTREE &&
+ ((cp->dpgno == PGNO_INVALID &&
+ GET_BKEYDATA(cp->page, indx + O_INDX)->deleted) ||
+ (cp->dpgno != PGNO_INVALID &&
+ GET_BKEYDATA(cp->page, indx)->deleted))) {
+ indx += adjust;
+ continue;
+ }
+
+ /*
+ * If we're not in a duplicates page, check to see if we've
+ * found a page of duplicates, in which case we move to the
+ * first entry.
+ */
+ if (cp->dpgno == PGNO_INVALID) {
+ cp->pgno = cp->page->pgno;
+ cp->indx = indx;
+
+ if ((ret =
+ __bam_ovfl_chk(dbp, cp, indx + O_INDX, 0)) != 0)
+ return (ret);
+ if (cp->dpgno != PGNO_INVALID) {
+ indx = cp->dindx;
+ adjust = O_INDX;
+ continue;
+ }
+ } else {
+ cp->dpgno = cp->page->pgno;
+ cp->dindx = indx;
+ }
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __bam_c_prev --
+ * Move to the previous record.
+ */
+static int
+__bam_c_prev(dbp, cp)
+ DB *dbp;
+ CURSOR *cp;
+{
+ db_indx_t indx, adjust;
+ db_pgno_t pgno;
+ int ret, set_indx;
+
+ /*
+ * We're either moving through a page of duplicates or a btree leaf
+ * page.
+ */
+ if (cp->dpgno == PGNO_INVALID) {
+ adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
+ pgno = cp->pgno;
+ indx = cp->indx;
+ } else {
+ adjust = O_INDX;
+ pgno = cp->dpgno;
+ indx = cp->dindx;
+ }
+ if (cp->page == NULL) {
+ if ((ret =
+ __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If at the beginning of the page, move to any previous one.
+ *
+ * !!!
+ * This code handles empty pages and pages with only deleted entries.
+ */
+ for (;;) {
+ if (indx == 0) {
+ pgno = cp->page->prev_pgno;
+ DISCARD(dbp, cp);
+
+ /*
+ * If we're in a btree leaf page, we've reached the
+ * beginning of the tree. If we've reached the first
+ * of a page of duplicates, continue from the btree
+ * leaf page where we found this page of duplicates.
+ */
+ if (pgno == PGNO_INVALID) {
+ /* If in a btree leaf page, it's SOF. */
+ if (cp->dpgno == PGNO_INVALID)
+ return (DB_NOTFOUND);
+
+ /* Continue from the last btree leaf page. */
+ cp->dpgno = PGNO_INVALID;
+
+ adjust = P_INDX;
+ pgno = cp->pgno;
+ indx = cp->indx;
+ set_indx = 0;
+ } else
+ set_indx = 1;
+
+ if ((ret = __bam_lget(dbp,
+ 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+ return (ret);
+
+ if (set_indx)
+ indx = NUM_ENT(cp->page);
+ if (indx == 0)
+ continue;
+ }
+
+ /* Ignore deleted records. */
+ indx -= adjust;
+ if (dbp->type == DB_BTREE &&
+ ((cp->dpgno == PGNO_INVALID &&
+ GET_BKEYDATA(cp->page, indx + O_INDX)->deleted) ||
+ (cp->dpgno != PGNO_INVALID &&
+ GET_BKEYDATA(cp->page, indx)->deleted)))
+ continue;
+
+ /*
+ * If we're not in a duplicates page, check to see if we've
+ * found a page of duplicates, in which case we move to the
+ * last entry.
+ */
+ if (cp->dpgno == PGNO_INVALID) {
+ cp->pgno = cp->page->pgno;
+ cp->indx = indx;
+
+ if ((ret =
+ __bam_ovfl_chk(dbp, cp, indx + O_INDX, 1)) != 0)
+ return (ret);
+ if (cp->dpgno != PGNO_INVALID) {
+ indx = cp->dindx + O_INDX;
+ adjust = O_INDX;
+ continue;
+ }
+ } else {
+ cp->dpgno = cp->page->pgno;
+ cp->dindx = indx;
+ }
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __bam_c_search --
+ * Move to a specified record.
+ */
+static int
+__bam_c_search(dbp, cp, key, flags, isrecno, exactp)
+ DB *dbp;
+ CURSOR *cp;
+ const DBT *key;
+ u_int flags;
+ int isrecno, *exactp;
+{
+ BTREE *t;
+ db_recno_t recno;
+ int needexact, ret;
+
+ t = dbp->internal;
+ needexact = *exactp;
+
+ /*
+ * Find any matching record; the search function pins the page. Make
+ * sure it's a valid key (__bam_search may return an index just past
+ * the end of a page) and return it.
+ */
+ if (isrecno) {
+ if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0)
+ return (ret);
+ ret = __bam_rsearch(dbp, &recno, flags, 1, exactp);
+ } else
+ ret = __bam_search(dbp, key, flags, 1, NULL, exactp);
+ if (ret != 0)
+ return (ret);
+
+ cp->page = t->bt_csp->page;
+ cp->pgno = cp->page->pgno;
+ cp->indx = t->bt_csp->indx;
+ cp->lock = t->bt_csp->lock;
+ cp->dpgno = PGNO_INVALID;
+
+ /*
+ * If we have an exact match, make sure that we're not looking at a
+ * chain of duplicates -- if so, move to an entry in that chain.
+ */
+ if (*exactp) {
+ if ((ret = __bam_ovfl_chk(dbp,
+ cp, cp->indx + O_INDX, LF_ISSET(S_DUPLAST))) != 0)
+ return (ret);
+ } else
+ if (needexact)
+ return (DB_NOTFOUND);
+
+ /* If past the end of a page, find the next entry. */
+ if (cp->indx == NUM_ENT(cp->page) &&
+ (ret = __bam_c_next(dbp, cp, 0)) != 0)
+ return (ret);
+
+ /* If it's a deleted record, go to the next or previous one. */
+ if (cp->dpgno != PGNO_INVALID &&
+ GET_BKEYDATA(cp->page, cp->dindx)->deleted)
+ if (flags == S_KEYLAST) {
+ if ((ret = __bam_c_prev(dbp, cp)) != 0)
+ return (ret);
+ } else
+ if ((ret = __bam_c_next(dbp, cp, 0)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * __bam_ovfl_chk --
+ * Check for an overflow record, and if found, move to the correct
+ * record.
+ *
+ * PUBLIC: int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int));
+ */
+int
+__bam_ovfl_chk(dbp, cp, indx, to_end)
+ DB *dbp;
+ CURSOR *cp;
+ u_int32_t indx;
+ int to_end;
+{
+ BOVERFLOW *bo;
+ db_pgno_t pgno;
+ int ret;
+
+ /* Check for an overflow entry. */
+ bo = GET_BOVERFLOW(cp->page, indx);
+ if (bo->type != B_DUPLICATE)
+ return (0);
+
+ /*
+ * If we find one, go to the duplicates page, and optionally move
+ * to the last record on that page.
+ *
+ * XXX
+ * We don't lock duplicates pages, we've already got the correct
+ * lock on the main page.
+ */
+ pgno = bo->pgno;
+ if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
+ return (ret);
+ cp->page = NULL;
+ if (to_end) {
+ if ((ret = __db_dend(dbp, pgno, &cp->page)) != 0)
+ return (ret);
+ indx = NUM_ENT(cp->page) - O_INDX;
+ } else {
+ if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+ return (ret);
+ indx = 0;
+ }
+
+ /* Update the duplicate entry in the cursor. */
+ cp->dpgno = cp->page->pgno;
+ cp->dindx = indx;
+
+ return (0);
+}
+
+#ifdef DEBUG
+/*
+ * __bam_cprint --
+ * Display the current btree cursor list.
+ */
+int
+__bam_cprint(dbp)
+ DB *dbp;
+{
+ CURSOR *cp;
+ DBC *dbc;
+
+ DB_THREAD_LOCK(dbp);
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (CURSOR *)dbc->internal;
+ fprintf(stderr,
+ "%#0x: page: %lu index: %lu dpage %lu dindex: %lu",
+ (u_int)cp, (u_long)cp->pgno, (u_long)cp->indx,
+ (u_long)cp->dpgno, (u_long)cp->dindx);
+ if (F_ISSET(cp, C_DELETED))
+ fprintf(stderr, "(deleted)");
+ fprintf(stderr, "\n");
+ }
+ DB_THREAD_UNLOCK(dbp);
+ return (0);
+}
+#endif /* DEBUG */
+
+/*
+ * __bam_ca_delete --
+ * Check if any of the cursors refer to the item we are about to delete.
+ * We'll return the number of cursors that refer to the item in question.
+ * If a cursor does refer to the item, then we set its deleted bit.
+ *
+ * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *));
+ */
+int
+__bam_ca_delete(dbp, pgno, indx, curs)
+ DB *dbp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ CURSOR *curs;
+{
+ DBC *dbc;
+ CURSOR *cp;
+ int count;
+
+ /*
+ * Adjust the cursors. We don't have to review the cursors for any
+ * process other than the current one, because we have the page write
+ * locked at this point, and any other process had better be using a
+ * different locker ID, meaning that only cursors in our process can
+ * be on the page.
+ *
+ * It's possible for multiple cursors within the thread to have write
+ * locks on the same page, but, cursors within a thread must be single
+ * threaded, so all we're locking here is the cursor linked list.
+ *
+ * indx refers to the first of what might be a duplicate set. The
+ * cursor passed in is the one initiating the delete, so we don't
+ * want to count it.
+ */
+ DB_THREAD_LOCK(dbp);
+ for (count = 0, dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (CURSOR *)dbc->internal;
+ if ((curs != cp &&
+ cp->pgno == pgno && cp->indx == indx) ||
+ (cp->dpgno == pgno && cp->dindx == indx)) {
+ ++count;
+ F_SET(cp, C_DELETED);
+ }
+ }
+ DB_THREAD_UNLOCK(dbp);
+ return (count);
+}
+
+/*
+ * __bam_ca_di --
+ * Adjust the cursors during a delete or insert.
+ *
+ * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int));
+ */
+void
+__bam_ca_di(dbp, pgno, indx, value)
+ DB *dbp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ int value;
+{
+ CURSOR *cp;
+ DBC *dbc;
+
+ /* Recno is responsible for its own adjustments. */
+ if (dbp->type == DB_RECNO)
+ return;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ DB_THREAD_LOCK(dbp);
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (CURSOR *)dbc->internal;
+ if (cp->pgno == pgno && cp->indx >= indx)
+ cp->indx += value;
+ if (cp->dpgno == pgno && cp->dindx >= indx)
+ cp->dindx += value;
+ }
+ DB_THREAD_UNLOCK(dbp);
+}
+
+/*
+ * __bam_ca_dup --
+ * Adjust the cursors when moving data items to a duplicates page.
+ *
+ * PUBLIC: void __bam_ca_dup __P((DB *,
+ * PUBLIC: db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t));
+ */
+void
+__bam_ca_dup(dbp, fpgno, first, fi, tpgno, ti)
+ DB *dbp;
+ db_pgno_t fpgno, tpgno;
+ u_int32_t first, fi, ti;
+{
+ CURSOR *cp;
+ DBC *dbc;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * No need to test duplicates, this only gets called when moving
+ * leaf page data items onto a duplicates page.
+ */
+ DB_THREAD_LOCK(dbp);
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (CURSOR *)dbc->internal;
+ /*
+ * Ignore matching entries that have already been moved,
+ * we move from the same location on the leaf page more
+ * than once.
+ */
+ if (cp->dpgno == PGNO_INVALID &&
+ cp->pgno == fpgno && cp->indx == fi) {
+ cp->indx = first;
+ cp->dpgno = tpgno;
+ cp->dindx = ti;
+ }
+ }
+ DB_THREAD_UNLOCK(dbp);
+}
+
+/*
+ * __bam_ca_move --
+ * Adjust the cursors when moving data items to another page.
+ *
+ * PUBLIC: void __bam_ca_move __P((DB *, BTREE *, db_pgno_t, db_pgno_t));
+ */
+void
+__bam_ca_move(dbp, t, fpgno, tpgno)
+ DB *dbp;
+ BTREE *t;
+ db_pgno_t fpgno, tpgno;
+{
+ CURSOR *cp;
+ DBC *dbc;
+
+ /* Recno is responsible for its own adjustments. */
+ if (dbp->type == DB_RECNO)
+ return;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * No need to test duplicates, this only gets called when copying
+ * over the root page with a leaf or internal page.
+ */
+ DB_THREAD_LOCK(dbp);
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (CURSOR *)dbc->internal;
+ if (cp->pgno == fpgno)
+ cp->pgno = tpgno;
+ }
+ DB_THREAD_UNLOCK(dbp);
+}
+
+/*
+ * __bam_ca_replace --
+ * Check if any of the cursors refer to the item we are about to replace.
+ * If so, their flags should be changed from deleted to replaced.
+ *
+ * PUBLIC: void __bam_ca_replace
+ * PUBLIC: __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg));
+ */
+void
+__bam_ca_replace(dbp, pgno, indx, pass)
+ DB *dbp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ ca_replace_arg pass;
+{
+ CURSOR *cp;
+ DBC *dbc;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * Find any cursors that have logically deleted a record we're about
+ * to overwrite.
+ *
+ * Pass == REPLACE_SETUP:
+ * Set C_REPLACE_SETUP so we can find the cursors again.
+ *
+ * Pass == REPLACE_SUCCESS:
+ * Clear C_DELETED and C_REPLACE_SETUP, set C_REPLACE, the
+ * overwrite was successful.
+ *
+ * Pass == REPLACE_FAILED:
+ * Clear C_REPLACE_SETUP, the overwrite failed.
+ *
+ * For REPLACE_SUCCESS and REPLACE_FAILED, we reset the indx value
+ * for the cursor as it may have been changed by other cursor update
+ * routines as the item was deleted/inserted.
+ */
+ DB_THREAD_LOCK(dbp);
+ switch (pass) {
+ case REPLACE_SETUP: /* Setup. */
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (CURSOR *)dbc->internal;
+ if ((cp->pgno == pgno && cp->indx == indx) ||
+ (cp->dpgno == pgno && cp->dindx == indx))
+ F_SET(cp, C_REPLACE_SETUP);
+ }
+ break;
+ case REPLACE_SUCCESS: /* Overwrite succeeded. */
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (CURSOR *)dbc->internal;
+ if (F_ISSET(cp, C_REPLACE_SETUP)) {
+ if (cp->dpgno == pgno)
+ cp->dindx = indx;
+ if (cp->pgno == pgno)
+ cp->indx = indx;
+ F_SET(cp, C_REPLACE);
+ F_CLR(cp, C_DELETED | C_REPLACE_SETUP);
+ }
+ }
+ break;
+ case REPLACE_FAILED: /* Overwrite failed. */
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (CURSOR *)dbc->internal;
+ if (F_ISSET(cp, C_REPLACE_SETUP)) {
+ if (cp->dpgno == pgno)
+ cp->dindx = indx;
+ if (cp->pgno == pgno)
+ cp->indx = indx;
+ F_CLR(cp, C_REPLACE_SETUP);
+ }
+ }
+ break;
+ }
+ DB_THREAD_UNLOCK(dbp);
+}
+
+/*
+ * __bam_ca_split --
+ * Adjust the cursors when splitting a page.
+ *
+ * PUBLIC: void __bam_ca_split __P((DB *,
+ * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
+ */
+void
+__bam_ca_split(dbp, ppgno, lpgno, rpgno, split_indx, cleft)
+ DB *dbp;
+ db_pgno_t ppgno, lpgno, rpgno;
+ u_int32_t split_indx;
+ int cleft;
+{
+ DBC *dbc;
+ CURSOR *cp;
+
+ /* Recno is responsible for its own adjustments. */
+ if (dbp->type == DB_RECNO)
+ return;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * If splitting the page that a cursor was on, the cursor has to be
+ * adjusted to point to the same record as before the split. Most
+ * of the time we don't adjust pointers to the left page, because
+ * we're going to copy its contents back over the original page. If
+ * the cursor is on the right page, it is decremented by the number of
+ * records split to the left page.
+ */
+ DB_THREAD_LOCK(dbp);
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (CURSOR *)dbc->internal;
+ if (cp->pgno == ppgno)
+ if (cp->indx < split_indx) {
+ if (cleft)
+ cp->pgno = lpgno;
+ } else {
+ cp->pgno = rpgno;
+ cp->indx -= split_indx;
+ }
+ if (cp->dpgno == ppgno)
+ if (cp->dindx < split_indx) {
+ if (cleft)
+ cp->dpgno = lpgno;
+ } else {
+ cp->dpgno = rpgno;
+ cp->dindx -= split_indx;
+ }
+ }
+ DB_THREAD_UNLOCK(dbp);
+}
+
+/*
+ * __bam_c_physdel --
+ * Actually do the cursor deletion.
+ */
+static int
+__bam_c_physdel(dbp, cp, h)
+ DB *dbp;
+ CURSOR *cp;
+ PAGE *h;
+{
+ BOVERFLOW bo;
+ BTREE *t;
+ DBT dbt;
+ DB_LOCK lock;
+ db_indx_t indx;
+ db_pgno_t pgno, next_pgno, prev_pgno;
+ int local, ret;
+
+ t = dbp->internal;
+ ret = 0;
+
+ /* Figure out what we're deleting. */
+ if (cp->dpgno == PGNO_INVALID) {
+ pgno = cp->pgno;
+ indx = cp->indx;
+ } else {
+ pgno = cp->dpgno;
+ indx = cp->dindx;
+ }
+
+ /*
+ * If the item is referenced by another cursor, leave it up to that
+ * cursor to do the delete.
+ */
+ if (__bam_ca_delete(dbp, pgno, indx, cp) != 0)
+ return (0);
+
+ /*
+ * If we don't already have the page locked, get it and delete the
+ * items.
+ */
+ if ((h == NULL || h->pgno != pgno)) {
+ if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+ return (ret);
+ local = 1;
+ } else
+ local = 0;
+
+ /*
+ * If we're deleting a duplicate entry, call the common code to do
+ * the work.
+ */
+ if (TYPE(h) == P_DUPLICATE) {
+ pgno = PGNO(h);
+ prev_pgno = PREV_PGNO(h);
+ next_pgno = NEXT_PGNO(h);
+ if ((ret = __db_drem(dbp, &h, indx, __bam_free)) != 0)
+ goto err;
+
+ /*
+ * There are 4 cases:
+ *
+ * 1. We removed an item on a page, but there are more items
+ * on the page.
+ * 2. We removed the last item on a page, removing the last
+ * duplicate.
+ * 3. We removed the last item on a page, but there is a
+ * following page of duplicates.
+ * 4. We removed the last item on a page, but there is a
+ * previous page of duplicates.
+ *
+ * In case 1, h != NULL, h->pgno == pgno
+ * In case 2, h == NULL,
+ * prev_pgno == PGNO_INVALID, next_pgno == PGNO_INVALID
+ * In case 3, h != NULL, next_pgno != PGNO_INVALID
+ * In case 4, h == NULL, prev_pgno != PGNO_INVALID
+ *
+ * In case 1, there's nothing else to do.
+ * In case 2, remove the entry from the parent page.
+ * In case 3 or 4, if the deleted page was the first in a chain
+ * of duplicate pages, update the parent page's entry.
+ *
+ * Test:
+ * If there were previous pages of duplicates or we didn't
+ * empty the current page of duplicates, we don't need to
+ * touch the parent page.
+ */
+ if (PREV_PGNO(h) != PGNO_INVALID ||
+ (h != NULL && pgno == h->pgno))
+ goto done;
+
+ /*
+ * Release any page we're holding and the lock on the deleted
+ * page.
+ */
+ if (local) {
+ if (h != NULL)
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_TLPUT(dbp, lock);
+ local = 0;
+ }
+
+ /* Acquire the parent page. */
+ if ((ret =
+ __bam_lget(dbp, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
+ goto err;
+ if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) {
+ (void)__BT_TLPUT(dbp, lock);
+ goto err;
+ }
+ local = 1;
+
+ /*
+ * If we deleted the last duplicate, we can fall out and do a
+ * normal btree delete in the context of the parent page. If
+ * not, we have to update the parent's page.
+ */
+ indx = cp->indx;
+ if (next_pgno != PGNO_INVALID) {
+ /*
+ * Copy, delete, update and re-insert the parent page's
+ * entry.
+ */
+ bo = *GET_BOVERFLOW(h, indx);
+ (void)__db_ditem(dbp, h, indx, BOVERFLOW_SIZE);
+ bo.pgno = next_pgno;
+ memset(&dbt, 0, sizeof(dbt));
+ dbt.data = &bo;
+ dbt.size = BOVERFLOW_SIZE;
+ (void)__db_pitem(dbp,
+ h, indx, BOVERFLOW_SIZE, &dbt, NULL);
+
+ /* Discard the parent page. */
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_TLPUT(dbp, lock);
+ local = 0;
+
+ goto done;
+ }
+ }
+
+ /* Otherwise, do a normal btree delete. */
+ if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+ goto err;
+ if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+ goto err;
+
+ /*
+ * If the page is empty, delete it. To delete a leaf page we need a
+ * copy of a key from the page. We use the first one that was there,
+ * since it's the last key that the page held. We malloc the page
+ * information instead of using the return key/data memory because
+ * we've already set them -- the reason that we've already set them
+ * is because we're (potentially) about to do a reverse split, which
+ * would make our saved page information useless.
+ *
+ * XXX
+ * The following operations to delete a page might deadlock. I think
+ * that's OK. The problem is if we're deleting an item because we're
+ * closing cursors because we've already deadlocked and want to call
+ * txn_abort(). If we fail due to deadlock, we'll leave an locked
+ * empty page in the tree, which won't be empty long because we're
+ * going to undo the delete.
+ */
+ if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) {
+ memset(&dbt, 0, sizeof(DBT));
+ dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
+ if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
+ goto err;
+
+ if (local) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_TLPUT(dbp, lock);
+ local = 0;
+ }
+
+ ret = __bam_dpage(dbp, &dbt);
+ free(dbt.data);
+ }
+
+err:
+done: if (local) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_TLPUT(dbp, lock);
+ }
+
+ if (ret == 0)
+ ++t->lstat.bt_deleted;
+ return (ret);
+}
diff --git a/db2/btree/bt_delete.c b/db2/btree/bt_delete.c
new file mode 100644
index 0000000000..e7ec4dfe3e
--- /dev/null
+++ b/db2/btree/bt_delete.c
@@ -0,0 +1,607 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_delete.c 10.18 (Sleepycat) 8/24/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdio.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+static int __bam_dpages __P((DB *, BTREE *));
+
+/*
+ * __bam_delete --
+ * Delete the items referenced by a key.
+ *
+ * PUBLIC: int __bam_delete __P((DB *, DB_TXN *, DBT *, int));
+ */
+int
+__bam_delete(argdbp, txn, key, flags)
+ DB *argdbp;
+ DB_TXN *txn;
+ DBT *key;
+ int flags;
+{
+ BTREE *t;
+ DB *dbp;
+ PAGE *h;
+ db_indx_t cnt, i, indx;
+ int dpage, exact, ret, stack;
+
+ DEBUG_LWRITE(argdbp, txn, "bam_delete", key, NULL, flags);
+
+ stack = 0;
+
+ /* Check for invalid flags. */
+ if ((ret =
+ __db_delchk(argdbp, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0)
+ return (ret);
+
+ GETHANDLE(argdbp, txn, &dbp, ret);
+ t = dbp->internal;
+
+ /* Search the tree for the key; delete only deletes exact matches. */
+ if ((ret = __bam_search(dbp, key, S_DELETE, 1, NULL, &exact)) != 0)
+ goto err;
+ stack = 1;
+ h = t->bt_csp->page;
+ indx = t->bt_csp->indx;
+
+ /* Delete the key/data pair, including any duplicates. */
+ for (cnt = 1, i = indx;; ++cnt)
+ if ((i += P_INDX) >= NUM_ENT(h) || h->inp[i] != h->inp[indx])
+ break;
+ for (; cnt > 0; --cnt, ++t->lstat.bt_deleted)
+ if (__bam_ca_delete(dbp, h->pgno, indx, NULL) != 0) {
+ GET_BKEYDATA(h, indx + O_INDX)->deleted = 1;
+ indx += P_INDX;
+ } else if ((ret = __bam_ditem(dbp, h, indx)) != 0 ||
+ (ret = __bam_ditem(dbp, h, indx)) != 0)
+ goto err;
+
+ /* If we're using record numbers, update internal page record counts. */
+ if (F_ISSET(dbp, DB_BT_RECNUM) && (ret = __bam_adjust(dbp, t, -1)) != 0)
+ goto err;
+
+ /* If the page is now empty, delete it. */
+ dpage = NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT;
+
+ __bam_stkrel(dbp);
+ stack = 0;
+
+ ret = dpage ? __bam_dpage(dbp, key) : 0;
+
+err: if (stack)
+ __bam_stkrel(dbp);
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __ram_delete --
+ * Delete the items referenced by a key.
+ *
+ * PUBLIC: int __ram_delete __P((DB *, DB_TXN *, DBT *, int));
+ */
+int
+__ram_delete(argdbp, txn, key, flags)
+ DB *argdbp;
+ DB_TXN *txn;
+ DBT *key;
+ int flags;
+{
+ BKEYDATA bk;
+ BTREE *t;
+ DB *dbp;
+ DBT hdr, data;
+ PAGE *h;
+ db_indx_t indx;
+ db_recno_t recno;
+ int exact, ret, stack;
+
+ stack = 0;
+
+ /* Check for invalid flags. */
+ if ((ret =
+ __db_delchk(argdbp, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0)
+ return (ret);
+
+ GETHANDLE(argdbp, txn, &dbp, ret);
+ t = dbp->internal;
+
+ /* Check the user's record number and fill in as necessary. */
+ if ((ret = __ram_getno(argdbp, key, &recno, 0)) != 0)
+ goto err;
+
+ /* Search the tree for the key; delete only deletes exact matches. */
+ if ((ret = __bam_rsearch(dbp, &recno, S_DELETE, 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ h = t->bt_csp->page;
+ indx = t->bt_csp->indx;
+ stack = 1;
+
+ /* If the record has already been deleted, we couldn't have found it. */
+ if (GET_BKEYDATA(h, indx)->deleted) {
+ ret = DB_KEYEMPTY;
+ goto done;
+ }
+
+ /*
+ * If we're not renumbering records, replace the record with a marker
+ * and return.
+ */
+ if (!F_ISSET(dbp, DB_RE_RENUMBER)) {
+ if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+ goto err;
+
+ bk.deleted = 1;
+ bk.type = B_KEYDATA;
+ bk.len = 0;
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bk;
+ hdr.size = SSZA(BKEYDATA, data);
+ memset(&data, 0, sizeof(data));
+ data.data = (char *) "";
+ data.size = 0;
+ if ((ret = __db_pitem(dbp,
+ h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
+ goto err;
+
+ ++t->lstat.bt_deleted;
+ goto done;
+ }
+
+ /* Delete the item. */
+ if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+ goto err;
+
+ ++t->lstat.bt_deleted;
+ if (t->bt_recno != NULL)
+ F_SET(t->bt_recno, RECNO_MODIFIED);
+
+ /* Adjust the counts. */
+ __bam_adjust(dbp, t, -1);
+
+ /* Adjust the cursors. */
+ __ram_ca(dbp, recno, CA_DELETE);
+
+ /*
+ * If the page is now empty, delete it -- we have the whole tree
+ * locked, so there are no preparations to make. Else, release
+ * the pages.
+ */
+ if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) {
+ stack = 0;
+ ret = __bam_dpages(dbp, t);
+ }
+
+done:
+err: if (stack)
+ __bam_stkrel(dbp);
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_ditem --
+ * Delete one or more entries from a page.
+ *
+ * PUBLIC: int __bam_ditem __P((DB *, PAGE *, u_int32_t));
+ */
+int
+__bam_ditem(dbp, h, indx)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t indx;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ u_int32_t nbytes;
+ int ret;
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(h, indx);
+ switch (bi->type) {
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ nbytes = BINTERNAL_SIZE(bi->len);
+ goto offpage;
+ case B_KEYDATA:
+ nbytes = BKEYDATA_SIZE(bi->len);
+ break;
+ default:
+ return (__db_pgfmt(dbp, h->pgno));
+ }
+ break;
+ case P_IRECNO:
+ nbytes = RINTERNAL_SIZE;
+ break;
+ case P_LBTREE:
+ /*
+ * If it's a duplicate key, discard the index and don't touch
+ * the actual page item. This works because no data item can
+ * have an index that matches any other index so even if the
+ * data item is in an index "slot", it won't match any other
+ * index.
+ */
+ if (!(indx % 2)) {
+ if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX])
+ return (__bam_adjindx(dbp,
+ h, indx, indx - P_INDX, 0));
+ if (indx < (u_int32_t)(NUM_ENT(h) - P_INDX) &&
+ h->inp[indx] == h->inp[indx + P_INDX])
+ return (__bam_adjindx(dbp,
+ h, indx, indx + O_INDX, 0));
+ }
+ /* FALLTHROUGH */
+ case P_LRECNO:
+ bk = GET_BKEYDATA(h, indx);
+ switch (bk->type) {
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ nbytes = BOVERFLOW_SIZE;
+
+offpage: /* Delete duplicate/offpage chains. */
+ bo = GET_BOVERFLOW(h, indx);
+ if (bo->type == B_DUPLICATE) {
+ if ((ret =
+ __db_ddup(dbp, bo->pgno, __bam_free)) != 0)
+ return (ret);
+ } else
+ if ((ret =
+ __db_doff(dbp, bo->pgno, __bam_free)) != 0)
+ return (ret);
+ break;
+ case B_KEYDATA:
+ nbytes = BKEYDATA_SIZE(bk->len);
+ break;
+ default:
+ return (__db_pgfmt(dbp, h->pgno));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp, h->pgno));
+ }
+
+ /* Delete the item. */
+ if ((ret = __db_ditem(dbp, h, indx, nbytes)) != 0)
+ return (ret);
+
+ /* Mark the page dirty. */
+ return (memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY));
+}
+
+/*
+ * __bam_adjindx --
+ * Adjust an index on the page.
+ *
+ * PUBLIC: int __bam_adjindx __P((DB *, PAGE *, u_int32_t, u_int32_t, int));
+ */
+int
+__bam_adjindx(dbp, h, indx, indx_copy, is_insert)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t indx, indx_copy;
+ int is_insert;
+{
+ db_indx_t copy;
+ int ret;
+
+ /* Log the change. */
+ if (DB_LOGGING(dbp) &&
+ (ret = __bam_adj_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h),
+ 0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy,
+ (u_int32_t)is_insert)) != 0)
+ return (ret);
+
+ if (is_insert) {
+ copy = h->inp[indx_copy];
+ if (indx != NUM_ENT(h))
+ memmove(&h->inp[indx + O_INDX], &h->inp[indx],
+ sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+ h->inp[indx] = copy;
+ ++NUM_ENT(h);
+ } else {
+ --NUM_ENT(h);
+ if (indx != NUM_ENT(h))
+ memmove(&h->inp[indx], &h->inp[indx + O_INDX],
+ sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+ }
+
+ /* Mark the page dirty. */
+ ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY);
+
+ /* Adjust the cursors. */
+ __bam_ca_di(dbp, h->pgno, indx, is_insert ? 1 : -1);
+ return (0);
+}
+
+/*
+ * __bam_dpage --
+ * Delete a page from the tree.
+ *
+ * PUBLIC: int __bam_dpage __P((DB *, const DBT *));
+ */
+int
+__bam_dpage(dbp, key)
+ DB *dbp;
+ const DBT *key;
+{
+ BTREE *t;
+ DB_LOCK lock;
+ PAGE *h;
+ db_pgno_t pgno;
+ int exact, level, ret;
+
+ ret = 0;
+ t = dbp->internal;
+
+ /*
+ * The locking protocol is that we acquire locks by walking down the
+ * tree, to avoid the obvious deadlocks.
+ *
+ * Call __bam_search to reacquire the empty leaf page, but this time
+ * get both the leaf page and it's parent, locked. Walk back up the
+ * tree, until we have the top pair of pages that we want to delete.
+ * Once we have the top page that we want to delete locked, lock the
+ * underlying pages and check to make sure they're still empty. If
+ * they are, delete them.
+ */
+ for (level = LEAFLEVEL;; ++level) {
+ /* Acquire a page and its parent, locked. */
+ if ((ret =
+ __bam_search(dbp, key, S_WRPAIR, level, NULL, &exact)) != 0)
+ return (ret);
+
+ /*
+ * If we reach the root or the page isn't going to be empty
+ * when we delete one record, quit.
+ */
+ h = t->bt_csp[-1].page;
+ if (h->pgno == PGNO_ROOT || NUM_ENT(h) != 1)
+ break;
+
+ /* Release the two locked pages. */
+ (void)memp_fput(dbp->mpf, t->bt_csp[-1].page, 0);
+ (void)__BT_TLPUT(dbp, t->bt_csp[-1].lock);
+ (void)memp_fput(dbp->mpf, t->bt_csp[0].page, 0);
+ (void)__BT_TLPUT(dbp, t->bt_csp[0].lock);
+ }
+
+ /*
+ * Leave the stack pointer one after the last entry, we may be about
+ * to push more items on the stack.
+ */
+ ++t->bt_csp;
+
+ /*
+ * t->bt_csp[-2].page is the top page, which we're not going to delete,
+ * and t->bt_csp[-1].page is the first page we are going to delete.
+ *
+ * Walk down the chain, acquiring the rest of the pages until we've
+ * retrieved the leaf page. If we find any pages that aren't going
+ * to be emptied by the delete, someone else added something while we
+ * were walking the tree, and we discontinue the delete.
+ */
+ for (h = t->bt_csp[-1].page;;) {
+ if (ISLEAF(h)) {
+ if (NUM_ENT(h) != 0)
+ goto release;
+ break;
+ } else
+ if (NUM_ENT(h) != 1)
+ goto release;
+
+ /*
+ * Get the next page, write lock it and push it onto the stack.
+ * We know it's index 0, because it can only have one element.
+ */
+ pgno = TYPE(h) == P_IBTREE ?
+ GET_BINTERNAL(h, 0)->pgno : GET_RINTERNAL(h, 0)->pgno;
+
+ if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
+ goto release;
+ if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+ goto release;
+ BT_STK_PUSH(t, h, 0, lock, ret);
+ if (ret != 0)
+ goto release;
+ }
+
+ BT_STK_POP(t);
+ return (__bam_dpages(dbp, t));
+
+release:
+ /* Discard any locked pages and return. */
+ BT_STK_POP(t);
+ __bam_stkrel(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_dpages --
+ * Delete a set of locked pages.
+ */
+static int
+__bam_dpages(dbp, t)
+ DB *dbp;
+ BTREE *t;
+{
+ DBT a, b;
+ DB_LOCK lock;
+ EPG *epg;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_recno_t rcnt;
+ int ret;
+
+ rcnt = 0; /* XXX: Shut the compiler up. */
+ epg = t->bt_sp;
+
+ /*
+ * !!!
+ * There is an interesting deadlock situation here. We have to relink
+ * the leaf page chain around the leaf page being deleted. Consider
+ * a cursor walking through the leaf pages, that has the previous page
+ * read-locked and is waiting on a lock for the page we're deleting.
+ * It will deadlock here. This is a problem, because if our process is
+ * selected to resolve the deadlock, we'll leave an empty leaf page
+ * that we can never again access by walking down the tree. So, before
+ * we unlink the subtree, we relink the leaf page chain.
+ */
+ if ((ret = __db_relink(dbp, t->bt_csp->page, NULL, 1)) != 0)
+ goto release;
+
+ /*
+ * We have the entire stack of deletable pages locked. Start from the
+ * top of the tree and move to the bottom, as it's better to release
+ * the inner pages as soon as possible.
+ */
+ if ((ret = __bam_ditem(dbp, epg->page, epg->indx)) != 0)
+ goto release;
+
+ /*
+ * If we deleted the next-to-last item from the root page, the tree
+ * has collapsed a level. Try and write lock the remaining root + 1
+ * page and copy it onto the root page. If we can't get the lock,
+ * that's okay, the tree just stays a level deeper than we'd like.
+ */
+ h = epg->page;
+ if (h->pgno == PGNO_ROOT && NUM_ENT(h) == 1) {
+ pgno = TYPE(epg->page) == P_IBTREE ?
+ GET_BINTERNAL(epg->page, 0)->pgno :
+ GET_RINTERNAL(epg->page, 0)->pgno;
+ if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
+ goto release;
+ if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+ goto release;
+
+ /* Log the change. */
+ if (DB_LOGGING(dbp)) {
+ memset(&a, 0, sizeof(a));
+ a.data = h;
+ a.size = dbp->pgsize;
+ memset(&b, 0, sizeof(b));
+ b.data = P_ENTRY(epg->page, 0);
+ b.size = BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
+ __bam_rsplit_log(dbp->dbenv->lg_info, dbp->txn,
+ &h->lsn, 0, dbp->log_fileid, h->pgno, &a, &b,
+ &epg->page->lsn);
+ }
+
+ /*
+ * Make the switch.
+ *
+ * One fixup -- if the tree has record numbers and we're not
+ * converting to a leaf page, we have to preserve the total
+ * record count.
+ */
+ if (TYPE(h) == P_IRECNO ||
+ (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM)))
+ rcnt = RE_NREC(epg->page);
+ memcpy(epg->page, h, dbp->pgsize);
+ epg->page->pgno = PGNO_ROOT;
+ if (TYPE(h) == P_IRECNO ||
+ (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM)))
+ RE_NREC_SET(epg->page, rcnt);
+
+ /* Free the last page in that level of the btree. */
+ ++t->lstat.bt_freed;
+ (void)__bam_free(dbp, h);
+
+ /* Adjust the cursors. */
+ __bam_ca_move(dbp, t, h->pgno, PGNO_ROOT);
+
+ (void)__BT_TLPUT(dbp, lock);
+ }
+
+ /* Release the top page in the subtree. */
+ (void)memp_fput(dbp->mpf, epg->page, 0);
+ (void)__BT_TLPUT(dbp, epg->lock);
+
+ /*
+ * Free the rest of the pages.
+ *
+ * XXX
+ * Don't bother checking for errors. We've unlinked the subtree from
+ * the tree, and there's no possibility of recovery.
+ */
+ for (; ++epg <= t->bt_csp; ++t->lstat.bt_freed) {
+ if (NUM_ENT(epg->page) != 0)
+ (void)__bam_ditem(dbp, epg->page, epg->indx);
+
+ (void)__bam_free(dbp, epg->page);
+ (void)__BT_TLPUT(dbp, epg->lock);
+ }
+ return (0);
+
+release:
+ /* Discard any remaining pages and return. */
+ for (; epg <= t->bt_csp; ++epg) {
+ (void)memp_fput(dbp->mpf, epg->page, 0);
+ (void)__BT_TLPUT(dbp, epg->lock);
+ }
+ return (ret);
+}
diff --git a/db2/btree/bt_open.c b/db2/btree/bt_open.c
new file mode 100644
index 0000000000..354888c6c2
--- /dev/null
+++ b/db2/btree/bt_open.c
@@ -0,0 +1,355 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_open.c 10.20 (Sleepycat) 8/19/97";
+#endif /* not lint */
+
+/*
+ * Implementation of btree access method for 4.4BSD.
+ *
+ * The design here was originally based on that of the btree access method
+ * used in the Postgres database system at UC Berkeley. This implementation
+ * is wholly independent of the Postgres code.
+ */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+#include "common_ext.h"
+
+static int __bam_keyalloc __P((BTREE *));
+static int __bam_setmeta __P((DB *, BTREE *));
+
+/*
+ * __bam_open --
+ * Open a btree.
+ *
+ * PUBLIC: int __bam_open __P((DB *, DBTYPE, DB_INFO *));
+ */
+int
+__bam_open(dbp, type, dbinfo)
+ DB *dbp;
+ DBTYPE type;
+ DB_INFO *dbinfo;
+{
+ BTREE *t;
+ int ret;
+
+ /* Allocate the btree internal structure. */
+ if ((t = (BTREE *)calloc(1, sizeof(BTREE))) == NULL)
+ return (ENOMEM);
+
+ t->bt_sp = t->bt_csp = t->bt_stack;
+ t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]);
+
+ if ((type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) &&
+ (ret = __bam_keyalloc(t)) != 0)
+ goto err;
+
+ /*
+ * Intention is to make sure all of the user's selections are okay
+ * here and then use them without checking.
+ */
+ if (dbinfo != NULL) {
+ /* Minimum number of keys per page. */
+ if (dbinfo->bt_minkey == 0)
+ t->bt_minkey = DEFMINKEYPAGE;
+ else {
+ if (dbinfo->bt_minkey < 2)
+ goto einval;
+ t->bt_minkey = dbinfo->bt_minkey;
+ }
+
+ /* Maximum number of keys per page. */
+ if (dbinfo->bt_maxkey == 0)
+ t->bt_maxkey = 0;
+ else {
+ if (dbinfo->bt_maxkey < 1)
+ goto einval;
+ t->bt_maxkey = dbinfo->bt_maxkey;
+ }
+
+ /*
+ * If no comparison, use default comparison. If no comparison
+ * and no prefix, use default prefix. (We can't default the
+ * prefix if the user supplies a comparison routine; shortening
+ * the keys may break their comparison algorithm.)
+ */
+ t->bt_compare = dbinfo->bt_compare == NULL ?
+ __bam_defcmp : dbinfo->bt_compare;
+ t->bt_prefix = dbinfo->bt_prefix == NULL ?
+ (dbinfo->bt_compare == NULL ?
+ __bam_defpfx : NULL) : dbinfo->bt_prefix;
+ } else {
+ t->bt_minkey = DEFMINKEYPAGE;
+ t->bt_compare = __bam_defcmp;
+ t->bt_prefix = __bam_defpfx;
+ }
+
+ /* Initialize the remaining fields of the DB. */
+ dbp->type = type;
+ dbp->internal = t;
+ dbp->cursor = __bam_cursor;
+ dbp->del = __bam_delete;
+ dbp->get = __bam_get;
+ dbp->put = __bam_put;
+ dbp->stat = __bam_stat;
+ dbp->sync = __bam_sync;
+
+ /*
+ * The btree data structure requires that at least two key/data pairs
+ * can fit on a page, but other than that there's no fixed requirement.
+ * Translate the minimum number of items into the bytes a key/data pair
+ * can use before being placed on an overflow page. We calculate for
+ * the worst possible alignment by assuming every item requires the
+ * maximum alignment for padding.
+ *
+ * Recno uses the btree bt_ovflsize value -- it's close enough.
+ */
+ t->bt_ovflsize = (dbp->pgsize - P_OVERHEAD) / (t->bt_minkey * P_INDX)
+ - (BKEYDATA_PSIZE(0) + ALIGN(1, 4));
+
+ /* Create a root page if new tree. */
+ if ((ret = __bam_setmeta(dbp, t)) != 0)
+ goto err;
+
+ return (0);
+
+einval: ret = EINVAL;
+
+err: if (t != NULL) {
+ /* If we allocated room for key/data return, discard it. */
+ if (t->bt_rkey.data != NULL)
+ free(t->bt_rkey.data);
+
+ FREE(t, sizeof(BTREE));
+ }
+ return (ret);
+}
+
+/*
+ * __bam_bdup --
+ * Create a BTREE handle for a threaded DB handle.
+ *
+ * PUBLIC: int __bam_bdup __P((DB *, DB *));
+ */
+int
+__bam_bdup(orig, new)
+ DB *orig, *new;
+{
+ BTREE *t, *ot;
+ int ret;
+
+ ot = orig->internal;
+
+ if ((t = (BTREE *)calloc(1, sizeof(*t))) == NULL)
+ return (ENOMEM);
+
+ /*
+ * !!!
+ * Ignore the cursor queue, only the first DB has attached cursors.
+ */
+
+ t->bt_sp = t->bt_csp = t->bt_stack;
+ t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]);
+
+ if ((orig->type == DB_RECNO || F_ISSET(orig, DB_BT_RECNUM)) &&
+ (ret = __bam_keyalloc(t)) != 0) {
+ FREE(t, sizeof(*t));
+ return (ret);
+ }
+
+ t->bt_maxkey = ot->bt_maxkey;
+ t->bt_minkey = ot->bt_minkey;
+ t->bt_compare = ot->bt_compare;
+ t->bt_prefix = ot->bt_prefix;
+ t->bt_ovflsize = ot->bt_ovflsize;
+
+ /*
+ * !!!
+ * The entire RECNO structure is shared. If it breaks, the application
+ * was misusing it to start with.
+ */
+ t->bt_recno = ot->bt_recno;
+
+ new->internal = t;
+
+ return (0);
+}
+
+/*
+ * __bam_keyalloc --
+ * Allocate return memory for recno keys.
+ */
+static int
+__bam_keyalloc(t)
+ BTREE *t;
+{
+ /*
+ * Recno keys are always the same size, and we don't want to have
+ * to check for space on each return. Allocate it now.
+ */
+ if ((t->bt_rkey.data = (void *)malloc(sizeof(db_recno_t))) == NULL)
+ return (ENOMEM);
+ t->bt_rkey.ulen = sizeof(db_recno_t);
+ return (0);
+}
+
+/*
+ * __bam_setmeta --
+ * Check (and optionally create) a tree.
+ */
+static int
+__bam_setmeta(dbp, t)
+ DB *dbp;
+ BTREE *t;
+{
+ BTMETA *meta;
+ PAGE *root;
+ DB_LOCK mlock, rlock;
+ db_pgno_t pgno;
+ int ret;
+
+ /* Get, and optionally create the metadata page. */
+ pgno = PGNO_METADATA;
+ if ((ret =
+ __bam_lget(dbp, 0, PGNO_METADATA, DB_LOCK_WRITE, &mlock)) != 0)
+ return (ret);
+ if ((ret =
+ __bam_pget(dbp, (PAGE **)&meta, &pgno, DB_MPOOL_CREATE)) != 0) {
+ (void)__BT_LPUT(dbp, mlock);
+ return (ret);
+ }
+
+ /*
+ * If the magic number is correct, we're not creating the tree.
+ * Correct any fields that may not be right. Note, all of the
+ * local flags were set by db_open(3).
+ */
+ if (meta->magic != 0) {
+ t->bt_maxkey = meta->maxkey;
+ t->bt_minkey = meta->minkey;
+
+ (void)memp_fput(dbp->mpf, (PAGE *)meta, 0);
+ (void)__BT_LPUT(dbp, mlock);
+ return (0);
+ }
+
+ /* Initialize the tree structure metadata information. */
+ ZERO_LSN(meta->lsn);
+ meta->pgno = PGNO_METADATA;
+ meta->magic = DB_BTREEMAGIC;
+ meta->version = DB_BTREEVERSION;
+ meta->pagesize = dbp->pgsize;
+ meta->maxkey = t->bt_maxkey;
+ meta->minkey = t->bt_minkey;
+ meta->free = PGNO_INVALID;
+ meta->flags = 0;
+ if (dbp->type == DB_RECNO)
+ F_SET(meta, BTM_RECNO);
+ if (F_ISSET(dbp, DB_AM_DUP))
+ F_SET(meta, BTM_DUP);
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN))
+ F_SET(meta, BTM_FIXEDLEN);
+ if (F_ISSET(dbp, DB_BT_RECNUM))
+ F_SET(meta, BTM_RECNUM);
+ if (F_ISSET(dbp, DB_RE_RENUMBER))
+ F_SET(meta, BTM_RENUMBER);
+ meta->re_len = 0;
+ meta->re_pad = 0;
+ memcpy(meta->uid, dbp->lock.fileid, DB_FILE_ID_LEN);
+
+ /* Create and initialize a root page. */
+ pgno = PGNO_ROOT;
+ if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_WRITE, &rlock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &root, &pgno, DB_MPOOL_CREATE)) != 0) {
+ (void)__BT_LPUT(dbp, rlock);
+ return (ret);
+ }
+ P_INIT(root, dbp->pgsize, PGNO_ROOT, PGNO_INVALID,
+ PGNO_INVALID, 1, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
+ ZERO_LSN(root->lsn);
+
+ /* Release the metadata and root pages. */
+ if ((ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+ if ((ret = memp_fput(dbp->mpf, root, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+
+ /*
+ * Flush the metadata and root pages to disk -- since the user can't
+ * transaction protect open, the pages have to exist during recovery.
+ *
+ * XXX
+ * It's not useful to return not-yet-flushed here -- convert it to
+ * an error.
+ */
+ if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE)
+ ret = EINVAL;
+
+ /* Release the locks. */
+ (void)__BT_LPUT(dbp, mlock);
+ (void)__BT_LPUT(dbp, rlock);
+
+ return (ret);
+}
diff --git a/db2/btree/bt_page.c b/db2/btree/bt_page.c
new file mode 100644
index 0000000000..7ee74ffcf8
--- /dev/null
+++ b/db2/btree/bt_page.c
@@ -0,0 +1,312 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_page.c 10.5 (Sleepycat) 8/18/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+/*
+ * __bam_new --
+ * Get a new page, preferably from the freelist.
+ *
+ * PUBLIC: int __bam_new __P((DB *, u_int32_t, PAGE **));
+ */
+int
+__bam_new(dbp, type, pagepp)
+ DB *dbp;
+ u_int32_t type;
+ PAGE **pagepp;
+{
+ BTMETA *meta;
+ DB_LOCK mlock;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret;
+
+ meta = NULL;
+ h = NULL;
+ mlock = LOCK_INVALID;
+
+ pgno = PGNO_METADATA;
+ if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &mlock)) != 0)
+ goto err;
+ if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0)
+ goto err;
+
+ if (meta->free == PGNO_INVALID) {
+ if ((ret = __bam_pget(dbp, &h, &pgno, DB_MPOOL_NEW)) != 0)
+ goto err;
+ ZERO_LSN(h->lsn);
+ h->pgno = pgno;
+ } else {
+ pgno = meta->free;
+ if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+ goto err;
+ meta->free = h->next_pgno;
+ }
+
+ /* Log the change. */
+ if (DB_LOGGING(dbp)) {
+ if ((ret = __bam_pg_alloc_log(dbp->dbenv->lg_info, dbp->txn,
+ &meta->lsn, 0, dbp->log_fileid, &meta->lsn, &h->lsn,
+ h->pgno, (u_int32_t)type, meta->free)) != 0)
+ goto err;
+ LSN(h) = LSN(meta);
+ }
+
+ (void)memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY);
+ (void)__BT_TLPUT(dbp, mlock);
+
+ P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
+ *pagepp = h;
+ return (0);
+
+err: if (h != NULL)
+ (void)memp_fput(dbp->mpf, h, 0);
+ if (meta != NULL)
+ (void)memp_fput(dbp->mpf, meta, 0);
+ if (mlock != LOCK_INVALID)
+ (void)__BT_TLPUT(dbp, mlock);
+ return (ret);
+}
+
+/*
+ * __bam_free --
+ * Add a page to the head of the freelist.
+ *
+ * PUBLIC: int __bam_free __P((DB *, PAGE *));
+ */
+int
+__bam_free(dbp, h)
+ DB *dbp;
+ PAGE *h;
+{
+ BTMETA *meta;
+ DBT ldbt;
+ DB_LOCK mlock;
+ db_pgno_t pgno;
+ int is_dirty, ret, t_ret;
+
+ /*
+ * Retrieve the metadata page and insert the page at the head of
+ * the free list. If either the lock get or page get routines
+ * fail, then we need to put the page with which we were called
+ * back because our caller assumes we take care of it.
+ */
+ is_dirty = 0;
+ pgno = PGNO_METADATA;
+ if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &mlock)) != 0)
+ goto err;
+ if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) {
+ (void)__BT_TLPUT(dbp, mlock);
+ goto err;
+ }
+
+ /* Log the change. */
+ if (DB_LOGGING(dbp)) {
+ memset(&ldbt, 0, sizeof(ldbt));
+ ldbt.data = h;
+ ldbt.size = P_OVERHEAD;
+ if ((ret = __bam_pg_free_log(dbp->dbenv->lg_info,
+ dbp->txn, &meta->lsn, 0, dbp->log_fileid, h->pgno,
+ &meta->lsn, &ldbt, meta->free)) != 0) {
+ (void)memp_fput(dbp->mpf, (PAGE *)meta, 0);
+ (void)__BT_TLPUT(dbp, mlock);
+ return (ret);
+ }
+ LSN(h) = LSN(meta);
+ }
+
+ /*
+ * The page should have nothing interesting on it, re-initialize it,
+ * leaving only the page number and the LSN.
+ */
+#ifdef DEBUG
+ { db_pgno_t __pgno; DB_LSN __lsn;
+ __pgno = h->pgno;
+ __lsn = h->lsn;
+ memset(h, 0xff, dbp->pgsize);
+ h->pgno = __pgno;
+ h->lsn = __lsn;
+ }
+#endif
+ P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, meta->free, 0, P_INVALID);
+
+ /* Link the page on the metadata free list. */
+ meta->free = h->pgno;
+
+ /* Discard the metadata page. */
+ ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY);
+ if ((t_ret = __BT_TLPUT(dbp, mlock)) != 0)
+ ret = t_ret;
+
+ /* Discard the caller's page reference. */
+ is_dirty = DB_MPOOL_DIRTY;
+err: if ((t_ret = memp_fput(dbp->mpf, h, is_dirty)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * XXX
+ * We have to unlock the caller's page in the caller!
+ */
+ return (ret);
+}
+
+#ifdef DEBUG
+/*
+ * __bam_lt --
+ * Print out the list of currently held locks.
+ */
+int
+__bam_lt(dbp)
+ DB *dbp;
+{
+ DB_LOCKREQ req;
+
+ if (F_ISSET(dbp, DB_AM_LOCKING)) {
+ req.op = DB_LOCK_DUMP;
+ lock_vec(dbp->dbenv->lk_info, dbp->locker, 0, &req, 1, NULL);
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __bam_lget --
+ * The standard lock get call.
+ *
+ * PUBLIC: int __bam_lget __P((DB *, int, db_pgno_t, db_lockmode_t, DB_LOCK *));
+ */
+int
+__bam_lget(dbp, do_couple, pgno, mode, lockp)
+ DB *dbp;
+ int do_couple;
+ db_pgno_t pgno;
+ db_lockmode_t mode;
+ DB_LOCK *lockp;
+{
+ DB_LOCKREQ couple[2];
+ u_int32_t locker;
+ int ret;
+
+ if (!F_ISSET(dbp, DB_AM_LOCKING))
+ return (0);
+
+ locker = dbp->txn == NULL ? dbp->locker : dbp->txn->txnid;
+ dbp->lock.pgno = pgno;
+
+ /*
+ * If the object not currently locked, acquire the lock and return,
+ * otherwise, lock couple. If we fail and it's not a system error,
+ * convert to EAGAIN.
+ */
+ if (do_couple) {
+ couple[0].op = DB_LOCK_GET;
+ couple[0].obj = &dbp->lock_dbt;
+ couple[0].mode = mode;
+ couple[1].op = DB_LOCK_PUT;
+ couple[1].lock = *lockp;
+
+ ret = lock_vec(dbp->dbenv->lk_info, locker, 0, couple, 2, NULL);
+ if (ret != 0) {
+ /* If we fail, discard the lock we held. */
+ __bam_lput(dbp, *lockp);
+
+ return (ret < 0 ? EAGAIN : ret);
+ }
+ *lockp = couple[0].lock;
+ } else {
+ ret = lock_get(dbp->dbenv->lk_info,
+ locker, 0, &dbp->lock_dbt, mode, lockp);
+ return (ret < 0 ? EAGAIN : ret);
+ }
+ return (0);
+}
+
+/*
+ * __bam_lput --
+ * The standard lock put call.
+ *
+ * PUBLIC: int __bam_lput __P((DB *, DB_LOCK));
+ */
+int
+__bam_lput(dbp, lock)
+ DB *dbp;
+ DB_LOCK lock;
+{
+ return (__BT_LPUT(dbp, lock));
+}
+
+/*
+ * __bam_pget --
+ * The standard page get call.
+ *
+ * PUBLIC: int __bam_pget __P((DB *, PAGE **, db_pgno_t *, int));
+ */
+int
+__bam_pget(dbp, hp, pgnop, mflags)
+ DB *dbp;
+ PAGE **hp;
+ db_pgno_t *pgnop;
+ int mflags;
+{
+ return (memp_fget((dbp)->mpf,
+ pgnop, mflags, hp) == 0 ? 0 : __db_pgerr(dbp, *pgnop));
+}
diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c
new file mode 100644
index 0000000000..632c3d185b
--- /dev/null
+++ b/db2/btree/bt_put.c
@@ -0,0 +1,919 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_put.c 10.23 (Sleepycat) 8/22/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+static int __bam_fixed __P((BTREE *, DBT *));
+static int __bam_lookup __P((DB *, DBT *, int *));
+static int __bam_ndup __P((DB *, PAGE *, u_int32_t));
+static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t));
+
+/*
+ * __bam_put --
+ * Add a new key/data pair or replace an existing pair (btree).
+ *
+ * PUBLIC: int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, int));
+ */
+int
+__bam_put(argdbp, txn, key, data, flags)
+ DB *argdbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ int flags;
+{
+ BTREE *t;
+ CURSOR c;
+ DB *dbp;
+ PAGE *h;
+ db_indx_t indx;
+ int exact, iflags, newkey, replace, ret, stack;
+
+ DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags);
+
+ /* Check flags. */
+ if ((ret = __db_putchk(argdbp, key, data, flags,
+ F_ISSET(argdbp, DB_AM_RDONLY), F_ISSET(argdbp, DB_AM_DUP))) != 0)
+ return (ret);
+
+ GETHANDLE(argdbp, txn, &dbp, ret);
+ t = dbp->internal;
+
+retry: /*
+ * Find the location at which to insert. The call to bt_lookup()
+ * leaves the returned page pinned.
+ */
+ if ((ret = __bam_lookup(dbp, key, &exact)) != 0) {
+ PUTHANDLE(dbp);
+ return (ret);
+ }
+ h = t->bt_csp->page;
+ indx = t->bt_csp->indx;
+ stack = 1;
+
+ /*
+ * If an identical key is already in the tree, and DB_NOOVERWRITE is
+ * set, an error is returned. If an identical key is already in the
+ * tree and DB_NOOVERWRITE is not set, the key is either added (when
+ * duplicates are permitted) or an error is returned. The exception
+ * is when the item located is referenced by a cursor and marked for
+ * deletion, in which case we permit the overwrite and flag the cursor.
+ */
+ replace = 0;
+ if (exact && flags == DB_NOOVERWRITE) {
+ if (!GET_BKEYDATA(h, indx + O_INDX)->deleted) {
+ ret = DB_KEYEXIST;
+ goto err;
+ }
+ replace = 1;
+ __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP);
+ }
+
+ /*
+ * If we're inserting into the first or last page of the tree,
+ * remember where we did it so we can do fast lookup next time.
+ *
+ * XXX
+ * Does reverse order still work (did it ever!?!?)
+ */
+ t->bt_lpgno =
+ h->next_pgno == PGNO_INVALID || h->prev_pgno == PGNO_INVALID ?
+ h->pgno : PGNO_INVALID;
+
+ /*
+ * Select the arguments for __bam_iitem() and do the insert. If the
+ * key is an exact match, we're either adding a new duplicate at the
+ * end of the duplicate set, or we're replacing the data item with a
+ * new data item. If the key isn't an exact match, we're inserting
+ * a new key/data pair, before the search location.
+ */
+ newkey = dbp->type == DB_BTREE && !exact;
+ if (exact) {
+ if (F_ISSET(dbp, DB_AM_DUP)) {
+ /*
+ * Make sure that we're not looking at a page of
+ * duplicates -- if so, move to the last entry on
+ * that page.
+ */
+ c.page = h;
+ c.pgno = h->pgno;
+ c.indx = indx;
+ c.dpgno = PGNO_INVALID;
+ c.dindx = 0;
+ if ((ret =
+ __bam_ovfl_chk(dbp, &c, indx + O_INDX, 1)) != 0)
+ goto err;
+ if (c.dpgno != PGNO_INVALID) {
+ /*
+ * XXX
+ * The __bam_ovfl_chk() routine memp_fput() the
+ * current page and acquired a new one, but did
+ * not do anything about the lock we're holding.
+ */
+ t->bt_csp->page = h = c.page;
+ indx = c.dindx;
+ }
+ iflags = DB_AFTER;
+ } else
+ iflags = DB_CURRENT;
+ } else
+ iflags = DB_BEFORE;
+
+ /*
+ * The pages we're using may be modified by __bam_iitem(), so make
+ * sure we reset the stack.
+ */
+ ret = __bam_iitem(dbp,
+ &h, &indx, key, data, iflags, newkey ? BI_NEWKEY : 0);
+ t->bt_csp->page = h;
+ t->bt_csp->indx = indx;
+
+ switch (ret) {
+ case 0:
+ /*
+ * Done. Clean up the cursor, and, if we're doing record
+ * numbers, adjust the internal page counts.
+ */
+ if (replace)
+ __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS);
+
+ if (!replace && F_ISSET(dbp, DB_BT_RECNUM))
+ ret = __bam_adjust(dbp, t, 1);
+ break;
+ case DB_NEEDSPLIT:
+ /*
+ * We have to split the page. Back out the cursor setup,
+ * discard the stack of pages, and do the split.
+ */
+ if (replace) {
+ replace = 0;
+ __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED);
+ }
+
+ (void)__bam_stkrel(dbp);
+ stack = 0;
+
+ if ((ret = __bam_split(dbp, key)) != 0)
+ break;
+
+ goto retry;
+ /* NOTREACHED */
+ default:
+ if (replace)
+ __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED);
+ break;
+ }
+
+err: if (stack)
+ (void)__bam_stkrel(dbp);
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_lookup --
+ * Find the right location in the tree for the key.
+ */
+static int
+__bam_lookup(dbp, key, exactp)
+ DB *dbp;
+ DBT *key;
+ int *exactp;
+{
+ BTREE *t;
+ DB_LOCK lock;
+ EPG e;
+ PAGE *h;
+ db_indx_t indx;
+ int cmp, ret;
+
+ t = dbp->internal;
+ h = NULL;
+
+ /*
+ * Record numbers can't be fast-tracked, we have to lock the entire
+ * tree.
+ */
+ if (F_ISSET(dbp, DB_BT_RECNUM))
+ goto slow;
+
+ /* Check to see if we've been seeing sorted input. */
+ if (t->bt_lpgno == PGNO_INVALID)
+ goto slow;
+
+ /*
+ * Retrieve the page on which we did the last insert. It's okay if
+ * it doesn't exist, or if it's not the page type we expect, it just
+ * means that the world changed.
+ */
+ if (__bam_lget(dbp, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock))
+ goto miss;
+ if (__bam_pget(dbp, &h, &t->bt_lpgno, 0)) {
+ (void)__BT_LPUT(dbp, lock);
+ goto miss;
+ }
+ if (TYPE(h) != P_LBTREE)
+ goto miss;
+ if (NUM_ENT(h) == 0)
+ goto miss;
+
+ /*
+ * We have to be at the end or beginning of the tree to know that
+ * we're inserting in a sort order. If that's the case and we're
+ * in the right order in comparison to the first/last key/data pair,
+ * we have the right position.
+ */
+ if (h->next_pgno == PGNO_INVALID) {
+ e.page = h;
+ e.indx = NUM_ENT(h) - P_INDX;
+ if ((cmp = __bam_cmp(dbp, key, &e)) >= 0) {
+ if (cmp > 0)
+ e.indx += P_INDX;
+ goto fast;
+ }
+ }
+ if (h->prev_pgno == PGNO_INVALID) {
+ e.page = h;
+ e.indx = 0;
+ if ((cmp = __bam_cmp(dbp, key, &e)) <= 0) {
+ /*
+ * We're doing a put, so we want to insert as the last
+ * of any set of duplicates.
+ */
+ if (cmp == 0) {
+ for (indx = 0;
+ indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+ h->inp[indx] == h->inp[indx + P_INDX];
+ indx += P_INDX);
+ e.indx = indx;
+ }
+ goto fast;
+ }
+ }
+ goto miss;
+
+ /* Set the exact match flag in case we've already inserted this key. */
+fast: *exactp = cmp == 0;
+
+ /* Enter the entry in the stack. */
+ BT_STK_CLR(t);
+ BT_STK_ENTER(t, e.page, e.indx, lock, ret);
+ if (ret != 0)
+ return (ret);
+
+ ++t->lstat.bt_cache_hit;
+ return (0);
+
+miss: ++t->lstat.bt_cache_miss;
+ if (h != NULL) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_LPUT(dbp, lock);
+ }
+
+slow: return (__bam_search(dbp, key, S_INSERT, 1, NULL, exactp));
+}
+
+/*
+ * OVPUT --
+ * Copy an overflow item onto a page.
+ */
+#undef OVPUT
+#define OVPUT(h, indx, bo) do { \
+ DBT __hdr; \
+ memset(&__hdr, 0, sizeof(__hdr)); \
+ __hdr.data = &bo; \
+ __hdr.size = BOVERFLOW_SIZE; \
+ if ((ret = __db_pitem(dbp, \
+ h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \
+ return (ret); \
+} while (0)
+
+/*
+ * __bam_iitem --
+ * Insert an item into the tree.
+ *
+ * PUBLIC: int __bam_iitem __P((DB *,
+ * PUBLIC: PAGE **, db_indx_t *, DBT *, DBT *, int, int));
+ */
+int
+__bam_iitem(dbp, hp, indxp, key, data, op, flags)
+ DB *dbp;
+ PAGE **hp;
+ db_indx_t *indxp;
+ DBT *key, *data;
+ int op, flags;
+{
+ BTREE *t;
+ BKEYDATA *bk;
+ BOVERFLOW kbo, dbo;
+ DBT tdbt;
+ PAGE *h;
+ db_indx_t indx;
+ u_int32_t have_bytes, need_bytes, needed;
+ int bigkey, bigdata, dcopy, dupadjust, ret;
+
+ t = dbp->internal;
+ h = *hp;
+ indx = *indxp;
+
+ dupadjust = 0;
+ bk = NULL; /* XXX: Shut the compiler up. */
+
+ /*
+ * If it's a page of duplicates, call the common code to do the work.
+ *
+ * !!!
+ * Here's where the hp and indxp are important. The duplicate code
+ * may decide to rework/rearrange the pages and indices we're using,
+ * so the caller must understand that the stack has to change.
+ */
+ if (TYPE(h) == P_DUPLICATE) {
+ /* Adjust the index for the new item if it's a DB_AFTER op. */
+ if (op == DB_AFTER)
+ ++*indxp;
+
+ /* Remove the current item if it's a DB_CURRENT op. */
+ if (op == DB_CURRENT && (ret = __db_ditem(dbp, *hp, *indxp,
+ BKEYDATA_SIZE(GET_BKEYDATA(*hp, *indxp)->len))) != 0)
+ return (ret);
+
+ /* Put the new/replacement item onto the page. */
+ return (__db_dput(dbp, data, hp, indxp, __bam_new));
+ }
+
+ /*
+ * XXX
+ * Handle partial puts.
+ *
+ * This is truly awful from a performance standput. We don't optimize
+ * for partial puts at all, we delete the record and add it back in,
+ * regardless of size or if we're simply overwriting current data.
+ * The hash access method does this a lot better than we do, and we're
+ * eventually going to have to fix it.
+ */
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ tdbt = *data;
+ if ((ret = __bam_partial(dbp, &tdbt, h, indx)) != 0)
+ return (ret);
+ data = &tdbt;
+ }
+
+ /* If it's a short fixed-length record, fix it up. */
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->bt_recno->re_len) {
+ tdbt = *data;
+ if ((ret = __bam_fixed(t, &tdbt)) != 0)
+ return (ret);
+ data = &tdbt;
+ }
+
+ /*
+ * If the key or data item won't fit on a page, store it in the
+ * overflow pages.
+ *
+ * !!!
+ * From this point on, we have to recover the allocated overflow
+ * pages on error.
+ */
+ bigkey = bigdata = 0;
+ if (LF_ISSET(BI_NEWKEY) && key->size > t->bt_ovflsize) {
+ kbo.deleted = 0;
+ kbo.type = B_OVERFLOW;
+ kbo.tlen = key->size;
+ if ((ret = __db_poff(dbp, key, &kbo.pgno, __bam_new)) != 0)
+ goto err;
+ bigkey = 1;
+ }
+ if (data->size > t->bt_ovflsize) {
+ dbo.deleted = 0;
+ dbo.type = B_OVERFLOW;
+ dbo.tlen = data->size;
+ if ((ret = __db_poff(dbp, data, &dbo.pgno, __bam_new)) != 0)
+ goto err;
+ bigdata = 1;
+ }
+
+ dcopy = 0;
+ needed = 0;
+ if (LF_ISSET(BI_NEWKEY)) {
+ /* If BI_NEWKEY is set we're adding a new key and data pair. */
+ if (bigkey)
+ needed += BOVERFLOW_PSIZE;
+ else
+ needed += BKEYDATA_PSIZE(key->size);
+ if (bigdata)
+ needed += BOVERFLOW_PSIZE;
+ else
+ needed += BKEYDATA_PSIZE(data->size);
+ } else {
+ /*
+ * We're either overwriting the data item of a key/data pair
+ * or we're adding the data item only, i.e. a new duplicate.
+ */
+ if (op == DB_CURRENT) {
+ bk = GET_BKEYDATA(h,
+ indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ if (bk->type == B_OVERFLOW)
+ have_bytes = BOVERFLOW_PSIZE;
+ else
+ have_bytes = BKEYDATA_PSIZE(bk->len);
+ need_bytes = 0;
+ } else {
+ have_bytes = 0;
+ need_bytes = sizeof(db_indx_t);
+ }
+ if (bigdata)
+ need_bytes += BOVERFLOW_PSIZE;
+ else
+ need_bytes += BKEYDATA_PSIZE(data->size);
+
+ /*
+ * If we're overwriting a data item, we copy it if it's not a
+ * special record type and it's the same size (including any
+ * alignment) and do a delete/insert otherwise.
+ */
+ if (op == DB_CURRENT && !bigdata &&
+ bk->type == B_KEYDATA && have_bytes == need_bytes)
+ dcopy = 1;
+ if (have_bytes < need_bytes)
+ needed += need_bytes - have_bytes;
+ }
+
+ /*
+ * If there's not enough room, or the user has put a ceiling on the
+ * number of keys permitted in the page, split the page.
+ *
+ * XXX
+ * The t->bt_maxkey test here may be insufficient -- do we have to
+ * check in the btree split code, so we don't undo it there!?!?
+ */
+ if (P_FREESPACE(h) < needed ||
+ (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey)) {
+ ret = DB_NEEDSPLIT;
+ goto err;
+ }
+
+ /*
+ * The code breaks it up into six cases:
+ *
+ * 1. Append a new key/data pair.
+ * 2. Insert a new key/data pair.
+ * 3. Copy the data item.
+ * 4. Delete/insert the data item.
+ * 5. Append a new data item.
+ * 6. Insert a new data item.
+ */
+ if (LF_ISSET(BI_NEWKEY)) {
+ switch (op) {
+ case DB_AFTER: /* 1. Append a new key/data pair. */
+ indx += 2;
+ *indxp += 2;
+ break;
+ case DB_BEFORE: /* 2. Insert a new key/data pair. */
+ break;
+ default:
+ abort();
+ }
+
+ /* Add the key. */
+ if (bigkey)
+ OVPUT(h, indx, kbo);
+ else {
+ DBT __data;
+ memset(&__data, 0, sizeof(__data));
+ __data.data = key->data;
+ __data.size = key->size;
+ if ((ret = __db_pitem(dbp, h, indx,
+ BKEYDATA_SIZE(key->size), NULL, &__data)) != 0)
+ goto err;
+ }
+ ++indx;
+ } else {
+ switch (op) {
+ case DB_CURRENT: /* 3. Copy the data item. */
+ /*
+ * If we're not logging and it's possible, overwrite
+ * the current item.
+ *
+ * XXX
+ * We should add a separate logging message so that
+ * we can do this anytime it's possible, including
+ * for partial record puts.
+ */
+ if (dcopy && !DB_LOGGING(dbp)) {
+ bk->len = data->size;
+ memcpy(bk->data, data->data, data->size);
+ goto done;
+ }
+ /* 4. Delete/insert the data item. */
+ if (TYPE(h) == P_LBTREE)
+ ++indx;
+ if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+ goto err;
+ break;
+ case DB_AFTER: /* 5. Append a new data item. */
+ if (TYPE(h) == P_LBTREE) {
+ /*
+ * Adjust the cursor and copy in the key for
+ * the duplicate.
+ */
+ if ((ret = __bam_adjindx(dbp,
+ h, indx + P_INDX, indx, 1)) != 0)
+ goto err;
+
+ indx += 3;
+ dupadjust = 1;
+
+ *indxp += 2;
+ } else {
+ ++indx;
+ __bam_ca_di(dbp, h->pgno, indx, 1);
+
+ *indxp += 1;
+ }
+ break;
+ case DB_BEFORE: /* 6. Insert a new data item. */
+ if (TYPE(h) == P_LBTREE) {
+ /*
+ * Adjust the cursor and copy in the key for
+ * the duplicate.
+ */
+ if ((ret =
+ __bam_adjindx(dbp, h, indx, indx, 1)) != 0)
+ goto err;
+
+ ++indx;
+ dupadjust = 1;
+ } else
+ __bam_ca_di(dbp, h->pgno, indx, 1);
+ break;
+ default:
+ abort();
+ }
+ }
+
+ /* Add the data. */
+ if (bigdata)
+ OVPUT(h, indx, dbo);
+ else {
+ BKEYDATA __bk;
+ DBT __hdr, __data;
+ memset(&__data, 0, sizeof(__data));
+ __data.data = data->data;
+ __data.size = data->size;
+
+ if (LF_ISSET(BI_DELETED)) {
+ __bk.len = __data.size;
+ __bk.deleted = 1;
+ __bk.type = B_KEYDATA;
+ __hdr.data = &__bk;
+ __hdr.size = SSZA(BKEYDATA, data);
+ ret = __db_pitem(dbp, h, indx,
+ BKEYDATA_SIZE(__data.size), &__hdr, &__data);
+ } else
+ ret = __db_pitem(dbp, h, indx,
+ BKEYDATA_SIZE(data->size), NULL, &__data);
+ if (ret != 0)
+ goto err;
+ }
+
+done: ++t->lstat.bt_added;
+
+ ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY);
+
+ /*
+ * If the page is at least 50% full, and we added a duplicate, see if
+ * that set of duplicates takes up at least 25% of the space. If it
+ * does, move it off onto its own page.
+ */
+ if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) {
+ --indx;
+ if ((ret = __bam_ndup(dbp, h, indx)) != 0)
+ goto err;
+ }
+
+ if (t->bt_recno != NULL)
+ F_SET(t->bt_recno, RECNO_MODIFIED);
+
+ if (0) {
+err: if (bigkey)
+ (void)__db_doff(dbp, kbo.pgno, __bam_free);
+ if (bigdata)
+ (void)__db_doff(dbp, dbo.pgno, __bam_free);
+ }
+ return (ret);
+}
+
+/*
+ * __bam_ndup --
+ * Check to see if the duplicate set at indx should have its own page.
+ * If it should, create it.
+ */
+static int
+__bam_ndup(dbp, h, indx)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t indx;
+{
+ BKEYDATA *bk;
+ BOVERFLOW bo;
+ DBT hdr;
+ PAGE *cp;
+ db_indx_t cnt, cpindx, first, sz;
+ int ret;
+
+ while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX])
+ indx -= P_INDX;
+ for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) {
+ if (indx >= NUM_ENT(h) || h->inp[first] != h->inp[indx])
+ break;
+ bk = GET_BKEYDATA(h, indx);
+ sz += bk->type == B_KEYDATA ?
+ BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
+ bk = GET_BKEYDATA(h, indx + O_INDX);
+ sz += bk->type == B_KEYDATA ?
+ BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
+ }
+
+ /*
+ * If this set of duplicates is using more than 25% of the page, move
+ * them off. The choice of 25% is a WAG, but it has to be small enough
+ * that we can always split regardless of the presence of duplicates.
+ */
+ if (sz < dbp->pgsize / 4)
+ return (0);
+
+ /* Get a new page. */
+ if ((ret = __bam_new(dbp, P_DUPLICATE, &cp)) != 0)
+ return (ret);
+
+ /*
+ * Move this set of duplicates off the page. First points to the first
+ * key of the first duplicate key/data pair, cnt is the number of pairs
+ * we're dealing with.
+ */
+ memset(&hdr, 0, sizeof(hdr));
+ for (indx = first + O_INDX, cpindx = 0;; ++cpindx) {
+ /* Copy the entry to the new page. */
+ bk = GET_BKEYDATA(h, indx);
+ hdr.data = bk;
+ hdr.size = bk->type == B_KEYDATA ?
+ BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
+ if ((ret =
+ __db_pitem(dbp, cp, cpindx, hdr.size, &hdr, NULL)) != 0)
+ goto err;
+
+ /*
+ * Move cursors referencing the old entry to the new entry.
+ * Done after the page put because __db_pitem() adjusts
+ * cursors on the new page, and before the delete because
+ * __db_ditem adjusts cursors on the old page.
+ */
+ __bam_ca_dup(dbp,
+ PGNO(h), first, indx - O_INDX, PGNO(cp), cpindx);
+
+ /* Delete the data item. */
+ if ((ret = __db_ditem(dbp, h, indx, hdr.size)) != 0)
+ goto err;
+
+ /* Delete all but the first reference to the key. */
+ if (--cnt == 0)
+ break;
+ if ((ret = __bam_adjindx(dbp, h, indx, first, 0)) != 0)
+ goto err;
+ }
+
+ /* Put in a new data item that points to the duplicates page. */
+ bo.deleted = 0;
+ bo.type = B_DUPLICATE;
+ bo.pgno = cp->pgno;
+ bo.tlen = 0;
+
+ OVPUT(h, indx, bo);
+
+ return (memp_fput(dbp->mpf, cp, DB_MPOOL_DIRTY));
+
+err: (void)__bam_free(dbp, cp);
+ return (ret);
+}
+
+/*
+ * __bam_fixed --
+ * Build the real record for a fixed length put.
+ */
+static int
+__bam_fixed(t, dbt)
+ BTREE *t;
+ DBT *dbt;
+{
+ RECNO *rp;
+
+ rp = t->bt_recno;
+
+ /*
+ * If using fixed-length records, and the record is long, return
+ * EINVAL. If it's short, pad it out. Use the record data return
+ * memory, it's only short-term.
+ */
+ if (dbt->size > rp->re_len)
+ return (EINVAL);
+ if (t->bt_rdata.ulen < rp->re_len) {
+ t->bt_rdata.data = t->bt_rdata.data == NULL ?
+ (void *)malloc(rp->re_len) :
+ (void *)realloc(t->bt_rdata.data, rp->re_len);
+ if (t->bt_rdata.data == NULL) {
+ t->bt_rdata.ulen = 0;
+ return (ENOMEM);
+ }
+ t->bt_rdata.ulen = rp->re_len;
+ }
+ memcpy(t->bt_rdata.data, dbt->data, dbt->size);
+ memset((u_int8_t *)t->bt_rdata.data + dbt->size,
+ rp->re_pad, rp->re_len - dbt->size);
+
+ /* Set the DBT to reference our new record. */
+ t->bt_rdata.size = rp->re_len;
+ t->bt_rdata.dlen = 0;
+ t->bt_rdata.doff = 0;
+ t->bt_rdata.flags = 0;
+ *dbt = t->bt_rdata;
+ return (0);
+}
+
+/*
+ * __bam_partial --
+ * Build the real record for a partial put.
+ */
+static int
+__bam_partial(dbp, dbt, h, indx)
+ DB *dbp;
+ DBT *dbt;
+ PAGE *h;
+ u_int32_t indx;
+{
+ BTREE *t;
+ BKEYDATA *bk, tbk;
+ BOVERFLOW *bo;
+ DBT copy;
+ u_int32_t len, nbytes, tlen;
+ int ret;
+ u_int8_t *p;
+
+ bo = NULL; /* XXX: Shut the compiler up. */
+ t = dbp->internal;
+
+ /*
+ * Figure out how much total space we'll need. Worst case is where
+ * the record is 0 bytes long, in which case doff causes the record
+ * to extend, and the put data is appended to it.
+ */
+ if (indx < NUM_ENT(h)) {
+ bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ if (bk->type == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ nbytes = bo->tlen;
+ } else
+ nbytes = bk->len;
+ } else {
+ bk = &tbk;
+ bk->type = B_KEYDATA;
+ nbytes = bk->len = 0;
+ }
+ nbytes += dbt->doff + dbt->size + dbt->dlen;
+
+ /* Allocate the space. */
+ if (t->bt_rdata.ulen < nbytes) {
+ t->bt_rdata.data = t->bt_rdata.data == NULL ?
+ (void *)malloc(nbytes) :
+ (void *)realloc(t->bt_rdata.data, nbytes);
+ if (t->bt_rdata.data == NULL) {
+ t->bt_rdata.ulen = 0;
+ return (ENOMEM);
+ }
+ t->bt_rdata.ulen = nbytes;
+ }
+
+ /* We use nul bytes for extending the record, get it over with. */
+ memset(t->bt_rdata.data, 0, nbytes);
+
+ tlen = 0;
+ if (bk->type == B_OVERFLOW) {
+ /* Take up to doff bytes from the record. */
+ memset(&copy, 0, sizeof(copy));
+ if ((ret = __db_goff(dbp, &copy, bo->tlen,
+ bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0)
+ return (ret);
+ tlen += dbt->doff;
+
+ /*
+ * If the original record was larger than the offset:
+ * If dlen > size, shift the remaining data down.
+ * If dlen < size, shift the remaining data up.
+ * Use memmove(), the regions may overlap.
+ */
+ p = t->bt_rdata.data;
+ if (bo->tlen > dbt->doff)
+ if (dbt->dlen > dbt->size) {
+ tlen += len = bo->tlen -
+ dbt->doff - (dbt->dlen - dbt->size);
+ memmove(p + dbt->doff + dbt->size,
+ p + dbt->doff + dbt->dlen, len);
+ } else if (dbt->dlen < dbt->size) {
+ tlen += len = bo->tlen -
+ dbt->doff - (dbt->size - dbt->dlen);
+ memmove(p + dbt->doff + dbt->dlen,
+ p + dbt->doff + dbt->size, len);
+ } else
+ tlen += bo->tlen - dbt->doff;
+
+ /* Copy in the user's data. */
+ memcpy((u_int8_t *)t->bt_rdata.data + dbt->doff,
+ dbt->data, dbt->size);
+ tlen += dbt->size;
+ } else {
+ /* Take up to doff bytes from the record. */
+ memcpy(t->bt_rdata.data,
+ bk->data, dbt->doff > bk->len ? bk->len : dbt->doff);
+ tlen += dbt->doff;
+
+ /* Copy in the user's data. */
+ memcpy((u_int8_t *)t->bt_rdata.data +
+ dbt->doff, dbt->data, dbt->size);
+ tlen += dbt->size;
+
+ /* Copy in any remaining data. */
+ len = dbt->doff + dbt->dlen;
+ if (bk->len > len) {
+ memcpy((u_int8_t *)t->bt_rdata.data + dbt->doff +
+ dbt->size, bk->data + len, bk->len - len);
+ tlen += bk->len - len;
+ }
+ }
+
+ /* Set the DBT to reference our new record. */
+ t->bt_rdata.size = tlen;
+ t->bt_rdata.dlen = 0;
+ t->bt_rdata.doff = 0;
+ t->bt_rdata.flags = 0;
+ *dbt = t->bt_rdata;
+ return (0);
+}
diff --git a/db2/btree/bt_rec.c b/db2/btree/bt_rec.c
new file mode 100644
index 0000000000..d4bc7f6824
--- /dev/null
+++ b/db2/btree/bt_rec.c
@@ -0,0 +1,767 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_rec.c 10.11 (Sleepycat) 8/22/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "shqueue.h"
+#include "hash.h"
+#include "btree.h"
+#include "log.h"
+#include "db_dispatch.h"
+#include "common_ext.h"
+
+/*
+ * __bam_pg_alloc_recover --
+ * Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __bam_pg_alloc_recover
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info)
+ DB_LOG *logp;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int redo;
+ void *info;
+{
+ __bam_pg_alloc_args *argp;
+ BTMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ DB *file_dbp, *mdbp;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, created, modified, ret;
+
+ REC_PRINT(__bam_pg_alloc_print);
+ REC_INTRO(__bam_pg_alloc_read);
+
+ /*
+ * Fix up the allocated page. If we're redoing the operation, we have
+ * to get the page (creating it if it doesn't exist), and update its
+ * LSN. If we're undoing the operation, we have to reset the page's
+ * LSN and put it on the free list.
+ *
+ * Fix up the metadata page. If we're redoing the operation, we have
+ * to get the metadata page and update its LSN and its free pointer.
+ * If we're undoing the operation and the page was ever created, we put
+ * it on the freelist.
+ */
+ pgno = PGNO_METADATA;
+ if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) {
+ (void)__db_pgerr(file_dbp, pgno);
+ goto out;
+ }
+ if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ (void)memp_fput(mpf, meta, 0);
+ goto out;
+ }
+
+ /* Fix up the allocated page. */
+ created = IS_ZERO_LSN(LSN(pagep));
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->page_lsn);
+ if ((created || cmp_p == 0) && redo) {
+ /* Need to redo update described. */
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, PGNO_INVALID, 0, argp->ptype);
+
+ pagep->lsn = *lsnp;
+ modified = 1;
+ } else if ((created || cmp_n == 0) && !redo) {
+ /* Need to undo update described. */
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, meta->free, 0, P_INVALID);
+
+ pagep->lsn = argp->page_lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
+ (void)__db_panic(file_dbp);
+ (void)memp_fput(mpf, meta, 0);
+ goto out;
+ }
+
+ /* Fix up the metadata page. */
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(meta));
+ cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+ if (cmp_p == 0 && redo) {
+ /* Need to redo update described. */
+ meta->lsn = *lsnp;
+ meta->free = argp->next;
+ modified = 1;
+ } else if (cmp_n == 0 && !redo) {
+ /* Need to undo update described. */
+ meta->lsn = argp->meta_lsn;
+ meta->free = argp->pgno;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
+ (void)__db_panic(file_dbp);
+ goto out;
+ }
+
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_pg_free_recover --
+ * Recovery function for pg_free.
+ *
+ * PUBLIC: int __bam_pg_free_recover
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__bam_pg_free_recover(logp, dbtp, lsnp, redo, info)
+ DB_LOG *logp;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int redo;
+ void *info;
+{
+ __bam_pg_free_args *argp;
+ BTMETA *meta;
+ DB *file_dbp, *mdbp;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, modified, ret;
+
+ REC_PRINT(__bam_pg_free_print);
+ REC_INTRO(__bam_pg_free_read);
+
+ /*
+ * Fix up the freed page. If we're redoing the operation we get the
+ * page and explicitly discard its contents, then update its LSN. If
+ * we're undoing the operation, we get the page and restore its header.
+ */
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ goto out;
+ }
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &LSN(argp->header.data));
+ if (cmp_p == 0 && redo) {
+ /* Need to redo update described. */
+ P_INIT(pagep, file_dbp->pgsize,
+ pagep->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+ pagep->lsn = *lsnp;
+
+ modified = 1;
+ } else if (cmp_n == 0 && !redo) {
+ /* Need to undo update described. */
+ memcpy(pagep, argp->header.data, argp->header.size);
+
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
+ (void)__db_panic(file_dbp);
+ goto out;
+ }
+
+ /*
+ * Fix up the metadata page. If we're redoing or undoing the operation
+ * we get the page and update its LSN and free pointer.
+ */
+ pgno = PGNO_METADATA;
+ if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) {
+ (void)__db_pgerr(file_dbp, pgno);
+ goto out;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(meta));
+ cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+ if (cmp_p == 0 && redo) {
+ /* Need to redo update described. */
+ meta->free = argp->pgno;
+
+ meta->lsn = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && !redo) {
+ /* Need to undo update described. */
+ meta->free = argp->next;
+
+ meta->lsn = argp->meta_lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
+ (void)__db_panic(file_dbp);
+ goto out;
+ }
+
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_split_recover --
+ * Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_recover
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__bam_split_recover(logp, dbtp, lsnp, redo, info)
+ DB_LOG *logp;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int redo;
+ void *info;
+{
+ __bam_split_args *argp;
+ DB *file_dbp, *mdbp;
+ DB_MPOOLFILE *mpf;
+ PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+ db_pgno_t pgno;
+ int l_update, p_update, r_update, ret, rootsplit, t_ret;
+
+ REC_PRINT(__bam_split_print);
+
+ mpf = NULL;
+ _lp = lp = np = pp = _rp = rp = NULL;
+
+ REC_INTRO(__bam_split_read);
+
+ /*
+ * There are two kinds of splits that we have to recover from. The
+ * first is a root-page split, where the root page is split from a
+ * leaf page into an internal page and two new leaf pages are created.
+ * The second is where a page is split into two pages, and a new key
+ * is inserted into the parent page.
+ */
+ sp = argp->pg.data;
+ pgno = PGNO(sp);
+ rootsplit = pgno == PGNO_ROOT;
+ if (memp_fget(mpf, &argp->left, 0, &lp) != 0)
+ lp = NULL;
+ if (memp_fget(mpf, &argp->right, 0, &rp) != 0)
+ rp = NULL;
+
+ if (redo) {
+ l_update = r_update = p_update = 0;
+ /*
+ * Decide if we need to resplit the page.
+ *
+ * If this is a root split, then the root has to exist, it's
+ * the page we're splitting and it gets modified. If this is
+ * not a root split, then the left page has to exist, for the
+ * same reason.
+ */
+ if (rootsplit) {
+ if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) {
+ (void)__db_pgerr(file_dbp, pgno);
+ pp = NULL;
+ goto out;
+ }
+ p_update =
+ log_compare(&LSN(pp), &LSN(argp->pg.data)) == 0;
+ } else
+ if (lp == NULL) {
+ (void)__db_pgerr(file_dbp, argp->left);
+ goto out;
+ }
+ if (lp == NULL || log_compare(&LSN(lp), &argp->llsn) == 0)
+ l_update = 1;
+ if (rp == NULL || log_compare(&LSN(rp), &argp->rlsn) == 0)
+ r_update = 1;
+ if (!p_update && !l_update && !r_update)
+ goto done;
+
+ /* Allocate and initialize new left/right child pages. */
+ if ((_lp = (PAGE *)malloc(file_dbp->pgsize)) == NULL)
+ goto nomem;
+ if ((_rp = (PAGE *)malloc(file_dbp->pgsize)) == NULL) {
+nomem: errno = ENOMEM;
+ __db_err(file_dbp->dbenv, "%s", strerror(errno));
+ goto out;
+ }
+ if (rootsplit) {
+ P_INIT(_lp, file_dbp->pgsize, argp->left,
+ PGNO_INVALID,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->left,
+ PGNO_INVALID, LEVEL(sp), TYPE(sp));
+ } else {
+ P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+ ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+ LEVEL(sp), TYPE(sp));
+ }
+
+ /* Split the page. */
+ if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+ (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+ NUM_ENT(sp))) != 0)
+ goto out;
+
+ /* If the left child is wrong, update it. */
+ if (lp == NULL && (ret =
+ memp_fget(mpf, &argp->left, DB_MPOOL_CREATE, &lp)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->left);
+ lp = NULL;
+ goto out;
+ }
+ if (l_update) {
+ memcpy(lp, _lp, file_dbp->pgsize);
+ lp->lsn = *lsnp;
+ if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0)
+ goto fatal;
+ lp = NULL;
+ }
+
+ /* If the right child is wrong, update it. */
+ if (rp == NULL && (ret = memp_fget(mpf,
+ &argp->right, DB_MPOOL_CREATE, &rp)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->right);
+ rp = NULL;
+ goto out;
+ }
+ if (r_update) {
+ memcpy(rp, _rp, file_dbp->pgsize);
+ rp->lsn = *lsnp;
+ if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0)
+ goto fatal;
+ rp = NULL;
+ }
+
+ /*
+ * If the parent page is wrong, update it. This is of interest
+ * only if it was a root split, since root splits create parent
+ * pages. All other splits modify a parent page, but those are
+ * separately logged and recovered.
+ */
+ if (rootsplit && p_update) {
+ if (file_dbp->type == DB_BTREE)
+ P_INIT(pp, file_dbp->pgsize,
+ PGNO_ROOT, PGNO_INVALID, PGNO_INVALID,
+ _lp->level + 1, P_IBTREE);
+ else
+ P_INIT(pp, file_dbp->pgsize,
+ PGNO_ROOT, PGNO_INVALID, PGNO_INVALID,
+ _lp->level + 1, P_IRECNO);
+ RE_NREC_SET(pp,
+ file_dbp->type == DB_RECNO ||
+ F_ISSET(file_dbp, DB_BT_RECNUM) ?
+ __bam_total(_lp) + __bam_total(_rp) : 0);
+ pp->lsn = *lsnp;
+ if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0)
+ goto fatal;
+ pp = NULL;
+ }
+
+ /*
+ * Finally, redo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. The next
+ * page had better exist.
+ */
+ if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) {
+ if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->npgno);
+ np = NULL;
+ goto out;
+ }
+ if (log_compare(&LSN(np), &argp->nlsn) == 0) {
+ PREV_PGNO(np) = argp->right;
+ np->lsn = *lsnp;
+ if ((ret = memp_fput(mpf,
+ np, DB_MPOOL_DIRTY)) != 0)
+ goto fatal;
+ np = NULL;
+ }
+ }
+ } else {
+ /*
+ * If the split page is wrong, replace its contents with the
+ * logged page contents. The split page had better exist.
+ */
+ if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) {
+ (void)__db_pgerr(file_dbp, pgno);
+ pp = NULL;
+ goto out;
+ }
+ if (log_compare(lsnp, &LSN(pp)) == 0) {
+ memcpy(pp, argp->pg.data, argp->pg.size);
+ if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0)
+ goto fatal;
+ pp = NULL;
+ }
+
+ /*
+ * If it's a root split and the left child ever existed, put
+ * it on the free list. (If it's not a root split, we just
+ * updated the left page -- it's the same as the split page.)
+ * If the right child ever existed, root split or not, put it
+ * on the free list.
+ */
+ if ((rootsplit && lp != NULL) || rp != NULL) {
+ if (rootsplit && lp != NULL &&
+ log_compare(lsnp, &LSN(lp)) == 0) {
+ lp->lsn = argp->llsn;
+ if ((ret =
+ memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0)
+ goto fatal;
+ lp = NULL;
+ }
+ if (rp != NULL &&
+ log_compare(lsnp, &LSN(rp)) == 0) {
+ rp->lsn = argp->rlsn;
+ if ((ret =
+ memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0)
+ goto fatal;
+ rp = NULL;
+ }
+ }
+
+ /*
+ * Finally, undo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. The next
+ * page had better exist.
+ */
+ if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) {
+ if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->npgno);
+ np = NULL;
+ goto out;
+ }
+ if (log_compare(lsnp, &LSN(np)) == 0) {
+ PREV_PGNO(np) = argp->left;
+ np->lsn = argp->nlsn;
+ if (memp_fput(mpf, np, DB_MPOOL_DIRTY))
+ goto fatal;
+ np = NULL;
+ }
+ }
+ }
+
+done: ret = 0;
+ *lsnp = argp->prev_lsn;
+
+ if (0) {
+fatal: (void)__db_panic(file_dbp);
+ }
+out: /* Free any pages that weren't dirtied. */
+ if (pp != NULL && (t_ret = memp_fput(mpf, pp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (lp != NULL && (t_ret = memp_fput(mpf, lp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (np != NULL && (t_ret = memp_fput(mpf, np, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = memp_fput(mpf, rp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free any allocated space. */
+ if (_lp != NULL)
+ free(_lp);
+ if (_rp != NULL)
+ free(_rp);
+
+ REC_CLOSE;
+}
+
+/*
+ * __bam_rsplit_recover --
+ * Recovery function for a reverse split.
+ *
+ * PUBLIC: int __bam_rsplit_recover
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__bam_rsplit_recover(logp, dbtp, lsnp, redo, info)
+ DB_LOG *logp;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int redo;
+ void *info;
+{
+ __bam_rsplit_args *argp;
+ DB *file_dbp, *mdbp;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, modified, ret;
+
+ REC_PRINT(__bam_rsplit_print);
+ REC_INTRO(__bam_rsplit_read);
+
+ /* Fix the root page. */
+ pgno = PGNO_ROOT;
+ if ((ret = memp_fget(mpf, &pgno, 0, &pagep)) != 0) {
+ __db_pgerr(file_dbp, pgno);
+ pagep = NULL;
+ goto out;
+ }
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->rootlsn);
+ if (cmp_p == 0 && redo) {
+ /* Need to redo update described. */
+ memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+ pagep->pgno = PGNO_ROOT;
+ pagep->lsn = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && !redo) {
+ /* Need to undo update described. */
+ P_INIT(pagep, file_dbp->pgsize, PGNO_ROOT,
+ PGNO_INVALID, PGNO_INVALID, pagep->level + 1, TYPE(pagep));
+ if ((ret = __db_pitem(file_dbp, pagep, 0,
+ argp->rootent.size, &argp->rootent, NULL)) != 0)
+ goto out;
+ pagep->lsn = argp->rootlsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
+ (void)__db_panic(file_dbp);
+ goto out;
+ }
+
+ /* Fix the page copied over the root page. */
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ pagep = NULL;
+ goto out;
+ }
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &LSN(argp->pgdbt.data));
+ if (cmp_p == 0 && redo) {
+ /* Need to redo update described. */
+ pagep->lsn = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && !redo) {
+ /* Need to undo update described. */
+ memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
+ (void)__db_panic(file_dbp);
+ goto out;
+ }
+
+ ret = 0;
+ *lsnp = argp->prev_lsn;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_adj_recover --
+ * Recovery function for adj.
+ *
+ * PUBLIC: int __bam_adj_recover
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__bam_adj_recover(logp, dbtp, lsnp, redo, info)
+ DB_LOG *logp;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int redo;
+ void *info;
+{
+ __bam_adj_args *argp;
+ DB *file_dbp, *mdbp;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ REC_PRINT(__bam_adj_print);
+ REC_INTRO(__bam_adj_read);
+
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ pagep = NULL;
+ goto out;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && redo) {
+ /* Need to redo update described. */
+ if ((ret = __bam_adjindx(file_dbp,
+ pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0)
+ goto err;
+
+ LSN(pagep) = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && !redo) {
+ /* Need to undo update described. */
+ if ((ret = __bam_adjindx(file_dbp,
+ pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0)
+ goto err;
+
+ LSN(pagep) = argp->lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0)
+ *lsnp = argp->prev_lsn;
+
+ if (0) {
+err: (void)memp_fput(mpf, pagep, 0);
+ }
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_cadjust_recover --
+ * Recovery function for the adjust of a count change in an internal
+ * page.
+ *
+ * PUBLIC: int __bam_cadjust_recover
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__bam_cadjust_recover(logp, dbtp, lsnp, redo, info)
+ DB_LOG *logp;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int redo;
+ void *info;
+{
+ __bam_cadjust_args *argp;
+ DB *file_dbp, *mdbp;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ REC_PRINT(__bam_cadjust_print);
+ REC_INTRO(__bam_cadjust_read);
+
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ errno = __db_pgerr(file_dbp, argp->pgno);
+ pagep = NULL;
+ goto out;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && redo) {
+ /* Need to redo update described. */
+ if (file_dbp->type == DB_BTREE &&
+ F_ISSET(file_dbp, DB_BT_RECNUM)) {
+ GET_BINTERNAL(pagep, argp->indx)->nrecs += argp->adjust;
+ if (argp->total && PGNO(pagep) == PGNO_ROOT)
+ RE_NREC_ADJ(pagep, argp->adjust);
+ }
+ if (file_dbp->type == DB_RECNO) {
+ GET_RINTERNAL(pagep, argp->indx)->nrecs += argp->adjust;
+ if (argp->total && PGNO(pagep) == PGNO_ROOT)
+ RE_NREC_ADJ(pagep, argp->adjust);
+ }
+
+ LSN(pagep) = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && !redo) {
+ /* Need to undo update described. */
+ if (file_dbp->type == DB_BTREE &&
+ F_ISSET(file_dbp, DB_BT_RECNUM)) {
+ GET_BINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust;
+ if (argp->total && PGNO(pagep) == PGNO_ROOT)
+ RE_NREC_ADJ(pagep, argp->adjust);
+ }
+ if (file_dbp->type == DB_RECNO) {
+ GET_RINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust;
+ if (argp->total && PGNO(pagep) == PGNO_ROOT)
+ RE_NREC_ADJ(pagep, -(argp->adjust));
+ }
+ LSN(pagep) = argp->lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0)
+ *lsnp = argp->prev_lsn;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_cdel_recover --
+ * Recovery function for the intent-to-delete of a cursor record.
+ *
+ * PUBLIC: int __bam_cdel_recover
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__bam_cdel_recover(logp, dbtp, lsnp, redo, info)
+ DB_LOG *logp;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int redo;
+ void *info;
+{
+ __bam_cdel_args *argp;
+ DB *file_dbp, *mdbp;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ REC_PRINT(__bam_cdel_print);
+ REC_INTRO(__bam_cdel_read);
+
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ pagep = NULL;
+ goto out;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && redo) {
+ /* Need to redo update described. */
+ GET_BKEYDATA(pagep, argp->indx + O_INDX)->deleted = 1;
+
+ LSN(pagep) = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && !redo) {
+ /* Need to undo update described. */
+ GET_BKEYDATA(pagep, argp->indx + O_INDX)->deleted = 0;
+
+ LSN(pagep) = argp->lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0)
+ *lsnp = argp->prev_lsn;
+
+out: REC_CLOSE;
+}
diff --git a/db2/btree/bt_recno.c b/db2/btree/bt_recno.c
new file mode 100644
index 0000000000..cd8872a064
--- /dev/null
+++ b/db2/btree/bt_recno.c
@@ -0,0 +1,1195 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_recno.c 10.12 (Sleepycat) 8/25/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+static int __ram_add __P((DB *, db_recno_t *, DBT *, int, int));
+static int __ram_c_close __P((DBC *));
+static int __ram_c_del __P((DBC *, int));
+static int __ram_c_get __P((DBC *, DBT *, DBT *, int));
+static int __ram_c_put __P((DBC *, DBT *, DBT *, int));
+static int __ram_fmap __P((DB *, db_recno_t));
+static int __ram_get __P((DB *, DB_TXN *, DBT *, DBT *, int));
+static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, int));
+static int __ram_source __P((DB *, RECNO *, const char *));
+static int __ram_sync __P((DB *, int));
+static int __ram_update __P((DB *, db_recno_t, int));
+static int __ram_vmap __P((DB *, db_recno_t));
+static int __ram_writeback __P((DB *));
+
+/*
+ * If we're renumbering records, then we have to detect in the cursor that a
+ * record was deleted, and adjust the cursor as necessary. If not renumbering
+ * records, then we can detect this by looking at the actual record, so we
+ * ignore the cursor delete flag.
+ */
+#define CD_SET(dbp, cp) { \
+ if (F_ISSET(dbp, DB_RE_RENUMBER)) \
+ F_SET(cp, CR_DELETED); \
+}
+#define CD_CLR(dbp, cp) { \
+ if (F_ISSET(dbp, DB_RE_RENUMBER)) \
+ F_CLR(cp, CR_DELETED); \
+}
+#define CD_ISSET(dbp, cp) \
+ (F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, CR_DELETED))
+
+/*
+ * __ram_open --
+ * Recno open function.
+ *
+ * PUBLIC: int __ram_open __P((DB *, DBTYPE, DB_INFO *));
+ */
+int
+__ram_open(dbp, type, dbinfo)
+ DB *dbp;
+ DBTYPE type;
+ DB_INFO *dbinfo;
+{
+ BTREE *t;
+ RECNO *rp;
+ int ret;
+
+ ret = 0;
+
+ /* Allocate and initialize the private RECNO structure. */
+ if ((rp = (RECNO *)calloc(1, sizeof(*rp))) == NULL)
+ return (errno);
+
+ if (dbinfo != NULL) {
+ /*
+ * If the user specified a source tree, open it and map it in.
+ *
+ * !!!
+ * We don't complain if the user specified transactions or
+ * threads. It's possible to make it work, but you'd better
+ * know what you're doing!
+ */
+ if (dbinfo->re_source == NULL) {
+ rp->re_fd = -1;
+ F_SET(rp, RECNO_EOF);
+ } else {
+ if ((ret =
+ __ram_source(dbp, rp, dbinfo->re_source)) != 0)
+ goto err;
+ }
+
+ /* Copy delimiter, length and padding values. */
+ rp->re_delim =
+ F_ISSET(dbp, DB_RE_DELIMITER) ? dbinfo->re_delim : '\n';
+ rp->re_pad = F_ISSET(dbp, DB_RE_PAD) ? dbinfo->re_pad : ' ';
+
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if ((rp->re_len = dbinfo->re_len) == 0) {
+ __db_err(dbp->dbenv,
+ "record length must be greater than 0");
+ ret = EINVAL;
+ goto err;
+ }
+ } else
+ rp->re_len = 0;
+ } else {
+ rp->re_delim = '\n';
+ rp->re_pad = ' ';
+ rp->re_fd = -1;
+ F_SET(rp, RECNO_EOF);
+ }
+
+ /* Open the underlying btree. */
+ if ((ret = __bam_open(dbp, DB_RECNO, dbinfo)) != 0)
+ goto err;
+
+ /* Set the routines necessary to make it look like a recno tree. */
+ dbp->cursor = __ram_cursor;
+ dbp->del = __ram_delete;
+ dbp->get = __ram_get;
+ dbp->put = __ram_put;
+ dbp->sync = __ram_sync;
+
+ /* Link in the private recno structure. */
+ ((BTREE *)dbp->internal)->bt_recno = rp;
+
+ /* If we're snapshotting an underlying source file, do it now. */
+ if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT))
+ if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND)
+ goto err;
+
+ return (0);
+
+err: /* If we mmap'd a source file, discard it. */
+ if (rp->re_smap != NULL)
+ (void)__db_munmap(rp->re_smap, rp->re_msize);
+
+ /* If we opened a source file, discard it. */
+ if (rp->re_fd != -1)
+ (void)__db_close(rp->re_fd);
+ if (rp->re_source != NULL)
+ FREES(rp->re_source);
+
+ /* If we allocated room for key/data return, discard it. */
+ t = dbp->internal;
+ if (t->bt_rkey.data != NULL)
+ free(t->bt_rkey.data);
+
+ FREE(rp, sizeof(*rp));
+
+ return (ret);
+}
+
+/*
+ * __ram_cursor --
+ * Recno db->cursor function.
+ *
+ * PUBLIC: int __ram_cursor __P((DB *, DB_TXN *, DBC **));
+ */
+int
+__ram_cursor(dbp, txn, dbcp)
+ DB *dbp;
+ DB_TXN *txn;
+ DBC **dbcp;
+{
+ RCURSOR *cp;
+ DBC *dbc;
+
+ DEBUG_LWRITE(dbp, txn, "ram_cursor", NULL, NULL, 0);
+
+ if ((dbc = (DBC *)calloc(1, sizeof(DBC))) == NULL)
+ return (ENOMEM);
+ if ((cp = (RCURSOR *)calloc(1, sizeof(RCURSOR))) == NULL) {
+ free(dbc);
+ return (ENOMEM);
+ }
+
+ cp->dbc = dbc;
+ cp->recno = RECNO_OOB;
+
+ dbc->dbp = dbp;
+ dbc->txn = txn;
+ dbc->internal = cp;
+ dbc->c_close = __ram_c_close;
+ dbc->c_del = __ram_c_del;
+ dbc->c_get = __ram_c_get;
+ dbc->c_put = __ram_c_put;
+
+ /* All cursor structures hang off the main DB structure. */
+ DB_THREAD_LOCK(dbp);
+ TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links);
+ DB_THREAD_UNLOCK(dbp);
+
+ *dbcp = dbc;
+ return (0);
+}
+
+/*
+ * __ram_get --
+ * Recno db->get function.
+ */
+static int
+__ram_get(argdbp, txn, key, data, flags)
+ DB *argdbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ int flags;
+{
+ BTREE *t;
+ DB *dbp;
+ PAGE *h;
+ db_indx_t indx;
+ db_recno_t recno;
+ int exact, ret, stack;
+
+ stack = 0;
+
+ DEBUG_LWRITE(argdbp, txn, "ram_get", key, NULL, flags);
+
+ /* Check for invalid flags. */
+ if ((ret = __db_getchk(argdbp, key, data, flags)) != 0)
+ return (ret);
+
+ GETHANDLE(argdbp, txn, &dbp, ret);
+ t = dbp->internal;
+
+ /* Check the user's record number and fill in as necessary. */
+ if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0)
+ goto done;
+
+ /* Search the tree for the record. */
+ if ((ret = __bam_rsearch(dbp, &recno, S_FIND, 1, &exact)) != 0)
+ goto done;
+ if (!exact)
+ return (DB_NOTFOUND);
+ stack = 1;
+
+ h = t->bt_csp->page;
+ indx = t->bt_csp->indx;
+
+ /* If the record has already been deleted, we couldn't have found it. */
+ if (GET_BKEYDATA(h, indx)->deleted) {
+ ret = DB_KEYEMPTY;
+ goto done;
+ }
+
+ /* Return the data item. */
+ ret = __db_ret(dbp,
+ h, indx, data, &t->bt_rdata.data, &t->bt_rdata.ulen);
+ ++t->lstat.bt_get;
+
+done: /* Discard the stack. */
+ if (stack)
+ __bam_stkrel(dbp);
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __ram_put --
+ * Recno db->put function.
+ */
+static int
+__ram_put(argdbp, txn, key, data, flags)
+ DB *argdbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ int flags;
+{
+ BTREE *t;
+ DB *dbp;
+ db_recno_t recno;
+ int ret;
+
+ DEBUG_LWRITE(argdbp, txn, "ram_put", key, data, flags);
+
+ /* Check for invalid flags. */
+ if ((ret = __db_putchk(argdbp,
+ key, data, flags, F_ISSET(argdbp, DB_AM_RDONLY), 0)) != 0)
+ return (ret);
+
+ GETHANDLE(argdbp, txn, &dbp, ret);
+
+ /*
+ * If we're appending to the tree, make sure we've read in all of
+ * the backing source file. Otherwise, check the user's record
+ * number and fill in as necessary.
+ */
+ ret = LF_ISSET(DB_APPEND) ?
+ __ram_snapshot(dbp) : __ram_getno(dbp, key, &recno, 1);
+
+ /* Add the record. */
+ if (ret == 0)
+ ret = __ram_add(dbp, &recno, data, flags, 0);
+
+ /* If we're appending to the tree, we have to return the record. */
+ if (ret == 0 && LF_ISSET(DB_APPEND)) {
+ t = dbp->internal;
+ ret = __db_retcopy(key, &recno, sizeof(recno),
+ &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc);
+ }
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __ram_sync --
+ * Recno db->sync function.
+ */
+static int
+__ram_sync(argdbp, flags)
+ DB *argdbp;
+ int flags;
+{
+ DB *dbp;
+ int ret;
+
+ DEBUG_LWRITE(argdbp, NULL, "ram_sync", NULL, NULL, flags);
+
+ /* Sync the underlying btree. */
+ if ((ret = __bam_sync(argdbp, flags)) != 0)
+ return (ret);
+
+ /* Copy back the backing source file. */
+ GETHANDLE(argdbp, NULL, &dbp, ret);
+ ret = __ram_writeback(dbp);
+ PUTHANDLE(dbp);
+
+ return (ret);
+}
+
+/*
+ * __ram_close --
+ * Recno db->close function.
+ *
+ * PUBLIC: int __ram_close __P((DB *));
+ */
+int
+__ram_close(argdbp)
+ DB *argdbp;
+{
+ RECNO *rp;
+
+ DEBUG_LWRITE(argdbp, NULL, "ram_close", NULL, NULL, 0);
+
+ rp = ((BTREE *)argdbp->internal)->bt_recno;
+
+ /* Close any underlying mmap region. */
+ if (rp->re_smap != NULL)
+ (void)__db_munmap(rp->re_smap, rp->re_msize);
+
+ /* Close any backing source file descriptor. */
+ if (rp->re_fd != -1)
+ (void)__db_close(rp->re_fd);
+
+ /* Free any backing source file name. */
+ if (rp->re_source != NULL)
+ FREES(rp->re_source);
+
+ /* Free allocated memory. */
+ FREE(rp, sizeof(RECNO));
+ ((BTREE *)argdbp->internal)->bt_recno = NULL;
+
+ /* Close the underlying btree. */
+ return (__bam_close(argdbp));
+}
+
+/*
+ * __ram_c_close --
+ * Recno cursor->close function.
+ */
+static int
+__ram_c_close(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+
+ DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_close", NULL, NULL, 0);
+
+ dbp = dbc->dbp;
+
+ /* Remove the cursor from the queue. */
+ DB_THREAD_LOCK(dbp);
+ TAILQ_REMOVE(&dbp->curs_queue, dbc, links);
+ DB_THREAD_UNLOCK(dbp);
+
+ /* Discard the structures. */
+ FREE(dbc->internal, sizeof(RCURSOR));
+ FREE(dbc, sizeof(DBC));
+
+ return (0);
+}
+
+/*
+ * __ram_c_del --
+ * Recno cursor->c_del function.
+ */
+static int
+__ram_c_del(dbc, flags)
+ DBC *dbc;
+ int flags;
+{
+ DBT key;
+ RCURSOR *cp;
+ int ret;
+
+ DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_del", NULL, NULL, flags);
+
+ cp = dbc->internal;
+
+ /* Check for invalid flags. */
+ if ((ret = __db_cdelchk(dbc->dbp, flags,
+ F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
+ return (ret);
+
+ /* If already deleted, return failure. */
+ if (CD_ISSET(dbc->dbp, cp))
+ return (DB_KEYEMPTY);
+
+ /* Build a normal delete request. */
+ memset(&key, 0, sizeof(key));
+ key.data = &cp->recno;
+ key.size = sizeof(db_recno_t);
+ if ((ret = __ram_delete(dbc->dbp, dbc->txn, &key, 0)) == 0)
+ CD_SET(dbc->dbp, cp);
+
+ return (ret);
+}
+
+/*
+ * __ram_c_get --
+ * Recno cursor->c_get function.
+ */
+static int
+__ram_c_get(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ int flags;
+{
+ BTREE *t;
+ DB *dbp;
+ RCURSOR *cp, copy;
+ int ret;
+
+ DEBUG_LREAD(dbc->dbp, dbc->txn, "ram_c_get",
+ flags == DB_SET || flags == DB_SET_RANGE ? key : NULL,
+ NULL, flags);
+
+ cp = dbc->internal;
+ dbp = dbc->dbp;
+
+ /* Check for invalid flags. */
+ if ((ret = __db_cgetchk(dbc->dbp,
+ key, data, flags, cp->recno != RECNO_OOB)) != 0)
+ return (ret);
+
+ GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
+ t = dbp->internal;
+
+ /* Initialize the cursor for a new retrieval. */
+ copy = *cp;
+
+retry: /* Update the record number. */
+ switch (flags) {
+ case DB_CURRENT:
+ if (CD_ISSET(dbp, cp)) {
+ PUTHANDLE(dbp);
+ return (DB_KEYEMPTY);
+ }
+ break;
+ case DB_NEXT:
+ if (CD_ISSET(dbp, cp))
+ break;
+ if (cp->recno != RECNO_OOB) {
+ ++cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_FIRST:
+ flags = DB_NEXT;
+ cp->recno = 1;
+ break;
+ case DB_PREV:
+ if (cp->recno != RECNO_OOB) {
+ if (cp->recno == 1)
+ return (DB_NOTFOUND);
+ --cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST:
+ flags = DB_PREV;
+ if (((ret = __ram_snapshot(dbp)) != 0) && ret != DB_NOTFOUND)
+ goto err;
+ if ((ret = __bam_nrecs(dbp, &cp->recno)) != 0)
+ goto err;
+ if (cp->recno == 0)
+ return (DB_NOTFOUND);
+ break;
+ case DB_SET:
+ case DB_SET_RANGE:
+ if ((ret = __ram_getno(dbp, key, &cp->recno, 0)) != 0)
+ goto err;
+ break;
+ }
+
+ /*
+ * Return the key if the user didn't give us one, and then pass it
+ * into __ram_get().
+ */
+ if (flags != DB_SET && flags != DB_SET_RANGE &&
+ (ret = __db_retcopy(key, &cp->recno, sizeof(cp->recno),
+ &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc)) != 0)
+ return (ret);
+
+ /*
+ * The cursor was reset, so the delete adjustment is no
+ * longer necessary.
+ */
+ CD_CLR(dbp, cp);
+
+ /*
+ * Retrieve the record.
+ *
+ * Skip any keys that don't really exist.
+ */
+ if ((ret = __ram_get(dbp, dbc->txn, key, data, 0)) != 0)
+ if (ret == DB_KEYEMPTY &&
+ (flags == DB_NEXT || flags == DB_PREV))
+ goto retry;
+
+err: if (ret != 0)
+ *cp = copy;
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __ram_c_put --
+ * Recno cursor->c_put function.
+ */
+static int
+__ram_c_put(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ int flags;
+{
+ BTREE *t;
+ RCURSOR *cp, copy;
+ DB *dbp;
+ int exact, ret;
+ void *arg;
+
+ DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_put", NULL, data, flags);
+
+ cp = dbc->internal;
+
+ if ((ret = __db_cputchk(dbc->dbp, key, data, flags,
+ F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
+ return (ret);
+
+ GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
+ t = dbp->internal;
+
+ /* Initialize the cursor for a new retrieval. */
+ copy = *cp;
+
+ /*
+ * To split, we need a valid key for the page. Since it's a cursor,
+ * we have to build one.
+ *
+ * The split code discards all short-term locks and stack pages.
+ */
+ if (0) {
+split: arg = &cp->recno;
+ if ((ret = __bam_split(dbp, arg)) != 0)
+ goto err;
+ }
+
+ if ((ret = __bam_rsearch(dbp, &cp->recno, S_INSERT, 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ if ((ret = __bam_iitem(dbp, &t->bt_csp->page,
+ &t->bt_csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) {
+ if ((ret = __bam_stkrel(dbp)) != 0)
+ goto err;
+ goto split;
+ }
+ if ((ret = __bam_stkrel(dbp)) != 0)
+ goto err;
+
+ if (flags != DB_CURRENT) {
+ /* Adjust the counts. */
+ if ((ret = __bam_adjust(dbp, t, 1)) != 0)
+ goto err;
+
+ switch (flags) {
+ case DB_AFTER:
+ /* Adjust the cursors. */
+ __ram_ca(dbp, cp->recno, CA_IAFTER);
+
+ /* Set this cursor to reference the new record. */
+ cp->recno = copy.recno + 1;
+ break;
+ case DB_BEFORE:
+ /* Adjust the cursors. */
+ __ram_ca(dbp, cp->recno, CA_IBEFORE);
+
+ /* Set this cursor to reference the new record. */
+ cp->recno = copy.recno;
+ break;
+ }
+
+ }
+
+ /*
+ * The cursor was reset, so the delete adjustment is no
+ * longer necessary.
+ */
+ CD_CLR(dbp, cp);
+
+err: if (ret != 0)
+ *cp = copy;
+
+ PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __ram_ca --
+ * Adjust cursors.
+ *
+ * PUBLIC: void __ram_ca __P((DB *, db_recno_t, ca_recno_arg));
+ */
+void
+__ram_ca(dbp, recno, op)
+ DB *dbp;
+ db_recno_t recno;
+ ca_recno_arg op;
+{
+ DBC *dbc;
+ RCURSOR *cp;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ DB_THREAD_LOCK(dbp);
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (RCURSOR *)dbc->internal;
+ switch (op) {
+ case CA_DELETE:
+ if (recno > cp->recno)
+ --cp->recno;
+ break;
+ case CA_IAFTER:
+ if (recno > cp->recno)
+ ++cp->recno;
+ break;
+ case CA_IBEFORE:
+ if (recno >= cp->recno)
+ ++cp->recno;
+ break;
+ }
+ }
+ DB_THREAD_UNLOCK(dbp);
+}
+
+#ifdef DEBUG
+/*
+ * __ram_cprint --
+ * Display the current recno cursor list.
+ */
+int
+__ram_cprint(dbp)
+ DB *dbp;
+{
+ DBC *dbc;
+ RCURSOR *cp;
+
+ DB_THREAD_LOCK(dbp);
+ for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (RCURSOR *)dbc->internal;
+ fprintf(stderr,
+ "%#0x: recno: %lu\n", (u_int)cp, (u_long)cp->recno);
+ }
+ DB_THREAD_UNLOCK(dbp);
+ return (0);
+}
+#endif /* DEBUG */
+
+/*
+ * __ram_getno --
+ * Check the user's record number, and make sure we've seen it.
+ *
+ * PUBLIC: int __ram_getno __P((DB *, const DBT *, db_recno_t *, int));
+ */
+int
+__ram_getno(dbp, key, rep, can_create)
+ DB *dbp;
+ const DBT *key;
+ db_recno_t *rep;
+ int can_create;
+{
+ db_recno_t recno;
+
+ /* Check the user's record number. */
+ if ((recno = *(db_recno_t *)key->data) == 0) {
+ __db_err(dbp->dbenv, "illegal record number of 0");
+ return (EINVAL);
+ }
+ if (rep != NULL)
+ *rep = recno;
+
+ /*
+ * Btree can neither create records or read them in. Recno can
+ * do both, see if we can find the record.
+ */
+ return (dbp->type == DB_RECNO ?
+ __ram_update(dbp, recno, can_create) : 0);
+}
+
+/*
+ * __ram_snapshot --
+ * Read in any remaining records from the backing input file.
+ *
+ * PUBLIC: int __ram_snapshot __P((DB *));
+ */
+int
+__ram_snapshot(dbp)
+ DB *dbp;
+{
+ return (__ram_update(dbp, DB_MAX_RECORDS, 0));
+}
+
+/*
+ * __ram_update --
+ * Ensure the tree has records up to and including the specified one.
+ */
+static int
+__ram_update(dbp, recno, can_create)
+ DB *dbp;
+ db_recno_t recno;
+ int can_create;
+{
+ BTREE *t;
+ RECNO *rp;
+ db_recno_t nrecs;
+ int ret;
+
+ t = dbp->internal;
+ rp = t->bt_recno;
+
+ /*
+ * If we can't create records and we've read the entire backing input
+ * file, we're done.
+ */
+ if (!can_create && F_ISSET(rp, RECNO_EOF))
+ return (0);
+
+ /*
+ * If we haven't seen this record yet, try to get it from the original
+ * file.
+ */
+ if ((ret = __bam_nrecs(dbp, &nrecs)) != 0)
+ return (ret);
+ if (!F_ISSET(rp, RECNO_EOF) && recno > nrecs) {
+ if ((ret = rp->re_irec(dbp, recno)) != 0)
+ return (ret);
+ if ((ret = __bam_nrecs(dbp, &nrecs)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If we can create records, create empty ones up to the requested
+ * record.
+ */
+ if (!can_create || recno <= nrecs + 1)
+ return (0);
+
+ t->bt_rdata.dlen = 0;
+ t->bt_rdata.doff = 0;
+ t->bt_rdata.flags = 0;
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if (t->bt_rdata.ulen < rp->re_len) {
+ t->bt_rdata.data = t->bt_rdata.data == NULL ?
+ (void *)malloc(rp->re_len) :
+ (void *)realloc(t->bt_rdata.data, rp->re_len);
+ if (t->bt_rdata.data == NULL) {
+ t->bt_rdata.ulen = 0;
+ return (ENOMEM);
+ }
+ t->bt_rdata.ulen = rp->re_len;
+ }
+ t->bt_rdata.size = rp->re_len;
+ memset(t->bt_rdata.data, rp->re_pad, rp->re_len);
+ } else
+ t->bt_rdata.size = 0;
+
+ while (recno > ++nrecs)
+ if ((ret = __ram_add(dbp,
+ &nrecs, &t->bt_rdata, 0, BI_DELETED)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * __ram_source --
+ * Load information about the backing file.
+ */
+static int
+__ram_source(dbp, rp, fname)
+ DB *dbp;
+ RECNO *rp;
+ const char *fname;
+{
+ off_t size;
+ int oflags, ret;
+
+ if ((ret = __db_appname(dbp->dbenv,
+ DB_APP_DATA, NULL, fname, NULL, &rp->re_source)) != 0)
+ return (ret);
+
+ oflags = F_ISSET(dbp, DB_AM_RDONLY) ? DB_RDONLY : 0;
+ if ((ret =
+ __db_fdopen(rp->re_source, oflags, oflags, 0, &rp->re_fd)) != 0) {
+ __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
+ goto err;
+ }
+
+ /*
+ * XXX
+ * We'd like to test to see if the file is too big to mmap. Since we
+ * don't know what size or type off_t's or size_t's are, or the largest
+ * unsigned integral type is, or what random insanity the local C
+ * compiler will perpetrate, doing the comparison in a portable way is
+ * flatly impossible. Hope that mmap fails if the file is too large.
+ */
+ if ((ret =
+ __db_stat(dbp->dbenv, rp->re_source, rp->re_fd, &size, NULL)) != 0)
+ goto err;
+ if (size == 0) {
+ F_SET(rp, RECNO_EOF);
+ return (0);
+ }
+
+ if ((ret = __db_mmap(rp->re_fd, (size_t)size, 1, 1, &rp->re_smap)) != 0)
+ goto err;
+ rp->re_cmap = rp->re_smap;
+ rp->re_emap = (u_int8_t *)rp->re_smap + (rp->re_msize = size);
+ rp->re_irec = F_ISSET(dbp, DB_RE_FIXEDLEN) ? __ram_fmap : __ram_vmap;
+ return (0);
+
+err: FREES(rp->re_source)
+ return (ret);
+}
+
+/*
+ * __ram_writeback --
+ * Rewrite the backing file.
+ */
+static int
+__ram_writeback(dbp)
+ DB *dbp;
+{
+ RECNO *rp;
+ DBT key, data;
+ db_recno_t keyno;
+ ssize_t nw;
+ int fd, ret, t_ret;
+ u_int8_t delim, *pad;
+
+ rp = ((BTREE *)dbp->internal)->bt_recno;
+
+ /* If the file wasn't modified, we're done. */
+ if (!F_ISSET(rp, RECNO_MODIFIED))
+ return (0);
+
+ /* If there's no backing source file, we're done. */
+ if (rp->re_source == NULL) {
+ F_CLR(rp, RECNO_MODIFIED);
+ return (0);
+ }
+
+ /*
+ * Read any remaining records into the tree.
+ *
+ * XXX
+ * This is why we can't support transactions when applications specify
+ * backing (re_source) files. At this point we have to read in the
+ * rest of the records from the file so that we can write all of the
+ * records back out again, which could modify a page for which we'd
+ * have to log changes and which we don't have locked. This could be
+ * partially fixed by taking a snapshot of the entire file during the
+ * db_open(), or, since db_open() isn't transaction protected, as part
+ * of the first DB operation. But, if a checkpoint occurs then, the
+ * part of the log holding the copy of the file could be discarded, and
+ * that would make it impossible to recover in the face of disaster.
+ * This could all probably be fixed, but it would require transaction
+ * protecting the backing source file, i.e. mpool would have to know
+ * about it, and we don't want to go there.
+ */
+ if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND)
+ return (ret);
+
+ /*
+ * !!!
+ * Close any underlying mmap region. This is required for Windows NT
+ * (4.0, Service Pack 2) -- if the file is still mapped, the following
+ * open will fail.
+ */
+ if (rp->re_smap != NULL) {
+ (void)__db_munmap(rp->re_smap, rp->re_msize);
+ rp->re_smap = NULL;
+ }
+
+ /* Get rid of any backing file descriptor, just on GP's. */
+ if (rp->re_fd != -1) {
+ (void)__db_close(rp->re_fd);
+ rp->re_fd = -1;
+ }
+
+ /* Open the file, truncating it. */
+ if ((ret = __db_fdopen(rp->re_source,
+ DB_SEQUENTIAL | DB_TRUNCATE,
+ DB_SEQUENTIAL | DB_TRUNCATE, 0, &fd)) != 0) {
+ __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
+ return (ret);
+ }
+
+ /*
+ * We step through the records, writing each one out. Use the record
+ * number and the dbp->get() function, instead of a cursor, so we find
+ * and write out "deleted" or non-existent records.
+ */
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ key.size = sizeof(db_recno_t);
+ key.data = &keyno;
+
+ /*
+ * We'll need the delimiter if we're doing variable-length records,
+ * and the pad character if we're doing fixed-length records.
+ */
+ delim = rp->re_delim;
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if ((pad = malloc(rp->re_len)) == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ memset(pad, rp->re_pad, rp->re_len);
+ } else
+ pad = NULL; /* XXX: Shut the compiler up. */
+ for (keyno = 1;; ++keyno) {
+ switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) {
+ case 0:
+ if ((ret =
+ __db_write(fd, data.data, data.size, &nw)) != 0)
+ goto err;
+ if (nw != (ssize_t)data.size) {
+ ret = EIO;
+ goto err;
+ }
+ break;
+ case DB_KEYEMPTY:
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if ((ret =
+ __db_write(fd, pad, rp->re_len, &nw)) != 0)
+ goto err;
+ if (nw != (ssize_t) rp->re_len) {
+ ret = EIO;
+ goto err;
+ }
+ }
+ break;
+ case DB_NOTFOUND:
+ ret = 0;
+ goto done;
+ }
+ if (!F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if ((ret = __db_write(fd, &delim, 1, &nw)) != 0)
+ goto err;
+ if (nw != 1) {
+ ret = EIO;
+ goto err;
+ }
+ }
+ }
+
+err:
+done: /* Close the file descriptor. */
+ if ((t_ret = __db_close(fd)) != 0 || ret == 0)
+ ret = t_ret;
+
+ if (ret == 0)
+ F_CLR(rp, RECNO_MODIFIED);
+ return (ret);
+}
+
+/*
+ * __ram_fmap --
+ * Get fixed length records from a file.
+ */
+static int
+__ram_fmap(dbp, top)
+ DB *dbp;
+ db_recno_t top;
+{
+ BTREE *t;
+ DBT data;
+ RECNO *rp;
+ db_recno_t recno;
+ u_int32_t len;
+ u_int8_t *sp, *ep, *p;
+ int ret;
+
+ if ((ret = __bam_nrecs(dbp, &recno)) != 0)
+ return (ret);
+
+ t = dbp->internal;
+ rp = t->bt_recno;
+ if (t->bt_rdata.ulen < rp->re_len) {
+ t->bt_rdata.data = t->bt_rdata.data == NULL ?
+ (void *)malloc(rp->re_len) :
+ (void *)realloc(t->bt_rdata.data, rp->re_len);
+ if (t->bt_rdata.data == NULL) {
+ t->bt_rdata.ulen = 0;
+ return (ENOMEM);
+ }
+ t->bt_rdata.ulen = rp->re_len;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.data = t->bt_rdata.data;
+ data.size = rp->re_len;
+
+ sp = (u_int8_t *)rp->re_cmap;
+ ep = (u_int8_t *)rp->re_emap;
+ while (recno <= top) {
+ if (sp >= ep) {
+ F_SET(rp, RECNO_EOF);
+ return (DB_NOTFOUND);
+ }
+ len = rp->re_len;
+ for (p = t->bt_rdata.data;
+ sp < ep && len > 0; *p++ = *sp++, --len);
+
+ /*
+ * Another process may have read some portion of the input
+ * file already, in which case we just want to discard the
+ * new record.
+ *
+ * XXX
+ * We should just do a seek, since the records are fixed
+ * length.
+ */
+ if (rp->re_last >= recno) {
+ if (len != 0)
+ memset(p, rp->re_pad, len);
+
+ ++recno;
+ if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0)
+ return (ret);
+ }
+ ++rp->re_last;
+ }
+ rp->re_cmap = sp;
+ return (0);
+}
+
+/*
+ * __ram_vmap --
+ * Get variable length records from a file.
+ */
+static int
+__ram_vmap(dbp, top)
+ DB *dbp;
+ db_recno_t top;
+{
+ BTREE *t;
+ DBT data;
+ RECNO *rp;
+ db_recno_t recno;
+ u_int8_t *sp, *ep;
+ int delim, ret;
+
+ t = dbp->internal;
+ rp = t->bt_recno;
+
+ if ((ret = __bam_nrecs(dbp, &recno)) != 0)
+ return (ret);
+
+ memset(&data, 0, sizeof(data));
+
+ delim = rp->re_delim;
+
+ sp = (u_int8_t *)rp->re_cmap;
+ ep = (u_int8_t *)rp->re_emap;
+ while (recno <= top) {
+ if (sp >= ep) {
+ F_SET(rp, RECNO_EOF);
+ return (DB_NOTFOUND);
+ }
+ for (data.data = sp; sp < ep && *sp != delim; ++sp);
+
+ /*
+ * Another process may have read some portion of the input
+ * file already, in which case we just want to discard the
+ * new record.
+ */
+ if (rp->re_last >= recno) {
+ data.size = sp - (u_int8_t *)data.data;
+ ++recno;
+ if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0)
+ return (ret);
+ }
+ ++rp->re_last;
+ ++sp;
+ }
+ rp->re_cmap = sp;
+ return (0);
+}
+
+/*
+ * __ram_add --
+ * Add records into the tree.
+ */
+static int
+__ram_add(dbp, recnop, data, flags, bi_flags)
+ DB *dbp;
+ db_recno_t *recnop;
+ DBT *data;
+ int flags, bi_flags;
+{
+ BTREE *t;
+ PAGE *h;
+ db_indx_t indx;
+ int exact, ret, stack;
+
+ t = dbp->internal;
+
+retry: /* Find the slot for insertion. */
+ if ((ret = __bam_rsearch(dbp, recnop,
+ S_INSERT | (LF_ISSET(DB_APPEND) ? S_APPEND : 0), 1, &exact)) != 0)
+ return (ret);
+ h = t->bt_csp->page;
+ indx = t->bt_csp->indx;
+ stack = 1;
+
+ /*
+ * The recno access method doesn't currently support duplicates, so
+ * if an identical key is already in the tree we're either overwriting
+ * it or an error is returned.
+ */
+ if (exact && LF_ISSET(DB_NOOVERWRITE)) {
+ ret = DB_KEYEXIST;
+ goto err;
+ }
+
+ /*
+ * Select the arguments for __bam_iitem() and do the insert. If the
+ * key is an exact match, or we're replacing the data item with a
+ * new data item. If the key isn't an exact match, we're inserting
+ * a new key/data pair, before the search location.
+ */
+ if ((ret = __bam_iitem(dbp, &h, &indx, NULL,
+ data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) == DB_NEEDSPLIT) {
+ (void)__bam_stkrel(dbp);
+ stack = 0;
+ if ((ret = __bam_split(dbp, recnop)) != 0)
+ goto err;
+ goto retry;
+ }
+
+ if (!exact && ret == 0)
+ __bam_adjust(dbp, t, 1);
+
+err: if (stack)
+ __bam_stkrel(dbp);
+ return (ret);
+}
diff --git a/db2/btree/bt_rsearch.c b/db2/btree/bt_rsearch.c
new file mode 100644
index 0000000000..ee26221e25
--- /dev/null
+++ b/db2/btree/bt_rsearch.c
@@ -0,0 +1,347 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_rsearch.c 10.8 (Sleepycat) 8/24/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+/*
+ * __bam_rsearch --
+ * Search a btree for a record number.
+ *
+ * PUBLIC: int __bam_rsearch __P((DB *, db_recno_t *, u_int, int, int *));
+ */
+int
+__bam_rsearch(dbp, recnop, flags, stop, exactp)
+ DB *dbp;
+ db_recno_t *recnop;
+ u_int flags;
+ int stop, *exactp;
+{
+ BINTERNAL *bi;
+ BTREE *t;
+ DB_LOCK lock;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_indx_t indx, top;
+ db_pgno_t pg;
+ db_recno_t recno, total;
+ int isappend, ret, stack;
+
+ t = dbp->internal;
+
+ /*
+ * We test for groups of flags, S_APPEND is the only one that can be
+ * OR'd into the set. Clear it now so that the tests for equality
+ * will work.
+ */
+ if ((isappend = LF_ISSET(S_APPEND)) != 0)
+ LF_CLR(S_APPEND);
+
+ /*
+ * There are several ways we search a btree tree. The flags argument
+ * specifies if we're acquiring read or write locks and if we are
+ * locking pairs of pages. See btree.h for more details.
+ *
+ * If write-locking pages, we need to know whether or not to acquire a
+ * write lock on a page before getting it. This depends on how deep it
+ * is in tree, which we don't know until we acquire the root page. So,
+ * if we need to lock the root page we may have to upgrade it later,
+ * because we won't get the correct lock initially.
+ *
+ * Retrieve the root page.
+ */
+ pg = PGNO_ROOT;
+ if ((ret = __bam_lget(dbp, 0, PGNO_ROOT,
+ flags == S_INSERT || flags == S_DELETE ?
+ DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
+ (void)__BT_LPUT(dbp, lock);
+ return (ret);
+ }
+ total = RE_NREC(h);
+
+ /*
+ * If appending to the tree, set the record number now -- we have the
+ * root page locked.
+ *
+ * Delete only deletes exact matches, read only returns exact matches.
+ * Note, this is different from __bam_search(), which returns non-exact
+ * matches for read.
+ *
+ * The record may not exist. We can only return the correct location
+ * for the record immediately after the last record in the tree, so do
+ * a fast check now.
+ */
+ if (isappend) {
+ *exactp = 0;
+ *recnop = recno = total + 1;
+ } else {
+ recno = *recnop;
+ if (recno <= total)
+ *exactp = 1;
+ else {
+ *exactp = 0;
+ if (flags == S_DELETE ||
+ flags == S_FIND || recno > total + 1) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_LPUT(dbp, lock);
+ return (DB_NOTFOUND);
+ }
+ }
+ }
+
+ /* Decide if we're building a stack based on the operation. */
+ BT_STK_CLR(t);
+ stack = flags == S_DELETE || flags == S_INSERT;
+
+ /*
+ * Decide if we need to save this page; if we do, write lock it, and
+ * start to build a stack.
+ */
+ if (LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
+ (void)__BT_LPUT(dbp, lock);
+ return (ret);
+ }
+ stack = 1;
+ }
+
+ /* Records in the tree are 0-based, and record numbers are 1-based. */
+ --recno;
+
+ for (total = 0;;) {
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ BT_STK_ENTER(t, h, (recno - total) * P_INDX, lock, ret);
+ return (ret);
+ case P_IBTREE:
+ for (indx = 0, top = NUM_ENT(h);;) {
+ bi = GET_BINTERNAL(h, indx);
+ if (++indx == top || total + bi->nrecs > recno)
+ break;
+ total += bi->nrecs;
+ }
+ pg = bi->pgno;
+ break;
+ case P_LRECNO:
+ BT_STK_ENTER(t, h, recno - total, lock, ret);
+ return (ret);
+ case P_IRECNO:
+ for (indx = 0, top = NUM_ENT(h);;) {
+ ri = GET_RINTERNAL(h, indx);
+ if (++indx == top || total + ri->nrecs > recno)
+ break;
+ total += ri->nrecs;
+ }
+ pg = ri->pgno;
+ break;
+ default:
+ return (__db_pgfmt(dbp, h->pgno));
+ }
+ --indx;
+
+ if (stack) {
+ /* Return if this is the lowest page wanted. */
+ if (LF_ISSET(S_PARENT) && stop == h->level) {
+ BT_STK_ENTER(t, h, indx, lock, ret);
+ return (ret);
+ }
+ BT_STK_PUSH(t, h, indx, lock, ret);
+ if (ret)
+ goto err;
+
+ if ((ret = __bam_lget(dbp, 0, pg,
+ LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ,
+ &lock)) != 0)
+ goto err;
+ } else {
+ (void)memp_fput(dbp->mpf, h, 0);
+
+ /*
+ * Decide if we want to return a pointer to the next
+ * page in the stack. If we do, write lock it and
+ * never unlock it.
+ */
+ if (LF_ISSET(S_PARENT) &&
+ (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1))
+ stack = 1;
+
+ if ((ret = __bam_lget(dbp, 1, pg,
+ LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ,
+ &lock)) != 0)
+ goto err;
+ }
+
+ if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0)
+ goto err;
+ }
+ /* NOTREACHED */
+
+err: BT_STK_POP(t);
+ __bam_stkrel(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_adjust --
+ * Adjust the tree after adding or deleting a record.
+ *
+ * PUBLIC: int __bam_adjust __P((DB *, BTREE *, int));
+ */
+int
+__bam_adjust(dbp, t, adjust)
+ DB *dbp;
+ BTREE *t;
+ int adjust;
+{
+ EPG *epg;
+ PAGE *h;
+ int ret;
+
+ /* Update the record counts for the tree. */
+ for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) {
+ h = epg->page;
+ if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
+ if (DB_LOGGING(dbp) &&
+ (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
+ dbp->txn, &LSN(h), 0, dbp->log_fileid,
+ PGNO(h), &LSN(h), (u_int32_t)epg->indx,
+ (int32_t)adjust, 1)) != 0)
+ return (ret);
+
+ if (TYPE(h) == P_IBTREE)
+ GET_BINTERNAL(h, epg->indx)->nrecs += adjust;
+ else
+ GET_RINTERNAL(h, epg->indx)->nrecs += adjust;
+
+ if (PGNO(h) == PGNO_ROOT)
+ RE_NREC_ADJ(h, adjust);
+
+ if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __bam_nrecs --
+ * Return the number of records in the tree.
+ *
+ * PUBLIC: int __bam_nrecs __P((DB *, db_recno_t *));
+ */
+int
+__bam_nrecs(dbp, rep)
+ DB *dbp;
+ db_recno_t *rep;
+{
+ DB_LOCK lock;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret;
+
+ pgno = PGNO_ROOT;
+ if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+ return (ret);
+
+ *rep = RE_NREC(h);
+
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_TLPUT(dbp, lock);
+
+ return (0);
+}
+
+/*
+ * __bam_total --
+ * Return the number of records below a page.
+ *
+ * PUBLIC: db_recno_t __bam_total __P((PAGE *));
+ */
+db_recno_t
+__bam_total(h)
+ PAGE *h;
+{
+ db_recno_t recs;
+ db_indx_t nxt, top;
+
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ recs = NUM_ENT(h) / 2;
+ break;
+ case P_IBTREE:
+ for (recs = 0, nxt = 0, top = NUM_ENT(h); nxt < top; ++nxt)
+ recs += GET_BINTERNAL(h, nxt)->nrecs;
+ break;
+ case P_LRECNO:
+ recs = NUM_ENT(h);
+ break;
+ case P_IRECNO:
+ for (recs = 0, nxt = 0, top = NUM_ENT(h); nxt < top; ++nxt)
+ recs += GET_RINTERNAL(h, nxt)->nrecs;
+ break;
+ default:
+ abort();
+ }
+ return (recs);
+}
diff --git a/db2/btree/bt_search.c b/db2/btree/bt_search.c
new file mode 100644
index 0000000000..d5f20d4c61
--- /dev/null
+++ b/db2/btree/bt_search.c
@@ -0,0 +1,335 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_search.c 10.6 (Sleepycat) 8/22/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+/*
+ * __bam_search --
+ * Search a btree for a key.
+ *
+ * PUBLIC: int __bam_search __P((DB *,
+ * PUBLIC: const DBT *, u_int, int, db_recno_t *, int *));
+ */
+int
+__bam_search(dbp, key, flags, stop, recnop, exactp)
+ DB *dbp;
+ const DBT *key;
+ u_int flags;
+ int stop, *exactp;
+ db_recno_t *recnop;
+{
+ BTREE *t;
+ DB_LOCK lock;
+ EPG cur;
+ PAGE *h;
+ db_indx_t base, i, indx, lim;
+ db_pgno_t pg;
+ db_recno_t recno;
+ int cmp, jump, ret, stack;
+
+ t = dbp->internal;
+ recno = 0;
+
+ BT_STK_CLR(t);
+
+ /*
+ * There are several ways we search a btree tree. The flags argument
+ * specifies if we're acquiring read or write locks, if we position
+ * to the first or last item in a set of duplicates, if we return
+ * deleted items, and if we are locking pairs of pages. See btree.h
+ * for more details. In addition, if we're doing record numbers, we
+ * have to lock the entire tree regardless.
+ *
+ * If write-locking pages, we need to know whether or not to acquire a
+ * write lock on a page before getting it. This depends on how deep it
+ * is in tree, which we don't know until we acquire the root page. So,
+ * if we need to lock the root page we may have to upgrade it later,
+ * because we won't get the correct lock initially.
+ *
+ * Retrieve the root page.
+ */
+ pg = PGNO_ROOT;
+ stack = F_ISSET(dbp, DB_BT_RECNUM) &&
+ (flags == S_INSERT || flags == S_DELETE);
+ if ((ret = __bam_lget(dbp,
+ 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
+ (void)__BT_LPUT(dbp, lock);
+ return (ret);
+ }
+
+ /* Decide if we need to save this page; if we do, write lock it. */
+ if (!stack &&
+ ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
+ (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0)
+ return (ret);
+ if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
+ (void)__BT_LPUT(dbp, lock);
+ return (ret);
+ }
+
+ stack = 1;
+ }
+
+ for (;;) {
+ /*
+ * Do a binary search on the current page. If we're searching
+ * a leaf page, we have to manipulate the indices in groups of
+ * two. If we're searching an internal page, they're an index
+ * per page item. If we find an exact match on a leaf page,
+ * we're done.
+ */
+ cur.page = h;
+ jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
+ for (base = 0,
+ lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) {
+ cur.indx = indx = base + ((lim >> 1) * jump);
+ if ((cmp = __bam_cmp(dbp, key, &cur)) == 0) {
+ if (TYPE(h) == P_LBTREE)
+ goto match;
+ goto next;
+ }
+ if (cmp > 0) {
+ base = indx + jump;
+ --lim;
+ }
+ }
+
+ /*
+ * No match found. Base is the smallest index greater than
+ * key and may be zero or a last + O_INDX index.
+ *
+ * If it's a leaf page, return base as the "found" value.
+ * Delete only deletes exact matches.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ *exactp = 0;
+
+ if (LF_ISSET(S_EXACT))
+ goto notfound;
+
+ BT_STK_ENTER(t, h, base, lock, ret);
+ return (ret);
+ }
+
+ /*
+ * If it's not a leaf page, record the internal page (which is
+ * a parent page for the key). Decrement the base by 1 if it's
+ * non-zero so that if a split later occurs, the inserted page
+ * will be to the right of the saved page.
+ */
+ indx = base > 0 ? base - O_INDX : base;
+
+ /*
+ * If we're trying to calculate the record number, sum up
+ * all the record numbers on this page up to the indx point.
+ */
+ if (recnop != NULL)
+ for (i = 0; i < indx; ++i)
+ recno += GET_BINTERNAL(h, i)->nrecs;
+
+next: pg = GET_BINTERNAL(h, indx)->pgno;
+ if (stack) {
+ /* Return if this is the lowest page wanted. */
+ if (LF_ISSET(S_PARENT) && stop == h->level) {
+ BT_STK_ENTER(t, h, indx, lock, ret);
+ return (ret);
+ }
+ BT_STK_PUSH(t, h, indx, lock, ret);
+ if (ret != 0)
+ goto err;
+
+ if ((ret =
+ __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
+ goto err;
+ } else {
+ (void)memp_fput(dbp->mpf, h, 0);
+
+ /*
+ * Decide if we want to return a pointer to the next
+ * page in the stack. If we do, write lock it and
+ * never unlock it.
+ */
+ if ((LF_ISSET(S_PARENT) &&
+ (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
+ (h->level - 1) == LEAFLEVEL)
+ stack = 1;
+
+ if ((ret =
+ __bam_lget(dbp, 1, pg, stack && LF_ISSET(S_WRITE) ?
+ DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
+ goto err;
+ }
+ if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0)
+ goto err;
+ }
+
+ /* NOTREACHED */
+match: *exactp = 1;
+
+ /*
+ * If we're trying to calculate the record number, add in the
+ * offset on this page and correct for the fact that records
+ * in the tree are 0-based.
+ */
+ if (recnop != NULL)
+ *recnop = recno + (indx / P_INDX) + 1;
+
+ /*
+ * If we got here, we know that we have a btree leaf page.
+ *
+ * If there are duplicates, go to the first/last one.
+ */
+ if (LF_ISSET(S_DUPLAST))
+ while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+ h->inp[indx] == h->inp[indx + P_INDX])
+ indx += P_INDX;
+ else
+ while (indx > 0 &&
+ h->inp[indx] == h->inp[indx - P_INDX])
+ indx -= P_INDX;
+
+ /*
+ * Now check if we are allowed to return deleted item; if not
+ * find/last the first non-deleted item.
+ */
+ if (LF_ISSET(S_DELNO)) {
+ if (LF_ISSET(S_DUPLAST))
+ while (GET_BKEYDATA(h, indx + O_INDX)->deleted &&
+ indx > 0 &&
+ h->inp[indx] == h->inp[indx - P_INDX])
+ indx -= P_INDX;
+ else
+ while (GET_BKEYDATA(h, indx + O_INDX)->deleted &&
+ indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+ h->inp[indx] == h->inp[indx + P_INDX])
+ indx += P_INDX;
+
+ if (GET_BKEYDATA(h, indx + O_INDX)->deleted)
+ goto notfound;
+ }
+
+ BT_STK_ENTER(t, h, indx, lock, ret);
+ return (ret);
+
+notfound:
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_LPUT(dbp, lock);
+ ret = DB_NOTFOUND;
+
+err: if (t->bt_csp > t->bt_sp) {
+ BT_STK_POP(t);
+ __bam_stkrel(dbp);
+ }
+ return (ret);
+}
+
+/*
+ * __bam_stkrel --
+ * Release all pages currently held in the stack.
+ *
+ * PUBLIC: int __bam_stkrel __P((DB *));
+ */
+int
+__bam_stkrel(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ EPG *epg;
+
+ t = dbp->internal;
+ for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) {
+ (void)memp_fput(dbp->mpf, epg->page, 0);
+ (void)__BT_TLPUT(dbp, epg->lock);
+ }
+ return (0);
+}
+
+/*
+ * __bam_stkgrow --
+ * Grow the stack.
+ *
+ * PUBLIC: int __bam_stkgrow __P((BTREE *));
+ */
+int
+__bam_stkgrow(t)
+ BTREE *t;
+{
+ EPG *p;
+ size_t entries;
+
+ entries = t->bt_esp - t->bt_sp;
+
+ if ((p = (EPG *)calloc(entries * 2, sizeof(EPG))) == NULL)
+ return (ENOMEM);
+ memcpy(p, t->bt_sp, entries * sizeof(EPG));
+ if (t->bt_sp != t->bt_stack)
+ FREE(t->bt_sp, entries * sizeof(EPG));
+ t->bt_sp = p;
+ t->bt_csp = p + entries;
+ t->bt_esp = p + entries * 2;
+ return (0);
+}
diff --git a/db2/btree/bt_split.c b/db2/btree/bt_split.c
new file mode 100644
index 0000000000..89cfcb5a2e
--- /dev/null
+++ b/db2/btree/bt_split.c
@@ -0,0 +1,952 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_split.c 10.12 (Sleepycat) 8/24/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+static int __bam_page __P((DB *, EPG *, EPG *));
+static int __bam_pinsert __P((DB *, EPG *, PAGE *, PAGE *));
+static int __bam_psplit __P((DB *, EPG *, PAGE *, PAGE *, int));
+static int __bam_root __P((DB *, EPG *));
+
+/*
+ * __bam_split --
+ * Split a page.
+ *
+ * PUBLIC: int __bam_split __P((DB *, void *));
+ */
+int
+__bam_split(dbp, arg)
+ DB *dbp;
+ void *arg;
+{
+ BTREE *t;
+ enum { UP, DOWN } dir;
+ int exact, level, ret;
+
+ t = dbp->internal;
+
+ /*
+ * The locking protocol we use to avoid deadlock to acquire locks by
+ * walking down the tree, but we do it as lazily as possible, locking
+ * the root only as a last resort. We expect all stack pages to have
+ * been discarded before we're called; we discard all short-term locks.
+ *
+ * When __bam_split is first called, we know that a leaf page was too
+ * full for an insert. We don't know what leaf page it was, but we
+ * have the key/recno that caused the problem. We call XX_search to
+ * reacquire the leaf page, but this time get both the leaf page and
+ * its parent, locked. We then split the leaf page and see if the new
+ * internal key will fit into the parent page. If it will, we're done.
+ *
+ * If it won't, we discard our current locks and repeat the process,
+ * only this time acquiring the parent page and its parent, locked.
+ * This process repeats until we succeed in the split, splitting the
+ * root page as the final resort. The entire process then repeats,
+ * as necessary, until we split a leaf page.
+ *
+ * XXX
+ * A traditional method of speeding this up is to maintain a stack of
+ * the pages traversed in the original search. You can detect if the
+ * stack is correct by storing the page's LSN when it was searched and
+ * comparing that LSN with the current one when it's locked during the
+ * split. This would be an easy change for this code, but I have no
+ * numbers that indicate it's worthwhile.
+ */
+ for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) {
+ /*
+ * Acquire a page and its parent, locked.
+ */
+ if ((ret = (dbp->type == DB_BTREE ?
+ __bam_search(dbp, arg, S_WRPAIR, level, NULL, &exact) :
+ __bam_rsearch(dbp,
+ (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0)
+ return (ret);
+
+ /* Split the page. */
+ ret = t->bt_csp[0].page->pgno == PGNO_ROOT ?
+ __bam_root(dbp, &t->bt_csp[0]) :
+ __bam_page(dbp, &t->bt_csp[-1], &t->bt_csp[0]);
+
+ switch (ret) {
+ case 0:
+ /* Once we've split the leaf page, we're done. */
+ if (level == LEAFLEVEL)
+ return (0);
+
+ /* Switch directions. */
+ if (dir == UP)
+ dir = DOWN;
+ break;
+ case DB_NEEDSPLIT:
+ /*
+ * It's possible to fail to split repeatedly, as other
+ * threads may be modifying the tree, or the page usage
+ * is sufficiently bad that we don't get enough space
+ * the first time.
+ */
+ if (dir == DOWN)
+ dir = UP;
+ break;
+ default:
+ return (ret);
+ }
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __bam_root --
+ * Split the root page of a btree.
+ */
+static int
+__bam_root(dbp, cp)
+ DB *dbp;
+ EPG *cp;
+{
+ BTREE *t;
+ PAGE *lp, *rp;
+ int ret;
+
+ t = dbp->internal;
+
+ /* Yeah, right. */
+ if (cp->page->level >= MAXBTREELEVEL)
+ return (ENOSPC);
+
+ /* Create new left and right pages for the split. */
+ lp = rp = NULL;
+ if ((ret = __bam_new(dbp, TYPE(cp->page), &lp)) != 0 ||
+ (ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0)
+ goto err;
+ P_INIT(lp, dbp->pgsize, lp->pgno,
+ PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
+ cp->page->level, TYPE(cp->page));
+ P_INIT(rp, dbp->pgsize, rp->pgno,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID,
+ cp->page->level, TYPE(cp->page));
+
+ /* Split the page. */
+ if ((ret = __bam_psplit(dbp, cp, lp, rp, 1)) != 0)
+ goto err;
+
+ /* Log the change. */
+ if (DB_LOGGING(dbp)) {
+ DBT __a;
+ DB_LSN __lsn;
+ memset(&__a, 0, sizeof(__a));
+ __a.data = cp->page;
+ __a.size = dbp->pgsize;
+ ZERO_LSN(__lsn);
+ if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn,
+ &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp),
+ PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn,
+ &__a)) != 0)
+ goto err;
+ LSN(lp) = LSN(rp) = LSN(cp->page);
+ }
+
+ /* Clean up the new root page. */
+ if ((ret = (dbp->type == DB_RECNO ?
+ __ram_root(dbp, cp->page, lp, rp) :
+ __bam_broot(dbp, cp->page, lp, rp))) != 0)
+ goto err;
+
+ /* Success -- write the real pages back to the store. */
+ (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
+ (void)__BT_TLPUT(dbp, cp->lock);
+ (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY);
+ (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
+
+ ++t->lstat.bt_split;
+ ++t->lstat.bt_rootsplit;
+ return (0);
+
+err: if (lp != NULL)
+ (void)__bam_free(dbp, lp);
+ if (rp != NULL)
+ (void)__bam_free(dbp, rp);
+ (void)memp_fput(dbp->mpf, cp->page, 0);
+ (void)__BT_TLPUT(dbp, cp->lock);
+ return (ret);
+}
+
+/*
+ * __bam_page --
+ * Split the non-root page of a btree.
+ */
+static int
+__bam_page(dbp, pp, cp)
+ DB *dbp;
+ EPG *pp, *cp;
+{
+ BTREE *t;
+ DB_LOCK tplock;
+ PAGE *lp, *rp, *tp;
+ int ret;
+
+ t = dbp->internal;
+ lp = rp = tp = NULL;
+ ret = -1;
+
+ /* Create new right page for the split. */
+ if ((ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0)
+ return (ret);
+ P_INIT(rp, dbp->pgsize, rp->pgno,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->next_pgno,
+ cp->page->level, TYPE(cp->page));
+
+ /* Create new left page for the split. */
+ if ((lp = (PAGE *)malloc(dbp->pgsize)) == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+#ifdef DEBUG
+ memset(lp, 0xff, dbp->pgsize);
+#endif
+ P_INIT(lp, dbp->pgsize, cp->page->pgno,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->prev_pgno,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
+ cp->page->level, TYPE(cp->page));
+ ZERO_LSN(lp->lsn);
+
+ /*
+ * Split right.
+ *
+ * Only the indices are sorted on the page, i.e., the key/data pairs
+ * aren't, so it's simpler to copy the data from the split page onto
+ * two new pages instead of copying half the data to the right page
+ * and compacting the left page in place. Since the left page can't
+ * change, we swap the original and the allocated left page after the
+ * split.
+ */
+ if ((ret = __bam_psplit(dbp, cp, lp, rp, 0)) != 0)
+ goto err;
+
+ /*
+ * Fix up the previous pointer of any leaf page following the split
+ * page.
+ *
+ * !!!
+ * There are interesting deadlock situations here as we write-lock a
+ * page that's not in our direct ancestry. Consider a cursor walking
+ * through the leaf pages, that has the previous page read-locked and
+ * is waiting on a lock for the page we just split. It will deadlock
+ * here. If this is a problem, we can fail in the split; it's not a
+ * problem as the split will succeed after the cursor passes through
+ * the page we're splitting.
+ */
+ if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) {
+ if ((ret = __bam_lget(dbp,
+ 0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0)
+ goto err;
+ if ((ret = __bam_pget(dbp, &tp, &rp->next_pgno, 0)) != 0)
+ goto err;
+ }
+
+ /* Insert the new pages into the parent page. */
+ if ((ret = __bam_pinsert(dbp, pp, lp, rp)) != 0)
+ goto err;
+
+ /* Log the change. */
+ if (DB_LOGGING(dbp)) {
+ DBT __a;
+ DB_LSN __lsn;
+ memset(&__a, 0, sizeof(__a));
+ __a.data = cp->page;
+ __a.size = dbp->pgsize;
+ if (tp == NULL)
+ ZERO_LSN(__lsn);
+ if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn,
+ &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page),
+ &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp),
+ tp == NULL ? 0 : PGNO(tp),
+ tp == NULL ? &__lsn : &LSN(tp), &__a)) != 0)
+ goto err;
+
+ LSN(lp) = LSN(rp) = LSN(cp->page);
+ if (tp != NULL)
+ LSN(tp) = LSN(cp->page);
+ }
+
+ /* Copy the allocated page into place. */
+ memcpy(cp->page, lp, LOFFSET(lp));
+ memcpy((u_int8_t *)cp->page + HOFFSET(lp),
+ (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
+ FREE(lp, dbp->pgsize);
+ lp = NULL;
+
+ /* Finish the next-page link. */
+ if (tp != NULL)
+ tp->prev_pgno = rp->pgno;
+
+ /* Success -- write the real pages back to the store. */
+ (void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY);
+ (void)__BT_TLPUT(dbp, pp->lock);
+ (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
+ (void)__BT_TLPUT(dbp, cp->lock);
+ (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
+ if (tp != NULL) {
+ (void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY);
+ (void)__BT_TLPUT(dbp, tplock);
+ }
+ return (0);
+
+err: if (lp != NULL)
+ FREE(lp, dbp->pgsize);
+ if (rp != NULL)
+ (void)__bam_free(dbp, rp);
+ if (tp != NULL) {
+ (void)memp_fput(dbp->mpf, tp, 0);
+ (void)__BT_TLPUT(dbp, tplock);
+ }
+ (void)memp_fput(dbp->mpf, pp->page, 0);
+ (void)__BT_TLPUT(dbp, pp->lock);
+ (void)memp_fput(dbp->mpf, cp->page, 0);
+ (void)__BT_TLPUT(dbp, cp->lock);
+ return (ret);
+}
+
+/*
+ * __bam_broot --
+ * Fix up the btree root page after it has been split.
+ *
+ * PUBLIC: int __bam_broot __P((DB *, PAGE *, PAGE *, PAGE *));
+ */
+int
+__bam_broot(dbp, rootp, lp, rp)
+ DB *dbp;
+ PAGE *rootp, *lp, *rp;
+{
+ BINTERNAL bi, *child_bi;
+ BKEYDATA *child_bk;
+ DBT hdr, data;
+ int ret;
+
+ /*
+ * If the root page was a leaf page, change it into an internal page.
+ * We copy the key we split on (but not the key's data, in the case of
+ * a leaf page) to the new root page.
+ */
+ P_INIT(rootp, dbp->pgsize,
+ PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE);
+
+ /*
+ * The btree comparison code guarantees that the left-most key on any
+ * level of the tree is never used, so it doesn't need to be filled in.
+ */
+ bi.len = 0;
+ bi.deleted = 0;
+ bi.type = B_KEYDATA;
+ bi.pgno = lp->pgno;
+ if (F_ISSET(dbp, DB_BT_RECNUM)) {
+ bi.nrecs = __bam_total(lp);
+ RE_NREC_SET(rootp, bi.nrecs);
+ }
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ memset(&data, 0, sizeof(data));
+ data.data = (char *) "";
+ data.size = 0;
+ if ((ret =
+ __db_pitem(dbp, rootp, 0, BINTERNAL_SIZE(0), &hdr, &data)) != 0)
+ return (ret);
+
+ switch (TYPE(rp)) {
+ case P_IBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bi = GET_BINTERNAL(rp, 0);
+
+ bi.len = child_bi->len;
+ bi.deleted = 0;
+ bi.type = child_bi->type;
+ bi.pgno = rp->pgno;
+ if (F_ISSET(dbp, DB_BT_RECNUM)) {
+ bi.nrecs = __bam_total(rp);
+ RE_NREC_ADJ(rootp, bi.nrecs);
+ }
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ data.data = child_bi->data;
+ data.size = child_bi->len;
+ if ((ret = __db_pitem(dbp, rootp, 1,
+ BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
+ return (ret);
+
+ /* Increment the overflow ref count. */
+ if (child_bi->type == B_OVERFLOW && (ret =
+ __db_ioff(dbp, ((BOVERFLOW *)(child_bi->data))->pgno)) != 0)
+ return (ret);
+ break;
+ case P_LBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bk = GET_BKEYDATA(rp, 0);
+ switch (child_bk->type) {
+ case B_KEYDATA:
+ bi.len = child_bk->len;
+ bi.deleted = 0;
+ bi.type = child_bk->type;
+ bi.pgno = rp->pgno;
+ if (F_ISSET(dbp, DB_BT_RECNUM)) {
+ bi.nrecs = __bam_total(rp);
+ RE_NREC_ADJ(rootp, bi.nrecs);
+ }
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ data.data = child_bk->data;
+ data.size = child_bk->len;
+ if ((ret = __db_pitem(dbp, rootp, 1,
+ BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0)
+ return (ret);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ bi.len = BOVERFLOW_SIZE;
+ bi.deleted = 0;
+ bi.type = child_bk->type;
+ bi.pgno = rp->pgno;
+ if (F_ISSET(dbp, DB_BT_RECNUM)) {
+ bi.nrecs = __bam_total(rp);
+ RE_NREC_ADJ(rootp, bi.nrecs);
+ }
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ data.data = child_bk;
+ data.size = BOVERFLOW_SIZE;
+ if ((ret = __db_pitem(dbp, rootp, 1,
+ BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
+ return (ret);
+
+ /* Increment the overflow ref count. */
+ if (child_bk->type == B_OVERFLOW && (ret =
+ __db_ioff(dbp, ((BOVERFLOW *)child_bk)->pgno)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp, rp->pgno));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp, rp->pgno));
+ }
+ return (0);
+}
+
+/*
+ * __ram_root --
+ * Fix up the recno root page after it has been split.
+ *
+ * PUBLIC: int __ram_root __P((DB *, PAGE *, PAGE *, PAGE *));
+ */
+int
+__ram_root(dbp, rootp, lp, rp)
+ DB *dbp;
+ PAGE *rootp, *lp, *rp;
+{
+ DBT hdr;
+ RINTERNAL ri;
+ int ret;
+
+ /* Initialize the page. */
+ P_INIT(rootp, dbp->pgsize,
+ PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
+
+ /* Initialize the header. */
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &ri;
+ hdr.size = RINTERNAL_SIZE;
+
+ /* Insert the left and right keys, set the header information. */
+ ri.pgno = lp->pgno;
+ ri.nrecs = __bam_total(lp);
+ if ((ret = __db_pitem(dbp, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_SET(rootp, ri.nrecs);
+ ri.pgno = rp->pgno;
+ ri.nrecs = __bam_total(rp);
+ if ((ret = __db_pitem(dbp, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_ADJ(rootp, ri.nrecs);
+ return (0);
+}
+
+/*
+ * __bam_pinsert --
+ * Insert a new key into a parent page, completing the split.
+ */
+static int
+__bam_pinsert(dbp, parent, lchild, rchild)
+ DB *dbp;
+ EPG *parent;
+ PAGE *lchild, *rchild;
+{
+ BINTERNAL bi, *child_bi;
+ BKEYDATA *child_bk, *tmp_bk;
+ BTREE *t;
+ DBT a, b, hdr, data;
+ PAGE *ppage;
+ RINTERNAL ri;
+ db_indx_t off;
+ db_recno_t nrecs;
+ u_int32_t n, nbytes, nksize;
+ int ret;
+
+ t = dbp->internal;
+ ppage = parent->page;
+
+ /* If handling record numbers, count records split to the right page. */
+ nrecs = dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM) ?
+ __bam_total(rchild) : 0;
+
+ /*
+ * Now we insert the new page's first key into the parent page, which
+ * completes the split. The parent points to a PAGE and a page index
+ * offset, where the new key goes ONE AFTER the index, because we split
+ * to the right.
+ *
+ * XXX
+ * Some btree algorithms replace the key for the old page as well as
+ * the new page. We don't, as there's no reason to believe that the
+ * first key on the old page is any better than the key we have, and,
+ * in the case of a key being placed at index 0 causing the split, the
+ * key is unavailable.
+ */
+ off = parent->indx + O_INDX;
+
+ /*
+ * Calculate the space needed on the parent page.
+ *
+ * Prefix trees: space hack used when inserting into BINTERNAL pages.
+ * Retain only what's needed to distinguish between the new entry and
+ * the LAST entry on the page to its left. If the keys compare equal,
+ * retain the entire key. We ignore overflow keys, and the entire key
+ * must be retained for the next-to-leftmost key on the leftmost page
+ * of each level, or the search will fail. Applicable ONLY to internal
+ * pages that have leaf pages as children. Further reduction of the
+ * key between pairs of internal pages loses too much information.
+ */
+ switch (TYPE(rchild)) {
+ case P_IBTREE:
+ child_bi = GET_BINTERNAL(rchild, 0);
+ nbytes = BINTERNAL_PSIZE(child_bi->len);
+
+ if (P_FREESPACE(ppage) < nbytes)
+ return (DB_NEEDSPLIT);
+
+ /* Add a new record for the right page. */
+ bi.len = child_bi->len;
+ bi.deleted = 0;
+ bi.type = child_bi->type;
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ memset(&data, 0, sizeof(data));
+ data.data = child_bi->data;
+ data.size = child_bi->len;
+ if ((ret = __db_pitem(dbp, ppage, off,
+ BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
+ return (ret);
+
+ /* Increment the overflow ref count. */
+ if (child_bi->type == B_OVERFLOW && (ret =
+ __db_ioff(dbp, ((BOVERFLOW *)(child_bi->data))->pgno)) != 0)
+ return (ret);
+ break;
+ case P_LBTREE:
+ child_bk = GET_BKEYDATA(rchild, 0);
+ switch (child_bk->type) {
+ case B_KEYDATA:
+ nbytes = BINTERNAL_PSIZE(child_bk->len);
+ nksize = child_bk->len;
+ if (t->bt_prefix == NULL)
+ goto noprefix;
+ if (ppage->prev_pgno == PGNO_INVALID && off <= 1)
+ goto noprefix;
+ tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - P_INDX);
+ if (tmp_bk->type != B_KEYDATA)
+ goto noprefix;
+ memset(&a, 0, sizeof(a));
+ a.size = tmp_bk->len;
+ a.data = tmp_bk->data;
+ memset(&b, 0, sizeof(b));
+ b.size = child_bk->len;
+ b.data = child_bk->data;
+ nksize = t->bt_prefix(&a, &b);
+ if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) {
+ t->lstat.bt_pfxsaved += nbytes - n;
+ nbytes = n;
+ } else
+noprefix: nksize = child_bk->len;
+
+ if (P_FREESPACE(ppage) < nbytes)
+ return (DB_NEEDSPLIT);
+
+ bi.len = nksize;
+ bi.deleted = 0;
+ bi.type = child_bk->type;
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ memset(&data, 0, sizeof(data));
+ data.data = child_bk->data;
+ data.size = nksize;
+ if ((ret = __db_pitem(dbp, ppage, off,
+ BINTERNAL_SIZE(nksize), &hdr, &data)) != 0)
+ return (ret);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);
+
+ if (P_FREESPACE(ppage) < nbytes)
+ return (DB_NEEDSPLIT);
+
+ bi.len = BOVERFLOW_SIZE;
+ bi.deleted = 0;
+ bi.type = child_bk->type;
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ memset(&data, 0, sizeof(data));
+ data.data = child_bk;
+ data.size = BOVERFLOW_SIZE;
+ if ((ret = __db_pitem(dbp, ppage, off,
+ BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
+ return (ret);
+
+ /* Increment the overflow ref count. */
+ if (child_bk->type == B_OVERFLOW && (ret =
+ __db_ioff(dbp, ((BOVERFLOW *)child_bk)->pgno)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp, rchild->pgno));
+ }
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ nbytes = RINTERNAL_PSIZE;
+
+ if (P_FREESPACE(ppage) < nbytes)
+ return (DB_NEEDSPLIT);
+
+ /* Add a new record for the right page. */
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &ri;
+ hdr.size = RINTERNAL_SIZE;
+ ri.pgno = rchild->pgno;
+ ri.nrecs = nrecs;
+ if ((ret = __db_pitem(dbp,
+ ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp, rchild->pgno));
+ }
+
+ /* Adjust the parent page's left page record count. */
+ if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
+ /* Log the change. */
+ if (DB_LOGGING(dbp) &&
+ (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
+ dbp->txn, &LSN(ppage), 0, dbp->log_fileid,
+ PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx,
+ -(int32_t)nrecs, (int32_t)0)) != 0)
+ return (ret);
+
+ /* Update the left page count. */
+ if (dbp->type == DB_RECNO)
+ GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
+ else
+ GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_psplit --
+ * Do the real work of splitting the page.
+ */
+static int
+__bam_psplit(dbp, cp, lp, rp, cleft)
+ DB *dbp;
+ EPG *cp;
+ PAGE *lp, *rp;
+ int cleft;
+{
+ BTREE *t;
+ PAGE *pp;
+ db_indx_t half, nbytes, off, splitp, top;
+ int adjust, cnt, isbigkey, ret;
+
+ t = dbp->internal;
+ pp = cp->page;
+ adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
+
+ /*
+ * If we're splitting the first (last) page on a level because we're
+ * inserting (appending) a key to it, it's likely that the data is
+ * sorted. Moving a single item to the new page is less work and can
+ * push the fill factor higher than normal. If we're wrong it's not
+ * a big deal, we'll just do the split the right way next time.
+ */
+ off = 0;
+ if (NEXT_PGNO(pp) == PGNO_INVALID &&
+ ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) ||
+ (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page))))
+ off = NUM_ENT(cp->page) - adjust;
+ else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
+ off = adjust;
+
+ ++t->lstat.bt_split;
+ if (off != 0) {
+ ++t->lstat.bt_fastsplit;
+ goto sort;
+ }
+
+ /*
+ * Split the data to the left and right pages. Try not to split on
+ * an overflow key. (Overflow keys on internal pages will slow down
+ * searches.) Refuse to split in the middle of a set of duplicates.
+ *
+ * First, find the optimum place to split.
+ *
+ * It's possible to try and split past the last record on the page if
+ * there's a very large record at the end of the page. Make sure this
+ * doesn't happen by bounding the check at the next-to-last entry on
+ * the page.
+ *
+ * Note, we try and split half the data present on the page. This is
+ * because another process may have already split the page and left
+ * it half empty. We don't try and skip the split -- we don't know
+ * how much space we're going to need on the page, and we may need up
+ * to half the page for a big item, so there's no easy test to decide
+ * if we need to split or not. Besides, if two threads are inserting
+ * data into the same place in the database, we're probably going to
+ * need more space soon anyway.
+ */
+ top = NUM_ENT(pp) - adjust;
+ half = (dbp->pgsize - HOFFSET(pp)) / 2;
+ for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (GET_BINTERNAL(pp, off)->type == B_KEYDATA)
+ nbytes +=
+ BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len);
+ else
+ nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ if (GET_BKEYDATA(pp, off)->type == B_KEYDATA)
+ nbytes +=
+ BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
+
+ ++off;
+ if (GET_BKEYDATA(pp, off)->type == B_KEYDATA)
+ nbytes +=
+ BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
+ break;
+ case P_IRECNO:
+ nbytes += RINTERNAL_SIZE;
+ break;
+ case P_LRECNO:
+ nbytes += BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
+ break;
+ default:
+ return (__db_pgfmt(dbp, pp->pgno));
+ }
+sort: splitp = off;
+
+ /*
+ * Splitp is either at or just past the optimum split point. If
+ * it's a big key, try and find something close by that's not.
+ */
+ if (TYPE(pp) == P_IBTREE)
+ isbigkey = GET_BINTERNAL(pp, off)->type != B_KEYDATA;
+ else if (TYPE(pp) == P_LBTREE)
+ isbigkey = GET_BKEYDATA(pp, off)->type != B_KEYDATA;
+ else
+ isbigkey = 0;
+ if (isbigkey)
+ for (cnt = 1; cnt <= 3; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < (db_indx_t)NUM_ENT(pp) &&
+ ((TYPE(pp) == P_IBTREE &&
+ GET_BINTERNAL(pp, off)->type == B_KEYDATA) ||
+ GET_BKEYDATA(pp, off)->type == B_KEYDATA)) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (TYPE(pp) == P_IBTREE ?
+ GET_BINTERNAL(pp, off)->type == B_KEYDATA :
+ GET_BKEYDATA(pp, off)->type == B_KEYDATA) {
+ splitp = off;
+ break;
+ }
+ }
+
+ /*
+ * We can't split in the middle a set of duplicates. We know that
+ * no duplicate set can take up more than about 25% of the page,
+ * because that's the point where we push it off onto a duplicate
+ * page set. So, this loop can't be unbounded.
+ */
+ if (F_ISSET(dbp, DB_AM_DUP) && TYPE(pp) == P_LBTREE &&
+ pp->inp[splitp] == pp->inp[splitp - adjust])
+ for (cnt = 1;; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < NUM_ENT(pp) &&
+ pp->inp[splitp] != pp->inp[off]) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (pp->inp[splitp] != pp->inp[off]) {
+ splitp = off + adjust;
+ break;
+ }
+ }
+
+
+ /* We're going to split at splitp. */
+ if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)
+ return (ret);
+ if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
+ return (ret);
+
+ /* Adjust the cursors. */
+ __bam_ca_split(dbp, pp->pgno, lp->pgno, rp->pgno, splitp, cleft);
+ return (0);
+}
+
+/*
+ * __bam_copy --
+ * Copy a set of records from one page to another.
+ *
+ * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__bam_copy(dbp, pp, cp, nxt, stop)
+ DB *dbp;
+ PAGE *pp, *cp;
+ u_int32_t nxt, stop;
+{
+ db_indx_t dup, nbytes, off;
+
+ /*
+ * Copy the rest of the data to the right page. Nxt is the next
+ * offset placed on the target page.
+ */
+ for (dup = off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (GET_BINTERNAL(pp, nxt)->type == B_KEYDATA)
+ nbytes =
+ BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len);
+ else
+ nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ /*
+ * If we're on a key and it's a duplicate, just copy
+ * the offset.
+ */
+ if (off != 0 && (nxt % P_INDX) == 0 &&
+ pp->inp[nxt] == pp->inp[nxt - P_INDX]) {
+ cp->inp[off] = cp->inp[off - P_INDX];
+ continue;
+ }
+ /* FALLTHROUGH */
+ case P_LRECNO:
+ if (GET_BKEYDATA(pp, nxt)->type == B_KEYDATA)
+ nbytes =
+ BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len);
+ else
+ nbytes = BOVERFLOW_SIZE;
+ break;
+ case P_IRECNO:
+ nbytes = RINTERNAL_SIZE;
+ break;
+ default:
+ return (__db_pgfmt(dbp, pp->pgno));
+ }
+ cp->inp[off] = HOFFSET(cp) -= nbytes;
+ memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes);
+ }
+ return (0);
+}
diff --git a/db2/btree/bt_stat.c b/db2/btree/bt_stat.c
new file mode 100644
index 0000000000..ba71ea616d
--- /dev/null
+++ b/db2/btree/bt_stat.c
@@ -0,0 +1,257 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_stat.c 10.11 (Sleepycat) 8/19/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+static void __bam_add_rstat __P((DB_BTREE_LSTAT *, DB_BTREE_STAT *));
+
+/*
+ * __bam_stat --
+ * Gather/print the btree statistics
+ *
+ * PUBLIC: int __bam_stat __P((DB *, void *, void *(*)(size_t), int));
+ */
+int
+__bam_stat(argdbp, spp, db_malloc, flags)
+ DB *argdbp;
+ void *spp;
+ void *(*db_malloc) __P((size_t));
+ int flags;
+{
+ BTMETA *meta;
+ BTREE *t;
+ DB *dbp;
+ DB_BTREE_STAT *sp;
+ DB_LOCK lock;
+ PAGE *h;
+ db_pgno_t lastpgno, pgno;
+ int ret;
+
+ DEBUG_LWRITE(argdbp, NULL, "bam_stat", NULL, NULL, flags);
+
+ /* Check for invalid flags. */
+ if ((ret = __db_statchk(argdbp, flags)) != 0)
+ return (ret);
+
+ if (spp == NULL)
+ return (0);
+
+ GETHANDLE(argdbp, NULL, &dbp, ret);
+ t = dbp->internal;
+
+ /* Allocate and clear the structure. */
+ if ((sp = db_malloc == NULL ?
+ (DB_BTREE_STAT *)malloc(sizeof(*sp)) :
+ (DB_BTREE_STAT *)db_malloc(sizeof(*sp))) == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ memset(sp, 0, sizeof(*sp));
+
+ /* If the app just wants the record count, make it fast. */
+ if (LF_ISSET(DB_RECORDCOUNT)) {
+ pgno = PGNO_ROOT;
+ if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0)
+ goto err;
+ if ((ret = __bam_pget(dbp, (PAGE **)&h, &pgno, 0)) != 0)
+ goto err;
+
+ sp->bt_nrecs = RE_NREC(h);
+
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_LPUT(dbp, lock);
+ goto done;
+ }
+
+ /* Get the meta-data page. */
+ pgno = PGNO_METADATA;
+ if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0)
+ goto err;
+ if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0)
+ goto err;
+
+ /* Translate the metadata flags. */
+ if (F_ISSET(meta, BTM_DUP))
+ sp->bt_flags |= DB_DUP;
+ if (F_ISSET(meta, BTM_FIXEDLEN))
+ sp->bt_flags |= DB_FIXEDLEN;
+ if (F_ISSET(meta, BTM_RECNUM))
+ sp->bt_flags |= DB_RECNUM;
+ if (F_ISSET(meta, BTM_RENUMBER))
+ sp->bt_flags |= DB_RENUMBER;
+
+ /*
+ * Get the maxkey, minkey, re_len and re_pad fields from the
+ * metadata.
+ */
+ sp->bt_minkey = meta->minkey;
+ sp->bt_maxkey = meta->maxkey;
+ sp->bt_re_len = meta->re_len;
+ sp->bt_re_pad = meta->re_pad;
+
+ /* Get the page size from the DB. */
+ sp->bt_pagesize = dbp->pgsize;
+
+ /* Initialize counters with the meta-data page information. */
+ __bam_add_rstat(&meta->stat, sp);
+
+ /*
+ * Add in the local information from this handle.
+ *
+ * !!!
+ * This is a bit odd, but it gets us closer to the truth.
+ */
+ __bam_add_rstat(&t->lstat, sp);
+
+ /* Walk the free list, counting pages. */
+ for (sp->bt_free = 0, pgno = meta->free; pgno != PGNO_INVALID;) {
+ ++sp->bt_free;
+
+ if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) {
+ (void)memp_fput(dbp->mpf, meta, 0);
+ (void)__BT_TLPUT(dbp, lock);
+ goto err;
+ }
+ pgno = h->next_pgno;
+ (void)memp_fput(dbp->mpf, h, 0);
+ }
+
+ /* Discard the meta-data page. */
+ (void)memp_fput(dbp->mpf, meta, 0);
+ (void)__BT_TLPUT(dbp, lock);
+
+ /* Get the root page. */
+ pgno = PGNO_ROOT;
+ if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_READ, &lock)) != 0)
+ goto err;
+ if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) {
+ (void)__BT_LPUT(dbp, lock);
+ goto err;
+ }
+
+ /* Get the levels from the root page. */
+ sp->bt_levels = h->level;
+
+ /*
+ * Determine the last page of the database, then walk it, counting
+ * things.
+ */
+ if ((ret = memp_fget(dbp->mpf, &lastpgno, DB_MPOOL_LAST, &h)) != 0)
+ goto err;
+ (void)memp_fput(dbp->mpf, h, 0);
+ for (;;) {
+ switch (TYPE(h)) {
+ case P_INVALID:
+ break;
+ case P_IBTREE:
+ case P_IRECNO:
+ ++sp->bt_int_pg;
+ sp->bt_int_pgfree += HOFFSET(h) - LOFFSET(h);
+ break;
+ case P_LBTREE:
+ ++sp->bt_leaf_pg;
+ sp->bt_leaf_pgfree += HOFFSET(h) - LOFFSET(h);
+ sp->bt_nrecs += NUM_ENT(h) / P_INDX;
+ break;
+ case P_LRECNO:
+ ++sp->bt_leaf_pg;
+ sp->bt_leaf_pgfree += HOFFSET(h) - LOFFSET(h);
+ sp->bt_nrecs += NUM_ENT(h);
+ break;
+ case P_DUPLICATE:
+ ++sp->bt_dup_pg;
+ /* XXX MARGO: sp->bt_dup_pgfree; */
+ break;
+ case P_OVERFLOW:
+ ++sp->bt_over_pg;
+ /* XXX MARGO: sp->bt_over_pgfree; */
+ break;
+ default:
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_LPUT(dbp, lock);
+ return (__db_pgfmt(dbp, pgno));
+ }
+
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__BT_LPUT(dbp, lock);
+
+ if (++pgno > lastpgno)
+ break;
+ if (__bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock))
+ break;
+ if (memp_fget(dbp->mpf, &pgno, 0, &h) != 0) {
+ (void)__BT_LPUT(dbp, lock);
+ break;
+ }
+ }
+
+done: *(DB_BTREE_STAT **)spp = sp;
+ ret = 0;
+
+err: PUTHANDLE(dbp);
+ return (ret);
+}
+
+/*
+ * __bam_add_mstat --
+ * Add the local statistics to the meta-data page statistics.
+ *
+ * PUBLIC: void __bam_add_mstat __P((DB_BTREE_LSTAT *, DB_BTREE_LSTAT *));
+ */
+void
+__bam_add_mstat(from, to)
+ DB_BTREE_LSTAT *from;
+ DB_BTREE_LSTAT *to;
+{
+ to->bt_freed += from->bt_freed;
+ to->bt_pfxsaved += from->bt_pfxsaved;
+ to->bt_split += from->bt_split;
+ to->bt_rootsplit += from->bt_rootsplit;
+ to->bt_fastsplit += from->bt_fastsplit;
+ to->bt_added += from->bt_added;
+ to->bt_deleted += from->bt_deleted;
+ to->bt_get += from->bt_get;
+ to->bt_cache_hit += from->bt_cache_hit;
+ to->bt_cache_miss += from->bt_cache_miss;
+}
+
+/*
+ * __bam_add_rstat --
+ * Add the local statistics to the returned statistics.
+ */
+static void
+__bam_add_rstat(from, to)
+ DB_BTREE_LSTAT *from;
+ DB_BTREE_STAT *to;
+{
+ to->bt_freed += from->bt_freed;
+ to->bt_pfxsaved += from->bt_pfxsaved;
+ to->bt_split += from->bt_split;
+ to->bt_rootsplit += from->bt_rootsplit;
+ to->bt_fastsplit += from->bt_fastsplit;
+ to->bt_added += from->bt_added;
+ to->bt_deleted += from->bt_deleted;
+ to->bt_get += from->bt_get;
+ to->bt_cache_hit += from->bt_cache_hit;
+ to->bt_cache_miss += from->bt_cache_miss;
+}
diff --git a/db2/btree/btree.src b/db2/btree/btree.src
new file mode 100644
index 0000000000..50cc0dd0ff
--- /dev/null
+++ b/db2/btree/btree.src
@@ -0,0 +1,137 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)btree.src 10.3 (Sleepycat) 8/17/97";
+#endif /* not lint */
+
+PREFIX bam
+
+/*
+ * BTREE-pg_alloc: used to record allocating a new page.
+ *
+ * meta_lsn: the meta-data page's original lsn.
+ * page_lsn: the allocated page's original lsn.
+ * pgno: the page allocated.
+ * next: the next page on the free list.
+ */
+BEGIN pg_alloc
+ARG fileid u_int32_t lu
+POINTER meta_lsn DB_LSN * lu
+POINTER page_lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG ptype u_int32_t lu
+ARG next db_pgno_t lu
+END
+
+/*
+ * BTREE-pg_free: used to record freeing a page.
+ *
+ * pgno: the page being freed.
+ * meta_lsn: the meta-data page's original lsn.
+ * header: the header from the free'd page.
+ * next: the previous next pointer on the metadata page.
+ */
+BEGIN pg_free
+ARG fileid u_int32_t lu
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+DBT header DBT s
+ARG next db_pgno_t lu
+END
+
+/*
+ * BTREE-split: used to log a page split.
+ *
+ * left: the page number for the low-order contents.
+ * llsn: the left page's original LSN.
+ * right: the page number for the high-order contents.
+ * rlsn: the right page's original LSN.
+ * indx: the number of entries that went to the left page.
+ * npgno: the next page number
+ * nlsn: the next page's original LSN (or 0 if no next page).
+ * pg: the split page's contents before the split.
+ */
+BEGIN split
+ARG fileid u_int32_t lu
+ARG left db_pgno_t lu
+POINTER llsn DB_LSN * lu
+ARG right db_pgno_t lu
+POINTER rlsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+DBT pg DBT s
+END
+
+/*
+ * BTREE-rsplit: used to log a reverse-split
+ *
+ * pgno: the page number of the page copied over the root.
+ * pgdbt: the page being copied on the root page.
+ * rootent: last entry on the root page.
+ * rootlsn: the root page's original lsn.
+ */
+BEGIN rsplit
+ARG fileid u_int32_t lu
+ARG pgno db_pgno_t lu
+DBT pgdbt DBT s
+DBT rootent DBT s
+POINTER rootlsn DB_LSN * lu
+END
+
+/*
+ * BTREE-adj: used to log the adjustment of an index.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index adjusted.
+ * indx_copy: the index to copy if inserting.
+ * is_insert: 0 if a delete, 1 if an insert.
+ */
+BEGIN adj
+ARG fileid u_int32_t lu
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG indx_copy u_int32_t lu
+ARG is_insert u_int32_t lu
+END
+
+/*
+ * BTREE-cadjust: used to adjust the count change in an internal page.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be adjusted.
+ * adjust: the signed adjustment.
+ * total: if the total tree entries count should be adjusted
+ */
+BEGIN cadjust
+ARG fileid u_int32_t lu
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG adjust int32_t ld
+ARG total int32_t ld
+END
+
+/*
+ * BTREE-cdel: used to log the intent-to-delete of a cursor record.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be deleted.
+ */
+BEGIN cdel
+ARG fileid u_int32_t lu
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+END
diff --git a/db2/btree/btree_auto.c b/db2/btree/btree_auto.c
new file mode 100644
index 0000000000..e6b72252e5
--- /dev/null
+++ b/db2/btree/btree_auto.c
@@ -0,0 +1,1279 @@
+/* Do not edit: automatically built by dist/db_gen.sh. */
+#include "config.h"
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <ctype.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_page.h"
+#include "db_dispatch.h"
+#include "btree.h"
+#include "db_am.h"
+#include "common_ext.h"
+
+/*
+ * PUBLIC: int __bam_pg_alloc_log
+ * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+ * PUBLIC: u_int32_t, DB_LSN *, DB_LSN *, db_pgno_t,
+ * PUBLIC: u_int32_t, db_pgno_t));
+ */
+int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags,
+ fileid, meta_lsn, page_lsn, pgno, ptype, next)
+ DB_LOG *logp;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t fileid;
+ DB_LSN * meta_lsn;
+ DB_LSN * page_lsn;
+ db_pgno_t pgno;
+ u_int32_t ptype;
+ db_pgno_t next;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_pg_alloc;
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ null_lsn.file = 0;
+ null_lsn.offset = 0;
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(*meta_lsn)
+ + sizeof(*page_lsn)
+ + sizeof(pgno)
+ + sizeof(ptype)
+ + sizeof(next);
+ if ((logrec.data = (void *)malloc(logrec.size)) == NULL)
+ return (ENOMEM);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ if (meta_lsn != NULL)
+ memcpy(bp, meta_lsn, sizeof(*meta_lsn));
+ else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+ if (page_lsn != NULL)
+ memcpy(bp, page_lsn, sizeof(*page_lsn));
+ else
+ memset(bp, 0, sizeof(*page_lsn));
+ bp += sizeof(*page_lsn);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ memcpy(bp, &ptype, sizeof(ptype));
+ bp += sizeof(ptype);
+ memcpy(bp, &next, sizeof(next));
+ bp += sizeof(next);
+#ifdef DEBUG
+ if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
+ fprintf(stderr, "Error in log record length");
+#endif
+ ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ free(logrec.data);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_pg_alloc_print
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+
+int
+__bam_pg_alloc_print(notused1, dbtp, lsnp, notused3, notused4)
+ DB_LOG *notused1;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int notused3;
+ void *notused4;
+{
+ __bam_pg_alloc_args *argp;
+ u_int32_t i;
+ int c, ret;
+
+ i = 0;
+ c = 0;
+ notused1 = NULL;
+ notused3 = 0;
+ notused4 = NULL;
+
+ if((ret = __bam_pg_alloc_read(dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_pg_alloc: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %lu\n", (u_long)argp->fileid);
+ printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ printf("\tpage_lsn: [%lu][%lu]\n",
+ (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tptype: %lu\n", (u_long)argp->ptype);
+ printf("\tnext: %lu\n", (u_long)argp->next);
+ printf("\n");
+ free(argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_pg_alloc_read __P((void *, __bam_pg_alloc_args **));
+ */
+int
+__bam_pg_alloc_read(recbuf, argpp)
+ void *recbuf;
+ __bam_pg_alloc_args **argpp;
+{
+ __bam_pg_alloc_args *argp;
+ u_int8_t *bp;
+
+ argp = (__bam_pg_alloc_args *)malloc(sizeof(__bam_pg_alloc_args) +
+ sizeof(DB_TXN));
+ if (argp == NULL)
+ return (ENOMEM);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn));
+ bp += sizeof(argp->meta_lsn);
+ memcpy(&argp->page_lsn, bp, sizeof(argp->page_lsn));
+ bp += sizeof(argp->page_lsn);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->ptype, bp, sizeof(argp->ptype));
+ bp += sizeof(argp->ptype);
+ memcpy(&argp->next, bp, sizeof(argp->next));
+ bp += sizeof(argp->next);
+ *argpp = argp;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_pg_free_log
+ * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, DBT *,
+ * PUBLIC: db_pgno_t));
+ */
+int __bam_pg_free_log(logp, txnid, ret_lsnp, flags,
+ fileid, pgno, meta_lsn, header, next)
+ DB_LOG *logp;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN * meta_lsn;
+ DBT *header;
+ db_pgno_t next;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t zero;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_pg_free;
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ null_lsn.file = 0;
+ null_lsn.offset = 0;
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(*meta_lsn)
+ + sizeof(u_int32_t) + (header == NULL ? 0 : header->size)
+ + sizeof(next);
+ if ((logrec.data = (void *)malloc(logrec.size)) == NULL)
+ return (ENOMEM);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (meta_lsn != NULL)
+ memcpy(bp, meta_lsn, sizeof(*meta_lsn));
+ else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+ if (header == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &header->size, sizeof(header->size));
+ bp += sizeof(header->size);
+ memcpy(bp, header->data, header->size);
+ bp += header->size;
+ }
+ memcpy(bp, &next, sizeof(next));
+ bp += sizeof(next);
+#ifdef DEBUG
+ if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
+ fprintf(stderr, "Error in log record length");
+#endif
+ ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ free(logrec.data);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_pg_free_print
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+
+int
+__bam_pg_free_print(notused1, dbtp, lsnp, notused3, notused4)
+ DB_LOG *notused1;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int notused3;
+ void *notused4;
+{
+ __bam_pg_free_args *argp;
+ u_int32_t i;
+ int c, ret;
+
+ i = 0;
+ c = 0;
+ notused1 = NULL;
+ notused3 = 0;
+ notused4 = NULL;
+
+ if((ret = __bam_pg_free_read(dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_pg_free: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %lu\n", (u_long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ printf("\theader: ");
+ for (i = 0; i < argp->header.size; i++) {
+ c = ((char *)argp->header.data)[i];
+ if (isprint(c) || c == 0xa)
+ putchar(c);
+ else
+ printf("%#x ", c);
+ }
+ printf("\n");
+ printf("\tnext: %lu\n", (u_long)argp->next);
+ printf("\n");
+ free(argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_pg_free_read __P((void *, __bam_pg_free_args **));
+ */
+int
+__bam_pg_free_read(recbuf, argpp)
+ void *recbuf;
+ __bam_pg_free_args **argpp;
+{
+ __bam_pg_free_args *argp;
+ u_int8_t *bp;
+
+ argp = (__bam_pg_free_args *)malloc(sizeof(__bam_pg_free_args) +
+ sizeof(DB_TXN));
+ if (argp == NULL)
+ return (ENOMEM);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn));
+ bp += sizeof(argp->meta_lsn);
+ memcpy(&argp->header.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->header.data = bp;
+ bp += argp->header.size;
+ memcpy(&argp->next, bp, sizeof(argp->next));
+ bp += sizeof(argp->next);
+ *argpp = argp;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_split_log
+ * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t,
+ * PUBLIC: DB_LSN *, u_int32_t, db_pgno_t, DB_LSN *,
+ * PUBLIC: DBT *));
+ */
+int __bam_split_log(logp, txnid, ret_lsnp, flags,
+ fileid, left, llsn, right, rlsn, indx,
+ npgno, nlsn, pg)
+ DB_LOG *logp;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t fileid;
+ db_pgno_t left;
+ DB_LSN * llsn;
+ db_pgno_t right;
+ DB_LSN * rlsn;
+ u_int32_t indx;
+ db_pgno_t npgno;
+ DB_LSN * nlsn;
+ DBT *pg;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t zero;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_split;
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ null_lsn.file = 0;
+ null_lsn.offset = 0;
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(left)
+ + sizeof(*llsn)
+ + sizeof(right)
+ + sizeof(*rlsn)
+ + sizeof(indx)
+ + sizeof(npgno)
+ + sizeof(*nlsn)
+ + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size);
+ if ((logrec.data = (void *)malloc(logrec.size)) == NULL)
+ return (ENOMEM);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &left, sizeof(left));
+ bp += sizeof(left);
+ if (llsn != NULL)
+ memcpy(bp, llsn, sizeof(*llsn));
+ else
+ memset(bp, 0, sizeof(*llsn));
+ bp += sizeof(*llsn);
+ memcpy(bp, &right, sizeof(right));
+ bp += sizeof(right);
+ if (rlsn != NULL)
+ memcpy(bp, rlsn, sizeof(*rlsn));
+ else
+ memset(bp, 0, sizeof(*rlsn));
+ bp += sizeof(*rlsn);
+ memcpy(bp, &indx, sizeof(indx));
+ bp += sizeof(indx);
+ memcpy(bp, &npgno, sizeof(npgno));
+ bp += sizeof(npgno);
+ if (nlsn != NULL)
+ memcpy(bp, nlsn, sizeof(*nlsn));
+ else
+ memset(bp, 0, sizeof(*nlsn));
+ bp += sizeof(*nlsn);
+ if (pg == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &pg->size, sizeof(pg->size));
+ bp += sizeof(pg->size);
+ memcpy(bp, pg->data, pg->size);
+ bp += pg->size;
+ }
+#ifdef DEBUG
+ if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
+ fprintf(stderr, "Error in log record length");
+#endif
+ ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ free(logrec.data);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_split_print
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+
+int
+__bam_split_print(notused1, dbtp, lsnp, notused3, notused4)
+ DB_LOG *notused1;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int notused3;
+ void *notused4;
+{
+ __bam_split_args *argp;
+ u_int32_t i;
+ int c, ret;
+
+ i = 0;
+ c = 0;
+ notused1 = NULL;
+ notused3 = 0;
+ notused4 = NULL;
+
+ if((ret = __bam_split_read(dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_split: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %lu\n", (u_long)argp->fileid);
+ printf("\tleft: %lu\n", (u_long)argp->left);
+ printf("\tllsn: [%lu][%lu]\n",
+ (u_long)argp->llsn.file, (u_long)argp->llsn.offset);
+ printf("\tright: %lu\n", (u_long)argp->right);
+ printf("\trlsn: [%lu][%lu]\n",
+ (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\tnpgno: %lu\n", (u_long)argp->npgno);
+ printf("\tnlsn: [%lu][%lu]\n",
+ (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset);
+ printf("\tpg: ");
+ for (i = 0; i < argp->pg.size; i++) {
+ c = ((char *)argp->pg.data)[i];
+ if (isprint(c) || c == 0xa)
+ putchar(c);
+ else
+ printf("%#x ", c);
+ }
+ printf("\n");
+ printf("\n");
+ free(argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_split_read __P((void *, __bam_split_args **));
+ */
+int
+__bam_split_read(recbuf, argpp)
+ void *recbuf;
+ __bam_split_args **argpp;
+{
+ __bam_split_args *argp;
+ u_int8_t *bp;
+
+ argp = (__bam_split_args *)malloc(sizeof(__bam_split_args) +
+ sizeof(DB_TXN));
+ if (argp == NULL)
+ return (ENOMEM);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->left, bp, sizeof(argp->left));
+ bp += sizeof(argp->left);
+ memcpy(&argp->llsn, bp, sizeof(argp->llsn));
+ bp += sizeof(argp->llsn);
+ memcpy(&argp->right, bp, sizeof(argp->right));
+ bp += sizeof(argp->right);
+ memcpy(&argp->rlsn, bp, sizeof(argp->rlsn));
+ bp += sizeof(argp->rlsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ memcpy(&argp->npgno, bp, sizeof(argp->npgno));
+ bp += sizeof(argp->npgno);
+ memcpy(&argp->nlsn, bp, sizeof(argp->nlsn));
+ bp += sizeof(argp->nlsn);
+ memcpy(&argp->pg.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->pg.data = bp;
+ bp += argp->pg.size;
+ *argpp = argp;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_log
+ * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+ * PUBLIC: u_int32_t, db_pgno_t, DBT *, DBT *,
+ * PUBLIC: DB_LSN *));
+ */
+int __bam_rsplit_log(logp, txnid, ret_lsnp, flags,
+ fileid, pgno, pgdbt, rootent, rootlsn)
+ DB_LOG *logp;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t fileid;
+ db_pgno_t pgno;
+ DBT *pgdbt;
+ DBT *rootent;
+ DB_LSN * rootlsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t zero;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_rsplit;
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ null_lsn.file = 0;
+ null_lsn.offset = 0;
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(u_int32_t) + (pgdbt == NULL ? 0 : pgdbt->size)
+ + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size)
+ + sizeof(*rootlsn);
+ if ((logrec.data = (void *)malloc(logrec.size)) == NULL)
+ return (ENOMEM);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (pgdbt == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &pgdbt->size, sizeof(pgdbt->size));
+ bp += sizeof(pgdbt->size);
+ memcpy(bp, pgdbt->data, pgdbt->size);
+ bp += pgdbt->size;
+ }
+ if (rootent == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &rootent->size, sizeof(rootent->size));
+ bp += sizeof(rootent->size);
+ memcpy(bp, rootent->data, rootent->size);
+ bp += rootent->size;
+ }
+ if (rootlsn != NULL)
+ memcpy(bp, rootlsn, sizeof(*rootlsn));
+ else
+ memset(bp, 0, sizeof(*rootlsn));
+ bp += sizeof(*rootlsn);
+#ifdef DEBUG
+ if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
+ fprintf(stderr, "Error in log record length");
+#endif
+ ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ free(logrec.data);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_print
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+
+int
+__bam_rsplit_print(notused1, dbtp, lsnp, notused3, notused4)
+ DB_LOG *notused1;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int notused3;
+ void *notused4;
+{
+ __bam_rsplit_args *argp;
+ u_int32_t i;
+ int c, ret;
+
+ i = 0;
+ c = 0;
+ notused1 = NULL;
+ notused3 = 0;
+ notused4 = NULL;
+
+ if((ret = __bam_rsplit_read(dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_rsplit: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %lu\n", (u_long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tpgdbt: ");
+ for (i = 0; i < argp->pgdbt.size; i++) {
+ c = ((char *)argp->pgdbt.data)[i];
+ if (isprint(c) || c == 0xa)
+ putchar(c);
+ else
+ printf("%#x ", c);
+ }
+ printf("\n");
+ printf("\trootent: ");
+ for (i = 0; i < argp->rootent.size; i++) {
+ c = ((char *)argp->rootent.data)[i];
+ if (isprint(c) || c == 0xa)
+ putchar(c);
+ else
+ printf("%#x ", c);
+ }
+ printf("\n");
+ printf("\trootlsn: [%lu][%lu]\n",
+ (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset);
+ printf("\n");
+ free(argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_read __P((void *, __bam_rsplit_args **));
+ */
+int
+__bam_rsplit_read(recbuf, argpp)
+ void *recbuf;
+ __bam_rsplit_args **argpp;
+{
+ __bam_rsplit_args *argp;
+ u_int8_t *bp;
+
+ argp = (__bam_rsplit_args *)malloc(sizeof(__bam_rsplit_args) +
+ sizeof(DB_TXN));
+ if (argp == NULL)
+ return (ENOMEM);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->pgdbt.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->pgdbt.data = bp;
+ bp += argp->pgdbt.size;
+ memcpy(&argp->rootent.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->rootent.data = bp;
+ bp += argp->rootent.size;
+ memcpy(&argp->rootlsn, bp, sizeof(argp->rootlsn));
+ bp += sizeof(argp->rootlsn);
+ *argpp = argp;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_adj_log
+ * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t,
+ * PUBLIC: u_int32_t, u_int32_t));
+ */
+int __bam_adj_log(logp, txnid, ret_lsnp, flags,
+ fileid, pgno, lsn, indx, indx_copy, is_insert)
+ DB_LOG *logp;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+ u_int32_t indx_copy;
+ u_int32_t is_insert;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_adj;
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ null_lsn.file = 0;
+ null_lsn.offset = 0;
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(*lsn)
+ + sizeof(indx)
+ + sizeof(indx_copy)
+ + sizeof(is_insert);
+ if ((logrec.data = (void *)malloc(logrec.size)) == NULL)
+ return (ENOMEM);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (lsn != NULL)
+ memcpy(bp, lsn, sizeof(*lsn));
+ else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+ memcpy(bp, &indx, sizeof(indx));
+ bp += sizeof(indx);
+ memcpy(bp, &indx_copy, sizeof(indx_copy));
+ bp += sizeof(indx_copy);
+ memcpy(bp, &is_insert, sizeof(is_insert));
+ bp += sizeof(is_insert);
+#ifdef DEBUG
+ if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
+ fprintf(stderr, "Error in log record length");
+#endif
+ ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ free(logrec.data);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_adj_print
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+
+int
+__bam_adj_print(notused1, dbtp, lsnp, notused3, notused4)
+ DB_LOG *notused1;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int notused3;
+ void *notused4;
+{
+ __bam_adj_args *argp;
+ u_int32_t i;
+ int c, ret;
+
+ i = 0;
+ c = 0;
+ notused1 = NULL;
+ notused3 = 0;
+ notused4 = NULL;
+
+ if((ret = __bam_adj_read(dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_adj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %lu\n", (u_long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy);
+ printf("\tis_insert: %lu\n", (u_long)argp->is_insert);
+ printf("\n");
+ free(argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_adj_read __P((void *, __bam_adj_args **));
+ */
+int
+__bam_adj_read(recbuf, argpp)
+ void *recbuf;
+ __bam_adj_args **argpp;
+{
+ __bam_adj_args *argp;
+ u_int8_t *bp;
+
+ argp = (__bam_adj_args *)malloc(sizeof(__bam_adj_args) +
+ sizeof(DB_TXN));
+ if (argp == NULL)
+ return (ENOMEM);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->lsn, bp, sizeof(argp->lsn));
+ bp += sizeof(argp->lsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ memcpy(&argp->indx_copy, bp, sizeof(argp->indx_copy));
+ bp += sizeof(argp->indx_copy);
+ memcpy(&argp->is_insert, bp, sizeof(argp->is_insert));
+ bp += sizeof(argp->is_insert);
+ *argpp = argp;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_log
+ * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t,
+ * PUBLIC: int32_t, int32_t));
+ */
+int __bam_cadjust_log(logp, txnid, ret_lsnp, flags,
+ fileid, pgno, lsn, indx, adjust, total)
+ DB_LOG *logp;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+ int32_t adjust;
+ int32_t total;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_cadjust;
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ null_lsn.file = 0;
+ null_lsn.offset = 0;
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(*lsn)
+ + sizeof(indx)
+ + sizeof(adjust)
+ + sizeof(total);
+ if ((logrec.data = (void *)malloc(logrec.size)) == NULL)
+ return (ENOMEM);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (lsn != NULL)
+ memcpy(bp, lsn, sizeof(*lsn));
+ else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+ memcpy(bp, &indx, sizeof(indx));
+ bp += sizeof(indx);
+ memcpy(bp, &adjust, sizeof(adjust));
+ bp += sizeof(adjust);
+ memcpy(bp, &total, sizeof(total));
+ bp += sizeof(total);
+#ifdef DEBUG
+ if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
+ fprintf(stderr, "Error in log record length");
+#endif
+ ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ free(logrec.data);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_print
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+
+int
+__bam_cadjust_print(notused1, dbtp, lsnp, notused3, notused4)
+ DB_LOG *notused1;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int notused3;
+ void *notused4;
+{
+ __bam_cadjust_args *argp;
+ u_int32_t i;
+ int c, ret;
+
+ i = 0;
+ c = 0;
+ notused1 = NULL;
+ notused3 = 0;
+ notused4 = NULL;
+
+ if((ret = __bam_cadjust_read(dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_cadjust: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %lu\n", (u_long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\tadjust: %ld\n", (long)argp->adjust);
+ printf("\ttotal: %ld\n", (long)argp->total);
+ printf("\n");
+ free(argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_read __P((void *, __bam_cadjust_args **));
+ */
+int
+__bam_cadjust_read(recbuf, argpp)
+ void *recbuf;
+ __bam_cadjust_args **argpp;
+{
+ __bam_cadjust_args *argp;
+ u_int8_t *bp;
+
+ argp = (__bam_cadjust_args *)malloc(sizeof(__bam_cadjust_args) +
+ sizeof(DB_TXN));
+ if (argp == NULL)
+ return (ENOMEM);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->lsn, bp, sizeof(argp->lsn));
+ bp += sizeof(argp->lsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ memcpy(&argp->adjust, bp, sizeof(argp->adjust));
+ bp += sizeof(argp->adjust);
+ memcpy(&argp->total, bp, sizeof(argp->total));
+ bp += sizeof(argp->total);
+ *argpp = argp;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_cdel_log
+ * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t));
+ */
+int __bam_cdel_log(logp, txnid, ret_lsnp, flags,
+ fileid, pgno, lsn, indx)
+ DB_LOG *logp;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_cdel;
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ null_lsn.file = 0;
+ null_lsn.offset = 0;
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(*lsn)
+ + sizeof(indx);
+ if ((logrec.data = (void *)malloc(logrec.size)) == NULL)
+ return (ENOMEM);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (lsn != NULL)
+ memcpy(bp, lsn, sizeof(*lsn));
+ else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+ memcpy(bp, &indx, sizeof(indx));
+ bp += sizeof(indx);
+#ifdef DEBUG
+ if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
+ fprintf(stderr, "Error in log record length");
+#endif
+ ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ free(logrec.data);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cdel_print
+ * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+
+int
+__bam_cdel_print(notused1, dbtp, lsnp, notused3, notused4)
+ DB_LOG *notused1;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ int notused3;
+ void *notused4;
+{
+ __bam_cdel_args *argp;
+ u_int32_t i;
+ int c, ret;
+
+ i = 0;
+ c = 0;
+ notused1 = NULL;
+ notused3 = 0;
+ notused4 = NULL;
+
+ if((ret = __bam_cdel_read(dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_cdel: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %lu\n", (u_long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\n");
+ free(argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_cdel_read __P((void *, __bam_cdel_args **));
+ */
+int
+__bam_cdel_read(recbuf, argpp)
+ void *recbuf;
+ __bam_cdel_args **argpp;
+{
+ __bam_cdel_args *argp;
+ u_int8_t *bp;
+
+ argp = (__bam_cdel_args *)malloc(sizeof(__bam_cdel_args) +
+ sizeof(DB_TXN));
+ if (argp == NULL)
+ return (ENOMEM);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->lsn, bp, sizeof(argp->lsn));
+ bp += sizeof(argp->lsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ *argpp = argp;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_init_print __P((DB_ENV *));
+ */
+int
+__bam_init_print(dbenv)
+ DB_ENV *dbenv;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_alloc_print, DB_bam_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_free_print, DB_bam_pg_free)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_split_print, DB_bam_split)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_rsplit_print, DB_bam_rsplit)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_adj_print, DB_bam_adj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_cadjust_print, DB_bam_cadjust)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_cdel_print, DB_bam_cdel)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_init_recover __P((DB_ENV *));
+ */
+int
+__bam_init_recover(dbenv)
+ DB_ENV *dbenv;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_alloc_recover, DB_bam_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_free_recover, DB_bam_pg_free)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_split_recover, DB_bam_split)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_rsplit_recover, DB_bam_rsplit)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_adj_recover, DB_bam_adj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_cadjust_recover, DB_bam_cadjust)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_cdel_recover, DB_bam_cdel)) != 0)
+ return (ret);
+ return (0);
+}
+