summaryrefslogtreecommitdiff
path: root/db2/mp
diff options
context:
space:
mode:
Diffstat (limited to 'db2/mp')
-rw-r--r--db2/mp/mp_bh.c54
-rw-r--r--db2/mp/mp_fget.c41
-rw-r--r--db2/mp/mp_fopen.c34
-rw-r--r--db2/mp/mp_fput.c26
-rw-r--r--db2/mp/mp_fset.c18
-rw-r--r--db2/mp/mp_open.c9
-rw-r--r--db2/mp/mp_pr.c48
-rw-r--r--db2/mp/mp_region.c33
-rw-r--r--db2/mp/mp_sync.c393
9 files changed, 477 insertions, 179 deletions
diff --git a/db2/mp/mp_bh.c b/db2/mp/mp_bh.c
index fb6bc96ae7..a707603eec 100644
--- a/db2/mp/mp_bh.c
+++ b/db2/mp/mp_bh.c
@@ -7,7 +7,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_bh.c 10.16 (Sleepycat) 9/23/97";
+static const char sccsid[] = "@(#)mp_bh.c 10.21 (Sleepycat) 10/25/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -94,10 +94,10 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* files that we have previously tried (and failed) to open.
*/
dbt.size = mfp->pgcookie_len;
- dbt.data = ADDR(dbmp, mfp->pgcookie_off);
- if (__memp_fopen(dbmp, ADDR(dbmp, mfp->path_off),
+ dbt.data = R_ADDR(dbmp, mfp->pgcookie_off);
+ if (__memp_fopen(dbmp, R_ADDR(dbmp, mfp->path_off),
mfp->ftype, 0, 0, mfp->stat.st_pagesize,
- mfp->lsn_off, &dbt, ADDR(dbmp, mfp->fileid_off), 0, &dbmfp) != 0)
+ mfp->lsn_off, &dbt, R_ADDR(dbmp, mfp->fileid_off), 0, &dbmfp) != 0)
return (0);
found: return (__memp_pgwrite(dbmfp, bhp, restartp, wrotep));
@@ -137,7 +137,7 @@ __memp_pgread(dbmfp, bhp, can_create)
ret = 0;
LOCKHANDLE(dbmp, dbmfp->mutexp);
if (dbmfp->fd == -1 || (ret =
- __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0) {
+ __db_seek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0) {
if (!can_create) {
if (dbmfp->fd == -1)
ret = EINVAL;
@@ -230,6 +230,7 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
+ mp = dbmp->mp;
mfp = dbmfp->mfp;
if (restartp != NULL)
@@ -277,8 +278,7 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
}
/* Write the page out. */
- if ((ret =
- __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0)
+ if ((ret = __db_seek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0)
fail = "seek";
else if ((ret = __db_write(dbmfp->fd, bhp->buf, pagesize, &nw)) != 0)
fail = "write";
@@ -309,15 +309,23 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
/* Clean up the flags based on a successful write. */
F_SET(bhp, BH_CALLPGIN);
F_CLR(bhp, BH_DIRTY | BH_LOCKED);
+
+ ++mp->stat.st_page_clean;
+ --mp->stat.st_page_dirty;
+
UNLOCKBUFFER(dbmp, bhp);
/*
- * If we wrote a buffer which a checkpoint is waiting for, update
+ * If we write a buffer for which a checkpoint is waiting, update
* the count of pending buffers (both in the mpool as a whole and
* for this file). If the count for this file goes to zero, flush
* the writes.
*
* XXX:
+ * Don't lock the region around the sync, fsync(2) has no atomicity
+ * issues.
+ *
+ * XXX:
* We ignore errors from the sync -- it makes no sense to return an
* error to the calling process, so set a flag causing the sync to
* be retried later.
@@ -325,21 +333,15 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
* If the buffer we wrote has a LSN larger than the current largest
* we've written for this checkpoint, update the saved value.
*/
- mp = dbmp->mp;
if (F_ISSET(bhp, BH_WRITE)) {
if (log_compare(&lsn, &mp->lsn) > 0)
mp->lsn = lsn;
F_CLR(bhp, BH_WRITE);
--mp->lsn_cnt;
- if (--mfp->lsn_cnt == 0) {
- /*
- * Don't lock -- there are no atomicity issues for
- * fsync(2).
- */
- if (__db_fsync(dbmfp->fd) != 0)
- F_SET(mp, MP_LSN_RETRY);
- }
+
+ if (--mfp->lsn_cnt == 0 && __db_fsync(dbmfp->fd) != 0)
+ F_SET(mp, MP_LSN_RETRY);
}
/* Update I/O statistics. */
@@ -391,7 +393,7 @@ __memp_pg(dbmfp, bhp, is_pgin)
dbtp = NULL;
else {
dbt.size = mfp->pgcookie_len;
- dbt.data = ADDR(dbmp, mfp->pgcookie_off);
+ dbt.data = R_ADDR(dbmp, mfp->pgcookie_off);
dbtp = &dbt;
}
UNLOCKHANDLE(dbmp, dbmp->mutexp);
@@ -433,19 +435,21 @@ __memp_bhfree(dbmp, mfp, bhp, free_mem)
{
size_t off;
- /* Delete the buffer header from the MPOOL hash list. */
- off = BUCKET(dbmp->mp, OFFSET(dbmp, mfp), bhp->pgno);
- SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, mq, __bh);
+ /* Delete the buffer header from the hash bucket queue. */
+ off = BUCKET(dbmp->mp, R_OFFSET(dbmp, mfp), bhp->pgno);
+ SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, hq, __bh);
- /* Delete the buffer header from the LRU chain. */
+ /* Delete the buffer header from the LRU queue. */
SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);
/*
* If we're not reusing it immediately, free the buffer header
* and data for real.
*/
- if (free_mem)
+ if (free_mem) {
__db_shalloc_free(dbmp->addr, bhp);
+ --dbmp->mp->stat.st_page_clean;
+ }
}
/*
@@ -474,13 +478,13 @@ __memp_upgrade(dbmp, dbmfp, mfp)
return (1);
/* Try the open. */
- if (__db_fdopen(ADDR(dbmp, mfp->path_off), 0, 0, 0, &fd) != 0) {
+ if (__db_open(R_ADDR(dbmp, mfp->path_off), 0, 0, 0, &fd) != 0) {
F_SET(dbmfp, MP_UPGRADE_FAIL);
return (1);
}
/* Swap the descriptors and set the upgrade flag. */
- (void)close(dbmfp->fd);
+ (void)__db_close(dbmfp->fd);
dbmfp->fd = fd;
F_SET(dbmfp, MP_UPGRADE);
diff --git a/db2/mp/mp_fget.c b/db2/mp/mp_fget.c
index a0364e92c3..3f99e60505 100644
--- a/db2/mp/mp_fget.c
+++ b/db2/mp/mp_fget.c
@@ -7,7 +7,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_fget.c 10.25 (Sleepycat) 9/23/97";
+static const char sccsid[] = "@(#)mp_fget.c 10.30 (Sleepycat) 10/25/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -87,14 +87,14 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
* We want to switch threads as often as possible. Sleep every time
* we get a new page to make it more likely.
*/
- if (__sleep_on_every_page_get && (dbmp->dbenv == NULL ||
- dbmp->dbenv->db_yield == NULL || dbmp->dbenv->db_yield() != 0))
+ if (__sleep_on_every_page_get &&
+ (__db_yield == NULL || __db_yield() != 0))
__db_sleep(0, 1);
#endif
mp = dbmp->mp;
mfp = dbmfp->mfp;
- mf_offset = OFFSET(dbmp, mfp);
+ mf_offset = R_OFFSET(dbmp, mfp);
addr = NULL;
bhp = NULL;
b_incr = b_inserted = readonly_alloc = ret = 0;
@@ -137,7 +137,7 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
}
}
if (!readonly_alloc) {
- addr = ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+ addr = R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
++mp->stat.st_map;
++mfp->stat.st_map;
@@ -159,9 +159,12 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
*/
if (dbmfp->fd == -1)
size = 0;
- else if ((ret = __db_stat(dbmp->dbenv,
- dbmfp->path, dbmfp->fd, &size, NULL)) != 0)
+ else if ((ret =
+ __db_ioinfo(dbmfp->path, dbmfp->fd, &size, NULL)) != 0) {
+ __db_err(dbmp->dbenv,
+ "%s: %s", dbmfp->path, strerror(ret));
goto err;
+ }
*pgnoaddr = size == 0 ? 0 : (size - 1) / mfp->stat.st_pagesize;
@@ -190,26 +193,29 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
goto found;
}
- /* If we haven't checked the BH list yet, do the search. */
+ /* If we haven't checked the BH hash bucket queue, do the search. */
if (!LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) {
- ++mp->stat.st_hash_searches;
bucket = BUCKET(mp, mf_offset, *pgnoaddr);
for (cnt = 0,
bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh)) {
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
++cnt;
if (bhp->pgno == *pgnoaddr &&
bhp->mf_offset == mf_offset) {
addr = bhp->buf;
+ ++mp->stat.st_hash_searches;
if (cnt > mp->stat.st_hash_longest)
mp->stat.st_hash_longest = cnt;
mp->stat.st_hash_examined += cnt;
goto found;
}
}
- if (cnt > mp->stat.st_hash_longest)
- mp->stat.st_hash_longest = cnt;
- mp->stat.st_hash_examined += cnt;
+ if (cnt != 0) {
+ ++mp->stat.st_hash_searches;
+ if (cnt > mp->stat.st_hash_longest)
+ mp->stat.st_hash_longest = cnt;
+ mp->stat.st_hash_examined += cnt;
+ }
}
/*
@@ -239,8 +245,9 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
* our region lock without screwing up the world.
*/
bucket = BUCKET(mp, mf_offset, *pgnoaddr);
- SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, mq, __bh);
+ SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, hq, __bh);
SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q);
+ ++mp->stat.st_page_clean;
b_inserted = 1;
/* Set the page number, and associated MPOOLFILE. */
@@ -281,7 +288,8 @@ reread: if ((ret = __memp_pgread(dbmfp,
* !!!
* The __memp_pgread call discarded and reacquired the region
* lock. Because the buffer reference count was incremented
- * before the region lock was discarded the buffer didn't move.
+ * before the region lock was discarded the buffer can't move
+ * and its contents can't change.
*/
++mp->stat.st_cache_miss;
++mfp->stat.st_cache_miss;
@@ -305,7 +313,8 @@ found: /* Increment the reference count. */
* BH_LOCKED --
* I/O in progress, wait for it to finish. Because the buffer
* reference count was incremented before the region lock was
- * discarded we know the buffer didn't move.
+ * discarded we know the buffer can't move and its contents
+ * can't change.
*/
if (F_ISSET(bhp, BH_LOCKED)) {
UNLOCKREGION(dbmp);
diff --git a/db2/mp/mp_fopen.c b/db2/mp/mp_fopen.c
index 5ab807701c..de59c9ea9b 100644
--- a/db2/mp/mp_fopen.c
+++ b/db2/mp/mp_fopen.c
@@ -7,7 +7,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_fopen.c 10.27 (Sleepycat) 9/23/97";
+static const char sccsid[] = "@(#)mp_fopen.c 10.30 (Sleepycat) 10/25/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -92,7 +92,7 @@ __memp_fopen(dbmp, path,
/* Allocate and initialize the per-process structure. */
if ((dbmfp =
- (DB_MPOOLFILE *)calloc(1, sizeof(DB_MPOOLFILE))) == NULL) {
+ (DB_MPOOLFILE *)__db_calloc(1, sizeof(DB_MPOOLFILE))) == NULL) {
__db_err(dbenv, "%s: %s",
path == NULL ? TEMPORARY : path, strerror(ENOMEM));
return (ENOMEM);
@@ -120,7 +120,7 @@ __memp_fopen(dbmp, path,
/* Open the file. */
- if ((ret = __db_fdopen(dbmfp->path,
+ if ((ret = __db_open(dbmfp->path,
LF_ISSET(DB_CREATE | DB_RDONLY), DB_CREATE | DB_RDONLY,
mode, &dbmfp->fd)) != 0) {
__db_err(dbenv, "%s: %s", dbmfp->path, strerror(ret));
@@ -128,9 +128,11 @@ __memp_fopen(dbmp, path,
}
/* Don't permit files that aren't a multiple of the pagesize. */
- if ((ret = __db_stat(dbenv,
- dbmfp->path, dbmfp->fd, &size, NULL)) != 0)
+ if ((ret =
+ __db_ioinfo(dbmfp->path, dbmfp->fd, &size, NULL)) != 0) {
+ __db_err(dbenv, "%s: %s", dbmfp->path, strerror(ret));
goto err;
+ }
if (size % pagesize) {
__db_err(dbenv,
"%s: file size not a multiple of the pagesize",
@@ -198,7 +200,7 @@ __memp_fopen(dbmp, path,
dbmfp->addr = NULL;
if (mfp->can_mmap) {
dbmfp->len = size;
- if (__db_mmap(dbmfp->fd, dbmfp->len, 1, 1, &dbmfp->addr) != 0) {
+ if (__db_map(dbmfp->fd, dbmfp->len, 1, 1, &dbmfp->addr) != 0) {
mfp->can_mmap = 0;
dbmfp->addr = NULL;
}
@@ -264,7 +266,7 @@ __memp_mf_open(dbmp, dbmfp,
for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
if (!memcmp(fileid,
- ADDR(dbmp, mfp->fileid_off), DB_FILE_ID_LEN)) {
+ R_ADDR(dbmp, mfp->fileid_off), DB_FILE_ID_LEN)) {
if (ftype != mfp->ftype ||
pagesize != mfp->stat.st_pagesize) {
__db_err(dbmp->dbenv,
@@ -325,10 +327,10 @@ alloc: if ((ret = __memp_ralloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
if (0) {
err: if (mfp->path_off != 0)
__db_shalloc_free(dbmp->addr,
- ADDR(dbmp, mfp->path_off));
+ R_ADDR(dbmp, mfp->path_off));
if (!istemp)
__db_shalloc_free(dbmp->addr,
- ADDR(dbmp, mfp->fileid_off));
+ R_ADDR(dbmp, mfp->fileid_off));
if (mfp != NULL)
__db_shalloc_free(dbmp->addr, mfp);
mfp = NULL;
@@ -367,7 +369,7 @@ memp_fclose(dbmfp)
/* Discard any mmap information. */
if (dbmfp->addr != NULL &&
- (ret = __db_munmap(dbmfp->addr, dbmfp->len)) != 0)
+ (ret = __db_unmap(dbmfp->addr, dbmfp->len)) != 0)
__db_err(dbmp->dbenv, "%s: %s", dbmfp->path, strerror(ret));
/* Close the file; temporary files may not yet have been created. */
@@ -423,7 +425,7 @@ __memp_mf_close(dbmp, dbmfp)
* fairly expensive to reintegrate the buffers back into the region for
* no purpose.
*/
- mf_offset = OFFSET(dbmp, mfp);
+ mf_offset = R_OFFSET(dbmp, mfp);
for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
@@ -436,6 +438,10 @@ __memp_mf_close(dbmp, dbmfp)
#endif
if (bhp->mf_offset == mf_offset) {
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ ++mp->stat.st_page_clean;
+ --mp->stat.st_page_dirty;
+ }
__memp_bhfree(dbmp, mfp, bhp, 0);
SH_TAILQ_INSERT_HEAD(&mp->bhfq, bhp, q, __bh);
}
@@ -446,11 +452,11 @@ __memp_mf_close(dbmp, dbmfp)
/* Free the space. */
__db_shalloc_free(dbmp->addr, mfp);
- __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->path_off));
+ __db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->path_off));
if (mfp->fileid_off != 0)
- __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->fileid_off));
+ __db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->fileid_off));
if (mfp->pgcookie_off != 0)
- __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->pgcookie_off));
+ __db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->pgcookie_off));
ret1: UNLOCKREGION(dbmp);
return (0);
diff --git a/db2/mp/mp_fput.c b/db2/mp/mp_fput.c
index 9ea7cd9d0d..892f179d3a 100644
--- a/db2/mp/mp_fput.c
+++ b/db2/mp/mp_fput.c
@@ -7,7 +7,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_fput.c 10.12 (Sleepycat) 9/23/97";
+static const char sccsid[] = "@(#)mp_fput.c 10.14 (Sleepycat) 10/5/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -35,10 +35,12 @@ memp_fput(dbmfp, pgaddr, flags)
{
BH *bhp;
DB_MPOOL *dbmp;
+ MPOOL *mp;
MPOOLFILE *mfp;
int wrote, ret;
dbmp = dbmfp->dbmp;
+ mp = dbmp->mp;
/* Validate arguments. */
if (flags) {
@@ -82,10 +84,16 @@ memp_fput(dbmfp, pgaddr, flags)
LOCKREGION(dbmp);
/* Set/clear the page bits. */
- if (LF_ISSET(DB_MPOOL_CLEAN))
+ if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
+ ++mp->stat.st_page_clean;
+ --mp->stat.st_page_dirty;
F_CLR(bhp, BH_DIRTY);
- if (LF_ISSET(DB_MPOOL_DIRTY))
+ }
+ if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
+ --mp->stat.st_page_clean;
+ ++mp->stat.st_page_dirty;
F_SET(bhp, BH_DIRTY);
+ }
if (LF_ISSET(DB_MPOOL_DISCARD))
F_SET(bhp, BH_DISCARD);
@@ -108,11 +116,11 @@ memp_fput(dbmfp, pgaddr, flags)
}
/* Move the buffer to the head/tail of the LRU chain. */
- SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);
+ SH_TAILQ_REMOVE(&mp->bhq, bhp, q, __bh);
if (F_ISSET(bhp, BH_DISCARD))
- SH_TAILQ_INSERT_HEAD(&dbmp->mp->bhq, bhp, q, __bh);
+ SH_TAILQ_INSERT_HEAD(&mp->bhq, bhp, q, __bh);
else
- SH_TAILQ_INSERT_TAIL(&dbmp->mp->bhq, bhp, q);
+ SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q);
/*
* If this buffer is scheduled for writing because of a checkpoint,
@@ -125,14 +133,14 @@ memp_fput(dbmfp, pgaddr, flags)
if (F_ISSET(bhp, BH_DIRTY)) {
if (__memp_bhwrite(dbmp,
dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote)
- F_SET(dbmp->mp, MP_LSN_RETRY);
+ F_SET(mp, MP_LSN_RETRY);
} else {
F_CLR(bhp, BH_WRITE);
- mfp = ADDR(dbmp, bhp->mf_offset);
+ mfp = R_ADDR(dbmp, bhp->mf_offset);
--mfp->lsn_cnt;
- --dbmp->mp->lsn_cnt;
+ --mp->lsn_cnt;
}
UNLOCKREGION(dbmp);
diff --git a/db2/mp/mp_fset.c b/db2/mp/mp_fset.c
index a3a3dcef9c..a7d2706008 100644
--- a/db2/mp/mp_fset.c
+++ b/db2/mp/mp_fset.c
@@ -7,7 +7,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_fset.c 10.9 (Sleepycat) 9/20/97";
+static const char sccsid[] = "@(#)mp_fset.c 10.10 (Sleepycat) 10/5/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -34,9 +34,13 @@ memp_fset(dbmfp, pgaddr, flags)
{
BH *bhp;
DB_MPOOL *dbmp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
int ret;
dbmp = dbmfp->dbmp;
+ mfp = dbmfp->mfp;
+ mp = dbmp->mp;
/* Validate arguments. */
if (flags != 0) {
@@ -60,10 +64,16 @@ memp_fset(dbmfp, pgaddr, flags)
LOCKREGION(dbmp);
- if (LF_ISSET(DB_MPOOL_DIRTY))
- F_SET(bhp, BH_DIRTY);
- if (LF_ISSET(DB_MPOOL_CLEAN))
+ if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
+ ++mp->stat.st_page_clean;
+ --mp->stat.st_page_dirty;
F_CLR(bhp, BH_DIRTY);
+ }
+ if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
+ --mp->stat.st_page_clean;
+ ++mp->stat.st_page_dirty;
+ F_SET(bhp, BH_DIRTY);
+ }
if (LF_ISSET(DB_MPOOL_DISCARD))
F_SET(bhp, BH_DISCARD);
diff --git a/db2/mp/mp_open.c b/db2/mp/mp_open.c
index f622b1ed26..4c19739ebd 100644
--- a/db2/mp/mp_open.c
+++ b/db2/mp/mp_open.c
@@ -7,7 +7,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_open.c 10.13 (Sleepycat) 9/23/97";
+static const char sccsid[] = "@(#)mp_open.c 10.15 (Sleepycat) 10/25/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -54,7 +54,7 @@ memp_open(path, flags, mode, dbenv, retp)
cachesize = dbenv == NULL ? 0 : dbenv->mp_size;
/* Create and initialize the DB_MPOOL structure. */
- if ((dbmp = (DB_MPOOL *)calloc(1, sizeof(DB_MPOOL))) == NULL)
+ if ((dbmp = (DB_MPOOL *)__db_calloc(1, sizeof(DB_MPOOL))) == NULL)
return (ENOMEM);
LIST_INIT(&dbmp->dbregq);
TAILQ_INIT(&dbmp->dbmfq);
@@ -62,8 +62,7 @@ memp_open(path, flags, mode, dbenv, retp)
dbmp->dbenv = dbenv;
/* Decide if it's possible for anyone else to access the pool. */
- if ((dbenv == NULL && path == NULL) ||
- (dbenv != NULL && F_ISSET(dbenv, DB_MPOOL_PRIVATE)))
+ if ((dbenv == NULL && path == NULL) || LF_ISSET(DB_MPOOL_PRIVATE))
F_SET(dbmp, MP_ISPRIVATE);
/*
@@ -183,7 +182,7 @@ memp_register(dbmp, ftype, pgin, pgout)
{
DB_MPREG *mpr;
- if ((mpr = (DB_MPREG *)malloc(sizeof(DB_MPREG))) == NULL)
+ if ((mpr = (DB_MPREG *)__db_malloc(sizeof(DB_MPREG))) == NULL)
return (ENOMEM);
mpr->ftype = ftype;
diff --git a/db2/mp/mp_pr.c b/db2/mp/mp_pr.c
index 7794cfa7f3..01f0920df4 100644
--- a/db2/mp/mp_pr.c
+++ b/db2/mp/mp_pr.c
@@ -7,7 +7,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_pr.c 10.13 (Sleepycat) 8/27/97";
+static const char sccsid[] = "@(#)mp_pr.c 10.18 (Sleepycat) 11/1/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -53,7 +53,7 @@ memp_stat(dbmp, gspp, fspp, db_malloc)
*gspp = NULL;
if ((*gspp = db_malloc == NULL ?
- (DB_MPOOL_STAT *)malloc(sizeof(**gspp)) :
+ (DB_MPOOL_STAT *)__db_malloc(sizeof(**gspp)) :
(DB_MPOOL_STAT *)db_malloc(sizeof(**gspp))) == NULL)
return (ENOMEM);
@@ -62,6 +62,10 @@ memp_stat(dbmp, gspp, fspp, db_malloc)
/* Copy out the global statistics. */
**gspp = dbmp->mp->stat;
(*gspp)->st_hash_buckets = dbmp->mp->htab_buckets;
+ (*gspp)->st_region_wait =
+ dbmp->mp->rlayout.lock.mutex_set_wait;
+ (*gspp)->st_region_nowait =
+ dbmp->mp->rlayout.lock.mutex_set_nowait;
UNLOCKREGION(dbmp);
}
@@ -85,7 +89,7 @@ memp_stat(dbmp, gspp, fspp, db_malloc)
/* Allocate space for the pointers. */
len = (len + 1) * sizeof(DB_MPOOL_FSTAT *);
if ((*fspp = db_malloc == NULL ?
- (DB_MPOOL_FSTAT **)malloc(len) :
+ (DB_MPOOL_FSTAT **)__db_malloc(len) :
(DB_MPOOL_FSTAT **)db_malloc(len)) == NULL)
return (ENOMEM);
@@ -96,11 +100,11 @@ memp_stat(dbmp, gspp, fspp, db_malloc)
mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
mfp != NULL;
++tfsp, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
- name = ADDR(dbmp, mfp->path_off);
+ name = R_ADDR(dbmp, mfp->path_off);
nlen = strlen(name);
len = sizeof(DB_MPOOL_FSTAT) + nlen + 1;
if ((*tfsp = db_malloc == NULL ?
- (DB_MPOOL_FSTAT *)malloc(len) :
+ (DB_MPOOL_FSTAT *)__db_malloc(len) :
(DB_MPOOL_FSTAT *)db_malloc(len)) == NULL)
return (ENOMEM);
**tfsp = mfp->stat;
@@ -200,18 +204,19 @@ __memp_pmp(fp, dbmp, mp, data)
(void)fprintf(fp, "references: %lu; cachesize: %lu\n",
(u_long)mp->rlayout.refcnt, (u_long)mp->stat.st_cachesize);
(void)fprintf(fp,
- " %lu pages created\n", mp->stat.st_page_create);
+ " %lu pages created\n", (u_long)mp->stat.st_page_create);
(void)fprintf(fp,
- " %lu mmap pages returned\n", mp->stat.st_map);
+ " %lu mmap pages returned\n", (u_long)mp->stat.st_map);
(void)fprintf(fp, " %lu I/O's (%lu read, %lu written)\n",
- mp->stat.st_page_in + mp->stat.st_page_out,
- mp->stat.st_page_in, mp->stat.st_page_out);
+ (u_long)mp->stat.st_page_in + mp->stat.st_page_out,
+ (u_long)mp->stat.st_page_in, (u_long)mp->stat.st_page_out);
if (mp->stat.st_cache_hit + mp->stat.st_cache_miss != 0)
(void)fprintf(fp,
" %.0f%% cache hit rate (%lu hit, %lu miss)\n",
((double)mp->stat.st_cache_hit /
(mp->stat.st_cache_hit + mp->stat.st_cache_miss)) * 100,
- mp->stat.st_cache_hit, mp->stat.st_cache_miss);
+ (u_long)mp->stat.st_cache_hit,
+ (u_long)mp->stat.st_cache_miss);
/* Display the MPOOLFILE structures. */
for (cnt = 0, mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
@@ -230,17 +235,18 @@ __memp_pmp(fp, dbmp, mp, data)
(void)fprintf(fp, "%s\nHASH table of BH's (%lu buckets):\n",
DB_LINE, (u_long)mp->htab_buckets);
(void)fprintf(fp,
- "longest chain searched %lu\n", mp->stat.st_hash_longest);
+ "longest chain searched %lu\n", (u_long)mp->stat.st_hash_longest);
(void)fprintf(fp, "average chain searched %lu (total/calls: %lu/%lu)\n",
- mp->stat.st_hash_examined /
+ (u_long)mp->stat.st_hash_examined /
(mp->stat.st_hash_searches ? mp->stat.st_hash_searches : 1),
- mp->stat.st_hash_examined, mp->stat.st_hash_searches);
+ (u_long)mp->stat.st_hash_examined,
+ (u_long)mp->stat.st_hash_searches);
for (htabp = dbmp->htab,
bucket = 0; bucket < mp->htab_buckets; ++htabp, ++bucket) {
if (SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh) != NULL)
(void)fprintf(fp, "%lu:\n", (u_long)bucket);
for (bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh))
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
__memp_pbh(fp, dbmp, bhp, data);
}
@@ -249,7 +255,7 @@ __memp_pmp(fp, dbmp, mp, data)
for (sep = "\n ", bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
bhp != NULL; sep = ", ", bhp = SH_TAILQ_NEXT(bhp, q, __bh))
(void)fprintf(fp, "%s%lu/%lu", sep,
- (u_long)bhp->pgno, (u_long)OFFSET(dbmp, bhp));
+ (u_long)bhp->pgno, (u_long)R_OFFSET(dbmp, bhp));
(void)fprintf(fp, "\n");
}
@@ -263,16 +269,18 @@ __memp_pmf(fp, mfp, data)
MPOOLFILE *mfp;
int data;
{
- (void)fprintf(fp, " %lu pages created\n", mfp->stat.st_page_create);
+ (void)fprintf(fp, " %lu pages created\n",
+ (u_long)mfp->stat.st_page_create);
(void)fprintf(fp, " %lu I/O's (%lu read, %lu written)\n",
- mfp->stat.st_page_in + mfp->stat.st_page_out,
- mfp->stat.st_page_in, mfp->stat.st_page_out);
+ (u_long)mfp->stat.st_page_in + mfp->stat.st_page_out,
+ (u_long)mfp->stat.st_page_in, (u_long)mfp->stat.st_page_out);
if (mfp->stat.st_cache_hit + mfp->stat.st_cache_miss != 0)
(void)fprintf(fp,
" %.0f%% cache hit rate (%lu hit, %lu miss)\n",
((double)mfp->stat.st_cache_hit /
(mfp->stat.st_cache_hit + mfp->stat.st_cache_miss)) * 100,
- mfp->stat.st_cache_hit, mfp->stat.st_cache_miss);
+ (u_long)mfp->stat.st_cache_hit,
+ (u_long)mfp->stat.st_cache_miss);
if (!data)
return;
@@ -298,7 +306,7 @@ __memp_pbh(fp, dbmp, bhp, data)
return;
(void)fprintf(fp, " BH @ %lu (mf: %lu): page %lu; ref %lu",
- (u_long)OFFSET(dbmp, bhp),
+ (u_long)R_OFFSET(dbmp, bhp),
(u_long)bhp->mf_offset, (u_long)bhp->pgno, (u_long)bhp->ref);
sep = "; ";
if (F_ISSET(bhp, BH_DIRTY)) {
diff --git a/db2/mp/mp_region.c b/db2/mp/mp_region.c
index a5c52123b9..6b2f93125c 100644
--- a/db2/mp/mp_region.c
+++ b/db2/mp/mp_region.c
@@ -7,7 +7,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_region.c 10.11 (Sleepycat) 8/2/97";
+static const char sccsid[] = "@(#)mp_region.c 10.16 (Sleepycat) 10/25/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -51,7 +51,7 @@ __memp_ralloc(dbmp, len, offsetp, retp)
nomore = 0;
alloc: if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) {
if (offsetp != NULL)
- *offsetp = OFFSET(dbmp, p);
+ *offsetp = R_OFFSET(dbmp, p);
*(void **)retp = p;
return (0);
}
@@ -68,7 +68,7 @@ alloc: if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) {
if (__db_shsizeof(bhp) == len) {
SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh);
if (offsetp != NULL)
- *offsetp = OFFSET(dbmp, bhp);
+ *offsetp = R_OFFSET(dbmp, bhp);
*(void **)retp = bhp;
return (0);
}
@@ -82,6 +82,7 @@ alloc: if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) {
SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh);
__db_shalloc_free(dbmp->addr, bhp);
+ --mp->stat.st_page_clean;
/*
* Retry as soon as we've freed up sufficient space. If we
@@ -104,7 +105,7 @@ retry: /* Find a buffer we can flush; pure LRU. */
continue;
/* Find the associated MPOOLFILE. */
- mfp = ADDR(dbmp, bhp->mf_offset);
+ mfp = R_ADDR(dbmp, bhp->mf_offset);
/*
* Write the page if it's dirty.
@@ -135,8 +136,7 @@ retry: /* Find a buffer we can flush; pure LRU. */
else {
if (restart)
goto retry;
- else
- continue;
+ continue;
}
} else
++mp->stat.st_ro_evict;
@@ -150,7 +150,7 @@ retry: /* Find a buffer we can flush; pure LRU. */
__memp_bhfree(dbmp, mfp, bhp, 0);
if (offsetp != NULL)
- *offsetp = OFFSET(dbmp, bhp);
+ *offsetp = R_OFFSET(dbmp, bhp);
*(void **)retp = bhp;
return (0);
}
@@ -225,9 +225,13 @@ retry: if (LF_ISSET(DB_CREATE)) {
* be possible for DB_THREAD to be set if HAVE_SPINLOCKS aren't
* defined.
*/
- if (F_ISSET(dbmp, MP_ISPRIVATE))
- ret = (dbmp->maddr = malloc(rlen)) == NULL ? ENOMEM : 0;
- else
+ if (F_ISSET(dbmp, MP_ISPRIVATE)) {
+ if ((dbmp->maddr = __db_malloc(rlen)) == NULL)
+ ret = ENOMEM;
+ else
+ ret = __db_rinit(dbmp->dbenv,
+ dbmp->maddr, 0, rlen, 0);
+ } else
ret = __db_rcreate(dbmp->dbenv, DB_APP_NONE, path,
DB_DEFAULT_MPOOL_FILE, mode, rlen, &fd,
&dbmp->maddr);
@@ -259,7 +263,10 @@ retry: if (LF_ISSET(DB_CREATE)) {
0, &dbmp->htab)) != 0)
goto err;
__db_hashinit(dbmp->htab, mp->htab_buckets);
- mp->htab = OFFSET(dbmp, dbmp->htab);
+ mp->htab = R_OFFSET(dbmp, dbmp->htab);
+
+ ZERO_LSN(mp->lsn);
+ mp->lsn_cnt = 0;
memset(&mp->stat, 0, sizeof(mp->stat));
mp->stat.st_cachesize = cachesize;
@@ -303,7 +310,7 @@ retry: if (LF_ISSET(DB_CREATE)) {
* Get the hash table address; it's on the shared page, so we have
* to lock first.
*/
- dbmp->htab = ADDR(dbmp, dbmp->mp->htab);
+ dbmp->htab = R_ADDR(dbmp, dbmp->mp->htab);
dbmp->fd = fd;
@@ -333,7 +340,7 @@ __memp_rclose(dbmp)
DB_MPOOL *dbmp;
{
if (F_ISSET(dbmp, MP_ISPRIVATE)) {
- free(dbmp->maddr);
+ __db_free(dbmp->maddr);
return (0);
}
return (__db_rclose(dbmp->dbenv, dbmp->fd, dbmp->maddr));
diff --git a/db2/mp/mp_sync.c b/db2/mp/mp_sync.c
index 65b2a18267..2f042df9e1 100644
--- a/db2/mp/mp_sync.c
+++ b/db2/mp/mp_sync.c
@@ -7,13 +7,14 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_sync.c 10.9 (Sleepycat) 8/29/97";
+static const char sccsid[] = "@(#)mp_sync.c 10.15 (Sleepycat) 11/1/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#include <errno.h>
+#include <stdlib.h>
#include <string.h>
#endif
@@ -23,6 +24,8 @@ static const char sccsid[] = "@(#)mp_sync.c 10.9 (Sleepycat) 8/29/97";
#include "mp.h"
#include "common_ext.h"
+static int __bhcmp __P((const void *, const void *));
+
/*
* memp_sync --
* Mpool sync function.
@@ -32,27 +35,40 @@ memp_sync(dbmp, lsnp)
DB_MPOOL *dbmp;
DB_LSN *lsnp;
{
- BH *bhp;
+ BH *bhp, **bharray;
DB_ENV *dbenv;
MPOOL *mp;
MPOOLFILE *mfp;
- int can_write, wrote, lsn_cnt, restart, ret;
+ int ar_cnt, cnt, nalloc, next, notused, ret, wrote;
dbenv = dbmp->dbenv;
- if (dbmp->dbenv->lg_info == NULL) {
- __db_err(dbenv, "memp_sync requires logging");
+ if (dbenv->lg_info == NULL) {
+ __db_err(dbenv, "memp_sync: requires logging");
return (EINVAL);
}
+ /*
+ * We try and write the buffers in page order so that the underlying
+ * filesystem doesn't have to seek and can write contiguous blocks,
+ * plus, we don't want to hold the region lock while we write the
+ * buffers. Get memory to hold the buffer pointers. Get a good-size
+ * block, too, because we realloc while holding the region lock if we
+ * run out.
+ */
+ if ((bharray =
+ (BH **)__db_malloc((nalloc = 1024) * sizeof(BH *))) == NULL)
+ return (ENOMEM);
+
LOCKREGION(dbmp);
/*
- * If the application is asking about a previous call, and we haven't
- * found any buffers that the application holding the pin couldn't
- * write, return yes or no based on the current count. Note, if the
- * application is asking about a LSN *smaller* than one we've already
- * handled, then we return based on the count for that LSN.
+ * If the application is asking about a previous call to memp_sync(),
+ * and we haven't found any buffers that the application holding the
+ * pin couldn't write, return yes or no based on the current count.
+ * Note, if the application is asking about a LSN *smaller* than one
+ * we've already handled or are currently handling, then we return a
+ * result based on the count for the larger LSN.
*/
mp = dbmp->mp;
if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
@@ -61,9 +77,7 @@ memp_sync(dbmp, lsnp)
ret = 0;
} else
ret = DB_INCOMPLETE;
-
- UNLOCKREGION(dbmp);
- return (ret);
+ goto done;
}
/* Else, it's a new checkpoint. */
@@ -74,7 +88,7 @@ memp_sync(dbmp, lsnp)
* for which we were already doing a checkpoint. (BTW, I don't expect
* to see multiple LSN's from the same or multiple processes, but You
* Just Never Know. Responding as if they all called with the largest
- * of the LSNs specified makes everything work.
+ * of the LSNs specified makes everything work.)
*
* We don't currently use the LSN we save. We could potentially save
* the last-written LSN in each buffer header and use it to determine
@@ -93,64 +107,127 @@ memp_sync(dbmp, lsnp)
/*
* Walk the list of buffers and mark all dirty buffers to be written
- * and all pinned buffers to be potentially written. We do this in
- * single fell swoop while holding the region locked so that processes
- * can't make new buffers dirty, causing us to never finish. Since
- * the application may have restarted the sync, clear any BH_WRITE
- * flags that appear to be left over.
+ * and all pinned buffers to be potentially written (we can't know if
+ * we'll need to write them until the holding process returns them to
+ * the cache). We do this in one pass while holding the region locked
+ * so that processes can't make new buffers dirty, causing us to never
+ * finish. Since the application may have restarted the sync, clear
+ * any BH_WRITE flags that appear to be left over from previous calls.
+ *
+ * Keep a count of the total number of buffers we need to write in
+ * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
*/
- can_write = lsn_cnt = 0;
- for (lsn_cnt = 0, bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
+ ar_cnt = 0;
+ for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
F_SET(bhp, BH_WRITE);
- if (bhp->ref == 0)
- can_write = 1;
+ ++mp->lsn_cnt;
- mfp = ADDR(dbmp, bhp->mf_offset);
+ mfp = R_ADDR(dbmp, bhp->mf_offset);
++mfp->lsn_cnt;
- ++lsn_cnt;
+ /*
+ * If the buffer isn't in use, we should be able to
+ * write it immediately, so save a reference to it.
+ */
+ if (bhp->ref == 0) {
+ if (ar_cnt == nalloc) {
+ nalloc *= 2;
+ if ((bharray =
+ (BH **)__db_realloc(bharray,
+ nalloc * sizeof(BH *))) == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ }
+ bharray[ar_cnt++] = bhp;
+ }
} else
F_CLR(bhp, BH_WRITE);
- mp->lsn_cnt = lsn_cnt;
-
- /* If there no buffers we can write, we're done. */
- if (!can_write) {
- UNLOCKREGION(dbmp);
- return (mp->lsn_cnt ? DB_INCOMPLETE : 0);
+ /* If there no buffers we can write immediately, we're done. */
+ if (ar_cnt == 0) {
+ ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
+ goto done;
}
- /*
- * Write any buffers that we can. Restart the walk after each write,
- * __memp_pgwrite() discards and reacquires the region lock during I/O.
- */
-retry: for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
- /* Ignore pinned or locked buffers. */
- if (!F_ISSET(bhp, BH_WRITE) ||
- bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
- continue;
+ /* Lock down the buffers and their contents. */
+ for (cnt = 0; cnt < ar_cnt; ++cnt)
+ ++bharray[cnt]->ref;
- mfp = ADDR(dbmp, bhp->mf_offset);
- if ((ret =
- __memp_bhwrite(dbmp, mfp, bhp, &restart, &wrote)) != 0)
- goto err;
- if (wrote) {
- if (restart)
- goto retry;
+ UNLOCKREGION(dbmp);
+
+ /* Sort the buffers we're going to write. */
+ qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+
+ LOCKREGION(dbmp);
+
+ /* Walk the array, writing buffers. */
+ for (next = 0; next < ar_cnt; ++next) {
+ /*
+ * It's possible for a thread to have gotten the buffer since
+ * we listed it for writing. If the reference count is still
+ * 1, we're the only ones using the buffer, go ahead and write.
+ * If it's >1, then skip the buffer and assume that it will be
+ * written when it's returned to the cache.
+ */
+ if (bharray[next]->ref > 1) {
+ --bharray[next]->ref;
continue;
}
- __db_err(dbenv, "%s: unable to flush page: %lu",
- ADDR(dbmp, mfp->path_off), (u_long)bhp->pgno);
- ret = EPERM;
- goto err;
+
+ /* Write the buffer. */
+ mfp = R_ADDR(dbmp, bharray[next]->mf_offset);
+ ret =
+ __memp_bhwrite(dbmp, mfp, bharray[next], &notused, &wrote);
+
+ /* Release the buffer. */
+ --bharray[next]->ref;
+
+ /* If there's an error, release the rest of the buffers. */
+ if (ret != 0 || !wrote) {
+ while (++next < ar_cnt)
+ --bharray[next]->ref;
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Any process syncing the shared memory buffer pool
+ * had better be able to write to any underlying file.
+ * Be understanding, but firm, on this point.
+ */
+ if (!wrote) {
+ __db_err(dbenv, "%s: unable to flush page: %lu",
+ R_ADDR(dbmp, mfp->path_off),
+ (u_long)bharray[next]->pgno);
+ ret = EPERM;
+ goto err;
+ }
+ }
}
ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
-err: UNLOCKREGION(dbmp);
+done:
+ if (0) {
+err: /*
+ * On error, clear:
+ * MPOOL->lsn_cnt (the total sync count)
+ * MPOOLFILE->lsn_cnt (the per-file sync count)
+ * BH_WRITE flag (the scheduled for writing flag)
+ */
+ mp->lsn_cnt = 0;
+ for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
+ mfp->lsn_cnt = 0;
+ for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
+ F_CLR(bhp, BH_WRITE);
+ }
+ UNLOCKREGION(dbmp);
+ __db_free(bharray);
return (ret);
}
@@ -162,10 +239,10 @@ int
memp_fsync(dbmfp)
DB_MPOOLFILE *dbmfp;
{
- BH *bhp;
+ BH *bhp, **bharray;
DB_MPOOL *dbmp;
size_t mf_offset;
- int pincnt, restart, ret, wrote;
+ int ar_cnt, cnt, nalloc, next, pincnt, notused, ret, wrote;
/*
* If this handle doesn't have a file descriptor that's open for
@@ -175,35 +252,205 @@ memp_fsync(dbmfp)
if (F_ISSET(dbmfp, MP_READONLY | MP_PATH_TEMP))
return (0);
- dbmp = dbmfp->dbmp;
ret = 0;
+ dbmp = dbmfp->dbmp;
+ mf_offset = R_OFFSET(dbmp, dbmfp->mfp);
- mf_offset = OFFSET(dbmp, dbmfp->mfp);
+ /*
+ * We try and write the buffers in page order so that the underlying
+ * filesystem doesn't have to seek and can write contiguous blocks,
+ * plus, we don't want to hold the region lock while we write the
+ * buffers. Get memory to hold the buffer pointers. Get a good-size
+ * block, too, because we realloc while holding the region lock if we
+ * run out.
+ */
+ nalloc = 1024;
+ if ((bharray =
+ (BH **)__db_malloc((size_t)nalloc * sizeof(BH *))) == NULL)
+ return (ENOMEM);
LOCKREGION(dbmp);
/*
- * Walk the list of buffer headers for the MPOOLFILE, and write out any
- * dirty buffers that we can.
+ * Walk the LRU list of buffer headers, and get a list of buffers to
+ * write for this MPOOLFILE.
*/
-retry: pincnt = 0;
+ ar_cnt = pincnt = 0;
for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
- if (F_ISSET(bhp, BH_DIRTY) && bhp->mf_offset == mf_offset) {
- if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
- ++pincnt;
- continue;
- }
- if ((ret =
- __memp_pgwrite(dbmfp, bhp, &restart, &wrote)) != 0)
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+ if (!F_ISSET(bhp, BH_DIRTY) || bhp->mf_offset != mf_offset)
+ continue;
+ if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
+ ++pincnt;
+ continue;
+ }
+
+ if (ar_cnt == nalloc) {
+ nalloc *= 2;
+ if ((bharray = (BH **)__db_realloc(bharray,
+ nalloc * sizeof(BH *))) == NULL) {
+ ret = ENOMEM;
goto err;
- if (!wrote)
- ++pincnt;
- if (restart)
- goto retry;
+ }
+ }
+
+ bharray[ar_cnt++] = bhp;
+ }
+
+ /* Lock down the buffers and their contents. */
+ for (cnt = 0; cnt < ar_cnt; ++cnt)
+ ++bharray[cnt]->ref;
+
+ UNLOCKREGION(dbmp);
+
+ /* Sort the buffers we're going to write. */
+ qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+
+ LOCKREGION(dbmp);
+
+ /* Walk the array, writing buffers. */
+ for (next = 0; next < ar_cnt; ++next) {
+ /*
+ * It's possible for a thread to have gotten the buffer since
+ * we listed it for writing. If the reference count is still
+ * 1, we're the only ones using the buffer, go ahead and write.
+ * If it's >1, then skip the buffer and assume that it will be
+ * written when it's returned to the cache.
+ */
+ if (bharray[next]->ref > 1) {
+ ++pincnt;
+
+ --bharray[next]->ref;
+ continue;
}
+ /* Write the buffer. */
+ ret = __memp_pgwrite(dbmfp, bharray[next], &notused, &wrote);
+
+ /* Release the buffer. */
+ --bharray[next]->ref;
+
+ /* If there's an error, release the rest of the buffers. */
+ if (ret != 0) {
+ while (++next < ar_cnt)
+ --bharray[next]->ref;
+ goto err;
+ }
+ if (!wrote)
+ ++pincnt;
+ }
+
+err: UNLOCKREGION(dbmp);
+
+ __db_free(bharray);
+
+ /*
+ * Sync the underlying file as the last thing we do, so that the OS
+ * has maximal opportunity to flush buffers before we request it.
+ *
+ * XXX:
+ * Don't lock the region around the sync, fsync(2) has no atomicity
+ * issues.
+ */
+ if (ret == 0)
+ return (pincnt == 0 ? __db_fsync(dbmfp->fd) : DB_INCOMPLETE);
+ return (ret);
+
+}
+
+/*
+ * memp_trickle --
+ * Keep a specified percentage of the buffers clean.
+ */
+int
+memp_trickle(dbmp, pct, nwrotep)
+ DB_MPOOL *dbmp;
+ int pct, *nwrotep;
+{
+ BH *bhp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ u_long total;
+ int notused, ret, wrote;
+
+ mp = dbmp->mp;
+ if (nwrotep != NULL)
+ *nwrotep = 0;
+
+ if (pct < 1 || pct > 100)
+ return (EINVAL);
+
+ LOCKREGION(dbmp);
+
+ /*
+ * If there are sufficient clean buffers, or no buffers or no dirty
+ * buffers, we're done.
+ *
+ * XXX
+ * Using st_page_clean and st_page_dirty is our only choice at the
+ * moment, but it's not as correct as we might like in the presence
+ * of pools with more than one buffer size, as a free 512-byte buffer
+ * isn't the same as a free 8K buffer.
+ */
+loop: total = mp->stat.st_page_clean + mp->stat.st_page_dirty;
+ if (total == 0 || mp->stat.st_page_dirty == 0 ||
+ (mp->stat.st_page_clean * 100) / total >= (u_long)pct) {
+ UNLOCKREGION(dbmp);
+ return (0);
+ }
+
+ /* Loop until we write a buffer. */
+ for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+ if (bhp->ref != 0 ||
+ !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED))
+ continue;
+
+ mfp = R_ADDR(dbmp, bhp->mf_offset);
+ if ((ret =
+ __memp_bhwrite(dbmp, mfp, bhp, &notused, &wrote)) != 0)
+ goto err;
+
+ /*
+ * Any process syncing the shared memory buffer pool
+ * had better be able to write to any underlying file.
+ * Be understanding, but firm, on this point.
+ */
+ if (!wrote) {
+ __db_err(dbmp->dbenv, "%s: unable to flush page: %lu",
+ R_ADDR(dbmp, mfp->path_off), (u_long)bhp->pgno);
+ ret = EPERM;
+ goto err;
+ }
+
+ ++mp->stat.st_page_trickle;
+ if (nwrotep != NULL)
+ ++*nwrotep;
+ goto loop;
+ }
+
+ /* No more buffers to write. */
+ return (0);
+
err: UNLOCKREGION(dbmp);
+ return (ret);
+}
+
+static int
+__bhcmp(p1, p2)
+ const void *p1, *p2;
+{
+ BH *bhp1, *bhp2;
+
+ bhp1 = *(BH **)p1;
+ bhp2 = *(BH **)p2;
+
+ /* Sort by file (shared memory pool offset). */
+ if (bhp1->mf_offset < bhp2->mf_offset)
+ return (-1);
+ if (bhp1->mf_offset > bhp2->mf_offset)
+ return (1);
- return (ret == 0 ? (pincnt ? DB_INCOMPLETE : 0) : ret);
+ /* Sort by page in file. */
+ return (bhp1->pgno < bhp2->pgno ? -1 : 1);
}