/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint static const char revid[] = "$Id: mp_fopen.c,v 1.2 2004/03/30 01:23:44 jtownsen Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #ifdef HAVE_RPC #include #endif #include #endif #include "db_int.h" #include "dbinc/db_shash.h" #include "dbinc/log.h" #include "dbinc/mp.h" #ifdef HAVE_RPC #include "dbinc_auto/db_server.h" #include "dbinc_auto/rpc_client_ext.h" #endif static int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t)); static int __memp_fopen_pp __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t)); static int __memp_get_clear_len __P((DB_MPOOLFILE *, u_int32_t *)); static int __memp_get_flags __P((DB_MPOOLFILE *, u_int32_t *)); static int __memp_get_lsn_offset __P((DB_MPOOLFILE *, int32_t *)); static int __memp_get_maxsize __P((DB_MPOOLFILE *, u_int32_t *, u_int32_t *)); static int __memp_set_maxsize __P((DB_MPOOLFILE *, u_int32_t, u_int32_t)); static int __memp_get_pgcookie __P((DB_MPOOLFILE *, DBT *)); static int __memp_get_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *)); static int __memp_set_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY)); /* * __memp_fcreate_pp -- * DB_ENV->memp_fcreate pre/post processing. * * PUBLIC: int __memp_fcreate_pp __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t)); */ int __memp_fcreate_pp(dbenv, retp, flags) DB_ENV *dbenv; DB_MPOOLFILE **retp; u_int32_t flags; { int rep_check, ret; PANIC_CHECK(dbenv); /* Validate arguments. */ if ((ret = __db_fchk(dbenv, "DB_ENV->memp_fcreate", flags, 0)) != 0) return (ret); rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; if (rep_check) __env_rep_enter(dbenv); ret = __memp_fcreate(dbenv, retp); if (rep_check) __env_rep_exit(dbenv); return (ret); } /* * __memp_fcreate -- * DB_ENV->memp_fcreate. * * PUBLIC: int __memp_fcreate __P((DB_ENV *, DB_MPOOLFILE **)); */ int __memp_fcreate(dbenv, retp) DB_ENV *dbenv; DB_MPOOLFILE **retp; { DB_MPOOLFILE *dbmfp; int ret; /* Allocate and initialize the per-process structure. */ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0) return (ret); dbmfp->ref = 1; dbmfp->lsn_offset = -1; dbmfp->dbenv = dbenv; dbmfp->mfp = INVALID_ROFF; #ifdef HAVE_RPC if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) { dbmfp->get_clear_len = __dbcl_memp_get_clear_len; dbmfp->set_clear_len = __dbcl_memp_set_clear_len; dbmfp->get_fileid = __dbcl_memp_get_fileid; dbmfp->set_fileid = __dbcl_memp_set_fileid; dbmfp->get_flags = __dbcl_memp_get_flags; dbmfp->set_flags = __dbcl_memp_set_flags; dbmfp->get_ftype = __dbcl_memp_get_ftype; dbmfp->set_ftype = __dbcl_memp_set_ftype; dbmfp->get_lsn_offset = __dbcl_memp_get_lsn_offset; dbmfp->set_lsn_offset = __dbcl_memp_set_lsn_offset; dbmfp->get_maxsize = __dbcl_memp_get_maxsize; dbmfp->set_maxsize = __dbcl_memp_set_maxsize; dbmfp->get_pgcookie = __dbcl_memp_get_pgcookie; dbmfp->set_pgcookie = __dbcl_memp_set_pgcookie; dbmfp->get_priority = __dbcl_memp_get_priority; dbmfp->set_priority = __dbcl_memp_set_priority; dbmfp->get = __dbcl_memp_fget; dbmfp->open = __dbcl_memp_fopen; dbmfp->put = __dbcl_memp_fput; dbmfp->set = __dbcl_memp_fset; dbmfp->sync = __dbcl_memp_fsync; } else #endif { dbmfp->get_clear_len = __memp_get_clear_len; dbmfp->set_clear_len = __memp_set_clear_len; dbmfp->get_fileid = __memp_get_fileid; dbmfp->set_fileid = __memp_set_fileid; dbmfp->get_flags = __memp_get_flags; dbmfp->set_flags = __memp_set_flags; dbmfp->get_ftype = __memp_get_ftype; dbmfp->set_ftype = __memp_set_ftype; dbmfp->get_lsn_offset = __memp_get_lsn_offset; dbmfp->set_lsn_offset = __memp_set_lsn_offset; dbmfp->get_maxsize = __memp_get_maxsize; dbmfp->set_maxsize = __memp_set_maxsize; dbmfp->get_pgcookie = __memp_get_pgcookie; dbmfp->set_pgcookie = __memp_set_pgcookie; dbmfp->get_priority = __memp_get_priority; dbmfp->set_priority = __memp_set_priority; dbmfp->get = __memp_fget_pp; dbmfp->open = __memp_fopen_pp; dbmfp->put = __memp_fput_pp; dbmfp->set = __memp_fset_pp; dbmfp->sync = __memp_fsync_pp; } dbmfp->close = __memp_fclose_pp; *retp = dbmfp; return (0); } /* * __memp_get_clear_len -- * Get the clear length. */ static int __memp_get_clear_len(dbmfp, clear_lenp) DB_MPOOLFILE *dbmfp; u_int32_t *clear_lenp; { *clear_lenp = dbmfp->clear_len; return (0); } /* * __memp_set_clear_len -- * DB_MPOOLFILE->set_clear_len. * * PUBLIC: int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t)); */ int __memp_set_clear_len(dbmfp, clear_len) DB_MPOOLFILE *dbmfp; u_int32_t clear_len; { MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_clear_len"); dbmfp->clear_len = clear_len; return (0); } /* * __memp_get_fileid -- * DB_MPOOLFILE->get_fileid. * * PUBLIC: int __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *)); */ int __memp_get_fileid(dbmfp, fileid) DB_MPOOLFILE *dbmfp; u_int8_t *fileid; { if (!F_ISSET(dbmfp, MP_FILEID_SET)) { __db_err(dbmfp->dbenv, "get_fileid: file ID not set"); return (EINVAL); } memcpy(fileid, dbmfp->fileid, DB_FILE_ID_LEN); return (0); } /* * __memp_set_fileid -- * DB_MPOOLFILE->set_fileid. * * PUBLIC: int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *)); */ int __memp_set_fileid(dbmfp, fileid) DB_MPOOLFILE *dbmfp; u_int8_t *fileid; { MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_fileid"); memcpy(dbmfp->fileid, fileid, DB_FILE_ID_LEN); F_SET(dbmfp, MP_FILEID_SET); return (0); } /* * __memp_get_flags -- * Get the DB_MPOOLFILE flags; */ static int __memp_get_flags(dbmfp, flagsp) DB_MPOOLFILE *dbmfp; u_int32_t *flagsp; { MPOOLFILE *mfp; mfp = dbmfp->mfp; *flagsp = 0; if (mfp == NULL) *flagsp = FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE); else if (mfp->no_backing_file) FLD_SET(*flagsp, DB_MPOOL_NOFILE); return (0); } /* * __memp_set_flags -- * Set the DB_MPOOLFILE flags; * * PUBLIC: int __memp_set_flags __P((DB_MPOOLFILE *, u_int32_t, int)); */ int __memp_set_flags(dbmfp, flags, onoff) DB_MPOOLFILE *dbmfp; u_int32_t flags; int onoff; { DB_ENV *dbenv; MPOOLFILE *mfp; int ret; dbenv = dbmfp->dbenv; mfp = dbmfp->mfp; #define OKFLAGS (DB_MPOOL_NOFILE | DB_MPOOL_UNLINK) if ((ret = __db_fchk(dbenv, "DB_MPOOLFILE->set_flags", flags, OKFLAGS)) != 0) return (ret); switch (flags) { case 0: break; case DB_MPOOL_NOFILE: if (mfp == NULL) if (onoff) FLD_SET(dbmfp->config_flags, DB_MPOOL_NOFILE); else FLD_CLR(dbmfp->config_flags, DB_MPOOL_NOFILE); else mfp->no_backing_file = onoff; break; case DB_MPOOL_UNLINK: if (mfp == NULL) if (onoff) FLD_SET(dbmfp->config_flags, DB_MPOOL_UNLINK); else FLD_CLR(dbmfp->config_flags, DB_MPOOL_UNLINK); else mfp->unlink_on_close = onoff; break; } return (0); } /* * __memp_get_ftype -- * Get the file type (as registered). * * PUBLIC: int __memp_get_ftype __P((DB_MPOOLFILE *, int *)); */ int __memp_get_ftype(dbmfp, ftypep) DB_MPOOLFILE *dbmfp; int *ftypep; { *ftypep = dbmfp->ftype; return (0); } /* * __memp_set_ftype -- * DB_MPOOLFILE->set_ftype. * * PUBLIC: int __memp_set_ftype __P((DB_MPOOLFILE *, int)); */ int __memp_set_ftype(dbmfp, ftype) DB_MPOOLFILE *dbmfp; int ftype; { MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_ftype"); dbmfp->ftype = ftype; return (0); } /* * __memp_get_lsn_offset -- * Get the page's LSN offset. */ static int __memp_get_lsn_offset(dbmfp, lsn_offsetp) DB_MPOOLFILE *dbmfp; int32_t *lsn_offsetp; { *lsn_offsetp = dbmfp->lsn_offset; return (0); } /* * __memp_set_lsn_offset -- * Set the page's LSN offset. * * PUBLIC: int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t)); */ int __memp_set_lsn_offset(dbmfp, lsn_offset) DB_MPOOLFILE *dbmfp; int32_t lsn_offset; { MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_lsn_offset"); dbmfp->lsn_offset = lsn_offset; return (0); } /* * __memp_get_maxsize -- * Get the file's maximum size. */ static int __memp_get_maxsize(dbmfp, gbytesp, bytesp) DB_MPOOLFILE *dbmfp; u_int32_t *gbytesp, *bytesp; { DB_ENV *dbenv; DB_MPOOL *dbmp; MPOOLFILE *mfp; if ((mfp = dbmfp->mfp) == NULL) { *gbytesp = dbmfp->gbytes; *bytesp = dbmfp->bytes; } else { dbenv = dbmfp->dbenv; dbmp = dbenv->mp_handle; R_LOCK(dbenv, dbmp->reginfo); *gbytesp = mfp->maxpgno / (GIGABYTE / mfp->stat.st_pagesize); *bytesp = (mfp->maxpgno % (GIGABYTE / mfp->stat.st_pagesize)) * mfp->stat.st_pagesize; R_UNLOCK(dbenv, dbmp->reginfo); } return (0); } /* * __memp_set_maxsize -- * Set the files's maximum size. */ static int __memp_set_maxsize(dbmfp, gbytes, bytes) DB_MPOOLFILE *dbmfp; u_int32_t gbytes, bytes; { DB_ENV *dbenv; DB_MPOOL *dbmp; MPOOLFILE *mfp; if ((mfp = dbmfp->mfp) == NULL) { dbmfp->gbytes = gbytes; dbmfp->bytes = bytes; } else { dbenv = dbmfp->dbenv; dbmp = dbenv->mp_handle; R_LOCK(dbenv, dbmp->reginfo); mfp->maxpgno = gbytes * (GIGABYTE / mfp->stat.st_pagesize); mfp->maxpgno += (bytes + mfp->stat.st_pagesize - 1) / mfp->stat.st_pagesize; R_UNLOCK(dbenv, dbmp->reginfo); } return (0); } /* * __memp_get_pgcookie -- * Get the pgin/pgout cookie. */ static int __memp_get_pgcookie(dbmfp, pgcookie) DB_MPOOLFILE *dbmfp; DBT *pgcookie; { if (dbmfp->pgcookie == NULL) { pgcookie->size = 0; pgcookie->data = ""; } else memcpy(pgcookie, dbmfp->pgcookie, sizeof(DBT)); return (0); } /* * __memp_set_pgcookie -- * Set the pgin/pgout cookie. * * PUBLIC: int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *)); */ int __memp_set_pgcookie(dbmfp, pgcookie) DB_MPOOLFILE *dbmfp; DBT *pgcookie; { DB_ENV *dbenv; DBT *cookie; int ret; MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_pgcookie"); dbenv = dbmfp->dbenv; if ((ret = __os_calloc(dbenv, 1, sizeof(*cookie), &cookie)) != 0) return (ret); if ((ret = __os_malloc(dbenv, pgcookie->size, &cookie->data)) != 0) { (void)__os_free(dbenv, cookie); return (ret); } memcpy(cookie->data, pgcookie->data, pgcookie->size); cookie->size = pgcookie->size; dbmfp->pgcookie = cookie; return (0); } /* * __memp_get_priority -- * Set the cache priority for pages from this file. */ static int __memp_get_priority(dbmfp, priorityp) DB_MPOOLFILE *dbmfp; DB_CACHE_PRIORITY *priorityp; { switch (dbmfp->priority) { case MPOOL_PRI_VERY_LOW: *priorityp = DB_PRIORITY_VERY_LOW; break; case MPOOL_PRI_LOW: *priorityp = DB_PRIORITY_LOW; break; case MPOOL_PRI_DEFAULT: *priorityp = DB_PRIORITY_DEFAULT; break; case MPOOL_PRI_HIGH: *priorityp = DB_PRIORITY_HIGH; break; case MPOOL_PRI_VERY_HIGH: *priorityp = DB_PRIORITY_VERY_HIGH; break; default: __db_err(dbmfp->dbenv, "DB_MPOOLFILE->get_priority: unknown priority value: %d", dbmfp->priority); return (EINVAL); } return (0); } /* * __memp_set_priority -- * Set the cache priority for pages from this file. */ static int __memp_set_priority(dbmfp, priority) DB_MPOOLFILE *dbmfp; DB_CACHE_PRIORITY priority; { switch (priority) { case DB_PRIORITY_VERY_LOW: dbmfp->priority = MPOOL_PRI_VERY_LOW; break; case DB_PRIORITY_LOW: dbmfp->priority = MPOOL_PRI_LOW; break; case DB_PRIORITY_DEFAULT: dbmfp->priority = MPOOL_PRI_DEFAULT; break; case DB_PRIORITY_HIGH: dbmfp->priority = MPOOL_PRI_HIGH; break; case DB_PRIORITY_VERY_HIGH: dbmfp->priority = MPOOL_PRI_VERY_HIGH; break; default: __db_err(dbmfp->dbenv, "DB_MPOOLFILE->set_priority: unknown priority value: %d", priority); return (EINVAL); } /* Update the underlying file if we've already opened it. */ if (dbmfp->mfp != NULL) dbmfp->mfp->priority = priority; return (0); } /* * __memp_fopen_pp -- * DB_MPOOLFILE->open pre/post processing. */ static int __memp_fopen_pp(dbmfp, path, flags, mode, pagesize) DB_MPOOLFILE *dbmfp; const char *path; u_int32_t flags; int mode; size_t pagesize; { DB_ENV *dbenv; int rep_check, ret; dbenv = dbmfp->dbenv; PANIC_CHECK(dbenv); /* Validate arguments. */ if ((ret = __db_fchk(dbenv, "DB_MPOOLFILE->open", flags, DB_CREATE | DB_DIRECT | DB_EXTENT | DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0) return (ret); /* * Require a non-zero, power-of-two pagesize, smaller than the * clear length. */ if (pagesize == 0 || !POWER_OF_TWO(pagesize)) { __db_err(dbenv, "DB_MPOOLFILE->open: page sizes must be a power-of-2"); return (EINVAL); } if (dbmfp->clear_len > pagesize) { __db_err(dbenv, "DB_MPOOLFILE->open: clear length larger than page size"); return (EINVAL); } /* Read-only checks, and local flag. */ if (LF_ISSET(DB_RDONLY) && path == NULL) { __db_err(dbenv, "DB_MPOOLFILE->open: temporary files can't be readonly"); return (EINVAL); } rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; if (rep_check) __env_rep_enter(dbenv); ret = __memp_fopen(dbmfp, NULL, path, flags, mode, pagesize); if (rep_check) __env_rep_exit(dbenv); return (ret); } /* * __memp_fopen -- * DB_MPOOLFILE->open. * * PUBLIC: int __memp_fopen __P((DB_MPOOLFILE *, * PUBLIC: MPOOLFILE *, const char *, u_int32_t, int, size_t)); */ int __memp_fopen(dbmfp, mfp, path, flags, mode, pagesize) DB_MPOOLFILE *dbmfp; MPOOLFILE *mfp; const char *path; u_int32_t flags; int mode; size_t pagesize; { DB_ENV *dbenv; DB_MPOOL *dbmp; DB_MPOOLFILE *tmp_dbmfp; MPOOL *mp; db_pgno_t last_pgno; size_t maxmap; u_int32_t mbytes, bytes, oflags; int refinc, ret; char *rpath; void *p; dbenv = dbmfp->dbenv; dbmp = dbenv->mp_handle; mp = dbmp->reginfo[0].primary; refinc = ret = 0; rpath = NULL; /* * If it's a temporary file, delay the open until we actually need * to write the file, and we know we can't join any existing files. */ if (path == NULL) goto alloc; /* * If our caller knows what mfp we're using, increment the ref count, * no need to search. * * We don't need to acquire a lock other than the mfp itself, because * we know there's another reference and it's not going away. */ if (mfp != NULL) { MUTEX_LOCK(dbenv, &mfp->mutex); ++mfp->mpf_cnt; refinc = 1; MUTEX_UNLOCK(dbenv, &mfp->mutex); } /* * Get the real name for this file and open it. If it's a Queue extent * file, it may not exist, and that's OK. */ oflags = 0; if (LF_ISSET(DB_CREATE)) oflags |= DB_OSO_CREATE; if (LF_ISSET(DB_DIRECT)) oflags |= DB_OSO_DIRECT; if (LF_ISSET(DB_RDONLY)) { F_SET(dbmfp, MP_READONLY); oflags |= DB_OSO_RDONLY; } if ((ret = __db_appname(dbenv, DB_APP_DATA, path, 0, NULL, &rpath)) != 0) goto err; /* * Supply a page size so os_open can decide whether to turn buffering * off if the DB_DIRECT_DB flag is set. */ if ((ret = __os_open_extend(dbenv, rpath, 0, (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp)) != 0) { if (!LF_ISSET(DB_EXTENT)) __db_err(dbenv, "%s: %s", rpath, db_strerror(ret)); goto err; } /* * Cache file handles are shared, and have mutexes to protect the * underlying file handle across seek and read/write calls. */ dbmfp->fhp->ref = 1; if (F_ISSET(dbenv, DB_ENV_THREAD) && (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmfp->fhp->mutexp, MUTEX_ALLOC | MUTEX_THREAD)) != 0) goto err; /* * Figure out the file's size. * * !!! * We can't use off_t's here, or in any code in the mainline library * for that matter. (We have to use them in the os stubs, of course, * as there are system calls that take them as arguments.) The reason * is some customers build in environments where an off_t is 32-bits, * but still run where offsets are 64-bits, and they pay us a lot of * money. */ if ((ret = __os_ioinfo( dbenv, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) { __db_err(dbenv, "%s: %s", rpath, db_strerror(ret)); goto err; } /* * Get the file id if we weren't given one. Generated file id's * don't use timestamps, otherwise there'd be no chance of any * other process joining the party. */ if (!F_ISSET(dbmfp, MP_FILEID_SET) && (ret = __os_fileid(dbenv, rpath, 0, dbmfp->fileid)) != 0) goto err; if (mfp != NULL) goto check_map; /* * If not creating a temporary file, walk the list of MPOOLFILE's, * looking for a matching file. Files backed by temporary files * or previously removed files can't match. * * DB_TRUNCATE support. * * The fileID is a filesystem unique number (e.g., a UNIX dev/inode * pair) plus a timestamp. If files are removed and created in less * than a second, the fileID can be repeated. The problem with * repetition happens when the file that previously had the fileID * value still has pages in the pool, since we don't want to use them * to satisfy requests for the new file. * * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated * opens with that flag set guarantees matching fileIDs when the * machine can open a file and then re-open with truncate within a * second. For this reason, we pass that flag down, and, if we find * a matching entry, we ensure that it's never found again, and we * create a new entry for the current request. */ R_LOCK(dbenv, dbmp->reginfo); for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { /* Skip dead files and temporary files. */ if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) continue; /* Skip non-matching files. */ if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0) continue; /* * If the file is being truncated, remove it from the system * and create a new entry. * * !!! * We should be able to set mfp to NULL and break out of the * loop, but I like the idea of checking all the entries. */ if (LF_ISSET(DB_TRUNCATE)) { MUTEX_LOCK(dbenv, &mfp->mutex); mfp->deadfile = 1; MUTEX_UNLOCK(dbenv, &mfp->mutex); continue; } /* * Some things about a file cannot be changed: the clear length, * page size, or lSN location. * * The file type can change if the application's pre- and post- * processing needs change. For example, an application that * created a hash subdatabase in a database that was previously * all btree. * * !!! * We do not check to see if the pgcookie information changed, * or update it if it is. */ if (dbmfp->clear_len != mfp->clear_len || pagesize != mfp->stat.st_pagesize || dbmfp->lsn_offset != mfp->lsn_off) { __db_err(dbenv, "%s: clear length, page size or LSN location changed", path); R_UNLOCK(dbenv, dbmp->reginfo); ret = EINVAL; goto err; } /* * Check to see if this file has died while we waited. * * We normally don't lock the deadfile field when we read it as * we only care if the field is zero or non-zero. We do lock * on read when searching for a matching MPOOLFILE so that two * threads of control don't race between setting the deadfile * bit and incrementing the reference count, that is, a thread * of control decrementing the reference count and then setting * deadfile because the reference count is 0 blocks us finding * the file without knowing it's about to be marked dead. */ MUTEX_LOCK(dbenv, &mfp->mutex); if (mfp->deadfile) { MUTEX_UNLOCK(dbenv, &mfp->mutex); continue; } ++mfp->mpf_cnt; refinc = 1; MUTEX_UNLOCK(dbenv, &mfp->mutex); if (dbmfp->ftype != 0) mfp->ftype = dbmfp->ftype; break; } R_UNLOCK(dbenv, dbmp->reginfo); if (mfp != NULL) goto check_map; alloc: /* Allocate and initialize a new MPOOLFILE. */ if ((ret = __memp_alloc( dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0) goto err; memset(mfp, 0, sizeof(MPOOLFILE)); mfp->mpf_cnt = 1; mfp->ftype = dbmfp->ftype; mfp->stat.st_pagesize = pagesize; mfp->lsn_off = dbmfp->lsn_offset; mfp->clear_len = dbmfp->clear_len; mfp->priority = dbmfp->priority; if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) { mfp->maxpgno = dbmfp->gbytes * (GIGABYTE / mfp->stat.st_pagesize); mfp->maxpgno += (dbmfp->bytes + mfp->stat.st_pagesize - 1) / mfp->stat.st_pagesize; } if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) mfp->no_backing_file = 1; if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK)) mfp->unlink_on_close = 1; if (LF_ISSET(DB_TXN_NOT_DURABLE)) F_SET(mfp, MP_NOT_DURABLE); if (LF_ISSET(DB_DIRECT)) F_SET(mfp, MP_DIRECT); if (LF_ISSET(DB_EXTENT)) F_SET(mfp, MP_EXTENT); F_SET(mfp, MP_CAN_MMAP); if (path == NULL) F_SET(mfp, MP_TEMP); else { /* * Don't permit files that aren't a multiple of the pagesize, * and find the number of the last page in the file, all the * time being careful not to overflow 32 bits. * * During verify or recovery, we might have to cope with a * truncated file; if the file size is not a multiple of the * page size, round down to a page, we'll take care of the * partial page outside the mpool system. */ if (bytes % pagesize != 0) { if (LF_ISSET(DB_ODDFILESIZE)) bytes -= (u_int32_t)(bytes % pagesize); else { __db_err(dbenv, "%s: file size not a multiple of the pagesize", rpath); ret = EINVAL; goto err; } } /* * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a * page get, we have to increment the last page in the file. * Figure it out and save it away. * * Note correction: page numbers are zero-based, not 1-based. */ last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize)); last_pgno += (db_pgno_t)(bytes / pagesize); if (last_pgno != 0) --last_pgno; mfp->orig_last_pgno = mfp->last_pgno = last_pgno; /* Copy the file path into shared memory. */ if ((ret = __memp_alloc(dbmp, dbmp->reginfo, NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0) goto err; memcpy(p, path, strlen(path) + 1); /* Copy the file identification string into shared memory. */ if ((ret = __memp_alloc(dbmp, dbmp->reginfo, NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) goto err; memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN); } /* Copy the page cookie into shared memory. */ if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) { mfp->pgcookie_len = 0; mfp->pgcookie_off = 0; } else { if ((ret = __memp_alloc(dbmp, dbmp->reginfo, NULL, dbmfp->pgcookie->size, &mfp->pgcookie_off, &p)) != 0) goto err; memcpy(p, dbmfp->pgcookie->data, dbmfp->pgcookie->size); mfp->pgcookie_len = dbmfp->pgcookie->size; } /* * Prepend the MPOOLFILE to the list of MPOOLFILE's. */ R_LOCK(dbenv, dbmp->reginfo); ret = __db_mutex_setup(dbenv, dbmp->reginfo, &mfp->mutex, MUTEX_NO_RLOCK); if (ret == 0) SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile); R_UNLOCK(dbenv, dbmp->reginfo); if (ret != 0) goto err; check_map: /* * We need to verify that all handles open a file either durable or not * durable. This needs to be cross process and cross sub-databases, so * mpool is the place to do it. */ if (!LF_ISSET(DB_RDONLY) && !LF_ISSET(DB_TXN_NOT_DURABLE) != !F_ISSET(mfp, MP_NOT_DURABLE)) { __db_err(dbenv, "Cannot open DURABLE and NOT DURABLE handles in the same file"); ret = EINVAL; goto err; } /* * All paths to here have initialized the mfp variable to reference * the selected (or allocated) MPOOLFILE. */ dbmfp->mfp = mfp; /* * Check to see if we can mmap the file. If a file: * + isn't temporary * + is read-only * + doesn't require any pgin/pgout support * + the DB_NOMMAP flag wasn't set (in either the file open or * the environment in which it was opened) * + and is less than mp_mmapsize bytes in size * * we can mmap it instead of reading/writing buffers. Don't do error * checking based on the mmap call failure. We want to do normal I/O * on the file if the reason we failed was because the file was on an * NFS mounted partition, and we can fail in buffer I/O just as easily * as here. * * We'd like to test to see if the file is too big to mmap. Since we * don't know what size or type off_t's or size_t's are, or the largest * unsigned integral type is, or what random insanity the local C * compiler will perpetrate, doing the comparison in a portable way is * flatly impossible. Hope that mmap fails if the file is too large. */ #define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 MB. */ if (F_ISSET(mfp, MP_CAN_MMAP)) { maxmap = dbenv->mp_mmapsize == 0 ? DB_MAXMMAPSIZE : dbenv->mp_mmapsize; if (path == NULL) F_CLR(mfp, MP_CAN_MMAP); else if (!F_ISSET(dbmfp, MP_READONLY)) F_CLR(mfp, MP_CAN_MMAP); else if (dbmfp->ftype != 0) F_CLR(mfp, MP_CAN_MMAP); else if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP)) F_CLR(mfp, MP_CAN_MMAP); else if (mbytes > maxmap / MEGABYTE || (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE)) F_CLR(mfp, MP_CAN_MMAP); dbmfp->addr = NULL; if (F_ISSET(mfp, MP_CAN_MMAP)) { dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; if (__os_mapfile(dbenv, rpath, dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) { dbmfp->addr = NULL; F_CLR(mfp, MP_CAN_MMAP); } } } F_SET(dbmfp, MP_OPEN_CALLED); /* * Share the underlying file descriptor if that's possible. * * Add the file to the process' list of DB_MPOOLFILEs. */ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); for (tmp_dbmfp = TAILQ_FIRST(&dbmp->dbmfq); tmp_dbmfp != NULL; tmp_dbmfp = TAILQ_NEXT(tmp_dbmfp, q)) if (dbmfp->mfp == tmp_dbmfp->mfp && (F_ISSET(dbmfp, MP_READONLY) || !F_ISSET(tmp_dbmfp, MP_READONLY))) { if (dbmfp->fhp->mutexp != NULL) __db_mutex_free( dbenv, dbmp->reginfo, dbmfp->fhp->mutexp); (void)__os_closehandle(dbenv, dbmfp->fhp); ++tmp_dbmfp->fhp->ref; dbmfp->fhp = tmp_dbmfp->fhp; break; } TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); if (0) { err: if (refinc) { MUTEX_LOCK(dbenv, &mfp->mutex); --mfp->mpf_cnt; MUTEX_UNLOCK(dbenv, &mfp->mutex); } } if (rpath != NULL) __os_free(dbenv, rpath); return (ret); } /* * __memp_last_pgno -- * Return the page number of the last page in the file. * * !!! * Undocumented interface: DB private. * * PUBLIC: void __memp_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *)); */ void __memp_last_pgno(dbmfp, pgnoaddr) DB_MPOOLFILE *dbmfp; db_pgno_t *pgnoaddr; { DB_ENV *dbenv; DB_MPOOL *dbmp; dbenv = dbmfp->dbenv; dbmp = dbenv->mp_handle; R_LOCK(dbenv, dbmp->reginfo); *pgnoaddr = dbmfp->mfp->last_pgno; R_UNLOCK(dbenv, dbmp->reginfo); } /* * memp_fclose_pp -- * DB_MPOOLFILE->close pre/post processing. */ static int __memp_fclose_pp(dbmfp, flags) DB_MPOOLFILE *dbmfp; u_int32_t flags; { DB_ENV *dbenv; int rep_check, ret, t_ret; dbenv = dbmfp->dbenv; /* * Validate arguments, but as a handle destructor, we can't fail. * * !!! * DB_MPOOL_DISCARD: Undocumented flag: DB private. */ ret = __db_fchk(dbenv, "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD); rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; if (rep_check) __env_rep_enter(dbenv); if ((t_ret = __memp_fclose(dbmfp, flags)) != 0 && ret == 0) ret = t_ret; if (rep_check) __env_rep_exit(dbenv); return (ret); } /* * __memp_fclose -- * DB_MPOOLFILE->close. * * PUBLIC: int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t)); */ int __memp_fclose(dbmfp, flags) DB_MPOOLFILE *dbmfp; u_int32_t flags; { DB_ENV *dbenv; DB_MPOOL *dbmp; MPOOLFILE *mfp; char *rpath; u_int32_t ref; int deleted, ret, t_ret; dbenv = dbmfp->dbenv; dbmp = dbenv->mp_handle; ret = 0; /* * Remove the DB_MPOOLFILE from the process' list. * * It's possible the underlying mpool cache may never have been created. * In that case, all we have is a structure, discard it. * * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE * file list, check the MP_OPEN_CALLED flag to be sure. */ if (dbmp == NULL) goto done; MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); DB_ASSERT(dbmfp->ref >= 1); if ((ref = --dbmfp->ref) == 0 && F_ISSET(dbmfp, MP_OPEN_CALLED)) TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); /* * Decrement the file descriptor's ref count -- if we're the last ref, * we'll discard the file descriptor. */ if (ref == 0 && dbmfp->fhp != NULL && --dbmfp->fhp->ref > 0) dbmfp->fhp = NULL; MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); if (ref != 0) return (0); /* Complain if pinned blocks never returned. */ if (dbmfp->pinref != 0) { __db_err(dbenv, "%s: close: %lu blocks left pinned", __memp_fn(dbmfp), (u_long)dbmfp->pinref); ret = __db_panic(dbenv, DB_RUNRECOVERY); } /* Discard any mmap information. */ if (dbmfp->addr != NULL && (ret = __os_unmapfile(dbenv, dbmfp->addr, dbmfp->len)) != 0) __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(ret)); /* * Close the file and discard the descriptor structure; temporary * files may not yet have been created. */ if (dbmfp->fhp != NULL) { if (dbmfp->fhp->mutexp != NULL) { __db_mutex_free( dbenv, dbmp->reginfo, dbmfp->fhp->mutexp); dbmfp->fhp->mutexp = NULL; } if ((t_ret = __os_closehandle(dbenv, dbmfp->fhp)) != 0) { __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(t_ret)); if (ret == 0) ret = t_ret; } dbmfp->fhp = NULL; } /* * Discard our reference on the underlying MPOOLFILE, and close it * if it's no longer useful to anyone. It possible the open of the * file never happened or wasn't successful, in which case, mpf will * be NULL and MP_OPEN_CALLED will not be set. */ mfp = dbmfp->mfp; DB_ASSERT((F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp != NULL) || (!F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp == NULL)); if (!F_ISSET(dbmfp, MP_OPEN_CALLED)) goto done; /* * If it's a temp file, all outstanding references belong to unflushed * buffers. (A temp file can only be referenced by one DB_MPOOLFILE). * We don't care about preserving any of those buffers, so mark the * MPOOLFILE as dead so that even the dirty ones just get discarded * when we try to flush them. */ deleted = 0; MUTEX_LOCK(dbenv, &mfp->mutex); if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) { if (LF_ISSET(DB_MPOOL_DISCARD) || F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) mfp->deadfile = 1; if (mfp->unlink_on_close) { if ((t_ret = __db_appname(dbmp->dbenv, DB_APP_DATA, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0) ret = t_ret; if (t_ret == 0) { if ((t_ret = __os_unlink( dbmp->dbenv, rpath) != 0) && ret == 0) ret = t_ret; __os_free(dbenv, rpath); } } if (mfp->block_cnt == 0) { if ((t_ret = __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0) ret = t_ret; deleted = 1; } } if (deleted == 0) MUTEX_UNLOCK(dbenv, &mfp->mutex); done: /* Discard the DB_MPOOLFILE structure. */ if (dbmfp->pgcookie != NULL) { __os_free(dbenv, dbmfp->pgcookie->data); __os_free(dbenv, dbmfp->pgcookie); } __os_free(dbenv, dbmfp); return (ret); } /* * __memp_mf_sync -- * sync an MPOOLFILE. Should only be used when * the file is not already open in this process. * * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *)); */ int __memp_mf_sync(dbmp, mfp) DB_MPOOL *dbmp; MPOOLFILE *mfp; { DB_ENV *dbenv; DB_FH *fhp; int ret, t_ret; char *rpath; dbenv = dbmp->dbenv; if ((ret = __db_appname(dbenv, DB_APP_DATA, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) { if ((ret = __os_open(dbenv, rpath, 0, 0, &fhp)) == 0) { ret = __os_fsync(dbenv, fhp); if ((t_ret = __os_closehandle(dbenv, fhp)) != 0 && ret == 0) ret = t_ret; } __os_free(dbenv, rpath); } return (ret); } /* * __memp_mf_discard -- * Discard an MPOOLFILE. * * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *)); */ int __memp_mf_discard(dbmp, mfp) DB_MPOOL *dbmp; MPOOLFILE *mfp; { DB_ENV *dbenv; DB_MPOOL_STAT *sp; MPOOL *mp; int ret; dbenv = dbmp->dbenv; mp = dbmp->reginfo[0].primary; ret = 0; /* * Expects caller to be holding the MPOOLFILE mutex. * * When discarding a file, we have to flush writes from it to disk. * The scenario is that dirty buffers from this file need to be * flushed to satisfy a future checkpoint, but when the checkpoint * calls mpool sync, the sync code won't know anything about them. */ if (mfp->file_written && !mfp->deadfile) ret = __memp_mf_sync(dbmp, mfp); /* * We have to release the MPOOLFILE lock before acquiring the region * lock so that we don't deadlock. Make sure nobody ever looks at * this structure again. */ mfp->deadfile = 1; /* Discard the mutex we're holding. */ MUTEX_UNLOCK(dbenv, &mfp->mutex); /* Delete from the list of MPOOLFILEs. */ R_LOCK(dbenv, dbmp->reginfo); SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile); /* Copy the statistics into the region. */ sp = &mp->stat; sp->st_cache_hit += mfp->stat.st_cache_hit; sp->st_cache_miss += mfp->stat.st_cache_miss; sp->st_map += mfp->stat.st_map; sp->st_page_create += mfp->stat.st_page_create; sp->st_page_in += mfp->stat.st_page_in; sp->st_page_out += mfp->stat.st_page_out; /* Clear the mutex this MPOOLFILE recorded. */ __db_shlocks_clear(&mfp->mutex, dbmp->reginfo, (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off)); /* Free the space. */ if (mfp->path_off != 0) __db_shalloc_free(dbmp->reginfo[0].addr, R_ADDR(dbmp->reginfo, mfp->path_off)); if (mfp->fileid_off != 0) __db_shalloc_free(dbmp->reginfo[0].addr, R_ADDR(dbmp->reginfo, mfp->fileid_off)); if (mfp->pgcookie_off != 0) __db_shalloc_free(dbmp->reginfo[0].addr, R_ADDR(dbmp->reginfo, mfp->pgcookie_off)); __db_shalloc_free(dbmp->reginfo[0].addr, mfp); R_UNLOCK(dbenv, dbmp->reginfo); return (ret); } /* * __memp_fn -- * On errors we print whatever is available as the file name. * * PUBLIC: char * __memp_fn __P((DB_MPOOLFILE *)); */ char * __memp_fn(dbmfp) DB_MPOOLFILE *dbmfp; { return (__memp_fns(dbmfp->dbenv->mp_handle, dbmfp->mfp)); } /* * __memp_fns -- * On errors we print whatever is available as the file name. * * PUBLIC: char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *)); * */ char * __memp_fns(dbmp, mfp) DB_MPOOL *dbmp; MPOOLFILE *mfp; { if (mfp->path_off == 0) return ((char *)"temporary"); return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off)); }