/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996,2008 Oracle. All rights reserved. * * $Id: mp_region.c,v 12.39 2008/05/08 03:15:38 mjc Exp $ */ #include "db_config.h" #include "db_int.h" #include "dbinc/mp.h" static int __memp_init_config __P((ENV *, MPOOL *)); static void __memp_region_size __P((ENV *, roff_t *, u_int32_t *)); /* * __memp_open -- * Internal version of memp_open: only called from ENV->open. * * PUBLIC: int __memp_open __P((ENV *, int)); */ int __memp_open(env, create_ok) ENV *env; int create_ok; { DB_ENV *dbenv; DB_MPOOL *dbmp; MPOOL *mp; REGINFO reginfo; roff_t reg_size; u_int i, max_nreg; u_int32_t htab_buckets, *regids; int ret; dbenv = env->dbenv; /* Calculate the region size and hash bucket count. */ __memp_region_size(env, ®_size, &htab_buckets); /* Create and initialize the DB_MPOOL structure. */ if ((ret = __os_calloc(env, 1, sizeof(*dbmp), &dbmp)) != 0) return (ret); LIST_INIT(&dbmp->dbregq); TAILQ_INIT(&dbmp->dbmfq); dbmp->env = env; /* Join/create the first mpool region. */ memset(®info, 0, sizeof(REGINFO)); reginfo.env = env; reginfo.type = REGION_TYPE_MPOOL; reginfo.id = INVALID_REGION_ID; reginfo.flags = REGION_JOIN_OK; if (create_ok) F_SET(®info, REGION_CREATE_OK); if ((ret = __env_region_attach(env, ®info, reg_size)) != 0) goto err; /* * If we created the region, initialize it. Create or join any * additional regions. */ if (F_ISSET(®info, REGION_CREATE)) { /* * We define how many regions there are going to be, allocate * the REGINFO structures and create them. Make sure we don't * clear the wrong entries on error. */ max_nreg = __memp_max_regions(env); if ((ret = __os_calloc(env, max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) goto err; /* Make sure we don't clear the wrong entries on error. */ dbmp->reginfo[0] = reginfo; for (i = 1; i < max_nreg; ++i) dbmp->reginfo[i].id = INVALID_REGION_ID; /* Initialize the first region. */ if ((ret = __memp_init(env, dbmp, 0, htab_buckets, max_nreg)) != 0) goto err; /* * Create/initialize remaining regions and copy their IDs into * the first region. */ mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary); regids = R_ADDR(dbmp->reginfo, mp->regids); regids[0] = dbmp->reginfo[0].id; for (i = 1; i < dbenv->mp_ncache; ++i) { dbmp->reginfo[i].env = env; dbmp->reginfo[i].type = REGION_TYPE_MPOOL; dbmp->reginfo[i].id = INVALID_REGION_ID; dbmp->reginfo[i].flags = REGION_CREATE_OK; if ((ret = __env_region_attach( env, &dbmp->reginfo[i], reg_size)) != 0) goto err; if ((ret = __memp_init(env, dbmp, i, htab_buckets, max_nreg)) != 0) goto err; regids[i] = dbmp->reginfo[i].id; } } else { /* * Determine how many regions there are going to be, allocate * the REGINFO structures and fill in local copies of that * information. */ mp = R_ADDR(®info, reginfo.rp->primary); dbenv->mp_ncache = mp->nreg; if ((ret = __os_calloc(env, mp->max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) goto err; /* Make sure we don't clear the wrong entries on error. */ for (i = 0; i < dbenv->mp_ncache; ++i) dbmp->reginfo[i].id = INVALID_REGION_ID; dbmp->reginfo[0] = reginfo; /* Join remaining regions. */ regids = R_ADDR(dbmp->reginfo, mp->regids); for (i = 1; i < dbenv->mp_ncache; ++i) { dbmp->reginfo[i].env = env; dbmp->reginfo[i].type = REGION_TYPE_MPOOL; dbmp->reginfo[i].id = regids[i]; dbmp->reginfo[i].flags = REGION_JOIN_OK; if ((ret = __env_region_attach( env, &dbmp->reginfo[i], 0)) != 0) goto err; } } /* Set the local addresses for the regions. */ for (i = 0; i < dbenv->mp_ncache; ++i) dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary); /* If the region is threaded, allocate a mutex to lock the handles. */ if ((ret = __mutex_alloc(env, MTX_MPOOL_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbmp->mutex)) != 0) goto err; env->mp_handle = dbmp; /* A process joining the region may reset the mpool configuration. */ if ((ret = __memp_init_config(env, mp)) != 0) return (ret); return (0); err: env->mp_handle = NULL; if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { for (i = 0; i < dbenv->mp_ncache; ++i) if (dbmp->reginfo[i].id != INVALID_REGION_ID) (void)__env_region_detach( env, &dbmp->reginfo[i], 0); __os_free(env, dbmp->reginfo); } (void)__mutex_free(env, &dbmp->mutex); __os_free(env, dbmp); return (ret); } /* * __memp_init -- * Initialize a MPOOL structure in shared memory. * * PUBLIC: int __memp_init * PUBLIC: __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int)); */ int __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) ENV *env; DB_MPOOL *dbmp; u_int reginfo_off, max_nreg; u_int32_t htab_buckets; { BH *frozen_bhp; BH_FROZEN_ALLOC *frozen; DB_ENV *dbenv; DB_MPOOL_HASH *htab, *hp; MPOOL *mp, *main_mp; REGINFO *infop; db_mutex_t mtx_base, mtx_discard, mtx_prev; u_int32_t i; int ret; void *p; dbenv = env->dbenv; infop = &dbmp->reginfo[reginfo_off]; if ((ret = __env_alloc(infop, sizeof(MPOOL), &infop->primary)) != 0) goto mem_err; infop->rp->primary = R_OFFSET(infop, infop->primary); mp = infop->primary; memset(mp, 0, sizeof(*mp)); if ((ret = __mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0) return (ret); if (reginfo_off == 0) { ZERO_LSN(mp->lsn); mp->nreg = dbenv->mp_ncache; mp->max_nreg = max_nreg; if ((ret = __env_alloc(&dbmp->reginfo[0], max_nreg * sizeof(u_int32_t), &p)) != 0) goto mem_err; mp->regids = R_OFFSET(dbmp->reginfo, p); mp->nbuckets = dbenv->mp_ncache * htab_buckets; /* Allocate file table space and initialize it. */ if ((ret = __env_alloc(infop, MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), &htab)) != 0) goto mem_err; mp->ftab = R_OFFSET(infop, htab); for (i = 0; i < MPOOL_FILE_BUCKETS; i++) { if ((ret = __mutex_alloc(env, MTX_MPOOL_FILE_BUCKET, 0, &htab[i].mtx_hash)) != 0) return (ret); SH_TAILQ_INIT(&htab[i].hash_bucket); htab[i].hash_page_dirty = 0; } /* * Allocate all of the hash bucket mutexes up front. We do * this so that we don't need to free and reallocate mutexes as * the cache is resized. */ mtx_base = mtx_prev = MUTEX_INVALID; for (i = 0; i < mp->max_nreg * htab_buckets; i++) { if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET, 0, &mtx_discard)) != 0) return (ret); if (i == 0) { mtx_base = mtx_discard; mtx_prev = mtx_discard - 1; } DB_ASSERT(env, mtx_discard == mtx_prev + 1 || mtx_base == MUTEX_INVALID); mtx_prev = mtx_discard; if ((ret = __mutex_alloc(env, MTX_MPOOL_IO, DB_MUTEX_SELF_BLOCK, &mtx_discard)) != 0) return (ret); DB_ASSERT(env, mtx_discard == mtx_prev + 1 || mtx_base == MUTEX_INVALID); mtx_prev = mtx_discard; } } else { main_mp = dbmp->reginfo[0].primary; htab = R_ADDR(&dbmp->reginfo[0], main_mp->htab); mtx_base = htab[0].mtx_hash; } /* * We preallocated all of the mutexes in a block, so for regions after * the first, we skip mutexes in use in earlier regions. Each region * has the same number of buckets and there are two mutexes per hash * bucket (the bucket mutex and the I/O mutex). */ if (mtx_base != MUTEX_INVALID) mtx_base += reginfo_off * htab_buckets * 2; /* Allocate hash table space and initialize it. */ if ((ret = __env_alloc(infop, htab_buckets * sizeof(DB_MPOOL_HASH), &htab)) != 0) goto mem_err; mp->htab = R_OFFSET(infop, htab); for (i = 0; i < htab_buckets; i++) { hp = &htab[i]; hp->mtx_hash = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID : mtx_base + i * 2; hp->mtx_io = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID : mtx_base + i * 2 + 1; SH_TAILQ_INIT(&hp->hash_bucket); hp->hash_page_dirty = 0; #ifdef HAVE_STATISTICS hp->hash_io_wait = 0; hp->hash_frozen = hp->hash_thawed = hp->hash_frozen_freed = 0; #endif hp->flags = 0; ZERO_LSN(hp->old_reader); } mp->htab_buckets = htab_buckets; #ifdef HAVE_STATISTICS mp->stat.st_hash_buckets = htab_buckets; #endif SH_TAILQ_INIT(&mp->free_frozen); SH_TAILQ_INIT(&mp->alloc_frozen); /* * Pre-allocate one frozen buffer header. This avoids situations where * the cache becomes full of pages and we don't even have the 28 bytes * (or so) available to allocate a frozen buffer header. */ if ((ret = __env_alloc(infop, sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen)) != 0) goto mem_err; frozen_bhp = (BH *)(frozen + 1); SH_TAILQ_INSERT_TAIL(&mp->alloc_frozen, frozen, links); SH_TAILQ_INSERT_TAIL(&mp->free_frozen, frozen_bhp, hq); /* * Only the environment creator knows the total cache size, fill in * those statistics now. */ mp->stat.st_gbytes = dbenv->mp_gbytes; mp->stat.st_bytes = dbenv->mp_bytes; return (0); mem_err:__db_errx(env, "Unable to allocate memory for mpool region"); return (ret); } /* * PUBLIC: u_int32_t __memp_max_regions __P((ENV *)); */ u_int32_t __memp_max_regions(env) ENV *env; { DB_ENV *dbenv; roff_t reg_size, max_size; size_t max_nreg; dbenv = env->dbenv; __memp_region_size(env, ®_size, NULL); max_size = (roff_t)dbenv->mp_max_gbytes * GIGABYTE + dbenv->mp_max_bytes; max_nreg = (max_size + reg_size / 2) / reg_size; /* Sanity check that the number of regions fits in 32 bits. */ DB_ASSERT(env, max_nreg == (u_int32_t)max_nreg); if (max_nreg <= dbenv->mp_ncache) max_nreg = dbenv->mp_ncache; return ((u_int32_t)max_nreg); } /* * __memp_region_size -- * Size the region and figure out how many hash buckets we'll have. */ static void __memp_region_size(env, reg_sizep, htab_bucketsp) ENV *env; roff_t *reg_sizep; u_int32_t *htab_bucketsp; { DB_ENV *dbenv; roff_t reg_size, cache_size; dbenv = env->dbenv; /* * Figure out how big each cache region is. Cast an operand to roff_t * so we do 64-bit arithmetic as appropriate. */ cache_size = (roff_t)dbenv->mp_gbytes * GIGABYTE + dbenv->mp_bytes; reg_size = cache_size / dbenv->mp_ncache; if (reg_sizep != NULL) *reg_sizep = reg_size; /* * Figure out how many hash buckets each region will have. Assume we * want to keep the hash chains with under 10 pages on each chain. We * don't know the pagesize in advance, and it may differ for different * files. Use a pagesize of 1K for the calculation -- we walk these * chains a lot, they must be kept short. * * XXX * Cache sizes larger than 10TB would cause 32-bit wrapping in the * calculation of the number of hash buckets. This probably isn't * something we need to worry about right now, but is checked when the * cache size is set. */ if (htab_bucketsp != NULL) *htab_bucketsp = __db_tablesize((u_int32_t)(reg_size / (10 * 1024))); } /* * __memp_region_mutex_count -- * Return the number of mutexes the mpool region will need. * * PUBLIC: u_int32_t __memp_region_mutex_count __P((ENV *)); */ u_int32_t __memp_region_mutex_count(env) ENV *env; { DB_ENV *dbenv; u_int32_t htab_buckets; dbenv = env->dbenv; __memp_region_size(env, NULL, &htab_buckets); /* * We need a couple of mutexes for the region itself, one for each * file handle (MPOOLFILE) the application allocates, one for each * of the MPOOL_FILE_BUCKETS, and each cache has two mutexes per * hash bucket. */ return (dbenv->mp_ncache * htab_buckets * 2 + 50 + MPOOL_FILE_BUCKETS); } /* * __memp_init_config -- * Initialize shared configuration information. */ static int __memp_init_config(env, mp) ENV *env; MPOOL *mp; { DB_ENV *dbenv; dbenv = env->dbenv; MPOOL_SYSTEM_LOCK(env); if (dbenv->mp_mmapsize != 0) mp->mp_mmapsize = dbenv->mp_mmapsize; if (dbenv->mp_maxopenfd != 0) mp->mp_maxopenfd = dbenv->mp_maxopenfd; if (dbenv->mp_maxwrite != 0) mp->mp_maxwrite = dbenv->mp_maxwrite; if (dbenv->mp_maxwrite_sleep != 0) mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep; MPOOL_SYSTEM_UNLOCK(env); return (0); } /* * __memp_env_refresh -- * Clean up after the mpool system on a close or failed open. * * PUBLIC: int __memp_env_refresh __P((ENV *)); */ int __memp_env_refresh(env) ENV *env; { BH *bhp; BH_FROZEN_ALLOC *frozen_alloc; DB_MPOOL *dbmp; DB_MPOOLFILE *dbmfp; DB_MPOOL_HASH *hp; DB_MPREG *mpreg; MPOOL *mp, *c_mp; REGINFO *infop; db_mutex_t mtx_base, mtx; u_int32_t bucket, htab_buckets, i, max_nreg, nreg; int ret, t_ret; ret = 0; dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; htab_buckets = mp->htab_buckets; nreg = mp->nreg; max_nreg = mp->max_nreg; hp = R_ADDR(&dbmp->reginfo[0], mp->htab); mtx_base = hp->mtx_hash; /* * If a private region, return the memory to the heap. Not needed for * filesystem-backed or system shared memory regions, that memory isn't * owned by any particular process. */ if (!F_ISSET(env, ENV_PRIVATE)) goto not_priv; /* Discard buffers. */ for (i = 0; i < nreg; ++i) { infop = &dbmp->reginfo[i]; c_mp = infop->primary; for (hp = R_ADDR(infop, c_mp->htab), bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { while ((bhp = SH_TAILQ_FIRST( &hp->hash_bucket, __bh)) != NULL) if (F_ISSET(bhp, BH_FROZEN)) SH_TAILQ_REMOVE( &hp->hash_bucket, bhp, hq, __bh); else { if (F_ISSET(bhp, BH_DIRTY)) { --hp->hash_page_dirty; F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); } if ((t_ret = __memp_bhfree( dbmp, infop, hp, bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED)) != 0 && ret == 0) ret = t_ret; } } while ((frozen_alloc = SH_TAILQ_FIRST( &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) { SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc, links, __bh_frozen_a); __env_alloc_free(infop, frozen_alloc); } } /* Discard hash bucket mutexes. */ if (mtx_base != MUTEX_INVALID) for (i = 0; i < 2 * max_nreg * htab_buckets; ++i) { mtx = mtx_base + i; if ((t_ret = __mutex_free(env, &mtx)) != 0 && ret == 0) ret = t_ret; } not_priv: /* Discard DB_MPOOLFILEs. */ while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0) ret = t_ret; /* Discard DB_MPREGs. */ if (dbmp->pg_inout != NULL) __os_free(env, dbmp->pg_inout); while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) { LIST_REMOVE(mpreg, q); __os_free(env, mpreg); } /* Discard the DB_MPOOL thread mutex. */ if ((t_ret = __mutex_free(env, &dbmp->mutex)) != 0 && ret == 0) ret = t_ret; if (F_ISSET(env, ENV_PRIVATE)) { /* Discard REGION IDs. */ infop = &dbmp->reginfo[0]; __memp_free(infop, NULL, R_ADDR(infop, mp->regids)); /* Discard the File table. */ __memp_free(infop, NULL, R_ADDR(infop, mp->ftab)); /* Discard Hash tables. */ for (i = 0; i < nreg; ++i) { infop = &dbmp->reginfo[i]; c_mp = infop->primary; __memp_free(infop, NULL, R_ADDR(infop, c_mp->htab)); } } /* Detach from the region. */ for (i = 0; i < nreg; ++i) { infop = &dbmp->reginfo[i]; if ((t_ret = __env_region_detach(env, infop, 0)) != 0 && ret == 0) ret = t_ret; } /* Discard DB_MPOOL. */ __os_free(env, dbmp->reginfo); __os_free(env, dbmp); env->mp_handle = NULL; return (ret); }