/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint static const char revid[] = "$Id: env_region.c,v 1.2 2004/03/30 01:23:16 jtownsen Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #endif #include "db_int.h" #include "dbinc/db_shash.h" #include "dbinc/lock.h" #include "dbinc/log.h" #include "dbinc/mp.h" #include "dbinc/txn.h" static int __db_des_destroy __P((DB_ENV *, REGION *, int)); static int __db_des_get __P((DB_ENV *, REGINFO *, REGINFO *, REGION **)); static int __db_e_remfile __P((DB_ENV *)); static int __db_faultmem __P((DB_ENV *, void *, size_t, int)); static void __db_region_destroy __P((DB_ENV *, REGINFO *)); /* * __db_e_attach * Join/create the environment * * PUBLIC: int __db_e_attach __P((DB_ENV *, u_int32_t *)); */ int __db_e_attach(dbenv, init_flagsp) DB_ENV *dbenv; u_int32_t *init_flagsp; { REGENV *renv; REGENV_REF ref; REGINFO *infop; REGION *rp, tregion; size_t size; size_t nrw; u_int32_t mbytes, bytes; int retry_cnt, ret, segid; char buf[sizeof(DB_REGION_FMT) + 20]; #if !defined(HAVE_MUTEX_THREADS) /* * !!! * If we don't have spinlocks, we need a file descriptor for fcntl(2) * locking. We use the file handle from the REGENV file for this * purpose. * * Since we may be using shared memory regions, e.g., shmget(2), and * not a mapped-in regular file, the backing file may be only a few * bytes in length. So, this depends on the ability to call fcntl to * lock file offsets much larger than the actual physical file. I * think that's safe -- besides, very few systems actually need this * kind of support, SunOS is the only one still in wide use of which * I'm aware. * * The error case is if an application lacks spinlocks and wants to be * threaded. That doesn't work because fcntl may lock the underlying * process, including all its threads. */ if (F_ISSET(dbenv, DB_ENV_THREAD)) { __db_err(dbenv, "architecture lacks fast mutexes: applications cannot be threaded"); return (EINVAL); } #endif /* Initialization */ retry_cnt = 0; /* Repeated initialization. */ loop: renv = NULL; /* Set up the DB_ENV's REG_INFO structure. */ if ((ret = __os_calloc(dbenv, 1, sizeof(REGINFO), &infop)) != 0) return (ret); infop->type = REGION_TYPE_ENV; infop->id = REGION_ID_ENV; infop->mode = dbenv->db_mode; infop->flags = REGION_JOIN_OK; if (F_ISSET(dbenv, DB_ENV_CREATE)) F_SET(infop, REGION_CREATE_OK); /* * We have to single-thread the creation of the REGENV region. Once * it exists, we can do locking using locks in the region, but until * then we have to be the only player in the game. * * If this is a private environment, we are only called once and there * are no possible race conditions. * * If this is a public environment, we use the filesystem to ensure * the creation of the environment file is single-threaded. */ if (F_ISSET(dbenv, DB_ENV_PRIVATE)) { if ((ret = __os_strdup(dbenv, "process-private", &infop->name)) != 0) goto err; goto creation; } /* Build the region name. */ (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV); if ((ret = __db_appname(dbenv, DB_APP_NONE, buf, 0, NULL, &infop->name)) != 0) goto err; /* * Try to create the file, if we have the authority. We have to ensure * that multiple threads/processes attempting to simultaneously create * the file are properly ordered. Open using the O_CREAT and O_EXCL * flags so that multiple attempts to create the region will return * failure in all but one. POSIX 1003.1 requires that EEXIST be the * errno return value -- I sure hope they're right. */ if (F_ISSET(dbenv, DB_ENV_CREATE)) { if ((ret = __os_open(dbenv, infop->name, DB_OSO_CREATE | DB_OSO_DIRECT | DB_OSO_EXCL | DB_OSO_REGION, dbenv->db_mode, &dbenv->lockfhp)) == 0) goto creation; if (ret != EEXIST) { __db_err(dbenv, "%s: %s", infop->name, db_strerror(ret)); goto err; } } /* * If we couldn't create the file, try and open it. (If that fails, * we're done.) */ if ((ret = __os_open(dbenv, infop->name, DB_OSO_REGION | DB_OSO_DIRECT, dbenv->db_mode, &dbenv->lockfhp)) != 0) goto err; /* The region exists, it's not okay to recreate it. */ F_CLR(infop, REGION_CREATE_OK); /* * !!! * The region may be in system memory not backed by the filesystem * (more specifically, not backed by this file), and we're joining * it. In that case, the process that created it will have written * out a REGENV_REF structure as its only contents. We read that * structure before we do anything further, e.g., we can't just map * that file in and then figure out what's going on. * * All of this noise is because some systems don't have a coherent VM * and buffer cache, and what's worse, when you mix operations on the * VM and buffer cache, half the time you hang the system. * * If the file is the size of an REGENV_REF structure, then we know * the real region is in some other memory. (The only way you get a * file that size is to deliberately write it, as it's smaller than * any possible disk sector created by writing a file or mapping the * file into memory.) In which case, retrieve the structure from the * file and use it to acquire the referenced memory. * * If the structure is larger than a REGENV_REF structure, then this * file is backing the shared memory region, and we just map it into * memory. * * And yes, this makes me want to take somebody and kill them. (I * digress -- but you have no freakin' idea. This is unbelievably * stupid and gross, and I've probably spent six months of my life, * now, trying to make different versions of it work.) */ if ((ret = __os_ioinfo(dbenv, infop->name, dbenv->lockfhp, &mbytes, &bytes, NULL)) != 0) { __db_err(dbenv, "%s: %s", infop->name, db_strerror(ret)); goto err; } /* * !!! * A size_t is OK -- regions get mapped into memory, and so can't * be larger than a size_t. */ size = mbytes * MEGABYTE + bytes; /* * If the size is less than the size of a REGENV_REF structure, the * region (or, possibly, the REGENV_REF structure) has not yet been * completely written. Wait awhile and try again. * * Otherwise, if the size is the size of a REGENV_REF structure, * read it into memory and use it as a reference to the real region. */ if (size <= sizeof(ref)) { if (size != sizeof(ref)) goto retry; if ((ret = __os_read(dbenv, dbenv->lockfhp, &ref, sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) { if (ret == 0) ret = EIO; __db_err(dbenv, "%s: unable to read system-memory information from: %s", infop->name, db_strerror(ret)); goto err; } size = ref.size; segid = ref.segid; F_SET(dbenv, DB_ENV_SYSTEM_MEM); } else if (F_ISSET(dbenv, DB_ENV_SYSTEM_MEM)) { ret = EINVAL; __db_err(dbenv, "%s: existing environment not created in system memory: %s", infop->name, db_strerror(ret)); goto err; } else segid = INVALID_REGION_SEGID; /* * If not doing thread locking, we need to save the file handle for * fcntl(2) locking. Otherwise, discard the handle, we no longer * need it, and the less contact between the buffer cache and the VM, * the better. */ #ifdef HAVE_MUTEX_THREADS (void)__os_closehandle(dbenv, dbenv->lockfhp); dbenv->lockfhp = NULL; #endif /* Call the region join routine to acquire the region. */ memset(&tregion, 0, sizeof(tregion)); tregion.size = (roff_t)size; tregion.segid = segid; if ((ret = __os_r_attach(dbenv, infop, &tregion)) != 0) goto err; /* * The environment's REGENV structure has to live at offset 0 instead * of the usual shalloc information. Set the primary reference and * correct the "addr" value to reference the shalloc region. Note, * this means that all of our offsets (R_ADDR/R_OFFSET) get shifted * as well, but that should be fine. */ infop->primary = R_ADDR(infop, 0); infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV); renv = infop->primary; /* Make sure the region matches our build. */ if (renv->majver != DB_VERSION_MAJOR || renv->minver != DB_VERSION_MINOR) { __db_err(dbenv, "Program version %d.%d doesn't match environment version", DB_VERSION_MAJOR, DB_VERSION_MINOR); #ifndef DIAGNOSTIC ret = EINVAL; goto err; #endif } /* * Check if the environment has had a catastrophic failure. * * Check the magic number to ensure the region is initialized. If the * magic number isn't set, the lock may not have been initialized, and * an attempt to use it could lead to random behavior. * * The panic and magic values aren't protected by any lock, so we never * use them in any check that's more complex than set/not-set. * * !!! * I'd rather play permissions games using the underlying file, but I * can't because Windows/NT filesystems won't open files mode 0. */ if (renv->envpanic && !F_ISSET(dbenv, DB_ENV_NOPANIC)) { ret = __db_panic_msg(dbenv); goto err; } if (renv->magic != DB_REGION_MAGIC) goto retry; /* Lock the environment. */ MUTEX_LOCK(dbenv, &renv->mutex); /* * Finally! We own the environment now. Repeat the panic check, it's * possible that it was set while we waited for the lock. */ if (renv->envpanic && !F_ISSET(dbenv, DB_ENV_NOPANIC)) { ret = __db_panic_msg(dbenv); goto err_unlock; } /* * Get a reference to the underlying REGION information for this * environment. */ if ((ret = __db_des_get(dbenv, infop, infop, &rp)) != 0 || rp == NULL) { MUTEX_UNLOCK(dbenv, &renv->mutex); goto find_err; } infop->rp = rp; /* * There's still a possibility for inconsistent data. When we acquired * the size of the region and attached to it, it might have still been * growing as part of its creation. We can detect this by checking the * size we originally found against the region's current size. (The * region's current size has to be final, the creator finished growing * it before releasing the environment for us to lock.) */ if (rp->size != size) { err_unlock: MUTEX_UNLOCK(dbenv, &renv->mutex); goto retry; } /* Increment the reference count. */ ++renv->refcnt; /* * If our caller wants them, return the flags this environment was * initialized with. */ if (init_flagsp != NULL) *init_flagsp = renv->init_flags; /* Discard our lock. */ MUTEX_UNLOCK(dbenv, &renv->mutex); /* * Fault the pages into memory. Note, do this AFTER releasing the * lock, because we're only reading the pages, not writing them. */ (void)__db_faultmem(dbenv, infop->primary, rp->size, 0); /* Everything looks good, we're done. */ dbenv->reginfo = infop; return (0); creation: /* Create the environment region. */ F_SET(infop, REGION_CREATE); /* * Allocate room for 50 REGION structures plus overhead (we're going * to use this space for last-ditch allocation requests), although we * should never need anything close to that. * * Encryption passwds are stored in the env region. Add that in too. */ memset(&tregion, 0, sizeof(tregion)); tregion.size = (roff_t)(50 * sizeof(REGION) + dbenv->passwd_len + 2048); tregion.segid = INVALID_REGION_SEGID; if ((ret = __os_r_attach(dbenv, infop, &tregion)) != 0) goto err; /* * Fault the pages into memory. Note, do this BEFORE we initialize * anything, because we're writing the pages, not just reading them. */ (void)__db_faultmem(dbenv, infop->addr, tregion.size, 1); /* * The first object in the region is the REGENV structure. This is * different from the other regions, and, from everything else in * this region, where all objects are allocated from the pool, i.e., * there aren't any fixed locations. The remaining space is made * available for later allocation. * * The allocation space must be size_t aligned, because that's what * the initialization routine is going to store there. To make sure * that happens, the REGENV structure was padded with a final size_t. * No other region needs to worry about it because all of them treat * the entire region as allocation space. * * Set the primary reference and correct the "addr" value to reference * the shalloc region. Note, this requires that we "uncorrect" it at * region detach, and that all of our offsets (R_ADDR/R_OFFSET) will be * shifted as well, but that should be fine. */ infop->primary = R_ADDR(infop, 0); infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV); __db_shalloc_init(infop->addr, tregion.size - sizeof(REGENV)); /* * Initialize the rest of the REGENV structure, except for the magic * number which validates the file/environment. */ renv = infop->primary; renv->envpanic = 0; db_version(&renv->majver, &renv->minver, &renv->patch); SH_LIST_INIT(&renv->regionq); renv->refcnt = 1; renv->cipher_off = INVALID_ROFF; renv->rep_off = INVALID_ROFF; /* * Initialize init_flags to store the flags that any other environment * handle that uses DB_JOINENV to join this environment will need. */ renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp; /* * Lock the environment. * * Check the lock call return. This is the first lock we initialize * and acquire, and we have to know if it fails. (It CAN fail, e.g., * SunOS, when using fcntl(2) for locking and using an in-memory * filesystem as the database home. But you knew that, I'm sure -- it * probably wasn't even worth mentioning.) */ if ((ret = __db_mutex_setup(dbenv, infop, &renv->mutex, MUTEX_NO_RECORD | MUTEX_NO_RLOCK)) != 0) { __db_err(dbenv, "%s: unable to initialize environment lock: %s", infop->name, db_strerror(ret)); goto err; } if (!F_ISSET(&renv->mutex, MUTEX_IGNORE) && (ret = __db_mutex_lock(dbenv, &renv->mutex)) != 0) { __db_err(dbenv, "%s: unable to acquire environment lock: %s", infop->name, db_strerror(ret)); goto err; } /* * Get the underlying REGION structure for this environment. Note, * we created the underlying OS region before we acquired the REGION * structure, which is backwards from the normal procedure. Update * the REGION structure. */ if ((ret = __db_des_get(dbenv, infop, infop, &rp)) != 0) { find_err: __db_err(dbenv, "%s: unable to find environment", infop->name); if (ret == 0) ret = EINVAL; goto err; } infop->rp = rp; rp->size = tregion.size; rp->segid = tregion.segid; /* * !!! * If we create an environment where regions are public and in system * memory, we have to inform processes joining the environment how to * attach to the shared memory segment. So, we write the shared memory * identifier into the file, to be read by those other processes. * * XXX * This is really OS-layer information, but I can't see any easy way * to move it down there without passing down information that it has * no right to know, e.g., that this is the one-and-only REGENV region * and not some other random region. */ if (tregion.segid != INVALID_REGION_SEGID) { ref.size = tregion.size; ref.segid = tregion.segid; if ((ret = __os_write( dbenv, dbenv->lockfhp, &ref, sizeof(ref), &nrw)) != 0) { __db_err(dbenv, "%s: unable to write out public environment ID: %s", infop->name, db_strerror(ret)); goto err; } } /* * If not doing thread locking, we need to save the file handle for * fcntl(2) locking. Otherwise, discard the handle, we no longer * need it, and the less contact between the buffer cache and the VM, * the better. */ #if defined(HAVE_MUTEX_THREADS) if (dbenv->lockfhp != NULL) { (void)__os_closehandle(dbenv, dbenv->lockfhp); dbenv->lockfhp = NULL; } #endif /* Validate the file. */ renv->magic = DB_REGION_MAGIC; /* Discard our lock. */ MUTEX_UNLOCK(dbenv, &renv->mutex); /* Everything looks good, we're done. */ dbenv->reginfo = infop; return (0); err: retry: /* Close any open file handle. */ if (dbenv->lockfhp != NULL) { (void)__os_closehandle(dbenv, dbenv->lockfhp); dbenv->lockfhp = NULL; } /* * If we joined or created the region, detach from it. If we created * it, destroy it. Note, there's a path in the above code where we're * using a temporary REGION structure because we haven't yet allocated * the real one. In that case the region address (addr) will be filled * in, but the REGION pointer (rp) won't. Fix it. */ if (infop->addr != NULL) { if (infop->rp == NULL) infop->rp = &tregion; /* Reset the addr value that we "corrected" above. */ infop->addr = infop->primary; (void)__os_r_detach(dbenv, infop, F_ISSET(infop, REGION_CREATE)); } /* Free the allocated name and/or REGINFO structure. */ if (infop->name != NULL) __os_free(dbenv, infop->name); __os_free(dbenv, infop); /* If we had a temporary error, wait awhile and try again. */ if (ret == 0) { if (++retry_cnt > 3) { __db_err(dbenv, "unable to join the environment"); ret = EAGAIN; } else { (void)__os_sleep(dbenv, retry_cnt * 3, 0); goto loop; } } return (ret); } /* * __db_e_detach -- * Detach from the environment. * * PUBLIC: int __db_e_detach __P((DB_ENV *, int)); */ int __db_e_detach(dbenv, destroy) DB_ENV *dbenv; int destroy; { REGENV *renv; REGINFO *infop; infop = dbenv->reginfo; renv = infop->primary; if (F_ISSET(dbenv, DB_ENV_PRIVATE)) destroy = 1; /* Lock the environment. */ MUTEX_LOCK(dbenv, &renv->mutex); /* Decrement the reference count. */ if (renv->refcnt == 0) { __db_err(dbenv, "region %lu (environment): reference count went negative", (u_long)infop->rp->id); } else --renv->refcnt; /* Release the lock. */ MUTEX_UNLOCK(dbenv, &renv->mutex); /* Close the locking file handle. */ if (dbenv->lockfhp != NULL) { (void)__os_closehandle(dbenv, dbenv->lockfhp); dbenv->lockfhp = NULL; } /* Reset the addr value that we "corrected" above. */ infop->addr = infop->primary; /* * If we are destroying the environment, we need to * destroy any system resources backing the mutex, as well * as any system resources that the replication system may have * acquired and put in the main region. * * Do these now before we free the memory in __os_r_detach. */ if (destroy) { (void)__rep_region_destroy(dbenv); __db_mutex_destroy(&renv->mutex); __db_mutex_destroy(&infop->rp->mutex); } /* * Release the region, and kill our reference. * * We set the DB_ENV->reginfo field to NULL here and discard its memory. * DB_ENV->remove calls __dbenv_remove to do the region remove, and * __dbenv_remove attached and then detaches from the region. We don't * want to return to DB_ENV->remove with a non-NULL DB_ENV->reginfo * field because it will attempt to detach again as part of its cleanup. */ (void)__os_r_detach(dbenv, infop, destroy); if (infop->name != NULL) __os_free(dbenv, infop->name); __os_free(dbenv, dbenv->reginfo); dbenv->reginfo = NULL; return (0); } /* * __db_e_remove -- * Discard an environment if it's not in use. * * PUBLIC: int __db_e_remove __P((DB_ENV *, u_int32_t)); */ int __db_e_remove(dbenv, flags) DB_ENV *dbenv; u_int32_t flags; { REGENV *renv; REGINFO *infop, reginfo; REGION *rp; u_int32_t db_env_reset; int force, ret; force = LF_ISSET(DB_FORCE) ? 1 : 0; /* * This routine has to walk a nasty line between not looking into * the environment (which may be corrupted after an app or system * crash), and removing everything that needs removing. What we * do is: * 1. Connect to the environment (so it better be OK). * 2. If the environment is in use (reference count is non-zero), * return EBUSY. * 3. Overwrite the magic number so that any threads of control * attempting to connect will backoff and retry. * 4. Walk the list of regions. Connect to each region and then * disconnect with the destroy flag set. This shouldn't cause * any problems, even if the region is corrupted, because we * should never be looking inside the region. * 5. Walk the list of files in the directory, unlinking any * files that match a region name. Unlink the environment * file last. * * If the force flag is set, we do not acquire any locks during this * process. */ db_env_reset = F_ISSET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC); if (force) F_SET(dbenv, DB_ENV_NOLOCKING); F_SET(dbenv, DB_ENV_NOPANIC); /* Join the environment. */ if ((ret = __db_e_attach(dbenv, NULL)) != 0) { /* * If we can't join it, we assume that's because it doesn't * exist. It would be better to know why we failed, but it * probably isn't important. */ ret = 0; if (force) goto remfiles; goto done; } infop = dbenv->reginfo; renv = infop->primary; /* Lock the environment. */ MUTEX_LOCK(dbenv, &renv->mutex); /* * If it's in use, we're done unless we're forcing the issue or the * environment has panic'd. (Presumably, if the environment panic'd, * the thread holding the reference count may not have cleaned up.) */ if (renv->refcnt == 1 || renv->envpanic == 1 || force) { /* * Set the panic flag and overwrite the magic number. * * !!! * From this point on, there's no going back, we pretty * much ignore errors, and just whack on whatever we can. */ renv->envpanic = 1; renv->magic = 0; /* * Unlock the environment. We should no longer need the lock * because we've poisoned the pool, but we can't continue to * hold it either, because other routines may want it. */ MUTEX_UNLOCK(dbenv, &renv->mutex); /* * Attach to each sub-region and destroy it. * * !!! * The REGION_CREATE_OK flag is set for Windows/95 -- regions * are zero'd out when the last reference to the region goes * away, in which case the underlying OS region code requires * callers be prepared to create the region in order to join it. */ memset(®info, 0, sizeof(reginfo)); for (rp = SH_LIST_FIRST(&renv->regionq, __db_region); rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) { if (rp->type == REGION_TYPE_ENV) continue; /* * If we get here and can't attach and/or detach to the * region, it's a mess. Ignore errors, there's nothing * we can do about them. */ reginfo.id = rp->id; reginfo.flags = REGION_CREATE_OK; if (__db_r_attach(dbenv, ®info, 0) == 0) { R_UNLOCK(dbenv, ®info); (void)__db_r_detach(dbenv, ®info, 1); } } /* Destroy the environment's region. */ (void)__db_e_detach(dbenv, 1); /* Discard any remaining physical files. */ remfiles: (void)__db_e_remfile(dbenv); } else { /* Unlock the environment. */ MUTEX_UNLOCK(dbenv, &renv->mutex); /* Discard the environment. */ (void)__db_e_detach(dbenv, 0); ret = EBUSY; } done: F_CLR(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC); F_SET(dbenv, db_env_reset); return (ret); } /* * __db_e_remfile -- * Discard any region files in the filesystem. */ static int __db_e_remfile(dbenv) DB_ENV *dbenv; { int cnt, fcnt, lastrm, ret; u_int8_t saved_byte; const char *dir; char *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20]; /* Get the full path of a file in the environment. */ (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV); if ((ret = __db_appname(dbenv, DB_APP_NONE, buf, 0, NULL, &path)) != 0) return (ret); /* Get the parent directory for the environment. */ if ((p = __db_rpath(path)) == NULL) { p = path; saved_byte = *p; dir = PATH_DOT; } else { saved_byte = *p; *p = '\0'; dir = path; } /* Get the list of file names. */ if ((ret = __os_dirlist(dbenv, dir, &names, &fcnt)) != 0) __db_err(dbenv, "%s: %s", dir, db_strerror(ret)); /* Restore the path, and free it. */ *p = saved_byte; __os_free(dbenv, path); if (ret != 0) return (ret); /* * Remove files from the region directory. */ for (lastrm = -1, cnt = fcnt; --cnt >= 0;) { /* Skip anything outside our name space. */ if (strncmp(names[cnt], DB_REGION_PREFIX, sizeof(DB_REGION_PREFIX) - 1)) continue; /* Skip queue extent files. */ if (strncmp(names[cnt], "__dbq.", 6) == 0) continue; /* * Remove the primary environment region last, because it's * the key to this whole mess. */ if (strcmp(names[cnt], DB_REGION_ENV) == 0) { lastrm = cnt; continue; } /* Remove the file. */ if (__db_appname(dbenv, DB_APP_NONE, names[cnt], 0, NULL, &path) == 0) { /* * Overwrite region files. Temporary files would have * been maintained in encrypted format, so there's no * reason to overwrite them. This is not an exact * check on the file being a region file, but it's * not likely to be wrong, and the worst thing that can * happen is we overwrite a file that didn't need to be * overwritten. */ if (F_ISSET(dbenv, DB_ENV_OVERWRITE) && strlen(names[cnt]) == DB_REGION_NAME_LENGTH) (void)__db_overwrite(dbenv, path); (void)__os_unlink(dbenv, path); __os_free(dbenv, path); } } if (lastrm != -1) if (__db_appname(dbenv, DB_APP_NONE, names[lastrm], 0, NULL, &path) == 0) { if (F_ISSET(dbenv, DB_ENV_OVERWRITE)) (void)__db_overwrite(dbenv, path); (void)__os_unlink(dbenv, path); __os_free(dbenv, path); } __os_dirfree(dbenv, names, fcnt); return (0); } /* * __db_e_stat * Statistics for the environment. * * PUBLIC: int __db_e_stat __P((DB_ENV *, * PUBLIC: REGENV *, REGION *, int *, u_int32_t)); */ int __db_e_stat(dbenv, arg_renv, arg_regions, arg_regions_cnt, flags) DB_ENV *dbenv; REGENV *arg_renv; REGION *arg_regions; int *arg_regions_cnt; u_int32_t flags; { REGENV *renv; REGINFO *infop; REGION *rp; int n, ret; infop = dbenv->reginfo; renv = infop->primary; rp = infop->rp; if ((ret = __db_fchk(dbenv, "DB_ENV->stat", flags, DB_STAT_CLEAR)) != 0) return (ret); /* Lock the environment. */ MUTEX_LOCK(dbenv, &rp->mutex); *arg_renv = *renv; if (LF_ISSET(DB_STAT_CLEAR)) { renv->mutex.mutex_set_nowait = 0; renv->mutex.mutex_set_wait = 0; } for (n = 0, rp = SH_LIST_FIRST(&renv->regionq, __db_region); n < *arg_regions_cnt && rp != NULL; ++n, rp = SH_LIST_NEXT(rp, q, __db_region)) { arg_regions[n] = *rp; if (LF_ISSET(DB_STAT_CLEAR)) { rp->mutex.mutex_set_nowait = 0; rp->mutex.mutex_set_wait = 0; } } /* Release the lock. */ rp = infop->rp; MUTEX_UNLOCK(dbenv, &rp->mutex); *arg_regions_cnt = n == 0 ? n : n - 1; return (0); } /* * __db_r_attach * Join/create a region. * * PUBLIC: int __db_r_attach __P((DB_ENV *, REGINFO *, size_t)); */ int __db_r_attach(dbenv, infop, size) DB_ENV *dbenv; REGINFO *infop; size_t size; { REGENV *renv; REGION *rp; int ret; char buf[sizeof(DB_REGION_FMT) + 20]; renv = ((REGINFO *)dbenv->reginfo)->primary; /* Lock the environment. */ MUTEX_LOCK(dbenv, &renv->mutex); /* * Find or create a REGION structure for this region. If we create * it, the REGION_CREATE flag will be set in the infop structure. */ F_CLR(infop, REGION_CREATE); if ((ret = __db_des_get(dbenv, dbenv->reginfo, infop, &rp)) != 0) { MUTEX_UNLOCK(dbenv, &renv->mutex); return (ret); } infop->rp = rp; infop->type = rp->type; infop->id = rp->id; /* If we're creating the region, set the desired size. */ if (F_ISSET(infop, REGION_CREATE)) rp->size = (roff_t)size; /* Join/create the underlying region. */ (void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id); if ((ret = __db_appname(dbenv, DB_APP_NONE, buf, 0, NULL, &infop->name)) != 0) goto err; if ((ret = __os_r_attach(dbenv, infop, rp)) != 0) goto err; /* * Fault the pages into memory. Note, do this BEFORE we initialize * anything because we're writing pages in created regions, not just * reading them. */ (void)__db_faultmem(dbenv, infop->addr, rp->size, F_ISSET(infop, REGION_CREATE)); /* * !!! * The underlying layer may have just decided that we are going * to create the region. There are various system issues that * can result in a useless region that requires re-initialization. * * If we created the region, initialize it for allocation. */ if (F_ISSET(infop, REGION_CREATE)) (void)__db_shalloc_init(infop->addr, rp->size); /* * If the underlying REGION isn't the environment, acquire a lock * for it and release our lock on the environment. */ if (infop->type != REGION_TYPE_ENV) { MUTEX_LOCK(dbenv, &rp->mutex); MUTEX_UNLOCK(dbenv, &renv->mutex); } return (0); /* Discard the underlying region. */ err: if (infop->addr != NULL) (void)__os_r_detach(dbenv, infop, F_ISSET(infop, REGION_CREATE)); infop->rp = NULL; infop->id = INVALID_REGION_ID; /* Discard the REGION structure if we created it. */ if (F_ISSET(infop, REGION_CREATE)) { (void)__db_des_destroy(dbenv, rp, 1); F_CLR(infop, REGION_CREATE); } /* Release the environment lock. */ MUTEX_UNLOCK(dbenv, &renv->mutex); return (ret); } /* * __db_r_detach -- * Detach from a region. * * PUBLIC: int __db_r_detach __P((DB_ENV *, REGINFO *, int)); */ int __db_r_detach(dbenv, infop, destroy) DB_ENV *dbenv; REGINFO *infop; int destroy; { REGENV *renv; REGION *rp; int ret, t_ret; renv = ((REGINFO *)dbenv->reginfo)->primary; rp = infop->rp; if (F_ISSET(dbenv, DB_ENV_PRIVATE)) destroy = 1; /* Lock the environment. */ MUTEX_LOCK(dbenv, &renv->mutex); /* Acquire the lock for the REGION. */ MUTEX_LOCK(dbenv, &rp->mutex); /* * We need to call destroy on per-subsystem info before * we free the memory associated with the region. */ if (destroy) __db_region_destroy(dbenv, infop); /* Detach from the underlying OS region. */ ret = __os_r_detach(dbenv, infop, destroy); /* Release the REGION lock. */ MUTEX_UNLOCK(dbenv, &rp->mutex); /* * If we destroyed the region, discard the REGION structure. The only * time this routine is called with the destroy flag set is when the * environment is being removed, and it's likely that the only reason * the environment is being removed is because we crashed. Don't do * any unnecessary shared memory manipulation. */ if (destroy && ((t_ret = __db_des_destroy(dbenv, rp, 0)) != 0) && ret == 0) ret = t_ret; /* Release the environment lock. */ MUTEX_UNLOCK(dbenv, &renv->mutex); /* Destroy the structure. */ if (infop->name != NULL) __os_free(dbenv, infop->name); return (ret); } /* * __db_des_get -- * Return a reference to the shared information for a REGION, * optionally creating a new entry. */ static int __db_des_get(dbenv, env_infop, infop, rpp) DB_ENV *dbenv; REGINFO *env_infop, *infop; REGION **rpp; { REGENV *renv; REGION *rp, *first_type; u_int32_t maxid; int ret; /* * !!! * Called with the environment already locked. */ *rpp = NULL; renv = env_infop->primary; /* * If the caller wants to join a region, walk through the existing * regions looking for a matching ID (if ID specified) or matching * type (if type specified). If we return based on a matching type * return the "primary" region, that is, the first region that was * created of this type. * * Track the maximum region ID so we can allocate a new region, * note that we have to start at 1 because the primary environment * uses ID == 1. */ maxid = REGION_ID_ENV; for (first_type = NULL, rp = SH_LIST_FIRST(&renv->regionq, __db_region); rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) { if (infop->id != INVALID_REGION_ID) { if (infop->id == rp->id) break; continue; } if (infop->type == rp->type && F_ISSET(infop, REGION_JOIN_OK) && (first_type == NULL || first_type->id > rp->id)) first_type = rp; if (rp->id > maxid) maxid = rp->id; } if (rp == NULL) rp = first_type; /* * If we didn't find a region and we can't create the region, fail. * The caller generates any error message. */ if (rp == NULL && !F_ISSET(infop, REGION_CREATE_OK)) return (ENOENT); /* * If we didn't find a region, create and initialize a REGION structure * for the caller. If id was set, use that value, otherwise we use the * next available ID. */ if (rp == NULL) { if ((ret = __db_shalloc(env_infop->addr, sizeof(REGION), MUTEX_ALIGN, &rp)) != 0) return (ret); /* Initialize the region. */ memset(rp, 0, sizeof(*rp)); if ((ret = __db_mutex_setup(dbenv, env_infop, &rp->mutex, MUTEX_NO_RECORD | MUTEX_NO_RLOCK)) != 0) { __db_shalloc_free(env_infop->addr, rp); return (ret); } rp->segid = INVALID_REGION_SEGID; /* * Set the type and ID; if no region ID was specified, * allocate one. */ rp->type = infop->type; rp->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id; SH_LIST_INSERT_HEAD(&renv->regionq, rp, q, __db_region); F_SET(infop, REGION_CREATE); } *rpp = rp; return (0); } /* * __db_des_destroy -- * Destroy a reference to a REGION. */ static int __db_des_destroy(dbenv, rp, shmem_safe) DB_ENV *dbenv; REGION *rp; int shmem_safe; { REGINFO *infop; /* * !!! * Called with the environment already locked. */ infop = dbenv->reginfo; /* * If we're calling during recovery, it may not be safe to access the * shared memory, as the shared memory may have been corrupted during * the crash. If the shared memory is safe, remove the REGION entry * from its linked list, destroy the mutex, and free the allocated * memory. On systems that require system mutex support, we don't * have a choice -- safe or not, we have to destroy the mutex or we'll * leak memory. */ if (shmem_safe) SH_LIST_REMOVE(rp, q, __db_region); __db_mutex_destroy(&rp->mutex); if (shmem_safe) __db_shalloc_free(infop->addr, rp); return (0); } /* * __db_faultmem -- * Fault the region into memory. */ static int __db_faultmem(dbenv, addr, size, created) DB_ENV *dbenv; void *addr; size_t size; int created; { int ret; u_int8_t *p, *t; /* * It's sometimes significantly faster to page-fault in all of the * region's pages before we run the application, as we see nasty * side-effects when we page-fault while holding various locks, i.e., * the lock takes a long time to acquire because of the underlying * page fault, and the other threads convoy behind the lock holder. * * If we created the region, we write a non-zero value so that the * system can't cheat. If we're just joining the region, we can * only read the value and try to confuse the compiler sufficiently * that it doesn't figure out that we're never really using it. */ ret = 0; if (F_ISSET(dbenv, DB_ENV_REGION_INIT)) { if (created) for (p = addr, t = (u_int8_t *)addr + size; p < t; p += OS_VMPAGESIZE) p[0] = 0xdb; else for (p = addr, t = (u_int8_t *)addr + size; p < t; p += OS_VMPAGESIZE) ret |= p[0]; } return (ret); } /* * __db_region_destroy -- * Destroy per-subsystem region information. * Called with the region already locked. */ static void __db_region_destroy(dbenv, infop) DB_ENV *dbenv; REGINFO *infop; { switch (infop->type) { case REGION_TYPE_LOCK: __lock_region_destroy(dbenv, infop); break; case REGION_TYPE_LOG: __log_region_destroy(dbenv, infop); break; case REGION_TYPE_MPOOL: __mpool_region_destroy(dbenv, infop); break; case REGION_TYPE_TXN: __txn_region_destroy(dbenv, infop); break; case REGION_TYPE_ENV: case REGION_TYPE_MUTEX: break; default: DB_ASSERT(0); break; } }