/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996,2008 Oracle. All rights reserved. * * $Id: mp_sync.c,v 12.59 2008/01/17 13:59:12 bostic Exp $ */ #include "db_config.h" #include "db_int.h" #include "dbinc/log.h" #include "dbinc/mp.h" #include "dbinc/db_page.h" #include "dbinc/hash.h" typedef struct { DB_MPOOL_HASH *track_hp; /* Hash bucket. */ roff_t track_off; /* Page file offset. */ db_pgno_t track_pgno; /* Page number. */ } BH_TRACK; static int __bhcmp __P((const void *, const void *)); static int __memp_close_flush_files __P((ENV *, int)); static int __memp_sync_files __P((ENV *)); static int __memp_sync_file __P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t)); /* * __memp_walk_files -- * PUBLIC: int __memp_walk_files __P((ENV *, MPOOL *, * PUBLIC: int (*) __P((ENV *, MPOOLFILE *, void *, * PUBLIC: u_int32_t *, u_int32_t)), void *, u_int32_t *, u_int32_t)); */ int __memp_walk_files(env, mp, func, arg, countp, flags) ENV *env; MPOOL *mp; int (*func)__P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t)); void *arg; u_int32_t *countp; u_int32_t flags; { DB_MPOOL *dbmp; DB_MPOOL_HASH *hp; MPOOLFILE *mfp; int i, ret, t_ret; dbmp = env->mp_handle; ret = 0; hp = R_ADDR(dbmp->reginfo, mp->ftab); for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { MUTEX_LOCK(env, hp->mtx_hash); SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { if ((t_ret = func(env, mfp, arg, countp, flags)) != 0 && ret == 0) ret = t_ret; if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR)) break; } MUTEX_UNLOCK(env, hp->mtx_hash); if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR)) break; } return (ret); } /* * __memp_sync_pp -- * ENV->memp_sync pre/post processing. * * PUBLIC: int __memp_sync_pp __P((DB_ENV *, DB_LSN *)); */ int __memp_sync_pp(dbenv, lsnp) DB_ENV *dbenv; DB_LSN *lsnp; { DB_THREAD_INFO *ip; ENV *env; int ret; env = dbenv->env; ENV_REQUIRES_CONFIG(env, env->mp_handle, "memp_sync", DB_INIT_MPOOL); /* * If no LSN is provided, flush the entire cache (reasonable usage * even if there's no log subsystem configured). */ if (lsnp != NULL) ENV_REQUIRES_CONFIG(env, env->lg_handle, "memp_sync", DB_INIT_LOG); ENV_ENTER(env, ip); REPLICATION_WRAP(env, (__memp_sync(env, DB_SYNC_CACHE, lsnp)), 0, ret); ENV_LEAVE(env, ip); return (ret); } /* * __memp_sync -- * ENV->memp_sync. * * PUBLIC: int __memp_sync __P((ENV *, u_int32_t, DB_LSN *)); */ int __memp_sync(env, flags, lsnp) ENV *env; u_int32_t flags; DB_LSN *lsnp; { DB_MPOOL *dbmp; MPOOL *mp; int interrupted, ret; dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; /* If we've flushed to the requested LSN, return that information. */ if (lsnp != NULL) { MPOOL_SYSTEM_LOCK(env); if (LOG_COMPARE(lsnp, &mp->lsn) <= 0) { *lsnp = mp->lsn; MPOOL_SYSTEM_UNLOCK(env); return (0); } MPOOL_SYSTEM_UNLOCK(env); } if ((ret = __memp_sync_int(env, NULL, 0, flags, NULL, &interrupted)) != 0) return (ret); if (!interrupted && lsnp != NULL) { MPOOL_SYSTEM_LOCK(env); if (LOG_COMPARE(lsnp, &mp->lsn) > 0) mp->lsn = *lsnp; MPOOL_SYSTEM_UNLOCK(env); } return (0); } /* * __memp_fsync_pp -- * DB_MPOOLFILE->sync pre/post processing. * * PUBLIC: int __memp_fsync_pp __P((DB_MPOOLFILE *)); */ int __memp_fsync_pp(dbmfp) DB_MPOOLFILE *dbmfp; { DB_THREAD_INFO *ip; ENV *env; int ret; env = dbmfp->env; MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->sync"); ENV_ENTER(env, ip); REPLICATION_WRAP(env, (__memp_fsync(dbmfp)), 0, ret); ENV_LEAVE(env, ip); return (ret); } /* * __memp_fsync -- * DB_MPOOLFILE->sync. * * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *)); */ int __memp_fsync(dbmfp) DB_MPOOLFILE *dbmfp; { MPOOLFILE *mfp; mfp = dbmfp->mfp; /* * If this handle doesn't have a file descriptor that's open for * writing, or if the file is a temporary, or if the file hasn't * been written since it was flushed, there's no reason to proceed * further. */ if (F_ISSET(dbmfp, MP_READONLY)) return (0); if (F_ISSET(dbmfp->mfp, MP_TEMP) || dbmfp->mfp->no_backing_file) return (0); if (mfp->file_written == 0) return (0); return (__memp_sync_int( dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)); } /* * __mp_xxx_fh -- * Return a file descriptor for DB 1.85 compatibility locking. * * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **)); */ int __mp_xxx_fh(dbmfp, fhp) DB_MPOOLFILE *dbmfp; DB_FH **fhp; { int ret; /* * This is a truly spectacular layering violation, intended ONLY to * support compatibility for the DB 1.85 DB->fd call. * * Sync the database file to disk, creating the file as necessary. * * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3). * The MP_READONLY test isn't interesting because we will either * already have a file descriptor (we opened the database file for * reading) or we aren't readonly (we created the database which * requires write privileges). The MP_TEMP test isn't interesting * because we want to write to the backing file regardless so that * we get a file descriptor to return. */ if ((*fhp = dbmfp->fhp) != NULL) return (0); if ((ret = __memp_sync_int( dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0) *fhp = dbmfp->fhp; return (ret); } /* * __memp_sync_int -- * Mpool sync internal function. * * PUBLIC: int __memp_sync_int __P((ENV *, * PUBLIC: DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *)); */ int __memp_sync_int(env, dbmfp, trickle_max, flags, wrote_totalp, interruptedp) ENV *env; DB_MPOOLFILE *dbmfp; u_int32_t trickle_max, flags, *wrote_totalp; int *interruptedp; { BH *bhp; BH_TRACK *bharray; DB_MPOOL *dbmp; DB_MPOOL_HASH *hp; MPOOL *c_mp, *mp; MPOOLFILE *mfp; db_mutex_t mutex; roff_t last_mf_offset; u_int32_t ar_cnt, ar_max, dirty, i, n_cache, remaining, wrote_total; int filecnt, maxopenfd, pass, required_write, ret, t_ret; int wait_cnt, wrote_cnt; dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; last_mf_offset = INVALID_ROFF; filecnt = pass = wrote_total = 0; if (wrote_totalp != NULL) *wrote_totalp = 0; if (interruptedp != NULL) *interruptedp = 0; /* * If we're flushing the cache, it's a checkpoint or we're flushing a * specific file, we really have to write the blocks and we have to * confirm they made it to disk. Otherwise, we can skip a block if * it's hard to get. */ required_write = LF_ISSET(DB_SYNC_CACHE | DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT); /* Get shared configuration information. */ MPOOL_SYSTEM_LOCK(env); maxopenfd = mp->mp_maxopenfd; MPOOL_SYSTEM_UNLOCK(env); /* Assume one dirty page per bucket. */ ar_max = mp->nreg * mp->htab_buckets; if ((ret = __os_malloc(env, ar_max * sizeof(BH_TRACK), &bharray)) != 0) return (ret); /* * Walk each cache's list of buffers and mark all dirty buffers to be * written and all dirty buffers to be potentially written, depending * on our flags. */ for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) { c_mp = dbmp->reginfo[n_cache].primary; hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); for (i = 0; i < c_mp->htab_buckets; i++, hp++) { /* * We can check for empty buckets before locking as * we only care if the pointer is zero or non-zero. * We can ignore empty or clean buckets because we * only need write buffers that were dirty before * we started. */ #ifdef DIAGNOSTIC if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) #else if (hp->hash_page_dirty == 0) #endif continue; dirty = 0; MUTEX_LOCK(env, hp->mtx_hash); SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { /* Always ignore clean pages. */ if (!F_ISSET(bhp, BH_DIRTY)) continue; dirty++; mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); /* * Ignore in-memory files, unless the file is * specifically being flushed. */ if (mfp->no_backing_file) continue; if (!LF_ISSET(DB_SYNC_FILE) && F_ISSET(mfp, MP_TEMP)) continue; /* * Ignore files that aren't involved in DB's * transactional operations during checkpoints. */ if (LF_ISSET(DB_SYNC_CHECKPOINT) && mfp->lsn_off == DB_LSN_OFF_NOTSET) continue; /* * Ignore files that aren't Queue extent files * if we're flushing a Queue file with extents. */ if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) && !F_ISSET(mfp, MP_EXTENT)) continue; /* * If we're flushing a specific file, see if * this page is from that file. */ if (dbmfp != NULL && mfp != dbmfp->mfp) continue; /* Track the buffer, we want it. */ bharray[ar_cnt].track_hp = hp; bharray[ar_cnt].track_pgno = bhp->pgno; bharray[ar_cnt].track_off = bhp->mf_offset; ar_cnt++; /* * If we run out of space, double and continue. * Don't stop at trickle_max, we want to sort * as large a sample set as possible in order * to minimize disk seeks. */ if (ar_cnt >= ar_max) { if ((ret = __os_realloc(env, (ar_max * 2) * sizeof(BH_TRACK), &bharray)) != 0) break; ar_max *= 2; } } DB_ASSERT(env, dirty == hp->hash_page_dirty); if (dirty != hp->hash_page_dirty) { __db_errx(env, "memp_sync: correcting dirty count %lu %lu", (u_long)hp->hash_page_dirty, (u_long)dirty); hp->hash_page_dirty = dirty; } MUTEX_UNLOCK(env, hp->mtx_hash); if (ret != 0) goto err; /* Check if the call has been interrupted. */ if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET( mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { if (interruptedp != NULL) *interruptedp = 1; goto err; } } } /* If there no buffers to write, we're done. */ if (ar_cnt == 0) goto done; /* * Write the buffers in file/page order, trying to reduce seeks by the * filesystem and, when pages are smaller than filesystem block sizes, * reduce the actual number of writes. */ if (ar_cnt > 1) qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp); /* * If we're trickling buffers, only write enough to reach the correct * percentage. */ if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max) ar_cnt = trickle_max; /* * Flush the log. We have to ensure the log records reflecting the * changes on the database pages we're writing have already made it * to disk. We still have to check the log each time we write a page * (because pages we are about to write may be modified after we have * flushed the log), but in general this will at least avoid any I/O * on the log's part. */ if (LOGGING_ON(env) && (ret = __log_flush(env, NULL)) != 0) goto err; /* * Walk the array, writing buffers. When we write a buffer, we NULL * out its hash bucket pointer so we don't process a slot more than * once. */ for (i = pass = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) { if (i >= ar_cnt) { i = 0; ++pass; __os_yield(env, 1, 0); } if ((hp = bharray[i].track_hp) == NULL) continue; /* Lock the hash bucket and find the buffer. */ mutex = hp->mtx_hash; MUTEX_LOCK(env, mutex); SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) if (bhp->pgno == bharray[i].track_pgno && bhp->mf_offset == bharray[i].track_off) break; /* * If we can't find the buffer we're done, somebody else had * to have written it. * * If the buffer isn't dirty, we're done, there's no work * needed. */ if (bhp == NULL || !F_ISSET(bhp, BH_DIRTY)) { MUTEX_UNLOCK(env, mutex); --remaining; bharray[i].track_hp = NULL; continue; } /* * If the buffer is locked by another thread, ignore it, we'll * come back to it. * * If the buffer is pinned and it's only the first or second * time we have looked at it, ignore it, we'll come back to * it. * * In either case, skip the buffer if we're not required to * write it. */ if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) { MUTEX_UNLOCK(env, mutex); if (!required_write) { --remaining; bharray[i].track_hp = NULL; } continue; } /* Pin the buffer into memory and lock it. */ ++bhp->ref; F_SET(bhp, BH_LOCKED); /* * If the buffer is referenced by another thread, set the sync * wait-for count (used to count down outstanding references to * this buffer as they are returned to the cache), then unlock * the hash bucket and wait for the count to go to 0. No other * thread can acquire the buffer because we have it locked. * * If a thread attempts to re-pin a page, the wait-for count * will never go to 0 (that thread spins on our buffer lock, * while we spin on the thread's ref count). Give up if we * don't get the buffer in 3 seconds, we'll try again later. * * If, when the wait-for count goes to 0, the buffer is found * to be dirty, write it. */ bhp->ref_sync = bhp->ref - 1; if (bhp->ref_sync != 0) { MUTEX_UNLOCK(env, mutex); for (wait_cnt = 1; bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt) __os_yield(env, 1, 0); MUTEX_LOCK(env, mutex); } /* * If we've switched files, check to see if we're configured * to close file descriptors. */ if (maxopenfd != 0 && bhp->mf_offset != last_mf_offset) { if (++filecnt >= maxopenfd) { filecnt = 0; if ((t_ret = __memp_close_flush_files( env, 1)) != 0 && ret == 0) ret = t_ret; } last_mf_offset = bhp->mf_offset; } /* * If the ref_sync count has gone to 0, we're going to be done * with this buffer no matter what happens. */ if (bhp->ref_sync == 0) { --remaining; bharray[i].track_hp = NULL; } /* * If the ref_sync count has gone to 0 and the buffer is still * dirty, we write it. We only try to write the buffer once. */ if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) { mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); if ((t_ret = __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) { ++wrote_cnt; ++wrote_total; } else { if (ret == 0) ret = t_ret; __db_errx (env, "%s: unable to flush page: %lu", __memp_fns(dbmp, mfp), (u_long)bhp->pgno); } } /* * If ref_sync count never went to 0, the buffer was written * by another thread, or the write failed, we still have the * buffer locked. */ if (F_ISSET(bhp, BH_LOCKED)) F_CLR(bhp, BH_LOCKED); /* * Reset the ref_sync count regardless of our success, we're * done with this buffer for now. */ bhp->ref_sync = 0; /* Discard our buffer reference. */ --bhp->ref; /* * If a thread of control is waiting in this hash bucket, wake * it up. */ if (F_ISSET(hp, IO_WAITER)) { F_CLR(hp, IO_WAITER); MUTEX_UNLOCK(env, hp->mtx_io); } /* Release the hash bucket mutex. */ MUTEX_UNLOCK(env, mutex); /* Check if the call has been interrupted. */ if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { if (interruptedp != NULL) *interruptedp = 1; goto err; } /* * Sleep after some number of writes to avoid disk saturation. * Don't cache the max writes value, an application shutting * down might reset the value in order to do a fast flush or * checkpoint. */ if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) && !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) && mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) { wrote_cnt = 0; __os_yield(env, 0, (u_long)mp->mp_maxwrite_sleep); } } done: /* * If a write is required, we have to force the pages to disk. We * don't do this as we go along because we want to give the OS as * much time as possible to lazily flush, and because we have to flush * files that might not even have had dirty buffers in the cache, so * we have to walk the files list. */ if (ret == 0 && required_write) { if (dbmfp == NULL) ret = __memp_sync_files(env); else ret = __os_fsync(env, dbmfp->fhp); } /* If we've opened files to flush pages, close them. */ if ((t_ret = __memp_close_flush_files(env, 0)) != 0 && ret == 0) ret = t_ret; err: __os_free(env, bharray); if (wrote_totalp != NULL) *wrote_totalp = wrote_total; return (ret); } static int __memp_sync_file(env, mfp, argp, countp, flags) ENV *env; MPOOLFILE *mfp; void *argp; u_int32_t *countp; u_int32_t flags; { DB_MPOOL *dbmp; DB_MPOOLFILE *dbmfp; int ret, t_ret; COMPQUIET(countp, NULL); COMPQUIET(flags, 0); if (!mfp->file_written || mfp->no_backing_file || mfp->deadfile || F_ISSET(mfp, MP_TEMP)) return (0); /* * Pin the MPOOLFILE structure into memory, and release the * region mutex allowing us to walk the linked list. We'll * re-acquire that mutex to move to the next entry in the list. * * This works because we only need to flush current entries, * we don't care about new entries being added, and the linked * list is never re-ordered, a single pass is sufficient. It * requires MPOOLFILE structures removed before we get to them * be flushed to disk, but that's nothing new, they could have * been removed while checkpoint was running, too. * * Once we have the MPOOLFILE lock, re-check the MPOOLFILE is * not being discarded. (A thread removing the MPOOLFILE * will: hold the MPOOLFILE mutex, set deadfile, drop the * MPOOLFILE mutex and then acquire the region MUTEX to walk * the linked list and remove the MPOOLFILE structure. Make * sure the MPOOLFILE wasn't marked dead while we waited for * the mutex. */ MUTEX_LOCK(env, mfp->mutex); if (!mfp->file_written || mfp->deadfile) { MUTEX_UNLOCK(env, mfp->mutex); return (0); } ++mfp->mpf_cnt; MUTEX_UNLOCK(env, mfp->mutex); /* * Look for an already open, writeable handle (fsync doesn't * work on read-only Windows handles). */ dbmp = env->mp_handle; MUTEX_LOCK(env, dbmp->mutex); TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) { if (dbmfp->mfp != mfp || F_ISSET(dbmfp, MP_READONLY)) continue; /* * We don't want to hold the mutex while calling sync. * Increment the DB_MPOOLFILE handle ref count to pin * it into memory. */ ++dbmfp->ref; break; } MUTEX_UNLOCK(env, dbmp->mutex); /* If we don't find a handle we can use, open one. */ if (dbmfp == NULL) { if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) { __db_err(env, ret, "%s: unable to flush", (char *) R_ADDR(dbmp->reginfo, mfp->path_off)); } } else ret = __os_fsync(env, dbmfp->fhp); /* * Re-acquire the MPOOLFILE mutex, we need it to modify the * reference count. */ MUTEX_LOCK(env, mfp->mutex); /* * If we wrote the file and there are no other references (or there * is a single reference, and it's the one we opened to write * buffers during checkpoint), clear the file_written flag. We * do this so that applications opening thousands of files don't * loop here opening and flushing those files during checkpoint. * * The danger here is if a buffer were to be written as part of * a checkpoint, and then not be flushed to disk. This cannot * happen because we only clear file_written when there are no * other users of the MPOOLFILE in the system, and, as we hold * the region lock, no possibility of another thread of control * racing with us to open a MPOOLFILE. */ if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 && dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) { mfp->file_written = 0; /* * We may be the last reference for a MPOOLFILE, as we * weren't holding the MPOOLFILE mutex when flushing * it's buffers to disk. If we can discard it, set * a flag to schedule a clean-out pass. (Not likely, * I mean, what are the chances that there aren't any * buffers in the pool? Regardless, it might happen.) */ if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0) *(int *)argp = 1; } /* * If we found the file we must close it in case we are the last * reference to the dbmfp. NOTE: since we have incremented * mfp->mpf_cnt this cannot be the last reference to the mfp. * This is important since we are called with the hash bucket * locked. The mfp will get freed via the cleanup pass. */ if (dbmfp != NULL && (t_ret = __memp_fclose(dbmfp, DB_MPOOL_NOLOCK)) != 0 && ret == 0) ret = t_ret; --mfp->mpf_cnt; /* Unlock the MPOOLFILE. */ MUTEX_UNLOCK(env, mfp->mutex); return (ret); } /* * __memp_sync_files -- * Sync all the files in the environment, open or not. */ static int __memp_sync_files(env) ENV *env; { DB_MPOOL *dbmp; DB_MPOOL_HASH *hp; MPOOL *mp; MPOOLFILE *mfp, *next_mfp; int i, need_discard_pass, ret; dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; need_discard_pass = ret = 0; ret = __memp_walk_files(env, mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_MEMP_NOERROR); /* * We may need to do a last pass through the MPOOLFILE list -- if we * were the last reference to an MPOOLFILE, we need to clean it out. */ if (!need_discard_pass) return (ret); hp = R_ADDR(dbmp->reginfo, mp->ftab); for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { retry: MUTEX_LOCK(env, hp->mtx_hash); for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket, __mpoolfile); mfp != NULL; mfp = next_mfp) { next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile); /* * Do a fast check -- we can check for zero/non-zero * without a mutex on the MPOOLFILE. If likely to * succeed, lock the MPOOLFILE down and look for real. */ if (mfp->deadfile || mfp->block_cnt != 0 || mfp->mpf_cnt != 0) continue; MUTEX_LOCK(env, mfp->mutex); if (!mfp->deadfile && mfp->block_cnt == 0 && mfp->mpf_cnt == 0) { MUTEX_UNLOCK(env, hp->mtx_hash); (void)__memp_mf_discard(dbmp, mfp); goto retry; } else MUTEX_UNLOCK(env, mfp->mutex); } MUTEX_UNLOCK(env, hp->mtx_hash); } return (ret); } /* * __memp_mf_sync -- * Flush an MPOOLFILE, when no currently open handle is available. * * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int)); */ int __memp_mf_sync(dbmp, mfp, locked) DB_MPOOL *dbmp; MPOOLFILE *mfp; int locked; { DB_FH *fhp; DB_MPOOL_HASH *hp; ENV *env; MPOOL *mp; int ret, t_ret; char *rpath; COMPQUIET(hp, NULL); env = dbmp->env; /* * We need to be holding the hash lock: we're using the path name * and __memp_nameop might try and rename the file. */ if (!locked) { mp = dbmp->reginfo[0].primary; hp = R_ADDR(dbmp->reginfo, mp->ftab); hp += FNBUCKET( R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN); MUTEX_LOCK(env, hp->mtx_hash); } if ((ret = __db_appname(env, DB_APP_DATA, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) { if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) { ret = __os_fsync(env, fhp); if ((t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) ret = t_ret; } __os_free(env, rpath); } if (!locked) MUTEX_UNLOCK(env, hp->mtx_hash); return (ret); } /* * __memp_close_flush_files -- * Close files opened only to flush buffers. */ static int __memp_close_flush_files(env, dosync) ENV *env; int dosync; { DB_MPOOL *dbmp; DB_MPOOLFILE *dbmfp; MPOOLFILE *mfp; int ret; dbmp = env->mp_handle; /* * The routine exists because we must close files opened by sync to * flush buffers. There are two cases: first, extent files have to * be closed so they may be removed when empty. Second, regular * files have to be closed so we don't run out of descriptors (for * example, an application partitioning its data into databases * based on timestamps, so there's a continually increasing set of * files). * * We mark files opened in the __memp_bhwrite() function with the * MP_FLUSH flag. Here we walk through our file descriptor list, * and, if a file was opened by __memp_bhwrite(), we close it. */ retry: MUTEX_LOCK(env, dbmp->mutex); TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) if (F_ISSET(dbmfp, MP_FLUSH)) { F_CLR(dbmfp, MP_FLUSH); MUTEX_UNLOCK(env, dbmp->mutex); if (dosync) { /* * If we have the only open handle on the file, * clear the dirty flag so we don't re-open and * sync it again when discarding the MPOOLFILE * structure. Clear the flag before the sync * so can't race with a thread writing the file. */ mfp = dbmfp->mfp; if (mfp->mpf_cnt == 1) { MUTEX_LOCK(env, mfp->mutex); if (mfp->mpf_cnt == 1) mfp->file_written = 0; MUTEX_UNLOCK(env, mfp->mutex); } if ((ret = __os_fsync(env, dbmfp->fhp)) != 0) return (ret); } if ((ret = __memp_fclose(dbmfp, 0)) != 0) return (ret); goto retry; } MUTEX_UNLOCK(env, dbmp->mutex); return (0); } static int __bhcmp(p1, p2) const void *p1, *p2; { BH_TRACK *bhp1, *bhp2; bhp1 = (BH_TRACK *)p1; bhp2 = (BH_TRACK *)p2; /* Sort by file (shared memory pool offset). */ if (bhp1->track_off < bhp2->track_off) return (-1); if (bhp1->track_off > bhp2->track_off) return (1); /* * !!! * Defend against badly written quicksort code calling the comparison * function with two identical pointers (e.g., WATCOM C++ (Power++)). */ if (bhp1->track_pgno < bhp2->track_pgno) return (-1); if (bhp1->track_pgno > bhp2->track_pgno) return (1); return (0); }