/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996,2008 Oracle. All rights reserved. * * $Id: mp_fget.c,v 12.53 2008/04/28 02:59:57 alexg Exp $ */ #include "db_config.h" #include "db_int.h" #include "dbinc/log.h" #include "dbinc/mp.h" #include "dbinc/txn.h" /* * __memp_fget_pp -- * DB_MPOOLFILE->get pre/post processing. * * PUBLIC: int __memp_fget_pp * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *)); */ int __memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp) DB_MPOOLFILE *dbmfp; db_pgno_t *pgnoaddr; DB_TXN *txnp; u_int32_t flags; void *addrp; { DB_THREAD_INFO *ip; ENV *env; int rep_blocked, ret; env = dbmfp->env; MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get"); /* * Validate arguments. * * !!! * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly * files here, and create non-existent pages in readonly files if the * flags are set, later. The reason is that the hash access method * wants to get empty pages that don't really exist in readonly files. * The only alternative is for hash to write the last "bucket" all the * time, which we don't want to do because one of our big goals in life * is to keep database files small. It's sleazy as hell, but we catch * any attempt to actually write the file in memp_fput(). */ #define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \ DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW) if (flags != 0) { if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0) return (ret); switch (flags) { case DB_MPOOL_DIRTY: case DB_MPOOL_CREATE: case DB_MPOOL_EDIT: case DB_MPOOL_LAST: case DB_MPOOL_NEW: break; default: return (__db_ferr(env, "memp_fget", 1)); } } ENV_ENTER(env, ip); rep_blocked = 0; if (txnp == NULL && IS_ENV_REPLICATED(env)) { if ((ret = __op_rep_enter(env)) != 0) goto err; rep_blocked = 1; } ret = __memp_fget(dbmfp, pgnoaddr, ip, txnp, flags, addrp); /* * We only decrement the count in op_rep_exit if the operation fails. * Otherwise the count will be decremented when the page is no longer * pinned in memp_fput. */ if (ret != 0 && rep_blocked) (void)__op_rep_exit(env); /* Similarly if an app has a page pinned it is ACTIVE. */ err: if (ret != 0) ENV_LEAVE(env, ip); return (ret); } /* * __memp_fget -- * Get a page from the file. * * PUBLIC: int __memp_fget __P((DB_MPOOLFILE *, * PUBLIC: db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *)); */ int __memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp) DB_MPOOLFILE *dbmfp; db_pgno_t *pgnoaddr; DB_THREAD_INFO *ip; DB_TXN *txn; u_int32_t flags; void *addrp; { enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; BH *alloc_bhp, *bhp, *frozen_bhp, *oldest_bhp; ENV *env; DB_LSN *read_lsnp, vlsn; DB_MPOOL *dbmp; DB_MPOOL_HASH *hp; MPOOL *c_mp; MPOOLFILE *mfp; PIN_LIST *list, *lp; REGINFO *infop, *t_infop, *reginfo; TXN_DETAIL *td; roff_t list_off, mf_offset; u_int32_t pinmax, st_hsearch; int b_incr, b_locked, dirty, edit, extending, first; int makecopy, mvcc, need_free, ret; *(void **)addrp = NULL; COMPQUIET(c_mp, NULL); COMPQUIET(infop, NULL); COMPQUIET(oldest_bhp, NULL); env = dbmfp->env; dbmp = env->mp_handle; mfp = dbmfp->mfp; mvcc = mfp->multiversion; mf_offset = R_OFFSET(dbmp->reginfo, mfp); alloc_bhp = bhp = frozen_bhp = NULL; read_lsnp = NULL; td = NULL; hp = NULL; b_incr = b_locked = extending = makecopy = ret = 0; if (LF_ISSET(DB_MPOOL_DIRTY)) { if (F_ISSET(dbmfp, MP_READONLY)) { __db_errx(env, "%s: dirty flag set for readonly file page", __memp_fn(dbmfp)); return (EINVAL); } if ((ret = __db_fcchk(env, "DB_MPOOLFILE->get", flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0) return (ret); } dirty = LF_ISSET(DB_MPOOL_DIRTY); edit = LF_ISSET(DB_MPOOL_EDIT); LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT); /* * If the transaction is being used to update a multiversion database * for the first time, set the read LSN. In addition, if this is an * update, allocate a mutex. If no transaction has been supplied, that * will be caught later, when we know whether one is required. */ if (mvcc && txn != NULL && txn->td != NULL) { /* We're only interested in the ultimate parent transaction. */ while (txn->parent != NULL) txn = txn->parent; td = (TXN_DETAIL *)txn->td; if (F_ISSET(txn, TXN_SNAPSHOT)) { read_lsnp = &td->read_lsn; if (IS_MAX_LSN(*read_lsnp) && (ret = __log_current_lsn(env, read_lsnp, NULL, NULL)) != 0) return (ret); } if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) && td->mvcc_mtx == MUTEX_INVALID && (ret = __mutex_alloc(env, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0) return (ret); } switch (flags) { case DB_MPOOL_LAST: /* Get the last page number in the file. */ MUTEX_LOCK(env, mfp->mutex); *pgnoaddr = mfp->last_pgno; MUTEX_UNLOCK(env, mfp->mutex); break; case DB_MPOOL_NEW: /* * If always creating a page, skip the first search * of the hash bucket. */ state = FIRST_MISS; goto alloc; case DB_MPOOL_CREATE: default: break; } /* * If mmap'ing the file and the page is not past the end of the file, * just return a pointer. We can't use R_ADDR here: this is an offset * into an mmap'd file, not a shared region, and doesn't change for * private environments. * * The page may be past the end of the file, so check the page number * argument against the original length of the file. If we previously * returned pages past the original end of the file, last_pgno will * have been updated to match the "new" end of the file, and checking * against it would return pointers past the end of the mmap'd region. * * If another process has opened the file for writing since we mmap'd * it, we will start playing the game by their rules, i.e. everything * goes through the cache. All pages previously returned will be safe, * as long as the correct locking protocol was observed. * * We don't discard the map because we don't know when all of the * pages will have been discarded from the process' address space. * It would be possible to do so by reference counting the open * pages from the mmap, but it's unclear to me that it's worth it. */ if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { *(void **)addrp = (u_int8_t *)dbmfp->addr + (*pgnoaddr * mfp->stat.st_pagesize); STAT(++mfp->stat.st_map); return (0); } retry: /* * Determine the cache and hash bucket where this page lives and get * local pointers to them. Reset on each pass through this code, the * page number can change. */ MP_GET_BUCKET(env, mfp, *pgnoaddr, &infop, hp, ret); if (ret != 0) return (ret); c_mp = infop->primary; /* Search the hash chain for the page. */ st_hsearch = 0; b_locked = 1; SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { ++st_hsearch; if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) continue; /* Snapshot reads -- get the version visible at read_lsn. */ if (mvcc && !edit && read_lsnp != NULL) { while (bhp != NULL && !BH_OWNED_BY(env, bhp, txn) && !BH_VISIBLE(env, bhp, read_lsnp, vlsn)) bhp = SH_CHAIN_PREV(bhp, vc, __bh); DB_ASSERT(env, bhp != NULL); } makecopy = mvcc && dirty && !BH_OWNED_BY(env, bhp, txn); if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) { DB_ASSERT(env, frozen_bhp == NULL); frozen_bhp = bhp; } /* * Increment the reference count. We may discard the hash * bucket lock as we evaluate and/or read the buffer, so we * need to ensure it doesn't move and its contents remain * unchanged. */ if (bhp->ref == UINT16_MAX) { __db_errx(env, "%s: page %lu: reference count overflow", __memp_fn(dbmfp), (u_long)bhp->pgno); ret = __env_panic(env, EINVAL); goto err; } ++bhp->ref; b_incr = 1; /* * BH_LOCKED -- * I/O is in progress or sync is waiting on the buffer to write * it. Because we've incremented the buffer reference count, * we know the buffer can't move. Unlock the bucket lock, wait * for the buffer to become available, re-acquire the bucket. */ for (first = 1; F_ISSET(bhp, BH_LOCKED) && !F_ISSET(env->dbenv, DB_ENV_NOLOCKING); first = 0) { /* * If someone is trying to sync this buffer and the * buffer is hot, they may never get in. Give up and * try again. */ if (!first && bhp->ref_sync != 0) { --bhp->ref; MUTEX_UNLOCK(env, hp->mtx_hash); bhp = frozen_bhp = NULL; b_incr = b_locked = 0; __os_yield(env, 0, 1); goto retry; } /* * If we're the first thread waiting on I/O, set the * flag so the thread doing I/O knows to wake us up, * and lock the mutex. */ if (!F_ISSET(hp, IO_WAITER)) { F_SET(hp, IO_WAITER); MUTEX_LOCK(env, hp->mtx_io); } STAT(++hp->hash_io_wait); /* Release the hash bucket lock. */ MUTEX_UNLOCK(env, hp->mtx_hash); /* Wait for I/O to finish. */ MUTEX_LOCK(env, hp->mtx_io); MUTEX_UNLOCK(env, hp->mtx_io); /* Re-acquire the hash bucket lock. */ MUTEX_LOCK(env, hp->mtx_hash); } /* * If the buffer was frozen before we waited for any I/O to * complete and is still frozen, we will need to thaw it. * Otherwise, it was thawed while we waited, and we need to * search again. */ if (frozen_bhp != NULL && F_ISSET(frozen_bhp, BH_THAWED)) { thawed: need_free = (--frozen_bhp->ref == 0); b_incr = 0; MUTEX_UNLOCK(env, hp->mtx_hash); MPOOL_REGION_LOCK(env, infop); if (alloc_bhp != NULL) { __memp_free(infop, mfp, alloc_bhp); alloc_bhp = NULL; } if (need_free) SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, frozen_bhp, hq); MPOOL_REGION_UNLOCK(env, infop); bhp = frozen_bhp = NULL; goto retry; } /* * If the buffer we wanted was frozen or thawed while we * waited, we need to start again. */ if (SH_CHAIN_HASNEXT(bhp, vc) && SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off) { --bhp->ref; b_incr = 0; MUTEX_UNLOCK(env, hp->mtx_hash); bhp = frozen_bhp = NULL; goto retry; } else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) { ret = DB_LOCK_DEADLOCK; goto err; } #ifdef HAVE_STATISTICS ++mfp->stat.st_cache_hit; #endif break; } #ifdef HAVE_STATISTICS /* * Update the hash bucket search statistics -- do now because our next * search may be for a different bucket. */ ++c_mp->stat.st_hash_searches; if (st_hsearch > c_mp->stat.st_hash_longest) c_mp->stat.st_hash_longest = st_hsearch; c_mp->stat.st_hash_examined += st_hsearch; #endif /* * There are 4 possible paths to this location: * * FIRST_MISS: * Didn't find the page in the hash bucket on our first pass: * bhp == NULL, alloc_bhp == NULL * * FIRST_FOUND: * Found the page in the hash bucket on our first pass: * bhp != NULL, alloc_bhp == NULL * * SECOND_FOUND: * Didn't find the page in the hash bucket on the first pass, * allocated space, and found the page in the hash bucket on * our second pass: * bhp != NULL, alloc_bhp != NULL * * SECOND_MISS: * Didn't find the page in the hash bucket on the first pass, * allocated space, and didn't find the page in the hash bucket * on our second pass: * bhp == NULL, alloc_bhp != NULL */ state = bhp == NULL ? (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) : (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND); switch (state) { case FIRST_FOUND: /* * If we are to free the buffer, then this had better be the * only reference. If so, just free the buffer. If not, * complain and get out. */ if (flags == DB_MPOOL_FREE) { if (--bhp->ref == 0) { if (F_ISSET(bhp, BH_DIRTY)) { --hp->hash_page_dirty; F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); } /* * In a multiversion database, this page could * be requested again so we have to leave it in * cache for now. It should *not* ever be * requested again for modification without an * intervening DB_MPOOL_CREATE or DB_MPOOL_NEW. * * Mark it with BH_FREED so we don't reuse the * data when the page is resurrected. */ if (mvcc && (F_ISSET(bhp, BH_FROZEN) || !SH_CHAIN_SINGLETON(bhp, vc) || bhp->td_off == INVALID_ROFF || !IS_MAX_LSN(*VISIBLE_LSN(env, bhp)))) { F_SET(bhp, BH_FREED); MUTEX_UNLOCK(env, hp->mtx_hash); return (0); } return (__memp_bhfree( dbmp, infop, hp, bhp, BH_FREE_FREEMEM)); } __db_errx(env, "File %s: freeing pinned buffer for page %lu", __memp_fns(dbmp, mfp), (u_long)*pgnoaddr); ret = __env_panic(env, EINVAL); goto err; } if (mvcc) { if (flags == DB_MPOOL_CREATE && F_ISSET(bhp, BH_FREED)) { extending = makecopy = 1; MUTEX_LOCK(env, mfp->mutex); if (*pgnoaddr > mfp->last_pgno) mfp->last_pgno = *pgnoaddr; MUTEX_UNLOCK(env, mfp->mutex); } /* * With multiversion databases, we might need to * allocate a new buffer into which we can copy the one * that we found. In that case, check the last buffer * in the chain to see whether we can reuse an obsolete * buffer. * * To provide snapshot isolation, we need to make sure * that we've seen a buffer older than the oldest * snapshot read LSN. */ reuse: if ((makecopy || frozen_bhp != NULL) && (oldest_bhp = SH_CHAIN_PREV(bhp, vc, __bh)) != NULL) { while (SH_CHAIN_HASPREV(oldest_bhp, vc)) oldest_bhp = SH_CHAIN_PREVP(oldest_bhp, vc, __bh); if (oldest_bhp->ref == 0 && !BH_OBSOLETE( oldest_bhp, hp->old_reader, vlsn) && (ret = __txn_oldest_reader(env, &hp->old_reader)) != 0) goto err; if (BH_OBSOLETE( oldest_bhp, hp->old_reader, vlsn) && oldest_bhp->ref == 0) { if (F_ISSET(oldest_bhp, BH_FROZEN)) { ++oldest_bhp->ref; if ((ret = __memp_bh_thaw(dbmp, infop, hp, oldest_bhp, NULL)) != 0) goto err; goto reuse; } else if ((ret = __memp_bhfree(dbmp, infop, hp, oldest_bhp, BH_FREE_REUSE)) != 0) goto err; alloc_bhp = oldest_bhp; } DB_ASSERT(env, alloc_bhp == NULL || !F_ISSET(alloc_bhp, BH_FROZEN)); } } /* We found the buffer or we're ready to copy -- we're done. */ if ((!makecopy && frozen_bhp == NULL) || alloc_bhp != NULL) break; /* FALLTHROUGH */ case FIRST_MISS: /* * We didn't find the buffer in our first check. Figure out * if the page exists, and allocate structures so we can add * the page to the buffer pool. */ MUTEX_UNLOCK(env, hp->mtx_hash); b_locked = 0; /* * The buffer is not in the pool, so we don't need to free it. */ if (flags == DB_MPOOL_FREE) return (0); alloc: /* * If DB_MPOOL_NEW is set, we have to allocate a page number. * If neither DB_MPOOL_CREATE or DB_MPOOL_NEW is set, then * it's an error to try and get a page past the end of file. */ DB_ASSERT(env, !b_locked); MUTEX_LOCK(env, mfp->mutex); switch (flags) { case DB_MPOOL_NEW: extending = 1; if (mfp->maxpgno != 0 && mfp->last_pgno >= mfp->maxpgno) { __db_errx( env, "%s: file limited to %lu pages", __memp_fn(dbmfp), (u_long)mfp->maxpgno); ret = ENOSPC; } else *pgnoaddr = mfp->last_pgno + 1; break; case DB_MPOOL_CREATE: if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) { __db_errx( env, "%s: file limited to %lu pages", __memp_fn(dbmfp), (u_long)mfp->maxpgno); ret = ENOSPC; } else if (!extending) extending = *pgnoaddr > mfp->last_pgno; break; default: ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0; break; } MUTEX_UNLOCK(env, mfp->mutex); if (ret != 0) goto err; /* * !!! * In the DB_MPOOL_NEW code path, infop and c_mp have * not yet been initialized. */ MP_GET_REGION(dbmfp, *pgnoaddr, &infop, ret); if (ret != 0) goto err; c_mp = infop->primary; /* Allocate a new buffer header and data space. */ if ((ret = __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0) goto err; #ifdef DIAGNOSTIC if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { __db_errx(env, "DB_MPOOLFILE->get: buffer data is NOT size_t aligned"); ret = __env_panic(env, EINVAL); goto err; } #endif /* * If we are extending the file, we'll need the mfp lock * again. */ if (extending) MUTEX_LOCK(env, mfp->mutex); /* * DB_MPOOL_NEW does not guarantee you a page unreferenced by * any other thread of control. (That guarantee is interesting * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller * did not specify the page number, and so, may reasonably not * have any way to lock the page outside of mpool.) Regardless, * if we allocate the page, and some other thread of control * requests the page by number, we will not detect that and the * thread of control that allocated using DB_MPOOL_NEW may not * have a chance to initialize the page. (Note: we *could* * detect this case if we set a flag in the buffer header which * guaranteed that no gets of the page would succeed until the * reference count went to 0, that is, until the creating page * put the page.) What we do guarantee is that if two threads * of control are both doing DB_MPOOL_NEW calls, they won't * collide, that is, they won't both get the same page. * * There's a possibility that another thread allocated the page * we were planning to allocate while we were off doing buffer * allocation. We can do that by making sure the page number * we were going to use is still available. If it's not, then * we check to see if the next available page number hashes to * the same mpool region as the old one -- if it does, we can * continue, otherwise, we have to start over. */ if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { *pgnoaddr = mfp->last_pgno + 1; MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop,ret); if (ret != 0) goto err; if (t_infop != infop) { /* * flags == DB_MPOOL_NEW, so extending is set * and we're holding the mfp locked. */ MUTEX_UNLOCK(env, mfp->mutex); MPOOL_REGION_LOCK(env, infop); __memp_free(infop, mfp, alloc_bhp); c_mp->stat.st_pages--; MPOOL_REGION_UNLOCK(env, infop); alloc_bhp = NULL; goto alloc; } } /* * We released the mfp lock, so another thread might have * extended the file. Update the last_pgno and initialize * the file, as necessary, if we extended the file. */ if (extending) { if (*pgnoaddr > mfp->last_pgno) mfp->last_pgno = *pgnoaddr; MUTEX_UNLOCK(env, mfp->mutex); if (ret != 0) goto err; } /* * If we're doing copy-on-write, we will already have the * buffer header. In that case, we don't need to search again. */ if (bhp != NULL) { MUTEX_LOCK(env, hp->mtx_hash); b_locked = 1; break; } DB_ASSERT(env, frozen_bhp == NULL); goto retry; case SECOND_FOUND: /* * We allocated buffer space for the requested page, but then * found the page in the buffer cache on our second check. * That's OK -- we can use the page we found in the pool, * unless DB_MPOOL_NEW is set. If we're about to copy-on-write, * this is exactly the situation we want. * * For multiversion files, we may have left some pages in cache * beyond the end of a file after truncating. In that case, we * would get to here with extending set. If so, we need to * insert the new page in the version chain similar to when * we copy on write. */ if (extending && F_ISSET(bhp, BH_FREED)) makecopy = 1; if (makecopy || frozen_bhp != NULL) break; /* Free the allocated memory, we no longer need it. Since we * can't acquire the region lock while holding the hash bucket * lock, we have to release the hash bucket and re-acquire it. * That's OK, because we have the buffer pinned down. */ MUTEX_UNLOCK(env, hp->mtx_hash); MPOOL_REGION_LOCK(env, infop); __memp_free(infop, mfp, alloc_bhp); c_mp->stat.st_pages--; MPOOL_REGION_UNLOCK(env, infop); alloc_bhp = NULL; /* * We can't use the page we found in the pool if DB_MPOOL_NEW * was set. (For details, see the above comment beginning * "DB_MPOOL_NEW does not guarantee you a page unreferenced by * any other thread of control".) If DB_MPOOL_NEW is set, we * release our pin on this particular buffer, and try to get * another one. */ if (flags == DB_MPOOL_NEW) { --bhp->ref; b_incr = b_locked = 0; bhp = NULL; goto alloc; } /* We can use the page -- get the bucket lock. */ MUTEX_LOCK(env, hp->mtx_hash); break; case SECOND_MISS: /* * We allocated buffer space for the requested page, and found * the page still missing on our second pass through the buffer * cache. Instantiate the page. */ bhp = alloc_bhp; alloc_bhp = NULL; /* * Initialize all the BH and hash bucket fields so we can call * __memp_bhfree if an error occurs. * * Append the buffer to the tail of the bucket list and update * the hash bucket's priority. */ /*lint --e{668} (flexelint: bhp cannot be NULL). */ #ifdef DIAG_MVCC memset(bhp, 0, SSZ(BH, align_off)); #else memset(bhp, 0, sizeof(BH)); #endif bhp->ref = 1; b_incr = 1; bhp->priority = UINT32_MAX; bhp->pgno = *pgnoaddr; bhp->mf_offset = mf_offset; SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); SH_CHAIN_INIT(bhp, vc); /* We created a new page, it starts dirty. */ if (extending) { ++hp->hash_page_dirty; F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE); } /* * If we created the page, zero it out. If we didn't create * the page, read from the backing file. * * !!! * DB_MPOOL_NEW doesn't call the pgin function. * * If DB_MPOOL_CREATE is used, then the application's pgin * function has to be able to handle pages of 0's -- if it * uses DB_MPOOL_NEW, it can detect all of its page creates, * and not bother. * * If we're running in diagnostic mode, smash any bytes on the * page that are unknown quantities for the caller. * * Otherwise, read the page into memory, optionally creating it * if DB_MPOOL_CREATE is set. */ if (extending) { MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | PROT_WRITE); if (mfp->clear_len == DB_CLEARLEN_NOTSET) memset(bhp->buf, 0, mfp->stat.st_pagesize); else { memset(bhp->buf, 0, mfp->clear_len); #if defined(DIAGNOSTIC) || defined(UMRW) memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, mfp->stat.st_pagesize - mfp->clear_len); #endif } if (flags == DB_MPOOL_CREATE && mfp->ftype != 0) F_SET(bhp, BH_CALLPGIN); STAT(++mfp->stat.st_page_create); } else { F_SET(bhp, BH_TRASH); STAT(++mfp->stat.st_cache_miss); } /* Increment buffer count referenced by MPOOLFILE. */ MUTEX_LOCK(env, mfp->mutex); ++mfp->block_cnt; MUTEX_UNLOCK(env, mfp->mutex); } DB_ASSERT(env, bhp != NULL); DB_ASSERT(env, bhp->ref != 0); /* We've got a buffer header we're re-instantiating. */ if (frozen_bhp != NULL) { DB_ASSERT(env, alloc_bhp != NULL); /* * If the empty buffer has been filled in the meantime, don't * overwrite it. */ if (F_ISSET(frozen_bhp, BH_THAWED)) goto thawed; else { if ((ret = __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)) != 0) goto err; bhp = alloc_bhp; } frozen_bhp = alloc_bhp = NULL; /* * If we're updating a buffer that was frozen, we have to go * through all of that again to allocate another buffer to hold * the new copy. */ if (makecopy) { MUTEX_UNLOCK(env, hp->mtx_hash); b_locked = 0; goto alloc; } } /* * BH_TRASH -- * The buffer we found may need to be filled from the disk. * * It's possible for the read function to fail, which means we fail as * well. Note, the __memp_pgread() function discards and reacquires * the hash lock, so the buffer must be pinned down so that it cannot * move and its contents are unchanged. Discard the buffer on failure * unless another thread is waiting on our I/O to complete. It's OK to * leave the buffer around, as the waiting thread will see the BH_TRASH * flag set, and will also attempt to discard it. If there's a waiter, * we need to decrement our reference count. */ if (F_ISSET(bhp, BH_TRASH) && (ret = __memp_pgread(dbmfp, hp, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0) goto err; /* * BH_CALLPGIN -- * The buffer was processed for being written to disk, and now has * to be re-converted for use. */ if (F_ISSET(bhp, BH_CALLPGIN)) { MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | PROT_WRITE); if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) goto err; F_CLR(bhp, BH_CALLPGIN); } /* Copy-on-write. */ if (makecopy && state != SECOND_MISS) { DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); DB_ASSERT(env, bhp != NULL); DB_ASSERT(env, alloc_bhp != NULL); DB_ASSERT(env, alloc_bhp != bhp); if (bhp->ref == 1) MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ); alloc_bhp->ref = 1; alloc_bhp->ref_sync = 0; alloc_bhp->flags = F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE); F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); alloc_bhp->priority = bhp->priority; alloc_bhp->pgno = bhp->pgno; alloc_bhp->mf_offset = bhp->mf_offset; alloc_bhp->td_off = INVALID_ROFF; if (txn != NULL && (ret = __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0) goto err; if (extending) { memset(alloc_bhp->buf, 0, mfp->stat.st_pagesize); F_SET(alloc_bhp, BH_DIRTY_CREATE); } else memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize); SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh); SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, bhp, alloc_bhp, hq, __bh); SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); if (--bhp->ref == 0) { bhp->priority = c_mp->lru_count; MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0); } bhp = alloc_bhp; if (alloc_bhp != oldest_bhp) { MUTEX_LOCK(env, mfp->mutex); ++mfp->block_cnt; MUTEX_UNLOCK(env, mfp->mutex); } alloc_bhp = NULL; } else if (mvcc && extending && txn != NULL && (ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0) goto err; if ((dirty || edit || extending) && !F_ISSET(bhp, BH_DIRTY)) { DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); ++hp->hash_page_dirty; F_SET(bhp, BH_DIRTY); } /* * If we're the only reference, update buffer priority. We may be * about to release the hash bucket lock, and everything should be * correct, first. (We've already done this work if we created the * buffer, so there is no need to do it again.) */ if (state != SECOND_MISS && bhp->ref == 1) { bhp->priority = UINT32_MAX; if (SH_CHAIN_SINGLETON(bhp, vc)) { if (bhp != SH_TAILQ_LAST(&hp->hash_bucket, hq, __bh)) { SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); } } } MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | (dirty || edit || extending || F_ISSET(bhp, BH_DIRTY) ? PROT_WRITE : 0)); #ifdef DIAGNOSTIC { BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh); DB_ASSERT(env, !mfp->multiversion || !F_ISSET(bhp, BH_DIRTY) || next_bhp == NULL); DB_ASSERT(env, !mvcc || edit || read_lsnp == NULL || bhp->td_off == INVALID_ROFF || BH_OWNED_BY(env, bhp, txn) || (BH_VISIBLE(env, bhp, read_lsnp, vlsn) && (next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) || (next_bhp->td_off != INVALID_ROFF && (BH_OWNER(env, next_bhp)->status != TXN_COMMITTED || !BH_VISIBLE(env, next_bhp, read_lsnp, vlsn)))))); } #endif MUTEX_UNLOCK(env, hp->mtx_hash); /* * Record this pin for this thread. Holding the page pinned * without recording the pin is ok since we do not recover from * a death from within the library itself. */ if (ip != NULL) { reginfo = env->reginfo; if (ip->dbth_pincount == ip->dbth_pinmax) { pinmax = ip->dbth_pinmax; if ((ret = __env_alloc(reginfo, 2 * pinmax * sizeof(PIN_LIST), &list)) != 0) goto err; memcpy(list, R_ADDR(reginfo, ip->dbth_pinlist), pinmax * sizeof(PIN_LIST)); memset(&list[pinmax], 0, pinmax * sizeof(PIN_LIST)); list_off = R_OFFSET(reginfo, list); list = R_ADDR(reginfo, ip->dbth_pinlist); ip->dbth_pinmax = 2 * pinmax; ip->dbth_pinlist = list_off; if (list != ip->dbth_pinarray) __env_alloc_free(reginfo, list); } list = R_ADDR(reginfo, ip->dbth_pinlist); for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) if (lp->b_ref == INVALID_ROFF) break; ip->dbth_pincount++; lp->b_ref = R_OFFSET(infop, bhp); lp->region = (int)(infop - dbmp->reginfo); } #ifdef DIAGNOSTIC /* Update the file's pinned reference count. */ MPOOL_SYSTEM_LOCK(env); ++dbmfp->pinref; MPOOL_SYSTEM_UNLOCK(env); /* * We want to switch threads as often as possible, and at awkward * times. Yield every time we get a new page to ensure contention. */ if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU)) __os_yield(env, 0, 0); #endif DB_ASSERT(env, alloc_bhp == NULL); *(void **)addrp = bhp->buf; return (0); err: /* * Discard our reference. If we're the only reference, discard the * the buffer entirely. If we held a reference to a buffer, we are * also still holding the hash bucket mutex. */ if (b_incr || frozen_bhp != NULL) { if (!b_locked) { MUTEX_LOCK(env, hp->mtx_hash); b_locked = 1; } if (frozen_bhp != NULL) --frozen_bhp->ref; if (b_incr && bhp != frozen_bhp) --bhp->ref; } if (b_locked) MUTEX_UNLOCK(env, hp->mtx_hash); /* If alloc_bhp is set, free the memory. */ if (alloc_bhp != NULL) { MPOOL_REGION_LOCK(env, infop); __memp_free(infop, mfp, alloc_bhp); c_mp->stat.st_pages--; MPOOL_REGION_UNLOCK(env, infop); } return (ret); }