/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 * Margo Seltzer. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Margo Seltzer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "db_config.h" #ifndef lint static const char revid[] = "$Id: hash.c,v 1.1.1.1 2003/02/15 04:56:06 zarzycki Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #include #endif #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/db_shash.h" #include "dbinc/btree.h" #include "dbinc/hash.h" #include "dbinc/lock.h" static int __ham_bulk __P((DBC *, DBT *, u_int32_t)); static int __ham_c_close __P((DBC *, db_pgno_t, int *)); static int __ham_c_del __P((DBC *)); static int __ham_c_destroy __P((DBC *)); static int __ham_c_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); static int __ham_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); static int __ham_c_writelock __P((DBC *)); static int __ham_dup_return __P((DBC *, DBT *, u_int32_t)); static int __ham_expand_table __P((DBC *)); static int __ham_lookup __P((DBC *, const DBT *, u_int32_t, db_lockmode_t, db_pgno_t *)); static int __ham_overwrite __P((DBC *, DBT *, u_int32_t)); /* * __ham_quick_delete -- * When performing a DB->del operation that does not involve secondary * indices and is not removing an off-page duplicate tree, we can * speed things up substantially by removing the entire duplicate * set, if any is present, in one operation, rather than by conjuring * up and deleting each of the items individually. (All are stored * in one big HKEYDATA structure.) We don't bother to distinguish * on-page duplicate sets from single, non-dup items; they're deleted * in exactly the same way. * * This function is called by __db_delete when the appropriate * conditions are met, and it performs the delete in the optimized way. * * The cursor should be set to the first item in the duplicate * set, or to the sole key/data pair when the key does not have a * duplicate set, before the function is called. * * PUBLIC: int __ham_quick_delete __P((DBC *)); */ int __ham_quick_delete(dbc) DBC *dbc; { int ret, t_ret; if ((ret = __ham_get_meta(dbc)) != 0) return (ret); /* Assert that we're not using secondary indices. */ DB_ASSERT(!F_ISSET(dbc->dbp, DB_AM_SECONDARY)); /* * We should assert that we're not a primary either, but that * would require grabbing the dbp's mutex, so we don't bother. */ /* Assert that we're set, but not to an off-page duplicate. */ DB_ASSERT(IS_INITIALIZED(dbc)); DB_ASSERT(((HASH_CURSOR *)dbc->internal)->opd == NULL); ret = __ham_del_pair(dbc, 1); if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; return (ret); } /* ****************** CURSORS ********************************** */ /* * __ham_c_init -- * Initialize the hash-specific portion of a cursor. * * PUBLIC: int __ham_c_init __P((DBC *)); */ int __ham_c_init(dbc) DBC *dbc; { DB_ENV *dbenv; HASH_CURSOR *new_curs; int ret; dbenv = dbc->dbp->dbenv; if ((ret = __os_calloc(dbenv, 1, sizeof(struct cursor_t), &new_curs)) != 0) return (ret); if ((ret = __os_malloc(dbenv, dbc->dbp->pgsize, &new_curs->split_buf)) != 0) { __os_free(dbenv, new_curs); return (ret); } dbc->internal = (DBC_INTERNAL *) new_curs; dbc->c_close = __db_c_close; dbc->c_count = __db_c_count; dbc->c_del = __db_c_del; dbc->c_dup = __db_c_dup; dbc->c_get = dbc->c_real_get = __db_c_get; dbc->c_pget = __db_c_pget; dbc->c_put = __db_c_put; dbc->c_am_bulk = __ham_bulk; dbc->c_am_close = __ham_c_close; dbc->c_am_del = __ham_c_del; dbc->c_am_destroy = __ham_c_destroy; dbc->c_am_get = __ham_c_get; dbc->c_am_put = __ham_c_put; dbc->c_am_writelock = __ham_c_writelock; __ham_item_init(dbc); return (0); } /* * __ham_c_close -- * Close down the cursor from a single use. */ static int __ham_c_close(dbc, root_pgno, rmroot) DBC *dbc; db_pgno_t root_pgno; int *rmroot; { DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; HKEYDATA *dp; int doroot, gotmeta, ret, t_ret; u_int32_t dirty; COMPQUIET(rmroot, 0); mpf = dbc->dbp->mpf; dirty = 0; doroot = gotmeta = ret = 0; hcp = (HASH_CURSOR *) dbc->internal; /* Check for off page dups. */ if (dbc->internal->opd != NULL) { if ((ret = __ham_get_meta(dbc)) != 0) goto done; gotmeta = 1; if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0) goto out; dp = (HKEYDATA *)H_PAIRDATA(dbc->dbp, hcp->page, hcp->indx); /* If its not a dup we aborted before we changed it. */ if (HPAGE_PTYPE(dp) == H_OFFDUP) memcpy(&root_pgno, HOFFPAGE_PGNO(dp), sizeof(db_pgno_t)); else root_pgno = PGNO_INVALID; if ((ret = hcp->opd->c_am_close(hcp->opd, root_pgno, &doroot)) != 0) goto out; if (doroot != 0) { if ((ret = __ham_del_pair(dbc, 1)) != 0) goto out; dirty = DB_MPOOL_DIRTY; } } out: if (hcp->page != NULL && (t_ret = mpf->put(mpf, hcp->page, dirty)) != 0 && ret == 0) ret = t_ret; if (gotmeta != 0 && (t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; done: __ham_item_init(dbc); return (ret); } /* * __ham_c_destroy -- * Cleanup the access method private part of a cursor. */ static int __ham_c_destroy(dbc) DBC *dbc; { HASH_CURSOR *hcp; hcp = (HASH_CURSOR *)dbc->internal; if (hcp->split_buf != NULL) __os_free(dbc->dbp->dbenv, hcp->split_buf); __os_free(dbc->dbp->dbenv, hcp); return (0); } /* * __ham_c_count -- * Return a count of on-page duplicates. * * PUBLIC: int __ham_c_count __P((DBC *, db_recno_t *)); */ int __ham_c_count(dbc, recnop) DBC *dbc; db_recno_t *recnop; { DB *dbp; DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; db_indx_t len; db_recno_t recno; int ret, t_ret; u_int8_t *p, *pend; dbp = dbc->dbp; mpf = dbp->mpf; hcp = (HASH_CURSOR *)dbc->internal; recno = 0; if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0) return (ret); switch (HPAGE_PTYPE(H_PAIRDATA(dbp, hcp->page, hcp->indx))) { case H_KEYDATA: case H_OFFPAGE: recno = 1; break; case H_DUPLICATE: p = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)); pend = p + LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx); for (; p < pend; recno++) { /* p may be odd, so copy rather than just dereffing */ memcpy(&len, p, sizeof(db_indx_t)); p += 2 * sizeof(db_indx_t) + len; } break; default: ret = __db_pgfmt(dbp->dbenv, hcp->pgno); goto err; } *recnop = recno; err: if ((t_ret = mpf->put(mpf, hcp->page, 0)) != 0 && ret == 0) ret = t_ret; hcp->page = NULL; return (ret); } static int __ham_c_del(dbc) DBC *dbc; { DB *dbp; DBT repldbt; DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; int ret, t_ret; dbp = dbc->dbp; mpf = dbp->mpf; hcp = (HASH_CURSOR *)dbc->internal; if (F_ISSET(hcp, H_DELETED)) return (DB_NOTFOUND); if ((ret = __ham_get_meta(dbc)) != 0) goto out; if ((ret = __ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0) goto out; /* Off-page duplicates. */ if (HPAGE_TYPE(dbp, hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP) goto out; if (F_ISSET(hcp, H_ISDUP)) { /* On-page duplicate. */ if (hcp->dup_off == 0 && DUP_SIZE(hcp->dup_len) == LEN_HDATA(dbp, hcp->page, hcp->hdr->dbmeta.pagesize, hcp->indx)) ret = __ham_del_pair(dbc, 1); else { repldbt.flags = 0; F_SET(&repldbt, DB_DBT_PARTIAL); repldbt.doff = hcp->dup_off; repldbt.dlen = DUP_SIZE(hcp->dup_len); repldbt.size = 0; repldbt.data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)); if ((ret = __ham_replpair(dbc, &repldbt, 0)) == 0) { hcp->dup_tlen -= DUP_SIZE(hcp->dup_len); F_SET(hcp, H_DELETED); ret = __ham_c_update(dbc, DUP_SIZE(hcp->dup_len), 0, 1); } } } else /* Not a duplicate */ ret = __ham_del_pair(dbc, 1); out: if (hcp->page != NULL) { if ((t_ret = mpf->put(mpf, hcp->page, ret == 0 ? DB_MPOOL_DIRTY : 0)) && ret == 0) ret = t_ret; hcp->page = NULL; } if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * __ham_c_dup -- * Duplicate a hash cursor, such that the new one holds appropriate * locks for the position of the original. * * PUBLIC: int __ham_c_dup __P((DBC *, DBC *)); */ int __ham_c_dup(orig_dbc, new_dbc) DBC *orig_dbc, *new_dbc; { HASH_CURSOR *orig, *new; orig = (HASH_CURSOR *)orig_dbc->internal; new = (HASH_CURSOR *)new_dbc->internal; new->bucket = orig->bucket; new->lbucket = orig->lbucket; new->dup_off = orig->dup_off; new->dup_len = orig->dup_len; new->dup_tlen = orig->dup_tlen; if (F_ISSET(orig, H_DELETED)) F_SET(new, H_DELETED); if (F_ISSET(orig, H_ISDUP)) F_SET(new, H_ISDUP); /* * If the old cursor held a lock and we're not in transactions, get one * for the new one. The reason that we don't need a new lock if we're * in a transaction is because we already hold a lock and will continue * to do so until commit, so there is no point in reaquiring it. We * don't know if the old lock was a read or write lock, but it doesn't * matter. We'll get a read lock. We know that this locker already * holds a lock of the correct type, so if we need a write lock and * request it, we know that we'll get it. */ if (!LOCK_ISSET(orig->lock) || orig_dbc->txn != NULL) return (0); return (__ham_lock_bucket(new_dbc, DB_LOCK_READ)); } static int __ham_c_get(dbc, key, data, flags, pgnop) DBC *dbc; DBT *key; DBT *data; u_int32_t flags; db_pgno_t *pgnop; { DB *dbp; DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; db_lockmode_t lock_type; int get_key, ret, t_ret; hcp = (HASH_CURSOR *)dbc->internal; dbp = dbc->dbp; mpf = dbp->mpf; /* Clear OR'd in additional bits so we can check for flag equality. */ if (F_ISSET(dbc, DBC_RMW)) lock_type = DB_LOCK_WRITE; else lock_type = DB_LOCK_READ; if ((ret = __ham_get_meta(dbc)) != 0) return (ret); hcp->seek_size = 0; ret = 0; get_key = 1; switch (flags) { case DB_PREV_NODUP: F_SET(hcp, H_NEXT_NODUP); /* FALLTHROUGH */ case DB_PREV: if (IS_INITIALIZED(dbc)) { ret = __ham_item_prev(dbc, lock_type, pgnop); break; } /* FALLTHROUGH */ case DB_LAST: ret = __ham_item_last(dbc, lock_type, pgnop); break; case DB_NEXT_NODUP: F_SET(hcp, H_NEXT_NODUP); /* FALLTHROUGH */ case DB_NEXT: if (IS_INITIALIZED(dbc)) { ret = __ham_item_next(dbc, lock_type, pgnop); break; } /* FALLTHROUGH */ case DB_FIRST: ret = __ham_item_first(dbc, lock_type, pgnop); break; case DB_NEXT_DUP: /* cgetchk has already determined that the cursor is set. */ F_SET(hcp, H_DUPONLY); ret = __ham_item_next(dbc, lock_type, pgnop); break; case DB_SET: case DB_SET_RANGE: case DB_GET_BOTH: case DB_GET_BOTH_RANGE: ret = __ham_lookup(dbc, key, 0, lock_type, pgnop); get_key = 0; break; case DB_GET_BOTHC: F_SET(hcp, H_DUPONLY); ret = __ham_item_next(dbc, lock_type, pgnop); get_key = 0; break; case DB_CURRENT: /* cgetchk has already determined that the cursor is set. */ if (F_ISSET(hcp, H_DELETED)) { ret = DB_KEYEMPTY; goto err; } ret = __ham_item(dbc, lock_type, pgnop); break; } /* * Must always enter this loop to do error handling and * check for big key/data pair. */ for (;;) { if (ret != 0 && ret != DB_NOTFOUND) goto err; else if (F_ISSET(hcp, H_OK)) { if (*pgnop == PGNO_INVALID) ret = __ham_dup_return(dbc, data, flags); break; } else if (!F_ISSET(hcp, H_NOMORE)) { __db_err(dbp->dbenv, "H_NOMORE returned to __ham_c_get"); ret = EINVAL; break; } /* * Ran out of entries in a bucket; change buckets. */ switch (flags) { case DB_LAST: case DB_PREV: case DB_PREV_NODUP: ret = mpf->put(mpf, hcp->page, 0); hcp->page = NULL; if (hcp->bucket == 0) { ret = DB_NOTFOUND; hcp->pgno = PGNO_INVALID; goto err; } F_CLR(hcp, H_ISDUP); hcp->bucket--; hcp->indx = NDX_INVALID; hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); if (ret == 0) ret = __ham_item_prev(dbc, lock_type, pgnop); break; case DB_FIRST: case DB_NEXT: case DB_NEXT_NODUP: ret = mpf->put(mpf, hcp->page, 0); hcp->page = NULL; hcp->indx = NDX_INVALID; hcp->bucket++; F_CLR(hcp, H_ISDUP); hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); if (hcp->bucket > hcp->hdr->max_bucket) { ret = DB_NOTFOUND; hcp->pgno = PGNO_INVALID; goto err; } if (ret == 0) ret = __ham_item_next(dbc, lock_type, pgnop); break; case DB_GET_BOTH: case DB_GET_BOTHC: case DB_GET_BOTH_RANGE: case DB_NEXT_DUP: case DB_SET: case DB_SET_RANGE: /* Key not found. */ ret = DB_NOTFOUND; goto err; case DB_CURRENT: /* * This should only happen if you are doing * deletes and reading with concurrent threads * and not doing proper locking. We return * the same error code as we would if the * cursor were deleted. */ ret = DB_KEYEMPTY; goto err; default: DB_ASSERT(0); } } if (get_key == 0) F_SET(key, DB_DBT_ISSET); err: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; F_CLR(hcp, H_DUPONLY); F_CLR(hcp, H_NEXT_NODUP); return (ret); } /* * __ham_bulk -- Return bulk data from a hash table. */ static int __ham_bulk(dbc, data, flags) DBC *dbc; DBT *data; u_int32_t flags; { DB *dbp; DB_MPOOLFILE *mpf; HASH_CURSOR *cp; PAGE *pg; db_indx_t dup_len, dup_off, dup_tlen, indx, *inp; db_lockmode_t lock_mode; db_pgno_t pgno; int32_t *endp, key_off, *offp, *saveoff; u_int32_t key_size, size, space; u_int8_t *dbuf, *dp, *hk, *np, *tmp; int is_dup, is_key; int need_pg, next_key, no_dup, pagesize, ret, t_ret; ret = 0; key_off = 0; dup_len = dup_off = dup_tlen = 0; size = 0; dbp = dbc->dbp; pagesize = dbp->pgsize; mpf = dbp->mpf; cp = (HASH_CURSOR *)dbc->internal; is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0; next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP; no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP; dbuf = data->data; np = dp = dbuf; /* Keep track of space that is left. There is an termination entry */ space = data->ulen; space -= sizeof(*offp); /* Build the offset/size table from the end up. */ endp = (int32_t *) ((u_int8_t *)dbuf + data->ulen); endp--; offp = endp; key_size = 0; lock_mode = F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE: DB_LOCK_READ; next_pg: need_pg = 1; indx = cp->indx; pg = cp->page; inp = P_INP(dbp, pg); do { if (is_key) { hk = H_PAIRKEY(dbp, pg, indx); if (HPAGE_PTYPE(hk) == H_OFFPAGE) { memcpy(&key_size, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); size = key_size; if (key_size > space) goto get_key_space; if ((ret = __bam_bulk_overflow( dbc, key_size, pgno, np)) != 0) return (ret); space -= key_size; key_off = (int32_t)(np - dbuf); np += key_size; } else { if (need_pg) { dp = np; size = pagesize - HOFFSET(pg); if (space < size) { get_key_space: if (offp == endp) { data->size = ALIGN(size + pagesize, sizeof(u_int32_t)); return (ENOMEM); } goto back_up; } memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size); need_pg = 0; space -= size; np += size; } key_size = LEN_HKEY(dbp, pg, pagesize, indx); key_off = (int32_t)(inp[indx] - HOFFSET(pg) + dp - dbuf + SSZA(HKEYDATA, data)); } } hk = H_PAIRDATA(dbp, pg, indx); switch (HPAGE_PTYPE(hk)) { case H_DUPLICATE: case H_KEYDATA: if (need_pg) { dp = np; size = pagesize - HOFFSET(pg); if (space < size) { back_up: if (indx != 0) { indx -= 2; /* XXX * It's not clear that this is * the right way to fix this, * but here goes. * If we are backing up onto a * duplicate, then we need to * position ourselves at the * end of the duplicate set. * We probably need to make * this work for H_OFFDUP too. * It might be worth making a * dummy cursor and calling * __ham_item_prev. */ tmp = H_PAIRDATA(dbp, pg, indx); if (HPAGE_PTYPE(tmp) == H_DUPLICATE) { dup_off = dup_tlen = LEN_HDATA(dbp, pg, pagesize, indx + 1); memcpy(&dup_len, HKEYDATA_DATA(tmp), sizeof(db_indx_t)); } goto get_space; } /* indx == 0 */ if ((ret = __ham_item_prev(dbc, lock_mode, &pgno)) != 0) { if (ret != DB_NOTFOUND) return (ret); if ((ret = mpf->put(mpf, cp->page, 0)) != 0) return (ret); cp->page = NULL; if (cp->bucket == 0) { cp->indx = indx = NDX_INVALID; goto get_space; } if ((ret = __ham_get_meta(dbc)) != 0) return (ret); cp->bucket--; cp->pgno = BUCKET_TO_PAGE(cp, cp->bucket); cp->indx = NDX_INVALID; if ((ret = __ham_release_meta( dbc)) != 0) return (ret); if ((ret = __ham_item_prev(dbc, lock_mode, &pgno)) != 0) return (ret); } indx = cp->indx; get_space: /* * See if we put any data in the buffer. */ if (offp >= endp || F_ISSET(dbc, DBC_TRANSIENT)) { data->size = ALIGN(size + data->ulen - space, sizeof(u_int32_t)); return (ENOMEM); } /* * Don't continue; we're all out * of space, even though we're * returning success. */ next_key = 0; break; } memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size); need_pg = 0; space -= size; np += size; } /* * We're about to crack the offset(s) and length(s) * out of an H_KEYDATA or H_DUPLICATE item. * There are three cases: * 1. We were moved into a duplicate set by * the standard hash cursor code. Respect * the dup_off and dup_tlen we were given. * 2. We stumbled upon a duplicate set while * walking the page on our own. We need to * recognize it as a dup and set dup_off and * dup_tlen. * 3. The current item is not a dup. */ if (F_ISSET(cp, H_ISDUP)) { /* Case 1 */ is_dup = 1; dup_len = cp->dup_len; dup_off = cp->dup_off; dup_tlen = cp->dup_tlen; } else if (HPAGE_PTYPE(hk) == H_DUPLICATE) { /* Case 2 */ is_dup = 1; /* * If we run out of memory and bail, * make sure the fact we're in a dup set * isn't ignored later. */ F_SET(cp, H_ISDUP); dup_off = 0; memcpy(&dup_len, HKEYDATA_DATA(hk), sizeof(db_indx_t)); dup_tlen = LEN_HDATA(dbp, pg, pagesize, indx); } else /* Case 3 */ is_dup = dup_len = dup_off = dup_tlen = 0; do { space -= (is_key ? 4 : 2) * sizeof(*offp); size += (is_key ? 4 : 2) * sizeof(*offp); /* * Since space is an unsigned, if we happen * to wrap, then this comparison will turn out * to be true. XXX Wouldn't it be better to * simply check above that space is greater than * the value we're about to subtract??? */ if (space > data->ulen) { if (!is_dup || dup_off == 0) goto back_up; dup_off -= (db_indx_t)DUP_SIZE(offp[1]); goto get_space; } if (is_key) { *offp-- = key_off; *offp-- = key_size; } if (is_dup) { *offp-- = (int32_t)( inp[indx + 1] - HOFFSET(pg) + dp - dbuf + SSZA(HKEYDATA, data) + dup_off + sizeof(db_indx_t)); memcpy(&dup_len, HKEYDATA_DATA(hk) + dup_off, sizeof(db_indx_t)); dup_off += DUP_SIZE(dup_len); *offp-- = dup_len; } else { *offp-- = (int32_t)( inp[indx + 1] - HOFFSET(pg) + dp - dbuf + SSZA(HKEYDATA, data)); *offp-- = LEN_HDATA(dbp, pg, pagesize, indx); } } while (is_dup && dup_off < dup_tlen && no_dup == 0); F_CLR(cp, H_ISDUP); break; case H_OFFDUP: memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); space -= 2 * sizeof(*offp); if (space > data->ulen) goto back_up; if (is_key) { space -= 2 * sizeof(*offp); if (space > data->ulen) goto back_up; *offp-- = key_off; *offp-- = key_size; } saveoff = offp; if ((ret = __bam_bulk_duplicates(dbc, pgno, dbuf, is_key ? offp + 2 : NULL, &offp, &np, &space, no_dup)) != 0) { if (ret == ENOMEM) { size = space; if (is_key && saveoff == offp) { offp += 2; goto back_up; } goto get_space; } return (ret); } break; case H_OFFPAGE: space -= (is_key ? 4 : 2) * sizeof(*offp); if (space > data->ulen) goto back_up; memcpy(&size, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); if (size > space) goto back_up; if ((ret = __bam_bulk_overflow(dbc, size, pgno, np)) != 0) return (ret); if (is_key) { *offp-- = key_off; *offp-- = key_size; } *offp-- = (int32_t)(np - dbuf); *offp-- = size; np += size; space -= size; break; } } while (next_key && (indx += 2) < NUM_ENT(pg)); cp->indx = indx; cp->dup_len = dup_len; cp->dup_off = dup_off; cp->dup_tlen = dup_tlen; /* If we are off the page then try to the next page. */ if (ret == 0 && next_key && indx >= NUM_ENT(pg)) { if ((ret = __ham_item_next(dbc, lock_mode, &pgno)) == 0) goto next_pg; if (ret != DB_NOTFOUND) return (ret); if ((ret = mpf->put(dbc->dbp->mpf, cp->page, 0)) != 0) return (ret); cp->page = NULL; if ((ret = __ham_get_meta(dbc)) != 0) return (ret); cp->bucket++; if (cp->bucket > cp->hdr->max_bucket) { /* * Restore cursor to its previous state. We're past * the last item in the last bucket, so the next * DBC->c_get(DB_NEXT) will return DB_NOTFOUND. */ cp->bucket--; ret = DB_NOTFOUND; } else { /* * Start on the next bucket. * * Note that if this new bucket happens to be empty, * but there's another non-empty bucket after it, * we'll return early. This is a rare case, and we * don't guarantee any particular number of keys * returned on each call, so just let the next call * to bulk get move forward by yet another bucket. */ cp->pgno = BUCKET_TO_PAGE(cp, cp->bucket); cp->indx = NDX_INVALID; F_CLR(cp, H_ISDUP); ret = __ham_item_next(dbc, lock_mode, &pgno); } if ((t_ret = __ham_release_meta(dbc)) != 0) return (t_ret); if (ret == 0) goto next_pg; if (ret != DB_NOTFOUND) return (ret); } *offp = (u_int32_t) -1; return (0); } static int __ham_c_put(dbc, key, data, flags, pgnop) DBC *dbc; DBT *key; DBT *data; u_int32_t flags; db_pgno_t *pgnop; { DB *dbp; DB_MPOOLFILE *mpf; DBT tmp_val, *myval; HASH_CURSOR *hcp; u_int32_t nbytes; int ret, t_ret; /* * The compiler doesn't realize that we only use this when ret is * equal to 0 and that if ret is equal to 0, that we must have set * myval. So, we initialize it here to shut the compiler up. */ COMPQUIET(myval, NULL); dbp = dbc->dbp; mpf = dbp->mpf; hcp = (HASH_CURSOR *)dbc->internal; if (F_ISSET(hcp, H_DELETED) && flags != DB_KEYFIRST && flags != DB_KEYLAST) return (DB_NOTFOUND); if ((ret = __ham_get_meta(dbc)) != 0) goto err1; switch (flags) { case DB_KEYLAST: case DB_KEYFIRST: case DB_NODUPDATA: nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE : HKEYDATA_PSIZE(key->size)) + (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE : HKEYDATA_PSIZE(data->size)); if ((ret = __ham_lookup(dbc, key, nbytes, DB_LOCK_WRITE, pgnop)) == DB_NOTFOUND) { ret = 0; if (hcp->seek_found_page != PGNO_INVALID && hcp->seek_found_page != hcp->pgno) { if ((ret = mpf->put(mpf, hcp->page, 0)) != 0) goto err2; hcp->page = NULL; hcp->pgno = hcp->seek_found_page; hcp->indx = NDX_INVALID; } if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) { /* * A partial put, but the key does not exist * and we are not beginning the write at 0. * We must create a data item padded up to doff * and then write the new bytes represented by * val. */ if ((ret = __ham_init_dbt(dbp->dbenv, &tmp_val, data->size + data->doff, &dbc->my_rdata.data, &dbc->my_rdata.ulen)) == 0) { memset(tmp_val.data, 0, data->doff); memcpy((u_int8_t *)tmp_val.data + data->doff, data->data, data->size); myval = &tmp_val; } } else myval = (DBT *)data; if (ret == 0) ret = __ham_add_el(dbc, key, myval, H_KEYDATA); goto done; } break; case DB_BEFORE: case DB_AFTER: case DB_CURRENT: ret = __ham_item(dbc, DB_LOCK_WRITE, pgnop); break; } if (*pgnop == PGNO_INVALID && ret == 0) { if (flags == DB_CURRENT || ((flags == DB_KEYFIRST || flags == DB_KEYLAST || flags == DB_NODUPDATA) && !(F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK)))) ret = __ham_overwrite(dbc, data, flags); else ret = __ham_add_dup(dbc, data, flags, pgnop); } done: if (ret == 0 && F_ISSET(hcp, H_EXPAND)) { ret = __ham_expand_table(dbc); F_CLR(hcp, H_EXPAND); } if (hcp->page != NULL && (t_ret = mpf->set(mpf, hcp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; err2: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; err1: return (ret); } /********************************* UTILITIES ************************/ /* * __ham_expand_table -- */ static int __ham_expand_table(dbc) DBC *dbc; { DB *dbp; DB_LOCK metalock; DB_LSN lsn; DB_MPOOLFILE *mpf; DBMETA *mmeta; HASH_CURSOR *hcp; PAGE *h; db_pgno_t pgno, mpgno; u_int32_t newalloc, new_bucket, old_bucket; int dirty_meta, got_meta, logn, new_double, ret; dbp = dbc->dbp; mpf = dbp->mpf; hcp = (HASH_CURSOR *)dbc->internal; if ((ret = __ham_dirty_meta(dbc)) != 0) return (ret); LOCK_INIT(metalock); mmeta = (DBMETA *) hcp->hdr; mpgno = mmeta->pgno; h = NULL; dirty_meta = 0; got_meta = 0; newalloc = 0; /* * If the split point is about to increase, make sure that we * have enough extra pages. The calculation here is weird. * We'd like to do this after we've upped max_bucket, but it's * too late then because we've logged the meta-data split. What * we'll do between then and now is increment max bucket and then * see what the log of one greater than that is; here we have to * look at the log of max + 2. VERY NASTY STUFF. * * We figure out what we need to do, then we log it, then request * the pages from mpool. We don't want to fail after extending * the file. * * If the page we are about to split into has already been allocated, * then we simply need to get it to get its LSN. If it hasn't yet * been allocated, then we know it's LSN (0,0). */ new_bucket = hcp->hdr->max_bucket + 1; old_bucket = new_bucket & hcp->hdr->low_mask; new_double = hcp->hdr->max_bucket == hcp->hdr->high_mask; logn = __db_log2(new_bucket); if (!new_double || hcp->hdr->spares[logn + 1] != PGNO_INVALID) { /* Page exists; get it so we can get its LSN */ pgno = BUCKET_TO_PAGE(hcp, new_bucket); if ((ret = mpf->get(mpf, &pgno, DB_MPOOL_CREATE, &h)) != 0) goto err; lsn = h->lsn; } else { /* Get the master meta-data page to do allocation. */ if (F_ISSET(dbp, DB_AM_SUBDB)) { mpgno = PGNO_BASE_MD; if ((ret = __db_lget(dbc, 0, mpgno, DB_LOCK_WRITE, 0, &metalock)) != 0) goto err; if ((ret = mpf->get(mpf, &mpgno, 0, (PAGE **)&mmeta)) != 0) goto err; got_meta = 1; } pgno = mmeta->last_pgno + 1; ZERO_LSN(lsn); newalloc = 1; } /* Log the meta-data split first. */ if (DBC_LOGGING(dbc)) { /* * We always log the page number of the first page of * the allocation group. However, the LSN that we log * is either the LSN on the first page (if we did not * do the actual allocation here) or the LSN on the last * page of the unit (if we did do the allocation here). */ if ((ret = __ham_metagroup_log(dbp, dbc->txn, &lsn, 0, hcp->hdr->max_bucket, mpgno, &mmeta->lsn, hcp->hdr->dbmeta.pgno, &hcp->hdr->dbmeta.lsn, pgno, &lsn, newalloc)) != 0) goto err; } else LSN_NOT_LOGGED(lsn); hcp->hdr->dbmeta.lsn = lsn; if (new_double && hcp->hdr->spares[logn + 1] == PGNO_INVALID) { /* * We need to begin a new doubling and we have not allocated * any pages yet. Read the last page in and initialize it to * make the allocation contiguous. The pgno we calculated * above is the first page allocated. The entry in spares is * that page number minus any buckets already allocated (it * simplifies bucket to page transaction). After we've set * that, we calculate the last pgno. */ hcp->hdr->spares[logn + 1] = pgno - new_bucket; pgno += hcp->hdr->max_bucket; mmeta->last_pgno = pgno; mmeta->lsn = lsn; dirty_meta = DB_MPOOL_DIRTY; if ((ret = mpf->get(mpf, &pgno, DB_MPOOL_CREATE, &h)) != 0) goto err; P_INIT(h, dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); } /* Write out whatever page we ended up modifying. */ h->lsn = lsn; if ((ret = mpf->put(mpf, h, DB_MPOOL_DIRTY)) != 0) goto err; h = NULL; /* * Update the meta-data page of this hash database. */ hcp->hdr->max_bucket = new_bucket; if (new_double) { hcp->hdr->low_mask = hcp->hdr->high_mask; hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask; } /* Relocate records to the new bucket */ ret = __ham_split_page(dbc, old_bucket, new_bucket); err: if (got_meta) (void)mpf->put(mpf, mmeta, dirty_meta); if (LOCK_ISSET(metalock)) (void)__TLPUT(dbc, metalock); if (h != NULL) (void)mpf->put(mpf, h, 0); return (ret); } /* * PUBLIC: u_int32_t __ham_call_hash __P((DBC *, u_int8_t *, int32_t)); */ u_int32_t __ham_call_hash(dbc, k, len) DBC *dbc; u_int8_t *k; int32_t len; { DB *dbp; u_int32_t n, bucket; HASH_CURSOR *hcp; HASH *hashp; dbp = dbc->dbp; hcp = (HASH_CURSOR *)dbc->internal; hashp = dbp->h_internal; n = (u_int32_t)(hashp->h_hash(dbp, k, len)); bucket = n & hcp->hdr->high_mask; if (bucket > hcp->hdr->max_bucket) bucket = bucket & hcp->hdr->low_mask; return (bucket); } /* * Check for duplicates, and call __db_ret appropriately. Release * everything held by the cursor. */ static int __ham_dup_return(dbc, val, flags) DBC *dbc; DBT *val; u_int32_t flags; { DB *dbp; HASH_CURSOR *hcp; PAGE *pp; DBT *myval, tmp_val; db_indx_t ndx; db_pgno_t pgno; u_int32_t off, tlen; u_int8_t *hk, type; int cmp, ret; db_indx_t len; /* Check for duplicate and return the first one. */ dbp = dbc->dbp; hcp = (HASH_CURSOR *)dbc->internal; ndx = H_DATAINDEX(hcp->indx); type = HPAGE_TYPE(dbp, hcp->page, ndx); pp = hcp->page; myval = val; /* * There are 4 cases: * 1. We are not in duplicate, simply return; the upper layer * will do the right thing. * 2. We are looking at keys and stumbled onto a duplicate. * 3. We are in the middle of a duplicate set. (ISDUP set) * 4. We need to check for particular data match. */ /* We should never get here with off-page dups. */ DB_ASSERT(type != H_OFFDUP); /* Case 1 */ if (type != H_DUPLICATE && flags != DB_GET_BOTH && flags != DB_GET_BOTHC && flags != DB_GET_BOTH_RANGE) return (0); /* * Here we check for the case where we just stumbled onto a * duplicate. In this case, we do initialization and then * let the normal duplicate code handle it. (Case 2) */ if (!F_ISSET(hcp, H_ISDUP) && type == H_DUPLICATE) { F_SET(hcp, H_ISDUP); hcp->dup_tlen = LEN_HDATA(dbp, hcp->page, hcp->hdr->dbmeta.pagesize, hcp->indx); hk = H_PAIRDATA(dbp, hcp->page, hcp->indx); if (flags == DB_LAST || flags == DB_PREV || flags == DB_PREV_NODUP) { hcp->dup_off = 0; do { memcpy(&len, HKEYDATA_DATA(hk) + hcp->dup_off, sizeof(db_indx_t)); hcp->dup_off += DUP_SIZE(len); } while (hcp->dup_off < hcp->dup_tlen); hcp->dup_off -= DUP_SIZE(len); } else { memcpy(&len, HKEYDATA_DATA(hk), sizeof(db_indx_t)); hcp->dup_off = 0; } hcp->dup_len = len; } /* * If we are retrieving a specific key/data pair, then we * may need to adjust the cursor before returning data. * Case 4 */ if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) { if (F_ISSET(hcp, H_ISDUP)) { /* * If we're doing a join, search forward from the * current position, not the beginning of the dup set. */ if (flags == DB_GET_BOTHC) F_SET(hcp, H_CONTINUE); __ham_dsearch(dbc, val, &off, &cmp, flags); /* * This flag is set nowhere else and is safe to * clear unconditionally. */ F_CLR(hcp, H_CONTINUE); hcp->dup_off = off; } else { hk = H_PAIRDATA(dbp, hcp->page, hcp->indx); if (((HKEYDATA *)hk)->type == H_OFFPAGE) { memcpy(&tlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); if ((ret = __db_moff(dbp, val, pgno, tlen, dbp->dup_compare, &cmp)) != 0) return (ret); } else { /* * We do not zero tmp_val since the comparison * routines may only look at data and size. */ tmp_val.data = HKEYDATA_DATA(hk); tmp_val.size = LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx); cmp = dbp->dup_compare == NULL ? __bam_defcmp(dbp, &tmp_val, val) : dbp->dup_compare(dbp, &tmp_val, val); } } if (cmp != 0) return (DB_NOTFOUND); } /* * If we're doing a bulk get, we don't want to actually return * the data: __ham_bulk will take care of cracking out the * duplicates appropriately. * * The rest of this function calculates partial offsets and * handles the actual __db_ret, so just return if * DB_MULTIPLE(_KEY) is set. */ if (F_ISSET(dbc, DBC_MULTIPLE | DBC_MULTIPLE_KEY)) return (0); /* * Now, everything is initialized, grab a duplicate if * necessary. */ if (F_ISSET(hcp, H_ISDUP)) { /* Case 3 */ /* * Copy the DBT in case we are retrieving into user * memory and we need the parameters for it. If the * user requested a partial, then we need to adjust * the user's parameters to get the partial of the * duplicate which is itself a partial. */ memcpy(&tmp_val, val, sizeof(*val)); if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) { /* * Take the user's length unless it would go * beyond the end of the duplicate. */ if (tmp_val.doff + hcp->dup_off > hcp->dup_len) tmp_val.dlen = 0; else if (tmp_val.dlen + tmp_val.doff > hcp->dup_len) tmp_val.dlen = hcp->dup_len - tmp_val.doff; /* * Calculate the new offset. */ tmp_val.doff += hcp->dup_off; } else { F_SET(&tmp_val, DB_DBT_PARTIAL); tmp_val.dlen = hcp->dup_len; tmp_val.doff = hcp->dup_off + sizeof(db_indx_t); } myval = &tmp_val; } /* * Finally, if we had a duplicate, pp, ndx, and myval should be * set appropriately. */ if ((ret = __db_ret(dbp, pp, ndx, myval, &dbc->rdata->data, &dbc->rdata->ulen)) != 0) return (ret); /* * In case we sent a temporary off to db_ret, set the real * return values. */ val->data = myval->data; val->size = myval->size; F_SET(val, DB_DBT_ISSET); return (0); } static int __ham_overwrite(dbc, nval, flags) DBC *dbc; DBT *nval; u_int32_t flags; { DB *dbp; DB_ENV *dbenv; HASH_CURSOR *hcp; DBT *myval, tmp_val, tmp_val2; void *newrec; u_int8_t *hk, *p; u_int32_t len, nondup_size; db_indx_t newsize; int ret; dbp = dbc->dbp; dbenv = dbp->dbenv; hcp = (HASH_CURSOR *)dbc->internal; if (F_ISSET(hcp, H_ISDUP)) { /* * This is an overwrite of a duplicate. We should never * be off-page at this point. */ DB_ASSERT(hcp->opd == NULL); /* On page dups */ if (F_ISSET(nval, DB_DBT_PARTIAL)) { /* * We're going to have to get the current item, then * construct the record, do any padding and do a * replace. */ memset(&tmp_val, 0, sizeof(tmp_val)); if ((ret = __ham_dup_return(dbc, &tmp_val, DB_CURRENT)) != 0) return (ret); /* Figure out new size. */ nondup_size = tmp_val.size; newsize = nondup_size; /* * Three cases: * 1. strictly append (may need to allocate space * for pad bytes; really gross). * 2. overwrite some and append. * 3. strictly overwrite. */ if (nval->doff > nondup_size) newsize += (nval->doff - nondup_size + nval->size); else if (nval->doff + nval->dlen > nondup_size) newsize += nval->size - (nondup_size - nval->doff); else newsize += nval->size - nval->dlen; /* * Make sure that the new size doesn't put us over * the onpage duplicate size in which case we need * to convert to off-page duplicates. */ if (ISBIG(hcp, hcp->dup_tlen - nondup_size + newsize)) { if ((ret = __ham_dup_convert(dbc)) != 0) return (ret); return (hcp->opd->c_am_put(hcp->opd, NULL, nval, flags, NULL)); } if ((ret = __os_malloc(dbp->dbenv, DUP_SIZE(newsize), &newrec)) != 0) return (ret); memset(&tmp_val2, 0, sizeof(tmp_val2)); F_SET(&tmp_val2, DB_DBT_PARTIAL); /* Construct the record. */ p = newrec; /* Initial size. */ memcpy(p, &newsize, sizeof(db_indx_t)); p += sizeof(db_indx_t); /* First part of original record. */ len = nval->doff > tmp_val.size ? tmp_val.size : nval->doff; memcpy(p, tmp_val.data, len); p += len; if (nval->doff > tmp_val.size) { /* Padding */ memset(p, 0, nval->doff - tmp_val.size); p += nval->doff - tmp_val.size; } /* New bytes */ memcpy(p, nval->data, nval->size); p += nval->size; /* End of original record (if there is any) */ if (nval->doff + nval->dlen < tmp_val.size) { len = tmp_val.size - nval->doff - nval->dlen; memcpy(p, (u_int8_t *)tmp_val.data + nval->doff + nval->dlen, len); p += len; } /* Final size. */ memcpy(p, &newsize, sizeof(db_indx_t)); /* * Make sure that the caller isn't corrupting * the sort order. */ if (dbp->dup_compare != NULL) { tmp_val2.data = (u_int8_t *)newrec + sizeof(db_indx_t); tmp_val2.size = newsize; if (dbp->dup_compare( dbp, &tmp_val, &tmp_val2) != 0) { (void)__os_free(dbenv, newrec); return (__db_duperr(dbp, flags)); } } tmp_val2.data = newrec; tmp_val2.size = DUP_SIZE(newsize); tmp_val2.doff = hcp->dup_off; tmp_val2.dlen = DUP_SIZE(hcp->dup_len); ret = __ham_replpair(dbc, &tmp_val2, 0); (void)__os_free(dbenv, newrec); /* Update cursor */ if (ret != 0) return (ret); if (newsize > nondup_size) hcp->dup_tlen += (newsize - nondup_size); else hcp->dup_tlen -= (nondup_size - newsize); hcp->dup_len = DUP_SIZE(newsize); return (0); } else { /* Check whether we need to convert to off page. */ if (ISBIG(hcp, hcp->dup_tlen - hcp->dup_len + nval->size)) { if ((ret = __ham_dup_convert(dbc)) != 0) return (ret); return (hcp->opd->c_am_put(hcp->opd, NULL, nval, flags, NULL)); } /* Make sure we maintain sort order. */ if (dbp->dup_compare != NULL) { tmp_val2.data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)) + hcp->dup_off + sizeof(db_indx_t); tmp_val2.size = hcp->dup_len; if (dbp->dup_compare(dbp, nval, &tmp_val2) != 0) return (EINVAL); } /* Overwriting a complete duplicate. */ if ((ret = __ham_make_dup(dbp->dbenv, nval, &tmp_val, &dbc->my_rdata.data, &dbc->my_rdata.ulen)) != 0) return (ret); /* Now fix what we are replacing. */ tmp_val.doff = hcp->dup_off; tmp_val.dlen = DUP_SIZE(hcp->dup_len); /* Update cursor */ if (nval->size > hcp->dup_len) hcp->dup_tlen += (nval->size - hcp->dup_len); else hcp->dup_tlen -= (hcp->dup_len - nval->size); hcp->dup_len = (db_indx_t)DUP_SIZE(nval->size); } myval = &tmp_val; } else if (!F_ISSET(nval, DB_DBT_PARTIAL)) { /* Put/overwrite */ memcpy(&tmp_val, nval, sizeof(*nval)); F_SET(&tmp_val, DB_DBT_PARTIAL); tmp_val.doff = 0; hk = H_PAIRDATA(dbp, hcp->page, hcp->indx); if (HPAGE_PTYPE(hk) == H_OFFPAGE) memcpy(&tmp_val.dlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); else tmp_val.dlen = LEN_HDATA(dbp, hcp->page, hcp->hdr->dbmeta.pagesize, hcp->indx); myval = &tmp_val; } else /* Regular partial put */ myval = nval; return (__ham_replpair(dbc, myval, 0)); } /* * Given a key and a cursor, sets the cursor to the page/ndx on which * the key resides. If the key is found, the cursor H_OK flag is set * and the pagep, bndx, pgno (dpagep, dndx, dpgno) fields are set. * If the key is not found, the H_OK flag is not set. If the sought * field is non-0, the pagep, bndx, pgno (dpagep, dndx, dpgno) fields * are set indicating where an add might take place. If it is 0, * non of the cursor pointer field are valid. */ static int __ham_lookup(dbc, key, sought, mode, pgnop) DBC *dbc; const DBT *key; u_int32_t sought; db_lockmode_t mode; db_pgno_t *pgnop; { DB *dbp; HASH_CURSOR *hcp; db_pgno_t pgno; u_int32_t tlen; int match, ret; u_int8_t *hk, *dk; dbp = dbc->dbp; hcp = (HASH_CURSOR *)dbc->internal; /* * Set up cursor so that we're looking for space to add an item * as we cycle through the pages looking for the key. */ if ((ret = __ham_item_reset(dbc)) != 0) return (ret); hcp->seek_size = sought; hcp->bucket = __ham_call_hash(dbc, (u_int8_t *)key->data, key->size); hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); for (;;) { *pgnop = PGNO_INVALID; if ((ret = __ham_item_next(dbc, mode, pgnop)) != 0) return (ret); if (F_ISSET(hcp, H_NOMORE)) break; hk = H_PAIRKEY(dbp, hcp->page, hcp->indx); switch (HPAGE_PTYPE(hk)) { case H_OFFPAGE: memcpy(&tlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); if (tlen == key->size) { memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); if ((ret = __db_moff(dbp, key, pgno, tlen, NULL, &match)) != 0) return (ret); if (match == 0) goto found_key; } break; case H_KEYDATA: if (key->size == LEN_HKEY(dbp, hcp->page, dbp->pgsize, hcp->indx) && memcmp(key->data, HKEYDATA_DATA(hk), key->size) == 0) { /* Found the key, check for data type. */ found_key: F_SET(hcp, H_OK); dk = H_PAIRDATA(dbp, hcp->page, hcp->indx); if (HPAGE_PTYPE(dk) == H_OFFDUP) memcpy(pgnop, HOFFDUP_PGNO(dk), sizeof(db_pgno_t)); return (0); } break; case H_DUPLICATE: case H_OFFDUP: /* * These are errors because keys are never * duplicated, only data items are. */ return (__db_pgfmt(dbp->dbenv, PGNO(hcp->page))); } } /* * Item was not found. */ if (sought != 0) return (ret); return (ret); } /* * __ham_init_dbt -- * Initialize a dbt using some possibly already allocated storage * for items. * * PUBLIC: int __ham_init_dbt __P((DB_ENV *, * PUBLIC: DBT *, u_int32_t, void **, u_int32_t *)); */ int __ham_init_dbt(dbenv, dbt, size, bufp, sizep) DB_ENV *dbenv; DBT *dbt; u_int32_t size; void **bufp; u_int32_t *sizep; { int ret; memset(dbt, 0, sizeof(*dbt)); if (*sizep < size) { if ((ret = __os_realloc(dbenv, size, bufp)) != 0) { *sizep = 0; return (ret); } *sizep = size; } dbt->data = *bufp; dbt->size = size; return (0); } /* * Adjust the cursor after an insert or delete. The cursor passed is * the one that was operated upon; we just need to check any of the * others. * * len indicates the length of the item added/deleted * add indicates if the item indicated by the cursor has just been * added (add == 1) or deleted (add == 0). * dup indicates if the addition occurred into a duplicate set. * * PUBLIC: int __ham_c_update * PUBLIC: __P((DBC *, u_int32_t, int, int)); */ int __ham_c_update(dbc, len, add, is_dup) DBC *dbc; u_int32_t len; int add, is_dup; { DB *dbp, *ldbp; DBC *cp; DB_ENV *dbenv; DB_LSN lsn; DB_TXN *my_txn; HASH_CURSOR *hcp, *lcp; int found, ret; u_int32_t order; dbp = dbc->dbp; dbenv = dbp->dbenv; hcp = (HASH_CURSOR *)dbc->internal; /* * Adjustment will only be logged if this is a subtransaction. * Only subtransactions can abort and effect their parent * transactions cursors. */ my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL; found = 0; MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); /* * Calculate the order of this deleted record. * This will be one greater than any cursor that is pointing * at this record and already marked as deleted. */ order = 0; if (!add) { order = 1; for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; ldbp = LIST_NEXT(ldbp, dblistlinks)) { MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL; cp = TAILQ_NEXT(cp, links)) { if (cp == dbc || cp->dbtype != DB_HASH) continue; lcp = (HASH_CURSOR *)cp->internal; if (F_ISSET(lcp, H_DELETED) && hcp->pgno == lcp->pgno && hcp->indx == lcp->indx && order <= lcp->order && (!is_dup || hcp->dup_off == lcp->dup_off)) order = lcp->order + 1; } MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); } hcp->order = order; } for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; ldbp = LIST_NEXT(ldbp, dblistlinks)) { MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL; cp = TAILQ_NEXT(cp, links)) { if (cp == dbc || cp->dbtype != DB_HASH) continue; lcp = (HASH_CURSOR *)cp->internal; if (lcp->pgno != hcp->pgno || lcp->indx == NDX_INVALID) continue; if (my_txn != NULL && cp->txn != my_txn) found = 1; if (!is_dup) { if (add) { /* * This routine is not called to add * non-dup records which are always put * at the end. It is only called from * recovery in this case and the * cursor will be marked deleted. * We are "undeleting" so unmark all * cursors with the same order. */ if (lcp->indx == hcp->indx && F_ISSET(lcp, H_DELETED)) { if (lcp->order == hcp->order) F_CLR(lcp, H_DELETED); else if (lcp->order > hcp->order) { /* * If we've moved this cursor's * index, split its order * number--i.e., decrement it by * enough so that the lowest * cursor moved has order 1. * cp_arg->order is the split * point, so decrement by one * less than that. */ lcp->order -= (hcp->order - 1); lcp->indx += 2; } } else if (lcp->indx >= hcp->indx) lcp->indx += 2; } else { if (lcp->indx > hcp->indx) { lcp->indx -= 2; if (lcp->indx == hcp->indx && F_ISSET(lcp, H_DELETED)) lcp->order += order; } else if (lcp->indx == hcp->indx && !F_ISSET(lcp, H_DELETED)) { F_SET(lcp, H_DELETED); F_CLR(lcp, H_ISDUP); lcp->order = order; } } } else if (lcp->indx == hcp->indx) { /* * Handle duplicates. This routine is * only called for on page dups. * Off page dups are handled by btree/rtree * code. */ if (add) { lcp->dup_tlen += len; if (lcp->dup_off == hcp->dup_off && F_ISSET(hcp, H_DELETED) && F_ISSET(lcp, H_DELETED)) { /* Abort of a delete. */ if (lcp->order == hcp->order) F_CLR(lcp, H_DELETED); else if (lcp->order > hcp->order) { lcp->order -= (hcp->order -1); lcp->dup_off += len; } } else if (lcp->dup_off >= hcp->dup_off) lcp->dup_off += len; } else { lcp->dup_tlen -= len; if (lcp->dup_off > hcp->dup_off) { lcp->dup_off -= len; if (lcp->dup_off == hcp->dup_off && F_ISSET(lcp, H_DELETED)) lcp->order += order; } else if (lcp->dup_off == hcp->dup_off && !F_ISSET(lcp, H_DELETED)) { F_SET(lcp, H_DELETED); lcp->order = order; } } } } MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); } MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); if (found != 0 && DBC_LOGGING(dbc)) { if ((ret = __ham_curadj_log(dbp, my_txn, &lsn, 0, hcp->pgno, hcp->indx, len, hcp->dup_off, add, is_dup, order)) != 0) return (ret); } return (0); } /* * __ham_get_clist -- * * Get a list of cursors either on a particular bucket or on a particular * page and index combination. The former is so that we can update * cursors on a split. The latter is so we can update cursors when we * move items off page. * * PUBLIC: int __ham_get_clist __P((DB *, db_pgno_t, u_int32_t, DBC ***)); */ int __ham_get_clist(dbp, pgno, indx, listp) DB *dbp; db_pgno_t pgno; u_int32_t indx; DBC ***listp; { DB *ldbp; DBC *cp; DB_ENV *dbenv; int nalloc, nused, ret; /* * Assume that finding anything is the exception, so optimize for * the case where there aren't any. */ nalloc = nused = 0; *listp = NULL; dbenv = dbp->dbenv; MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; ldbp = LIST_NEXT(ldbp, dblistlinks)) { MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL; cp = TAILQ_NEXT(cp, links)) /* * We match if cp->pgno matches the specified * pgno, and if either the cp->indx matches * or we weren't given an index. */ if (cp->internal->pgno == pgno && (indx == NDX_INVALID || cp->internal->indx == indx)) { if (nused >= nalloc) { nalloc += 10; if ((ret = __os_realloc(dbp->dbenv, nalloc * sizeof(HASH_CURSOR *), listp)) != 0) goto err; } (*listp)[nused++] = cp; } MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); } MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); if (listp != NULL) { if (nused >= nalloc) { nalloc++; if ((ret = __os_realloc(dbp->dbenv, nalloc * sizeof(HASH_CURSOR *), listp)) != 0) return (ret); } (*listp)[nused] = NULL; } return (0); err: MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); return (ret); } static int __ham_c_writelock(dbc) DBC *dbc; { DB_ENV *dbenv; DB_LOCK tmp_lock; HASH_CURSOR *hcp; int ret; /* * All we need do is acquire the lock and let the off-page * dup tree do its thing. */ if (!STD_LOCKING(dbc)) return (0); hcp = (HASH_CURSOR *)dbc->internal; if ((!LOCK_ISSET(hcp->lock) || hcp->lock_mode == DB_LOCK_READ)) { tmp_lock = hcp->lock; if ((ret = __ham_lock_bucket(dbc, DB_LOCK_WRITE)) != 0) return (ret); dbenv = dbc->dbp->dbenv; if (LOCK_ISSET(tmp_lock) && (ret = dbenv->lock_put(dbenv, &tmp_lock)) != 0) return (ret); } return (0); }