/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996,2008 Oracle. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 * Margo Seltzer. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Margo Seltzer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: hash.c,v 12.46 2008/01/08 20:58:33 bostic Exp $ */ #include "db_config.h" #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/btree.h" #include "dbinc/hash.h" #include "dbinc/lock.h" #include "dbinc/mp.h" static int __ham_bulk __P((DBC *, DBT *, u_int32_t)); static int __hamc_close __P((DBC *, db_pgno_t, int *)); static int __hamc_del __P((DBC *)); static int __hamc_destroy __P((DBC *)); static int __hamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); static int __hamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); static int __hamc_writelock __P((DBC *)); static int __ham_dup_return __P((DBC *, DBT *, u_int32_t)); static int __ham_expand_table __P((DBC *)); static int __ham_lookup __P((DBC *, const DBT *, u_int32_t, db_lockmode_t, db_pgno_t *)); static int __ham_overwrite __P((DBC *, DBT *, u_int32_t)); /* * __ham_quick_delete -- * This function is called by __db_del when the appropriate conditions * are met, and it performs the delete in the optimized way. * * PUBLIC: int __ham_quick_delete __P((DBC *)); */ int __ham_quick_delete(dbc) DBC *dbc; { int ret, t_ret; /* * When performing a DB->del operation not involving secondary indices * and not removing an off-page duplicate tree, we can speed things up * substantially by removing the entire duplicate set, if any is * present, in one operation, rather than by conjuring up and deleting * each of the items individually. (All are stored in one big HKEYDATA * structure.) We don't bother to distinguish on-page duplicate sets * from single, non-dup items; they're deleted in exactly the same way. * * The cursor should be set to the first item in the duplicate set, or * to the sole key/data pair when the key does not have a duplicate set, * before the function is called. * * We do not need to call CDB_LOCKING_INIT, __db_del calls here with * a write cursor. * * Assert we're initialized, but not to an off-page duplicate. * Assert we're not using secondary indices. */ DB_ASSERT(dbc->env, IS_INITIALIZED(dbc)); DB_ASSERT(dbc->env, dbc->internal->opd == NULL); DB_ASSERT(dbc->env, !F_ISSET(dbc->dbp, DB_AM_SECONDARY)); DB_ASSERT(dbc->env, LIST_FIRST(&dbc->dbp->s_secondaries) == NULL); if ((ret = __ham_get_meta(dbc)) != 0) return (ret); if ((ret = __hamc_writelock(dbc)) == 0) ret = __ham_del_pair(dbc, 0); if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; return (ret); } /* ****************** CURSORS ********************************** */ /* * __hamc_init -- * Initialize the hash-specific portion of a cursor. * * PUBLIC: int __hamc_init __P((DBC *)); */ int __hamc_init(dbc) DBC *dbc; { ENV *env; HASH_CURSOR *new_curs; int ret; env = dbc->env; if ((ret = __os_calloc(env, 1, sizeof(struct cursor_t), &new_curs)) != 0) return (ret); if ((ret = __os_malloc(env, dbc->dbp->pgsize, &new_curs->split_buf)) != 0) { __os_free(env, new_curs); return (ret); } dbc->internal = (DBC_INTERNAL *) new_curs; dbc->close = dbc->c_close = __dbc_close_pp; dbc->count = dbc->c_count = __dbc_count_pp; dbc->del = dbc->c_del = __dbc_del_pp; dbc->dup = dbc->c_dup = __dbc_dup_pp; dbc->get = dbc->c_get = __dbc_get_pp; dbc->pget = dbc->c_pget = __dbc_pget_pp; dbc->put = dbc->c_put = __dbc_put_pp; dbc->am_bulk = __ham_bulk; dbc->am_close = __hamc_close; dbc->am_del = __hamc_del; dbc->am_destroy = __hamc_destroy; dbc->am_get = __hamc_get; dbc->am_put = __hamc_put; dbc->am_writelock = __hamc_writelock; return (__ham_item_init(dbc)); } /* * __hamc_close -- * Close down the cursor from a single use. */ static int __hamc_close(dbc, root_pgno, rmroot) DBC *dbc; db_pgno_t root_pgno; int *rmroot; { DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; HKEYDATA *dp; db_lockmode_t lock_mode; int doroot, gotmeta, ret, t_ret; COMPQUIET(rmroot, 0); mpf = dbc->dbp->mpf; doroot = gotmeta = ret = 0; hcp = (HASH_CURSOR *) dbc->internal; /* Check for off page dups. */ if (dbc->internal->opd != NULL) { if ((ret = __ham_get_meta(dbc)) != 0) goto done; gotmeta = 1; lock_mode = DB_LOCK_READ; /* To support dirty reads we must reget the write lock. */ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) && F_ISSET((BTREE_CURSOR *) dbc->internal->opd->internal, C_DELETED)) lock_mode = DB_LOCK_WRITE; if ((ret = __ham_get_cpage(dbc, lock_mode)) != 0) goto out; dp = (HKEYDATA *)H_PAIRDATA(dbc->dbp, hcp->page, hcp->indx); /* If it's not a dup we aborted before we changed it. */ if (HPAGE_PTYPE(dp) == H_OFFDUP) memcpy(&root_pgno, HOFFPAGE_PGNO(dp), sizeof(db_pgno_t)); else root_pgno = PGNO_INVALID; if ((ret = hcp->opd->am_close(hcp->opd, root_pgno, &doroot)) != 0) goto out; if (doroot != 0) { if ((ret = __memp_dirty(mpf, &hcp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) goto out; if ((ret = __ham_del_pair(dbc, 0)) != 0) goto out; } } out: if (hcp->page != NULL && (t_ret = __memp_fput(mpf, dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0) ret = t_ret; if (gotmeta != 0 && (t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; done: if ((t_ret = __ham_item_init(dbc)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * __hamc_destroy -- * Cleanup the access method private part of a cursor. */ static int __hamc_destroy(dbc) DBC *dbc; { HASH_CURSOR *hcp; hcp = (HASH_CURSOR *)dbc->internal; if (hcp->split_buf != NULL) __os_free(dbc->env, hcp->split_buf); __os_free(dbc->env, hcp); return (0); } /* * __hamc_count -- * Return a count of on-page duplicates. * * PUBLIC: int __hamc_count __P((DBC *, db_recno_t *)); */ int __hamc_count(dbc, recnop) DBC *dbc; db_recno_t *recnop; { DB *dbp; DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; db_indx_t len; db_recno_t recno; int ret, t_ret; u_int8_t *p, *pend; dbp = dbc->dbp; mpf = dbp->mpf; hcp = (HASH_CURSOR *)dbc->internal; recno = 0; if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0) return (ret); if (hcp->indx >= NUM_ENT(hcp->page)) { *recnop = 0; goto err; } switch (HPAGE_PTYPE(H_PAIRDATA(dbp, hcp->page, hcp->indx))) { case H_KEYDATA: case H_OFFPAGE: recno = 1; break; case H_DUPLICATE: p = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)); pend = p + LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx); for (; p < pend; recno++) { /* p may be odd, so copy rather than just dereffing */ memcpy(&len, p, sizeof(db_indx_t)); p += 2 * sizeof(db_indx_t) + len; } break; default: ret = __db_pgfmt(dbp->env, hcp->pgno); goto err; } *recnop = recno; err: if ((t_ret = __memp_fput(mpf, dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0) ret = t_ret; hcp->page = NULL; return (ret); } static int __hamc_del(dbc) DBC *dbc; { DB *dbp; DBT repldbt; DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; int ret, t_ret; dbp = dbc->dbp; mpf = dbp->mpf; hcp = (HASH_CURSOR *)dbc->internal; if (F_ISSET(hcp, H_DELETED)) return (DB_NOTFOUND); if ((ret = __ham_get_meta(dbc)) != 0) goto out; if ((ret = __ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0) goto out; /* Off-page duplicates. */ if (HPAGE_TYPE(dbp, hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP) goto out; if ((ret = __memp_dirty(mpf, &hcp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) goto out; if (F_ISSET(hcp, H_ISDUP)) { /* On-page duplicate. */ if (hcp->dup_off == 0 && DUP_SIZE(hcp->dup_len) == LEN_HDATA(dbp, hcp->page, hcp->hdr->dbmeta.pagesize, hcp->indx)) ret = __ham_del_pair(dbc, 0); else { repldbt.flags = 0; F_SET(&repldbt, DB_DBT_PARTIAL); repldbt.doff = hcp->dup_off; repldbt.dlen = DUP_SIZE(hcp->dup_len); repldbt.size = 0; repldbt.data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)); if ((ret = __ham_replpair(dbc, &repldbt, 0)) == 0) { hcp->dup_tlen -= DUP_SIZE(hcp->dup_len); F_SET(hcp, H_DELETED); ret = __hamc_update(dbc, DUP_SIZE(hcp->dup_len), DB_HAM_CURADJ_DEL, 1); } } } else /* Not a duplicate */ ret = __ham_del_pair(dbc, 0); out: if (hcp->page != NULL) { if ((t_ret = __memp_fput(mpf, dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0) ret = t_ret; hcp->page = NULL; } if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * __hamc_dup -- * Duplicate a hash cursor, such that the new one holds appropriate * locks for the position of the original. * * PUBLIC: int __hamc_dup __P((DBC *, DBC *)); */ int __hamc_dup(orig_dbc, new_dbc) DBC *orig_dbc, *new_dbc; { HASH_CURSOR *orig, *new; orig = (HASH_CURSOR *)orig_dbc->internal; new = (HASH_CURSOR *)new_dbc->internal; new->bucket = orig->bucket; new->lbucket = orig->lbucket; new->dup_off = orig->dup_off; new->dup_len = orig->dup_len; new->dup_tlen = orig->dup_tlen; if (F_ISSET(orig, H_DELETED)) F_SET(new, H_DELETED); if (F_ISSET(orig, H_ISDUP)) F_SET(new, H_ISDUP); return (0); } static int __hamc_get(dbc, key, data, flags, pgnop) DBC *dbc; DBT *key; DBT *data; u_int32_t flags; db_pgno_t *pgnop; { DB *dbp; DB_MPOOLFILE *mpf; ENV *env; HASH_CURSOR *hcp; db_lockmode_t lock_type; int ret, t_ret; hcp = (HASH_CURSOR *)dbc->internal; dbp = dbc->dbp; env = dbp->env; mpf = dbp->mpf; /* Clear OR'd in additional bits so we can check for flag equality. */ if (F_ISSET(dbc, DBC_RMW)) lock_type = DB_LOCK_WRITE; else lock_type = DB_LOCK_READ; if ((ret = __ham_get_meta(dbc)) != 0) return (ret); hcp->seek_size = 0; ret = 0; switch (flags) { case DB_PREV_DUP: F_SET(hcp, H_DUPONLY); goto prev; case DB_PREV_NODUP: F_SET(hcp, H_NEXT_NODUP); /* FALLTHROUGH */ case DB_PREV: if (IS_INITIALIZED(dbc)) { prev: ret = __ham_item_prev(dbc, lock_type, pgnop); break; } /* FALLTHROUGH */ case DB_LAST: ret = __ham_item_last(dbc, lock_type, pgnop); break; case DB_NEXT_DUP: case DB_GET_BOTHC: /* cgetchk has already determined that the cursor is set. */ F_SET(hcp, H_DUPONLY); goto next; case DB_NEXT_NODUP: F_SET(hcp, H_NEXT_NODUP); /* FALLTHROUGH */ case DB_NEXT: if (IS_INITIALIZED(dbc)) { next: ret = __ham_item_next(dbc, lock_type, pgnop); break; } /* FALLTHROUGH */ case DB_FIRST: ret = __ham_item_first(dbc, lock_type, pgnop); break; case DB_SET: case DB_SET_RANGE: case DB_GET_BOTH: case DB_GET_BOTH_RANGE: ret = __ham_lookup(dbc, key, 0, lock_type, pgnop); break; case DB_CURRENT: /* cgetchk has already determined that the cursor is set. */ if (F_ISSET(hcp, H_DELETED)) { ret = DB_KEYEMPTY; goto err; } ret = __ham_item(dbc, lock_type, pgnop); break; default: ret = __db_unknown_flag(env, "__hamc_get", flags); break; } /* * Must always enter this loop to do error handling and * check for big key/data pair. */ for (;;) { if (ret != 0 && ret != DB_NOTFOUND) goto err; else if (F_ISSET(hcp, H_OK)) { if (*pgnop == PGNO_INVALID) ret = __ham_dup_return(dbc, data, flags); break; } else if (!F_ISSET(hcp, H_NOMORE)) { __db_errx(env, "H_NOMORE returned to __hamc_get"); ret = EINVAL; break; } /* * Ran out of entries in a bucket; change buckets. */ switch (flags) { case DB_LAST: case DB_PREV: case DB_PREV_DUP: case DB_PREV_NODUP: ret = __memp_fput(mpf, dbc->thread_info, hcp->page, dbc->priority); hcp->page = NULL; if (hcp->bucket == 0) { ret = DB_NOTFOUND; hcp->pgno = PGNO_INVALID; goto err; } F_CLR(hcp, H_ISDUP); hcp->bucket--; hcp->indx = NDX_INVALID; hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); if (ret == 0) ret = __ham_item_prev(dbc, lock_type, pgnop); break; case DB_FIRST: case DB_NEXT: case DB_NEXT_NODUP: ret = __memp_fput(mpf, dbc->thread_info, hcp->page, dbc->priority); hcp->page = NULL; hcp->indx = NDX_INVALID; hcp->bucket++; F_CLR(hcp, H_ISDUP); hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); if (hcp->bucket > hcp->hdr->max_bucket) { ret = DB_NOTFOUND; hcp->pgno = PGNO_INVALID; goto err; } if (ret == 0) ret = __ham_item_next(dbc, lock_type, pgnop); break; case DB_GET_BOTH: case DB_GET_BOTHC: case DB_GET_BOTH_RANGE: case DB_NEXT_DUP: case DB_SET: case DB_SET_RANGE: /* Key not found. */ ret = DB_NOTFOUND; goto err; case DB_CURRENT: /* * This should only happen if you are doing deletes and * reading with concurrent threads and not doing proper * locking. We return the same error code as we would * if the cursor were deleted. */ ret = DB_KEYEMPTY; goto err; default: DB_ASSERT(env, 0); } } err: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; F_CLR(hcp, H_DUPONLY); F_CLR(hcp, H_NEXT_NODUP); return (ret); } /* * __ham_bulk -- Return bulk data from a hash table. */ static int __ham_bulk(dbc, data, flags) DBC *dbc; DBT *data; u_int32_t flags; { DB *dbp; DB_MPOOLFILE *mpf; HASH_CURSOR *cp; PAGE *pg; db_indx_t dup_len, dup_off, dup_tlen, indx, *inp; db_lockmode_t lock_mode; db_pgno_t pgno; int32_t *endp, *offp, *saveoff; u_int32_t key_off, key_size, pagesize, size, space; u_int8_t *dbuf, *dp, *hk, *np, *tmp; int is_dup, is_key; int need_pg, next_key, no_dup, ret, t_ret; ret = 0; key_off = 0; dup_len = dup_off = dup_tlen = 0; size = 0; dbp = dbc->dbp; pagesize = dbp->pgsize; mpf = dbp->mpf; cp = (HASH_CURSOR *)dbc->internal; is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0; next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP; no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP; dbuf = data->data; np = dp = dbuf; /* Keep track of space that is left. There is an termination entry */ space = data->ulen; space -= sizeof(*offp); /* Build the offset/size table from the end up. */ endp = (int32_t *) ((u_int8_t *)dbuf + data->ulen); endp--; offp = endp; key_size = 0; lock_mode = F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE: DB_LOCK_READ; next_pg: need_pg = 1; indx = cp->indx; pg = cp->page; inp = P_INP(dbp, pg); do { if (is_key) { hk = H_PAIRKEY(dbp, pg, indx); if (HPAGE_PTYPE(hk) == H_OFFPAGE) { memcpy(&key_size, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); size = key_size; if (key_size > space) goto get_key_space; if ((ret = __bam_bulk_overflow( dbc, key_size, pgno, np)) != 0) return (ret); space -= key_size; key_off = (u_int32_t)(np - dbuf); np += key_size; } else { if (need_pg) { dp = np; size = pagesize - HOFFSET(pg); if (space < size) { get_key_space: if (offp == endp) { data->size = (u_int32_t) DB_ALIGN(size + pagesize, 1024); return (DB_BUFFER_SMALL); } goto back_up; } memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size); need_pg = 0; space -= size; np += size; } key_size = LEN_HKEY(dbp, pg, pagesize, indx); key_off = ((inp[indx] - HOFFSET(pg)) + (u_int32_t)(dp - dbuf)) + SSZA(HKEYDATA, data); } } hk = H_PAIRDATA(dbp, pg, indx); switch (HPAGE_PTYPE(hk)) { case H_DUPLICATE: case H_KEYDATA: if (need_pg) { dp = np; size = pagesize - HOFFSET(pg); if (space < size) { back_up: if (indx != 0) { indx -= 2; /* XXX * It's not clear that this is * the right way to fix this, * but here goes. * If we are backing up onto a * duplicate, then we need to * position ourselves at the * end of the duplicate set. * We probably need to make * this work for H_OFFDUP too. * It might be worth making a * dummy cursor and calling * __ham_item_prev. */ tmp = H_PAIRDATA(dbp, pg, indx); if (HPAGE_PTYPE(tmp) == H_DUPLICATE) { dup_off = dup_tlen = LEN_HDATA(dbp, pg, pagesize, indx + 1); memcpy(&dup_len, HKEYDATA_DATA(tmp), sizeof(db_indx_t)); } else { is_dup = 0; dup_len = 0; dup_off = 0; dup_tlen = 0; F_CLR(cp, H_ISDUP); } goto get_space; } /* indx == 0 */ cp->dup_len = dup_len; cp->dup_off = dup_off; cp->dup_tlen = dup_tlen; if ((ret = __ham_item_prev(dbc, lock_mode, &pgno)) != 0) { if (ret != DB_NOTFOUND) return (ret); if ((ret = __memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority)) != 0) return (ret); cp->page = NULL; if (cp->bucket == 0) { cp->indx = indx = NDX_INVALID; goto get_space; } if ((ret = __ham_get_meta(dbc)) != 0) return (ret); cp->bucket--; cp->pgno = BUCKET_TO_PAGE(cp, cp->bucket); cp->indx = NDX_INVALID; if ((ret = __ham_release_meta( dbc)) != 0) return (ret); if ((ret = __ham_item_prev(dbc, lock_mode, &pgno)) != 0) return (ret); } indx = cp->indx; get_space: /* * See if we put any data in the buffer. */ if (offp >= endp || F_ISSET(dbc, DBC_TRANSIENT)) { data->size = (u_int32_t) DB_ALIGN(size + data->ulen - space, 1024); return (DB_BUFFER_SMALL); } /* * Don't continue; we're all out * of space, even though we're * returning success. */ next_key = 0; break; } memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size); need_pg = 0; space -= size; np += size; } /* * We're about to crack the offset(s) and length(s) * out of an H_KEYDATA or H_DUPLICATE item. * There are three cases: * 1. We were moved into a duplicate set by * the standard hash cursor code. Respect * the dup_off and dup_tlen we were given. * 2. We stumbled upon a duplicate set while * walking the page on our own. We need to * recognize it as a dup and set dup_off and * dup_tlen. * 3. The current item is not a dup. */ if (F_ISSET(cp, H_ISDUP)) { /* Case 1 */ is_dup = 1; dup_len = cp->dup_len; dup_off = cp->dup_off; dup_tlen = cp->dup_tlen; } else if (HPAGE_PTYPE(hk) == H_DUPLICATE) { /* Case 2 */ is_dup = 1; /* * If we run out of memory and bail, * make sure the fact we're in a dup set * isn't ignored later. */ F_SET(cp, H_ISDUP); dup_off = 0; memcpy(&dup_len, HKEYDATA_DATA(hk), sizeof(db_indx_t)); dup_tlen = LEN_HDATA(dbp, pg, pagesize, indx); } else { /* Case 3 */ is_dup = 0; dup_len = 0; dup_off = 0; dup_tlen = 0; } do { space -= (is_key ? 4 : 2) * sizeof(*offp); size += (is_key ? 4 : 2) * sizeof(*offp); /* * Since space is an unsigned, if we happen * to wrap, then this comparison will turn out * to be true. XXX Wouldn't it be better to * simply check above that space is greater than * the value we're about to subtract??? */ if (space > data->ulen) { if (!is_dup || dup_off == 0) goto back_up; dup_off -= (db_indx_t) DUP_SIZE((u_int32_t)offp[1]); goto get_space; } if (is_key) { *offp-- = (int32_t)key_off; *offp-- = (int32_t)key_size; } if (is_dup) { *offp-- = (int32_t)( ((inp[indx + 1] - HOFFSET(pg)) + dp - dbuf) + SSZA(HKEYDATA, data) + dup_off + sizeof(db_indx_t)); memcpy(&dup_len, HKEYDATA_DATA(hk) + dup_off, sizeof(db_indx_t)); dup_off += DUP_SIZE(dup_len); *offp-- = dup_len; } else { *offp-- = (int32_t)( ((inp[indx + 1] - HOFFSET(pg)) + dp - dbuf) + SSZA(HKEYDATA, data)); *offp-- = LEN_HDATA(dbp, pg, pagesize, indx); } } while (is_dup && dup_off < dup_tlen && no_dup == 0); F_CLR(cp, H_ISDUP); break; case H_OFFDUP: memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); space -= 2 * sizeof(*offp); if (space > data->ulen) goto back_up; if (is_key) { space -= 2 * sizeof(*offp); if (space > data->ulen) goto back_up; *offp-- = (int32_t)key_off; *offp-- = (int32_t)key_size; } saveoff = offp; if ((ret = __bam_bulk_duplicates(dbc, pgno, dbuf, is_key ? offp + 2 : NULL, &offp, &np, &space, no_dup)) != 0) { if (ret == DB_BUFFER_SMALL) { size = space; space = 0; if (is_key && saveoff == offp) { offp += 2; goto back_up; } goto get_space; } return (ret); } break; case H_OFFPAGE: space -= (is_key ? 4 : 2) * sizeof(*offp); if (space > data->ulen) goto back_up; memcpy(&size, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); if (size > space) goto back_up; if ((ret = __bam_bulk_overflow(dbc, size, pgno, np)) != 0) return (ret); if (is_key) { *offp-- = (int32_t)key_off; *offp-- = (int32_t)key_size; } *offp-- = (int32_t)(np - dbuf); *offp-- = (int32_t)size; np += size; space -= size; break; default: /* Do nothing. */ break; } } while (next_key && (indx += 2) < NUM_ENT(pg)); cp->indx = indx; cp->dup_len = dup_len; cp->dup_off = dup_off; cp->dup_tlen = dup_tlen; /* If we are off the page then try to the next page. */ if (ret == 0 && next_key && indx >= NUM_ENT(pg)) { if ((ret = __ham_item_next(dbc, lock_mode, &pgno)) == 0) goto next_pg; if (ret != DB_NOTFOUND) return (ret); if ((ret = __memp_fput(dbc->dbp->mpf, dbc->thread_info, cp->page, dbc->priority)) != 0) return (ret); cp->page = NULL; if ((ret = __ham_get_meta(dbc)) != 0) return (ret); cp->bucket++; if (cp->bucket > cp->hdr->max_bucket) { /* * Restore cursor to its previous state. We're past * the last item in the last bucket, so the next * DBC->get(DB_NEXT) will return DB_NOTFOUND. */ cp->bucket--; ret = DB_NOTFOUND; } else { /* * Start on the next bucket. * * Note that if this new bucket happens to be empty, * but there's another non-empty bucket after it, * we'll return early. This is a rare case, and we * don't guarantee any particular number of keys * returned on each call, so just let the next call * to bulk get move forward by yet another bucket. */ cp->pgno = BUCKET_TO_PAGE(cp, cp->bucket); cp->indx = NDX_INVALID; F_CLR(cp, H_ISDUP); ret = __ham_item_next(dbc, lock_mode, &pgno); } if ((t_ret = __ham_release_meta(dbc)) != 0) return (t_ret); if (ret == 0) goto next_pg; if (ret != DB_NOTFOUND) return (ret); } *offp = -1; return (0); } static int __hamc_put(dbc, key, data, flags, pgnop) DBC *dbc; DBT *key; DBT *data; u_int32_t flags; db_pgno_t *pgnop; { DB *dbp; DBT tmp_val, *myval; DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; u_int32_t nbytes; int ret, t_ret; /* * The compiler doesn't realize that we only use this when ret is * equal to 0 and that if ret is equal to 0, that we must have set * myval. So, we initialize it here to shut the compiler up. */ COMPQUIET(myval, NULL); dbp = dbc->dbp; mpf = dbp->mpf; hcp = (HASH_CURSOR *)dbc->internal; if (F_ISSET(hcp, H_DELETED) && flags != DB_KEYFIRST && flags != DB_KEYLAST) return (DB_NOTFOUND); if ((ret = __ham_get_meta(dbc)) != 0) goto err1; switch (flags) { case DB_KEYLAST: case DB_KEYFIRST: case DB_NODUPDATA: case DB_NOOVERWRITE: nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE : HKEYDATA_PSIZE(key->size)) + (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE : HKEYDATA_PSIZE(data->size)); if ((ret = __ham_lookup(dbc, key, nbytes, DB_LOCK_WRITE, pgnop)) == DB_NOTFOUND) { if (hcp->seek_found_page != PGNO_INVALID && hcp->seek_found_page != hcp->pgno) { if ((ret = __memp_fput(mpf, dbc->thread_info, hcp->page, dbc->priority)) != 0) goto err2; hcp->page = NULL; hcp->pgno = hcp->seek_found_page; hcp->indx = NDX_INVALID; } if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) { /* * A partial put, but the key does not exist * and we are not beginning the write at 0. * We must create a data item padded up to doff * and then write the new bytes represented by * val. */ if ((ret = __ham_init_dbt(dbp->env, &tmp_val, data->size + data->doff, &dbc->my_rdata.data, &dbc->my_rdata.ulen)) != 0) goto err2; memset(tmp_val.data, 0, data->doff); memcpy((u_int8_t *)tmp_val.data + data->doff, data->data, data->size); myval = &tmp_val; } else myval = (DBT *)data; ret = __ham_add_el(dbc, key, myval, H_KEYDATA); goto done; } else if (flags == DB_NOOVERWRITE && !F_ISSET(hcp, H_DELETED)) { if (*pgnop == PGNO_INVALID) ret = DB_KEYEXIST; else ret = __bam_opd_exists(dbc, *pgnop); if (ret != 0) goto done; } break; case DB_BEFORE: case DB_AFTER: case DB_CURRENT: ret = __ham_item(dbc, DB_LOCK_WRITE, pgnop); break; default: ret = __db_unknown_flag(dbp->env, "__hamc_put", flags); break; } /* * Invalidate any insert index found. So they are not reused * in future inserts. */ hcp->seek_found_page = PGNO_INVALID; hcp->seek_found_indx = NDX_INVALID; if (*pgnop == PGNO_INVALID && ret == 0) { if ((ret = __memp_dirty(mpf, &hcp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) goto done; if (flags == DB_CURRENT || ((flags == DB_KEYFIRST || flags == DB_KEYLAST || flags == DB_NODUPDATA) && !(F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK)))) ret = __ham_overwrite(dbc, data, flags); else ret = __ham_add_dup(dbc, data, flags, pgnop); } done: if (hcp->page != NULL) { if ((t_ret = __memp_fput(mpf, dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0) ret = t_ret; if (t_ret == 0) hcp->page = NULL; } if (ret == 0 && F_ISSET(hcp, H_EXPAND)) { ret = __ham_expand_table(dbc); F_CLR(hcp, H_EXPAND); /* If we are out of space, ignore the error. */ if (ret == ENOSPC && dbc->txn == NULL) ret = 0; } err2: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) ret = t_ret; err1: return (ret); } /********************************* UTILITIES ************************/ /* * __ham_expand_table -- */ static int __ham_expand_table(dbc) DBC *dbc; { DB *dbp; DBMETA *mmeta; DB_LOCK metalock; DB_LSN lsn; DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; PAGE *h; db_pgno_t pgno, mpgno; u_int32_t logn, newalloc, new_bucket, old_bucket; int got_meta, new_double, ret, t_ret; dbp = dbc->dbp; mpf = dbp->mpf; hcp = (HASH_CURSOR *)dbc->internal; if ((ret = __ham_dirty_meta(dbc, 0)) != 0) return (ret); LOCK_INIT(metalock); mmeta = (DBMETA *) hcp->hdr; mpgno = mmeta->pgno; h = NULL; newalloc = 0; got_meta = 0; /* * If the split point is about to increase, make sure that we * have enough extra pages. The calculation here is weird. * We'd like to do this after we've upped max_bucket, but it's * too late then because we've logged the meta-data split. What * we'll do between then and now is increment max bucket and then * see what the log of one greater than that is; here we have to * look at the log of max + 2. VERY NASTY STUFF. * * We figure out what we need to do, then we log it, then request * the pages from mpool. We don't want to fail after extending * the file. * * If the page we are about to split into has already been allocated, * then we simply need to get it to get its LSN. If it hasn't yet * been allocated, then we know it's LSN (0,0). */ new_bucket = hcp->hdr->max_bucket + 1; old_bucket = new_bucket & hcp->hdr->low_mask; new_double = hcp->hdr->max_bucket == hcp->hdr->high_mask; logn = __db_log2(new_bucket); if (!new_double || hcp->hdr->spares[logn + 1] != PGNO_INVALID) { /* Page exists; get it so we can get its LSN */ pgno = BUCKET_TO_PAGE(hcp, new_bucket); if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0) goto err; lsn = h->lsn; } else { /* Get the master meta-data page to do allocation. */ if (F_ISSET(dbp, DB_AM_SUBDB)) { mpgno = PGNO_BASE_MD; if ((ret = __db_lget(dbc, 0, mpgno, DB_LOCK_WRITE, 0, &metalock)) != 0) goto err; if ((ret = __memp_fget(mpf, &mpgno, dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &mmeta)) != 0) goto err; got_meta = 1; } pgno = mmeta->last_pgno + 1; ZERO_LSN(lsn); newalloc = 1; } /* Log the meta-data split first. */ if (DBC_LOGGING(dbc)) { /* * We always log the page number of the first page of * the allocation group. However, the LSN that we log * is either the LSN on the first page (if we did not * do the actual allocation here) or the LSN on the last * page of the unit (if we did do the allocation here). */ if ((ret = __ham_metagroup_log(dbp, dbc->txn, &lsn, 0, hcp->hdr->max_bucket, mpgno, &mmeta->lsn, hcp->hdr->dbmeta.pgno, &hcp->hdr->dbmeta.lsn, pgno, &lsn, newalloc, mmeta->last_pgno)) != 0) goto err; } else LSN_NOT_LOGGED(lsn); hcp->hdr->dbmeta.lsn = lsn; if (new_double && hcp->hdr->spares[logn + 1] == PGNO_INVALID) { /* * We need to begin a new doubling and we have not allocated * any pages yet. Read the last page in and initialize it to * make the allocation contiguous. The pgno we calculated * above is the first page allocated. The entry in spares is * that page number minus any buckets already allocated (it * simplifies bucket to page transaction). After we've set * that, we calculate the last pgno. */ pgno += hcp->hdr->max_bucket; if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0) goto err; hcp->hdr->spares[logn + 1] = (pgno - new_bucket) - hcp->hdr->max_bucket; mmeta->last_pgno = pgno; mmeta->lsn = lsn; P_INIT(h, dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); } /* Write out whatever page we ended up modifying. */ h->lsn = lsn; if ((ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority)) != 0) goto err; h = NULL; /* * Update the meta-data page of this hash database. */ hcp->hdr->max_bucket = new_bucket; if (new_double) { hcp->hdr->low_mask = hcp->hdr->high_mask; hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask; } /* Relocate records to the new bucket */ ret = __ham_split_page(dbc, old_bucket, new_bucket); err: if (got_meta) if ((t_ret = __memp_fput(mpf, dbc->thread_info, mmeta, dbc->priority)) != 0 && ret == 0) ret = t_ret; if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) ret = t_ret; if (h != NULL) if ((t_ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * PUBLIC: u_int32_t __ham_call_hash __P((DBC *, u_int8_t *, u_int32_t)); */ u_int32_t __ham_call_hash(dbc, k, len) DBC *dbc; u_int8_t *k; u_int32_t len; { DB *dbp; HASH *hashp; HASH_CURSOR *hcp; u_int32_t n, bucket; dbp = dbc->dbp; hcp = (HASH_CURSOR *)dbc->internal; hashp = dbp->h_internal; n = (u_int32_t)(hashp->h_hash(dbp, k, len)); bucket = n & hcp->hdr->high_mask; if (bucket > hcp->hdr->max_bucket) bucket = bucket & hcp->hdr->low_mask; return (bucket); } /* * Check for duplicates, and call __db_ret appropriately. Release * everything held by the cursor. */ static int __ham_dup_return(dbc, val, flags) DBC *dbc; DBT *val; u_int32_t flags; { DB *dbp; DBT *myval, tmp_val; HASH_CURSOR *hcp; PAGE *pp; db_indx_t ndx; db_pgno_t pgno; u_int32_t off, tlen; u_int8_t *hk, type; int cmp, ret; db_indx_t len; /* Check for duplicate and return the first one. */ dbp = dbc->dbp; hcp = (HASH_CURSOR *)dbc->internal; ndx = H_DATAINDEX(hcp->indx); type = HPAGE_TYPE(dbp, hcp->page, ndx); pp = hcp->page; myval = val; /* * There are 4 cases: * 1. We are not in duplicate, simply return; the upper layer * will do the right thing. * 2. We are looking at keys and stumbled onto a duplicate. * 3. We are in the middle of a duplicate set. (ISDUP set) * 4. We need to check for particular data match. */ /* We should never get here with off-page dups. */ DB_ASSERT(dbp->env, type != H_OFFDUP); /* Case 1 */ if (type != H_DUPLICATE && flags != DB_GET_BOTH && flags != DB_GET_BOTHC && flags != DB_GET_BOTH_RANGE) return (0); /* * Here we check for the case where we just stumbled onto a * duplicate. In this case, we do initialization and then * let the normal duplicate code handle it. (Case 2) */ if (!F_ISSET(hcp, H_ISDUP) && type == H_DUPLICATE) { F_SET(hcp, H_ISDUP); hcp->dup_tlen = LEN_HDATA(dbp, hcp->page, hcp->hdr->dbmeta.pagesize, hcp->indx); hk = H_PAIRDATA(dbp, hcp->page, hcp->indx); if (flags == DB_LAST || flags == DB_PREV || flags == DB_PREV_NODUP) { hcp->dup_off = 0; do { memcpy(&len, HKEYDATA_DATA(hk) + hcp->dup_off, sizeof(db_indx_t)); hcp->dup_off += DUP_SIZE(len); } while (hcp->dup_off < hcp->dup_tlen); hcp->dup_off -= DUP_SIZE(len); } else { memcpy(&len, HKEYDATA_DATA(hk), sizeof(db_indx_t)); hcp->dup_off = 0; } hcp->dup_len = len; } /* * If we are retrieving a specific key/data pair, then we * may need to adjust the cursor before returning data. * Case 4 */ if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) { if (F_ISSET(hcp, H_ISDUP)) { /* * If we're doing a join, search forward from the * current position, not the beginning of the dup set. */ if (flags == DB_GET_BOTHC) F_SET(hcp, H_CONTINUE); __ham_dsearch(dbc, val, &off, &cmp, flags); /* * This flag is set nowhere else and is safe to * clear unconditionally. */ F_CLR(hcp, H_CONTINUE); hcp->dup_off = off; } else { hk = H_PAIRDATA(dbp, hcp->page, hcp->indx); if (((HKEYDATA *)hk)->type == H_OFFPAGE) { memcpy(&tlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); if ((ret = __db_moff(dbp, dbc->thread_info, dbc->txn, val, pgno, tlen, dbp->dup_compare, &cmp)) != 0) return (ret); } else { /* * We do not zero tmp_val since the comparison * routines may only look at data and size. */ tmp_val.data = HKEYDATA_DATA(hk); tmp_val.size = LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx); cmp = dbp->dup_compare == NULL ? __bam_defcmp(dbp, &tmp_val, val) : dbp->dup_compare(dbp, &tmp_val, val); } } if (cmp != 0) return (DB_NOTFOUND); } /* * If we've already got the data for this value, or we're doing a bulk * get, we don't want to return the data. */ if (F_ISSET(dbc, DBC_MULTIPLE | DBC_MULTIPLE_KEY) || F_ISSET(val, DB_DBT_ISSET)) return (0); /* * Now, everything is initialized, grab a duplicate if * necessary. */ if (F_ISSET(hcp, H_ISDUP)) { /* Case 3 */ /* * Copy the DBT in case we are retrieving into user * memory and we need the parameters for it. If the * user requested a partial, then we need to adjust * the user's parameters to get the partial of the * duplicate which is itself a partial. */ memcpy(&tmp_val, val, sizeof(*val)); if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) { /* * Take the user's length unless it would go * beyond the end of the duplicate. */ if (tmp_val.doff > hcp->dup_len) tmp_val.dlen = 0; else if (tmp_val.dlen + tmp_val.doff > hcp->dup_len) tmp_val.dlen = hcp->dup_len - tmp_val.doff; } else { F_SET(&tmp_val, DB_DBT_PARTIAL); tmp_val.dlen = hcp->dup_len; tmp_val.doff = 0; } /* * Set offset to the appropriate place within the * current duplicate -- need to take into account * both the dup_off and the current duplicate's * length. */ tmp_val.doff += hcp->dup_off + sizeof(db_indx_t); myval = &tmp_val; } /* * Finally, if we had a duplicate, pp, ndx, and myval should be * set appropriately. */ if ((ret = __db_ret(dbp, dbc->thread_info, dbc->txn, pp, ndx, myval, &dbc->rdata->data, &dbc->rdata->ulen)) != 0) return (ret); /* * In case we sent a temporary off to db_ret, set the real * return values. */ val->data = myval->data; val->size = myval->size; F_SET(val, DB_DBT_ISSET); return (0); } static int __ham_overwrite(dbc, nval, flags) DBC *dbc; DBT *nval; u_int32_t flags; { DB *dbp; DBT *myval, tmp_val, tmp_val2; ENV *env; HASH_CURSOR *hcp; void *newrec; u_int8_t *hk, *p; u_int32_t len, nondup_size; db_indx_t newsize; int ret; dbp = dbc->dbp; env = dbp->env; hcp = (HASH_CURSOR *)dbc->internal; if (F_ISSET(hcp, H_ISDUP)) { /* * This is an overwrite of a duplicate. We should never * be off-page at this point. */ DB_ASSERT(env, hcp->opd == NULL); /* On page dups */ if (F_ISSET(nval, DB_DBT_PARTIAL)) { /* * We're going to have to get the current item, then * construct the record, do any padding and do a * replace. */ memset(&tmp_val, 0, sizeof(tmp_val)); if ((ret = __ham_dup_return(dbc, &tmp_val, DB_CURRENT)) != 0) return (ret); /* Figure out new size. */ nondup_size = tmp_val.size; newsize = nondup_size; /* * Three cases: * 1. strictly append (may need to allocate space * for pad bytes; really gross). * 2. overwrite some and append. * 3. strictly overwrite. */ if (nval->doff > nondup_size) newsize += ((nval->doff - nondup_size) + nval->size); else if (nval->doff + nval->dlen > nondup_size) newsize += nval->size - (nondup_size - nval->doff); else newsize += nval->size - nval->dlen; /* * Make sure that the new size doesn't put us over * the onpage duplicate size in which case we need * to convert to off-page duplicates. */ if (ISBIG(hcp, (hcp->dup_tlen - nondup_size) + newsize)) { if ((ret = __ham_dup_convert(dbc)) != 0) return (ret); return (hcp->opd->am_put(hcp->opd, NULL, nval, flags, NULL)); } if ((ret = __os_malloc(dbp->env, DUP_SIZE(newsize), &newrec)) != 0) return (ret); memset(&tmp_val2, 0, sizeof(tmp_val2)); F_SET(&tmp_val2, DB_DBT_PARTIAL); /* Construct the record. */ p = newrec; /* Initial size. */ memcpy(p, &newsize, sizeof(db_indx_t)); p += sizeof(db_indx_t); /* First part of original record. */ len = nval->doff > tmp_val.size ? tmp_val.size : nval->doff; memcpy(p, tmp_val.data, len); p += len; if (nval->doff > tmp_val.size) { /* Padding */ memset(p, 0, nval->doff - tmp_val.size); p += nval->doff - tmp_val.size; } /* New bytes */ memcpy(p, nval->data, nval->size); p += nval->size; /* End of original record (if there is any) */ if (nval->doff + nval->dlen < tmp_val.size) { len = (tmp_val.size - nval->doff) - nval->dlen; memcpy(p, (u_int8_t *)tmp_val.data + nval->doff + nval->dlen, len); p += len; } /* Final size. */ memcpy(p, &newsize, sizeof(db_indx_t)); /* * Make sure that the caller isn't corrupting * the sort order. */ if (dbp->dup_compare != NULL) { tmp_val2.data = (u_int8_t *)newrec + sizeof(db_indx_t); tmp_val2.size = newsize; if (dbp->dup_compare( dbp, &tmp_val, &tmp_val2) != 0) { __os_free(env, newrec); return (__db_duperr(dbp, flags)); } } tmp_val2.data = newrec; tmp_val2.size = DUP_SIZE(newsize); tmp_val2.doff = hcp->dup_off; tmp_val2.dlen = DUP_SIZE(hcp->dup_len); ret = __ham_replpair(dbc, &tmp_val2, 0); __os_free(env, newrec); /* Update cursor */ if (ret != 0) return (ret); if (newsize > nondup_size) { if ((ret = __hamc_update(dbc, (newsize - nondup_size), DB_HAM_CURADJ_ADDMOD, 1)) != 0) return (ret); hcp->dup_tlen += (newsize - nondup_size); } else { if ((ret = __hamc_update(dbc, (nondup_size - newsize), DB_HAM_CURADJ_DELMOD, 1)) != 0) return (ret); hcp->dup_tlen -= (nondup_size - newsize); } hcp->dup_len = newsize; return (0); } else { /* Check whether we need to convert to off page. */ if (ISBIG(hcp, (hcp->dup_tlen - hcp->dup_len) + nval->size)) { if ((ret = __ham_dup_convert(dbc)) != 0) return (ret); return (hcp->opd->am_put(hcp->opd, NULL, nval, flags, NULL)); } /* Make sure we maintain sort order. */ if (dbp->dup_compare != NULL) { tmp_val2.data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)) + hcp->dup_off + sizeof(db_indx_t); tmp_val2.size = hcp->dup_len; if (dbp->dup_compare( dbp, nval, &tmp_val2) != 0) { __db_errx(env, "Existing data sorts differently from put data"); return (EINVAL); } } /* Overwriting a complete duplicate. */ if ((ret = __ham_make_dup(dbp->env, nval, &tmp_val, &dbc->my_rdata.data, &dbc->my_rdata.ulen)) != 0) return (ret); /* Now fix what we are replacing. */ tmp_val.doff = hcp->dup_off; tmp_val.dlen = DUP_SIZE(hcp->dup_len); /* Update cursor */ if (nval->size > hcp->dup_len) { if ((ret = __hamc_update(dbc, (nval->size - hcp->dup_len), DB_HAM_CURADJ_ADDMOD, 1)) != 0) return (ret); hcp->dup_tlen += (nval->size - hcp->dup_len); } else { if ((ret = __hamc_update(dbc, (hcp->dup_len - nval->size), DB_HAM_CURADJ_DELMOD, 1)) != 0) return (ret); hcp->dup_tlen -= (hcp->dup_len - nval->size); } hcp->dup_len = (db_indx_t)nval->size; } myval = &tmp_val; } else if (!F_ISSET(nval, DB_DBT_PARTIAL)) { /* Put/overwrite */ memcpy(&tmp_val, nval, sizeof(*nval)); F_SET(&tmp_val, DB_DBT_PARTIAL); tmp_val.doff = 0; hk = H_PAIRDATA(dbp, hcp->page, hcp->indx); if (HPAGE_PTYPE(hk) == H_OFFPAGE) memcpy(&tmp_val.dlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); else tmp_val.dlen = LEN_HDATA(dbp, hcp->page, hcp->hdr->dbmeta.pagesize, hcp->indx); myval = &tmp_val; } else /* Regular partial put */ myval = nval; return (__ham_replpair(dbc, myval, 0)); } /* * Given a key and a cursor, sets the cursor to the page/ndx on which * the key resides. If the key is found, the cursor H_OK flag is set * and the pagep, bndx, pgno (dpagep, dndx, dpgno) fields are set. * If the key is not found, the H_OK flag is not set. If the sought * field is non-0, the pagep, bndx, pgno (dpagep, dndx, dpgno) fields * are set indicating where an add might take place. If it is 0, * none of the cursor pointer field are valid. */ static int __ham_lookup(dbc, key, sought, mode, pgnop) DBC *dbc; const DBT *key; u_int32_t sought; db_lockmode_t mode; db_pgno_t *pgnop; { DB *dbp; HASH_CURSOR *hcp; db_pgno_t next_pgno; int match, ret; u_int8_t *dk; dbp = dbc->dbp; hcp = (HASH_CURSOR *)dbc->internal; /* * Set up cursor so that we're looking for space to add an item * as we cycle through the pages looking for the key. */ if ((ret = __ham_item_reset(dbc)) != 0) return (ret); hcp->seek_size = sought; hcp->bucket = __ham_call_hash(dbc, (u_int8_t *)key->data, key->size); hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); /* look though all pages in the bucket for the key */ if ((ret = __ham_get_cpage(dbc, mode)) != 0) return (ret); *pgnop = PGNO_INVALID; if (hcp->indx == NDX_INVALID) { hcp->indx = 0; F_CLR(hcp, H_ISDUP); } while (hcp->pgno != PGNO_INVALID) { /* Are we looking for space to insert an item. */ if (hcp->seek_size != 0 && hcp->seek_found_page == PGNO_INVALID && hcp->seek_size < P_FREESPACE(dbp, hcp->page)) { hcp->seek_found_page = hcp->pgno; hcp->seek_found_indx = NDX_INVALID; } if ((ret = __ham_getindex(dbc, hcp->page, key, H_KEYDATA, &match, &hcp->indx)) != 0) return (ret); /* * If this is the first page in the bucket with space for * inserting the requested item. Store the insert index to * save having to look it up again later. */ if (hcp->seek_found_page == hcp->pgno) hcp->seek_found_indx = hcp->indx; if (match == 0) { F_SET(hcp, H_OK); dk = H_PAIRDATA(dbp, hcp->page, hcp->indx); if (HPAGE_PTYPE(dk) == H_OFFDUP) memcpy(pgnop, HOFFDUP_PGNO(dk), sizeof(db_pgno_t)); return (0); } /* move the cursor to the next page. */ if (NEXT_PGNO(hcp->page) == PGNO_INVALID) break; next_pgno = NEXT_PGNO(hcp->page); hcp->indx = 0; if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0) return (ret); } F_SET(hcp, H_NOMORE); return (DB_NOTFOUND); } /* * __ham_init_dbt -- * Initialize a dbt using some possibly already allocated storage * for items. * * PUBLIC: int __ham_init_dbt __P((ENV *, * PUBLIC: DBT *, u_int32_t, void **, u_int32_t *)); */ int __ham_init_dbt(env, dbt, size, bufp, sizep) ENV *env; DBT *dbt; u_int32_t size; void **bufp; u_int32_t *sizep; { int ret; memset(dbt, 0, sizeof(*dbt)); if (*sizep < size) { if ((ret = __os_realloc(env, size, bufp)) != 0) { *sizep = 0; return (ret); } *sizep = size; } dbt->data = *bufp; dbt->size = size; return (0); } /* * Adjust the cursor after an insert or delete. The cursor passed is * the one that was operated upon; we just need to check any of the * others. * * len indicates the length of the item added/deleted * add indicates if the item indicated by the cursor has just been * added (add == 1) or deleted (add == 0). * dup indicates if the addition occurred into a duplicate set. * * PUBLIC: int __hamc_update * PUBLIC: __P((DBC *, u_int32_t, db_ham_curadj, int)); */ int __hamc_update(dbc, len, operation, is_dup) DBC *dbc; u_int32_t len; db_ham_curadj operation; int is_dup; { DB *dbp, *ldbp; DBC *cp; DB_LSN lsn; DB_TXN *my_txn; ENV *env; HASH_CURSOR *hcp, *lcp; int found, ret, was_mod, was_add; u_int32_t order; dbp = dbc->dbp; env = dbp->env; hcp = (HASH_CURSOR *)dbc->internal; /* * Adjustment will only be logged if this is a subtransaction. * Only subtransactions can abort and effect their parent * transactions cursors. */ my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL; found = 0; MUTEX_LOCK(env, env->mtx_dblist); switch (operation) { case DB_HAM_CURADJ_DEL: was_mod = 0; was_add = 0; break; case DB_HAM_CURADJ_ADD: was_mod = 0; was_add = 1; break; case DB_HAM_CURADJ_DELMOD: was_mod = 1; was_add = 0; break; case DB_HAM_CURADJ_ADDMOD: was_mod = 1; was_add = 1; break; default: return (EINVAL); } /* * Calculate the order of this deleted record. * This will be one greater than any cursor that is pointing * at this record and already marked as deleted. */ order = 0; if (was_add == 0) { FIND_FIRST_DB_MATCH(env, dbp, ldbp); for (order = 1; ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { MUTEX_LOCK(env, dbp->mutex); TAILQ_FOREACH(cp, &ldbp->active_queue, links) { if (cp == dbc || cp->dbtype != DB_HASH) continue; lcp = (HASH_CURSOR *)cp->internal; if (F_ISSET(lcp, H_DELETED) && hcp->pgno == lcp->pgno && hcp->indx == lcp->indx && order <= lcp->order && (!is_dup || hcp->dup_off == lcp->dup_off) && !MVCC_SKIP_CURADJ(cp, lcp->pgno)) order = lcp->order + 1; } MUTEX_UNLOCK(env, dbp->mutex); } hcp->order = order; } FIND_FIRST_DB_MATCH(env, dbp, ldbp); for (; ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { MUTEX_LOCK(env, dbp->mutex); TAILQ_FOREACH(cp, &ldbp->active_queue, links) { if (cp == dbc || cp->dbtype != DB_HASH) continue; lcp = (HASH_CURSOR *)cp->internal; if (lcp->pgno != hcp->pgno || lcp->indx == NDX_INVALID || MVCC_SKIP_CURADJ(cp, lcp->pgno)) continue; if (my_txn != NULL && cp->txn != my_txn) found = 1; if (!is_dup) { if (was_add == 1) { /* * This routine is not called to add * non-dup records which are always put * at the end. It is only called from * recovery in this case and the * cursor will be marked deleted. * We are "undeleting" so unmark all * cursors with the same order. */ if (lcp->indx == hcp->indx && F_ISSET(lcp, H_DELETED)) { if (lcp->order == hcp->order) F_CLR(lcp, H_DELETED); else if (lcp->order > hcp->order) { /* * If we've moved this cursor's * index, split its order * number--i.e., decrement it by * enough so that the lowest * cursor moved has order 1. * cp_arg->order is the split * point, so decrement by it. */ lcp->order -= hcp->order; lcp->indx += 2; } } else if (lcp->indx >= hcp->indx) lcp->indx += 2; } else { if (lcp->indx > hcp->indx) { lcp->indx -= 2; if (lcp->indx == hcp->indx && F_ISSET(lcp, H_DELETED)) lcp->order += order; } else if (lcp->indx == hcp->indx && !F_ISSET(lcp, H_DELETED)) { F_SET(lcp, H_DELETED); F_CLR(lcp, H_ISDUP); lcp->order = order; } } } else if (lcp->indx == hcp->indx) { /* * Handle duplicates. This routine is only * called for on page dups. Off page dups are * handled by btree/rtree code. */ if (was_add == 1) { lcp->dup_tlen += len; if (lcp->dup_off == hcp->dup_off && F_ISSET(hcp, H_DELETED) && F_ISSET(lcp, H_DELETED)) { /* Abort of a delete. */ if (lcp->order == hcp->order) F_CLR(lcp, H_DELETED); else if (lcp->order > hcp->order) { lcp->order -= (hcp->order -1); lcp->dup_off += len; } } else if (lcp->dup_off > hcp->dup_off || (!was_mod && lcp->dup_off == hcp->dup_off)) lcp->dup_off += len; } else { lcp->dup_tlen -= len; if (lcp->dup_off > hcp->dup_off) { lcp->dup_off -= len; if (lcp->dup_off == hcp->dup_off && F_ISSET(lcp, H_DELETED)) lcp->order += order; } else if (!was_mod && lcp->dup_off == hcp->dup_off && !F_ISSET(lcp, H_DELETED)) { F_SET(lcp, H_DELETED); lcp->order = order; } } } } MUTEX_UNLOCK(env, dbp->mutex); } MUTEX_UNLOCK(env, env->mtx_dblist); if (found != 0 && DBC_LOGGING(dbc)) { if ((ret = __ham_curadj_log(dbp, my_txn, &lsn, 0, hcp->pgno, hcp->indx, len, hcp->dup_off, (int)operation, is_dup, order)) != 0) return (ret); } return (0); } /* * __ham_get_clist -- * * Get a list of cursors either on a particular bucket or on a particular * page and index combination. The former is so that we can update * cursors on a split. The latter is so we can update cursors when we * move items off page. * * PUBLIC: int __ham_get_clist __P((DB *, db_pgno_t, u_int32_t, DBC ***)); */ int __ham_get_clist(dbp, pgno, indx, listp) DB *dbp; db_pgno_t pgno; u_int32_t indx; DBC ***listp; { DB *ldbp; DBC *cp; ENV *env; u_int nalloc, nused; int ret; *listp = NULL; env = dbp->env; nalloc = nused = 0; /* * Assume that finding anything is the exception, so optimize for * the case where there aren't any. */ MUTEX_LOCK(env, env->mtx_dblist); FIND_FIRST_DB_MATCH(env, dbp, ldbp); for (; ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { MUTEX_LOCK(env, dbp->mutex); TAILQ_FOREACH(cp, &ldbp->active_queue, links) /* * We match if cp->pgno matches the specified * pgno, and if either the cp->indx matches * or we weren't given an index. */ if (cp->internal->pgno == pgno && (indx == NDX_INVALID || cp->internal->indx == indx) && !MVCC_SKIP_CURADJ(cp, pgno)) { if (nused >= nalloc) { nalloc += 10; if ((ret = __os_realloc(dbp->env, nalloc * sizeof(HASH_CURSOR *), listp)) != 0) goto err; } (*listp)[nused++] = cp; } MUTEX_UNLOCK(dbp->env, dbp->mutex); } MUTEX_UNLOCK(env, env->mtx_dblist); if (listp != NULL) { if (nused >= nalloc) { nalloc++; if ((ret = __os_realloc(dbp->env, nalloc * sizeof(HASH_CURSOR *), listp)) != 0) return (ret); } (*listp)[nused] = NULL; } return (0); err: MUTEX_UNLOCK(dbp->env, dbp->mutex); MUTEX_UNLOCK(env, env->mtx_dblist); return (ret); } static int __hamc_writelock(dbc) DBC *dbc; { DB_LOCK tmp_lock; HASH_CURSOR *hcp; int ret; /* * All we need do is acquire the lock and let the off-page * dup tree do its thing. */ if (!STD_LOCKING(dbc)) return (0); hcp = (HASH_CURSOR *)dbc->internal; ret = 0; if ((!LOCK_ISSET(hcp->lock) || hcp->lock_mode != DB_LOCK_WRITE)) { tmp_lock = hcp->lock; if ((ret = __ham_lock_bucket(dbc, DB_LOCK_WRITE)) == 0 && tmp_lock.mode != DB_LOCK_WWRITE) ret = __LPUT(dbc, tmp_lock); } return (ret); }