/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996,2008 Oracle. All rights reserved. */ /* * Copyright (c) 1996 * The President and Fellows of Harvard University. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: txn_rec.c,v 12.29 2008/03/13 20:48:48 mbrey Exp $ */ #include "db_config.h" #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/lock.h" #include "dbinc/txn.h" #include "dbinc/db_am.h" /* * PUBLIC: int __txn_regop_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); * * These records are only ever written for commits. Normally, we redo any * committed transaction, however if we are doing recovery to a timestamp, then * we may treat transactions that committed after the timestamp as aborted. */ int __txn_regop_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __txn_regop_args *argp; DB_TXNHEAD *headp; int ret; u_int32_t status; #ifdef DEBUG_RECOVER (void)__txn_regop_print(env, dbtp, lsnp, op, info); #endif if ((ret = __txn_regop_read(env, dbtp->data, &argp)) != 0) return (ret); headp = info; /* * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL. * We check for the former explicitly and the last two clauses * apply to the BACKWARD_ROLL case. */ if (op == DB_TXN_FORWARD_ROLL) { /* * If this was a 2-phase-commit transaction, then it * might already have been removed from the list, and * that's OK. Ignore the return code from remove. */ if ((ret = __db_txnlist_remove(env, info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0) goto err; } else if ((env->dbenv->tx_timestamp != 0 && argp->timestamp > (int32_t)env->dbenv->tx_timestamp) || (!IS_ZERO_LSN(headp->trunc_lsn) && LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) { /* * We failed either the timestamp check or the trunc_lsn check, * so we treat this as an abort even if it was a commit record. */ if ((ret = __db_txnlist_update(env, info, argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0) goto err; else if (status != TXN_IGNORE && status != TXN_OK) goto err; } else { /* This is a normal commit; mark it appropriately. */ if ((ret = __db_txnlist_update(env, info, argp->txnp->txnid, argp->opcode, lsnp, &status, 0)) == DB_NOTFOUND) { if ((ret = __db_txnlist_add(env, info, argp->txnp->txnid, argp->opcode == TXN_ABORT ? TXN_IGNORE : argp->opcode, lsnp)) != 0) goto err; } else if (ret != 0 || (status != TXN_IGNORE && status != TXN_OK)) goto err; } if (ret == 0) *lsnp = argp->prev_lsn; if (0) { err: __db_errx(env, "txnid %lx commit record found, already on commit list", (u_long)argp->txnp->txnid); ret = EINVAL; } __os_free(env, argp); return (ret); } /* * PUBLIC: int __txn_xa_regop_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); * * These records are only ever written for prepares. */ int __txn_xa_regop_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __txn_xa_regop_args *argp; DBT *lock_dbt; DB_TXNHEAD *headp; DB_LOCKTAB *lt; u_int32_t status; int ret; #ifdef DEBUG_RECOVER (void)__txn_xa_regop_print(env, dbtp, lsnp, op, info); #endif if ((ret = __txn_xa_regop_read(env, dbtp->data, &argp)) != 0) return (ret); if (argp->opcode != TXN_PREPARE && argp->opcode != TXN_ABORT) { ret = EINVAL; goto err; } headp = info; /* * The return value here is either a DB_NOTFOUND or it is * the transaction status from the list. It is not a normal * error return, so we must make sure that in each of the * cases below, we overwrite the ret value so we return * appropriately. */ ret = __db_txnlist_find(env, info, argp->txnp->txnid, &status); /* * If we are rolling forward, then an aborted prepare * indicates that this may be the last record we'll see for * this transaction ID, so we should remove it from the list. */ if (op == DB_TXN_FORWARD_ROLL) { if ((ret = __db_txnlist_remove(env, info, argp->txnp->txnid)) != 0) goto txn_err; } else if (op == DB_TXN_BACKWARD_ROLL && status == TXN_PREPARE) { /* * On the backward pass, we have four possibilities: * 1. The transaction is already committed, no-op. * 2. The transaction is already aborted, no-op. * 3. The prepare failed and was aborted, mark as abort. * 4. The transaction is neither committed nor aborted. * Treat this like a commit and roll forward so that * the transaction can be resurrected in the region. * We handle cases 3 and 4 here; cases 1 and 2 * are the final clause below. */ if (argp->opcode == TXN_ABORT) { if ((ret = __db_txnlist_update(env, info, argp->txnp->txnid, TXN_ABORT, NULL, &status, 0)) != 0 && status != TXN_PREPARE) goto txn_err; ret = 0; } /* * This is prepared, but not yet committed transaction. We * need to add it to the transaction list, so that it gets * rolled forward. We also have to add it to the region's * internal state so it can be properly aborted or committed * after recovery (see txn_recover). */ else if ((ret = __db_txnlist_remove(env, info, argp->txnp->txnid)) != 0) { txn_err: __db_errx(env, "transaction not in list %lx", (u_long)argp->txnp->txnid); ret = DB_NOTFOUND; } else if (IS_ZERO_LSN(headp->trunc_lsn) || LOG_COMPARE(&headp->trunc_lsn, lsnp) >= 0) { if ((ret = __db_txnlist_add(env, info, argp->txnp->txnid, TXN_COMMIT, lsnp)) == 0) { /* Re-acquire the locks for this transaction. */ lock_dbt = &argp->locks; if (LOCKING_ON(env)) { lt = env->lk_handle; if ((ret = __lock_getlocker(lt, argp->txnp->txnid, 1, &argp->txnp->locker)) != 0) goto err; if ((ret = __lock_get_list(env, argp->txnp->locker, 0, DB_LOCK_WRITE, lock_dbt)) != 0) goto err; } ret = __txn_restore_txn(env, lsnp, argp); } } } else ret = 0; if (ret == 0) *lsnp = argp->prev_lsn; err: __os_free(env, argp); return (ret); } /* * PUBLIC: int __txn_ckp_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __txn_ckp_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __txn_ckp_args *argp; int ret; #ifdef DEBUG_RECOVER __txn_ckp_print(env, dbtp, lsnp, op, info); #endif if ((ret = __txn_ckp_read(env, dbtp->data, &argp)) != 0) return (ret); if (op == DB_TXN_BACKWARD_ROLL) __db_txnlist_ckp(env, info, lsnp); *lsnp = argp->last_ckp; __os_free(env, argp); return (DB_TXN_CKP); } /* * __txn_child_recover * Recover a commit record for a child transaction. * * PUBLIC: int __txn_child_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __txn_child_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __txn_child_args *argp; u_int32_t c_stat, p_stat, tmpstat; int ret, t_ret; #ifdef DEBUG_RECOVER (void)__txn_child_print(env, dbtp, lsnp, op, info); #endif if ((ret = __txn_child_read(env, dbtp->data, &argp)) != 0) return (ret); /* * This is a record in a PARENT's log trail indicating that a * child committed. If we are aborting, return the childs last * record's LSN. If we are in recovery, then if the * parent is committing, we set ourselves up to commit, else * we do nothing. */ if (op == DB_TXN_ABORT) { *lsnp = argp->c_lsn; ret = __db_txnlist_lsnadd(env, info, &argp->prev_lsn); goto out; } else if (op == DB_TXN_BACKWARD_ROLL) { /* Child might exist -- look for it. */ ret = __db_txnlist_find(env, info, argp->child, &c_stat); t_ret = __db_txnlist_find(env, info, argp->txnp->txnid, &p_stat); if (ret != 0 && ret != DB_NOTFOUND) goto out; if (t_ret != 0 && t_ret != DB_NOTFOUND) { ret = t_ret; goto out; } /* * If the parent is in state COMMIT or IGNORE, then we apply * that to the child, else we need to abort the child. */ if (ret == DB_NOTFOUND || c_stat == TXN_OK || c_stat == TXN_COMMIT) { if (t_ret == DB_NOTFOUND || (p_stat != TXN_COMMIT && p_stat != TXN_IGNORE)) c_stat = TXN_ABORT; else c_stat = p_stat; if (ret == DB_NOTFOUND) ret = __db_txnlist_add(env, info, argp->child, c_stat, NULL); else ret = __db_txnlist_update(env, info, argp->child, c_stat, NULL, &tmpstat, 0); } else if (c_stat == TXN_EXPECTED) { /* * The open after this create succeeded. If the * parent succeeded, we don't want to redo; if the * parent aborted, we do want to undo. */ switch (p_stat) { case TXN_COMMIT: case TXN_IGNORE: c_stat = TXN_IGNORE; break; default: c_stat = TXN_ABORT; } ret = __db_txnlist_update(env, info, argp->child, c_stat, NULL, &tmpstat, 0); } else if (c_stat == TXN_UNEXPECTED) { /* * The open after this create failed. If the parent * is rolling forward, we need to roll forward. If * the parent failed, then we do not want to abort * (because the file may not be the one in which we * are interested). */ ret = __db_txnlist_update(env, info, argp->child, p_stat == TXN_COMMIT ? TXN_COMMIT : TXN_IGNORE, NULL, &tmpstat, 0); } } else if (op == DB_TXN_OPENFILES) { /* * If we have a partial subtransaction, then the whole * transaction should be ignored. */ if ((ret = __db_txnlist_find(env, info, argp->child, &c_stat)) == DB_NOTFOUND) ret = __db_txnlist_update(env, info, argp->txnp->txnid, TXN_IGNORE, NULL, &p_stat, 1); } else if (DB_REDO(op)) { /* Forward Roll */ if ((ret = __db_txnlist_remove(env, info, argp->child)) != 0) __db_errx(env, "Transaction not in list %x", argp->child); } if (ret == 0) *lsnp = argp->prev_lsn; out: __os_free(env, argp); return (ret); } /* * __txn_restore_txn -- * Using only during XA recovery. If we find any transactions that are * prepared, but not yet committed, then we need to restore the transaction's * state into the shared region, because the TM is going to issue an abort * or commit and we need to respond correctly. * * lsnp is the LSN of the returned LSN * argp is the prepare record (in an appropriate structure) * * PUBLIC: int __txn_restore_txn __P((ENV *, DB_LSN *, __txn_xa_regop_args *)); */ int __txn_restore_txn(env, lsnp, argp) ENV *env; DB_LSN *lsnp; __txn_xa_regop_args *argp; { DB_TXNMGR *mgr; DB_TXNREGION *region; TXN_DETAIL *td; int ret; if (argp->xid.size == 0) return (0); mgr = env->tx_handle; region = mgr->reginfo.primary; TXN_SYSTEM_LOCK(env); /* Allocate a new transaction detail structure. */ if ((ret = __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) { TXN_SYSTEM_UNLOCK(env); return (ret); } /* Place transaction on active transaction list. */ SH_TAILQ_INSERT_HEAD(®ion->active_txn, td, links, __txn_detail); td->txnid = argp->txnp->txnid; __os_id(env->dbenv, &td->pid, &td->tid); td->last_lsn = *lsnp; td->begin_lsn = argp->begin_lsn; td->parent = INVALID_ROFF; td->name = INVALID_ROFF; SH_TAILQ_INIT(&td->kids); MAX_LSN(td->read_lsn); MAX_LSN(td->visible_lsn); td->mvcc_ref = 0; td->mvcc_mtx = MUTEX_INVALID; td->status = TXN_PREPARED; td->flags = TXN_DTL_RESTORED; td->xa_status = TXN_XA_PREPARED; memcpy(td->xid, argp->xid.data, argp->xid.size); td->bqual = argp->bqual; td->gtrid = argp->gtrid; td->format = argp->formatID; td->nlog_dbs = 0; td->nlog_slots = TXN_NSLOTS; td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots); region->stat.st_nrestores++; #ifdef HAVE_STATISTICS region->stat.st_nactive++; if (region->stat.st_nactive > region->stat.st_maxnactive) region->stat.st_maxnactive = region->stat.st_nactive; #endif TXN_SYSTEM_UNLOCK(env); return (0); } /* * __txn_recycle_recover -- * Recovery function for recycle. * * PUBLIC: int __txn_recycle_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __txn_recycle_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __txn_recycle_args *argp; int ret; #ifdef DEBUG_RECOVER (void)__txn_child_print(env, dbtp, lsnp, op, info); #endif if ((ret = __txn_recycle_read(env, dbtp->data, &argp)) != 0) return (ret); COMPQUIET(lsnp, NULL); if ((ret = __db_txnlist_gen(env, info, DB_UNDO(op) ? -1 : 1, argp->min, argp->max)) != 0) return (ret); __os_free(env, argp); return (0); } /* * PUBLIC: int __txn_regop_42_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); * * These records are only ever written for commits. Normally, we redo any * committed transaction, however if we are doing recovery to a timestamp, then * we may treat transactions that committed after the timestamp as aborted. */ int __txn_regop_42_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __txn_regop_42_args *argp; DB_TXNHEAD *headp; u_int32_t status; int ret; #ifdef DEBUG_RECOVER (void)__txn_regop_42_print(env, dbtp, lsnp, op, info); #endif if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0) return (ret); headp = info; /* * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL. * We check for the former explicitly and the last two clauses * apply to the BACKWARD_ROLL case. */ if (op == DB_TXN_FORWARD_ROLL) { /* * If this was a 2-phase-commit transaction, then it * might already have been removed from the list, and * that's OK. Ignore the return code from remove. */ if ((ret = __db_txnlist_remove(env, info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0) goto err; } else if ((env->dbenv->tx_timestamp != 0 && argp->timestamp > (int32_t)env->dbenv->tx_timestamp) || (!IS_ZERO_LSN(headp->trunc_lsn) && LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) { /* * We failed either the timestamp check or the trunc_lsn check, * so we treat this as an abort even if it was a commit record. */ if ((ret = __db_txnlist_update(env, info, argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0) goto err; else if (status != TXN_IGNORE && status != TXN_OK) goto err; } else { /* This is a normal commit; mark it appropriately. */ if ((ret = __db_txnlist_update(env, info, argp->txnp->txnid, argp->opcode, lsnp, &status, 0)) == DB_NOTFOUND) { if ((ret = __db_txnlist_add(env, info, argp->txnp->txnid, argp->opcode == TXN_ABORT ? TXN_IGNORE : argp->opcode, lsnp)) != 0) goto err; } else if (ret != 0 || (status != TXN_IGNORE && status != TXN_OK)) goto err; } if (ret == 0) *lsnp = argp->prev_lsn; if (0) { err: __db_errx(env, "txnid %lx commit record found, already on commit list", (u_long)argp->txnp->txnid); ret = EINVAL; } __os_free(env, argp); return (ret); } /* * PUBLIC: int __txn_ckp_42_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __txn_ckp_42_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __txn_ckp_42_args *argp; int ret; #ifdef DEBUG_RECOVER __txn_ckp_42_print(env, dbtp, lsnp, op, info); #endif if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0) return (ret); if (op == DB_TXN_BACKWARD_ROLL) __db_txnlist_ckp(env, info, lsnp); *lsnp = argp->last_ckp; __os_free(env, argp); return (DB_TXN_CKP); }