/*- * See the file LICENSE for redistribution information. * * Copyright (c) 2004,2008 Oracle. All rights reserved. * * $Id: rep_verify.c,v 12.69 2008/03/13 16:21:05 mbrey Exp $ */ #include "db_config.h" #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/db_am.h" #include "dbinc/log.h" #include "dbinc/txn.h" static int __rep_dorecovery __P((ENV *, DB_LSN *, DB_LSN *)); /* * __rep_verify -- * Handle a REP_VERIFY message. * * PUBLIC: int __rep_verify __P((ENV *, __rep_control_args *, DBT *, * PUBLIC: int, time_t)); */ int __rep_verify(env, rp, rec, eid, savetime) ENV *env; __rep_control_args *rp; DBT *rec; int eid; time_t savetime; { DBT mylog; DB_LOG *dblp; DB_LOGC *logc; DB_LSN lsn; DB_REP *db_rep; LOG *lp; REP *rep; u_int32_t rectype, logflag; int match, ret, t_ret; ret = 0; db_rep = env->rep_handle; rep = db_rep->region; dblp = env->lg_handle; lp = dblp->reginfo.primary; /* Do nothing if VERIFY flag is not set. */ if (!F_ISSET(rep, REP_F_RECOVER_VERIFY)) return (ret); #ifdef DIAGNOSTIC /* * We should not ever be in internal init with a lease granted. */ if (IS_USING_LEASES(env)) { REP_SYSTEM_LOCK(env); DB_ASSERT(env, __rep_islease_granted(env) == 0); REP_SYSTEM_UNLOCK(env); } #endif if ((ret = __log_cursor(env, &logc)) != 0) return (ret); memset(&mylog, 0, sizeof(mylog)); /* If verify_lsn of ZERO is passed in, get last log. */ MUTEX_LOCK(env, rep->mtx_clientdb); logflag = IS_ZERO_LSN(lp->verify_lsn) ? DB_LAST : DB_SET; MUTEX_UNLOCK(env, rep->mtx_clientdb); if ((ret = __logc_get(logc, &rp->lsn, &mylog, logflag)) != 0) goto err; match = 0; LOGCOPY_32(env, &rectype, mylog.data); if (mylog.size == rec->size && memcmp(mylog.data, rec->data, rec->size) == 0) match = 1; /* * If we don't have a match, backup to the previous * identification record and try again. */ if (match == 0) { ZERO_LSN(lsn); if ((ret = __rep_log_backup(env, rep, logc, &lsn)) == 0) { MUTEX_LOCK(env, rep->mtx_clientdb); lp->verify_lsn = lsn; __os_gettime(env, &lp->rcvd_ts, 1); lp->wait_ts = rep->request_gap; MUTEX_UNLOCK(env, rep->mtx_clientdb); (void)__rep_send_message(env, eid, REP_VERIFY_REQ, &lsn, NULL, 0, DB_REP_ANYWHERE); } else if (ret == DB_NOTFOUND) { /* * We've either run out of records because * logs have been removed or we've rolled back * all the way to the beginning. */ STAT(rep->stat.st_outdated++); REP_SYSTEM_LOCK(env); if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) ret = DB_REP_JOIN_FAILURE; else { F_CLR(rep, REP_F_RECOVER_VERIFY); F_SET(rep, REP_F_RECOVER_UPDATE); ZERO_LSN(rep->first_lsn); ZERO_LSN(rep->ckp_lsn); ret = 0; } REP_SYSTEM_UNLOCK(env); if (ret == 0) (void)__rep_send_message(env, eid, REP_UPDATE_REQ, NULL, NULL, 0, 0); } } else ret = __rep_verify_match(env, &rp->lsn, savetime); err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * __rep_verify_fail -- * Handle a REP_VERIFY_FAIL message. * * PUBLIC: int __rep_verify_fail __P((ENV *, __rep_control_args *, int)); */ int __rep_verify_fail(env, rp, eid) ENV *env; __rep_control_args *rp; int eid; { DB_LOG *dblp; DB_REP *db_rep; LOG *lp; REP *rep; int lockout, ret; lockout = 0; ret = 0; db_rep = env->rep_handle; rep = db_rep->region; dblp = env->lg_handle; lp = dblp->reginfo.primary; /* * If any recovery flags are set, but not LOG or VERIFY, * then we ignore this message. We are already * in the middle of updating. */ if (F_ISSET(rep, REP_F_RECOVER_MASK) && !F_ISSET(rep, REP_F_RECOVER_LOG | REP_F_RECOVER_VERIFY)) return (0); MUTEX_LOCK(env, rep->mtx_clientdb); REP_SYSTEM_LOCK(env); /* * We should not ever be in internal init with a lease granted. */ DB_ASSERT(env, !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0); /* * Update stats. */ STAT(rep->stat.st_outdated++); /* * Clean up old internal init in progress if: * REP_C_NOAUTOINIT is not configured and * we are recovering LOG and this LSN is in the range we need. */ if (!FLD_ISSET(rep->config, REP_C_NOAUTOINIT) && (F_ISSET(rep, REP_F_RECOVER_LOG) && LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 && LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0)) { /* * Already locking out messages, give up. */ if (F_ISSET(rep, REP_F_READY_MSG)) goto unlock; /* * Lock out other messages to prevent race conditions. */ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0) goto unlock; lockout = 1; /* * Clean up internal init if one was in progress. */ if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP)) { RPRINT(env, DB_VERB_REP_SYNC, (env, "VERIFY_FAIL is cleaning up old internal init for missing log")); if ((ret = __rep_init_cleanup(env, rep, DB_FORCE)) != 0) { RPRINT(env, DB_VERB_REP_SYNC, (env, "VERIFY_FAIL error cleaning up internal init for missing log: %d", ret)); goto msglck; } F_CLR(rep, REP_F_RECOVER_MASK); } F_CLR(rep, REP_F_READY_MSG); lockout = 0; } /* * Commence an internal init if: * We are in VERIFY state and the failing LSN is the one we * were verifying or * we're recovering LOG and this LSN is in the range we need or * we are in normal state (no recovery flags set) and * the failing LSN is the one we're ready for. */ if (((F_ISSET(rep, REP_F_RECOVER_VERIFY)) && LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) || (F_ISSET(rep, REP_F_RECOVER_LOG) && LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 && LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) || (F_ISSET(rep, REP_F_RECOVER_MASK) == 0 && LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0)) { /* * We don't want an old or delayed VERIFY_FAIL * message to throw us into internal initialization * when we shouldn't be. If REP_C_NOAUTOINIT is configured, * return DB_REP_JOIN_FAILURE instead of doing internal init. */ if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) { ret = DB_REP_JOIN_FAILURE; goto unlock; } /* * Do the internal init. */ F_CLR(rep, REP_F_RECOVER_VERIFY); F_SET(rep, REP_F_RECOVER_UPDATE); ZERO_LSN(rep->first_lsn); ZERO_LSN(rep->ckp_lsn); lp->wait_ts = rep->request_gap; REP_SYSTEM_UNLOCK(env); MUTEX_UNLOCK(env, rep->mtx_clientdb); (void)__rep_send_message(env, eid, REP_UPDATE_REQ, NULL, NULL, 0, 0); } else { /* * Otherwise ignore this message. */ msglck: if (lockout) F_CLR(rep, REP_F_READY_MSG); unlock: REP_SYSTEM_UNLOCK(env); MUTEX_UNLOCK(env, rep->mtx_clientdb); } return (ret); } /* * __rep_verify_req -- * Handle a REP_VERIFY_REQ message. * * PUBLIC: int __rep_verify_req __P((ENV *, __rep_control_args *, int)); */ int __rep_verify_req(env, rp, eid) ENV *env; __rep_control_args *rp; int eid; { DBT *d, data_dbt; DB_LOGC *logc; DB_REP *db_rep; REP *rep; u_int32_t type; int old, ret; ret = 0; db_rep = env->rep_handle; rep = db_rep->region; type = REP_VERIFY; if ((ret = __log_cursor(env, &logc)) != 0) return (ret); d = &data_dbt; memset(d, 0, sizeof(data_dbt)); F_SET(logc, DB_LOG_SILENT_ERR); ret = __logc_get(logc, &rp->lsn, d, DB_SET); /* * If the LSN was invalid, then we might get a DB_NOTFOUND * we might get an EIO, we could get anything. * If we get a DB_NOTFOUND, then there is a chance that * the LSN comes before the first file present in which * case we need to return a fail so that the client can * perform an internal init or return a REP_JOIN_FAILURE. * * If we're a client servicing this request and we get a * NOTFOUND, return it so the caller can rerequest from * a better source. */ if (ret == DB_NOTFOUND) { if (F_ISSET(rep, REP_F_CLIENT)) { (void)__logc_close(logc); return (DB_NOTFOUND); } if (__log_is_outdated(env, rp->lsn.file, &old) == 0 && old != 0) type = REP_VERIFY_FAIL; } if (ret != 0) d = NULL; (void)__rep_send_message(env, eid, type, &rp->lsn, d, 0, 0); return (__logc_close(logc)); } static int __rep_dorecovery(env, lsnp, trunclsnp) ENV *env; DB_LSN *lsnp, *trunclsnp; { DBT mylog; DB_LOGC *logc; DB_LSN last_ckp, lsn; DB_REP *db_rep; DB_THREAD_INFO *ip; REP *rep; int ret, skip_rec, t_ret, update; u_int32_t rectype, opcode; __txn_regop_args *txnrec; __txn_regop_42_args *txn42rec; db_rep = env->rep_handle; rep = db_rep->region; ENV_GET_THREAD_INFO(env, ip); /* Figure out if we are backing out any committed transactions. */ if ((ret = __log_cursor(env, &logc)) != 0) return (ret); memset(&mylog, 0, sizeof(mylog)); if (F_ISSET(rep, REP_F_RECOVER_LOG)) { /* * Internal init can never skip recovery. * Internal init must always update the timestamp and * force dead handles. */ skip_rec = 0; update = 1; } else { skip_rec = 1; update = 0; } while (update == 0 && (ret = __logc_get(logc, &lsn, &mylog, DB_PREV)) == 0 && LOG_COMPARE(&lsn, lsnp) > 0) { LOGCOPY_32(env, &rectype, mylog.data); /* * Find out if we can skip recovery completely. If we * are backing up over any record a client usually * cares about, we must run recovery. * * Skipping sync-up recovery can be pretty scary! * Here's why we can do it: * If a master downgraded to client and is now running * sync-up to a new master, that old master must have * waited for any outstanding txns to resolve before * becoming a client. Also we are in lockout so there * can be no other operations right now. * * If the client wrote a commit record to the log, but * was descheduled before processing the txn, and then * a new master was found, we must've let the txn get * processed because right now we are the only message * thread allowed to be running. */ DB_ASSERT(env, rep->op_cnt == 0); DB_ASSERT(env, rep->msg_th == 1); if (rectype == DB___txn_regop || rectype == DB___txn_ckp || rectype == DB___dbreg_register) skip_rec = 0; if (rectype == DB___txn_regop) { if (rep->version >= DB_REPVERSION_44) { if ((ret = __txn_regop_read( env, mylog.data, &txnrec)) != 0) goto err; opcode = txnrec->opcode; __os_free(env, txnrec); } else { if ((ret = __txn_regop_42_read( env, mylog.data, &txn42rec)) != 0) goto err; opcode = txn42rec->opcode; __os_free(env, txn42rec); } if (opcode != TXN_ABORT) update = 1; } } /* * Handle if the logc_get fails. */ if (ret != 0) goto err; /* * If we successfully run recovery, we've opened all the necessary * files. We are guaranteed to be single-threaded here, so no mutex * is necessary. */ if (skip_rec) { if ((ret = __log_get_stable_lsn(env, &last_ckp)) != 0) { if (ret != DB_NOTFOUND) goto err; ZERO_LSN(last_ckp); } RPRINT(env, DB_VERB_REP_SYNC, (env, "Skip sync-up rec. Truncate log to [%lu][%lu], ckp [%lu][%lu]", (u_long)lsnp->file, (u_long)lsnp->offset, (u_long)last_ckp.file, (u_long)last_ckp.offset)); ret = __log_vtruncate(env, lsnp, &last_ckp, trunclsnp); } else ret = __db_apprec(env, ip, lsnp, trunclsnp, update, 0); if (ret != 0) goto err; F_SET(db_rep, DBREP_OPENFILES); err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * __rep_verify_match -- * We have just received a matching log record during verification. * Figure out if we're going to need to run recovery. If so, wait until * everything else has exited the library. If not, set up the world * correctly and move forward. * * PUBLIC: int __rep_verify_match __P((ENV *, DB_LSN *, time_t)); */ int __rep_verify_match(env, reclsnp, savetime) ENV *env; DB_LSN *reclsnp; time_t savetime; { DB_LOG *dblp; DB_LSN trunclsn; DB_REP *db_rep; DB_THREAD_INFO *ip; LOG *lp; REGENV *renv; REGINFO *infop; REP *rep; int done, master, ret; u_int32_t unused; dblp = env->lg_handle; db_rep = env->rep_handle; rep = db_rep->region; lp = dblp->reginfo.primary; ret = 0; infop = env->reginfo; renv = infop->primary; ENV_GET_THREAD_INFO(env, ip); /* * Check if the savetime is different than our current time stamp. * If it is, then we're racing with another thread trying to recover * and we lost. We must give up. */ MUTEX_LOCK(env, rep->mtx_clientdb); done = savetime != renv->rep_timestamp; if (done) { MUTEX_UNLOCK(env, rep->mtx_clientdb); return (0); } ZERO_LSN(lp->verify_lsn); MUTEX_UNLOCK(env, rep->mtx_clientdb); /* * Make sure the world hasn't changed while we tried to get * the lock. If it hasn't then it's time for us to kick all * operations out of DB and run recovery. */ REP_SYSTEM_LOCK(env); if (F_ISSET(rep, REP_F_READY_MSG) || (!F_ISSET(rep, REP_F_RECOVER_LOG) && F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP))) { /* * We lost. The world changed and we should do nothing. */ STAT(rep->stat.st_msgs_recover++); goto errunlock; } /* * Lockout all message threads but ourselves. */ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0) goto errunlock; /* * Lockout the API and wait for operations to complete. */ if ((ret = __rep_lockout_api(env, rep)) != 0) goto errunlock; /* OK, everyone is out, we can now run recovery. */ REP_SYSTEM_UNLOCK(env); if ((ret = __rep_dorecovery(env, reclsnp, &trunclsn)) != 0 || (ret = __rep_remove_init_file(env)) != 0) { REP_SYSTEM_LOCK(env); F_CLR(rep, REP_F_READY_API | REP_F_READY_MSG | REP_F_READY_OP); goto errunlock; } /* * The log has been truncated (either directly by us or by __db_apprec) * We want to make sure we're waiting for the LSN at the new end-of-log, * not some later point. */ MUTEX_LOCK(env, rep->mtx_clientdb); lp->ready_lsn = trunclsn; ZERO_LSN(lp->waiting_lsn); ZERO_LSN(lp->max_wait_lsn); lp->max_perm_lsn = *reclsnp; lp->wait_ts = rep->request_gap; __os_gettime(env, &lp->rcvd_ts, 1); ZERO_LSN(lp->verify_lsn); /* * Discard any log records we have queued; we're about to re-request * them, and can't trust the ones in the queue. We need to set the * DB_AM_RECOVER bit in this handle, so that the operation doesn't * deadlock. */ if (db_rep->rep_db == NULL && (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) { MUTEX_UNLOCK(env, rep->mtx_clientdb); goto out; } F_SET(db_rep->rep_db, DB_AM_RECOVER); MUTEX_UNLOCK(env, rep->mtx_clientdb); ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused); MUTEX_LOCK(env, rep->mtx_clientdb); F_CLR(db_rep->rep_db, DB_AM_RECOVER); REP_SYSTEM_LOCK(env); rep->stat.st_log_queued = 0; F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK | REP_F_READY_MSG); if (ret != 0) goto errunlock2; /* * If the master_id is invalid, this means that since * the last record was sent, something happened to the * master and we may not have a master to request * things of. * * This is not an error; when we find a new master, * we'll re-negotiate where the end of the log is and * try to bring ourselves up to date again anyway. */ master = rep->master_id; REP_SYSTEM_UNLOCK(env); if (master == DB_EID_INVALID) { MUTEX_UNLOCK(env, rep->mtx_clientdb); ret = 0; } else { /* * We're making an ALL_REQ. But now that we've * cleared the flags, we're likely receiving new * log records from the master, resulting in a gap * immediately. So to avoid multiple data streams, * set the wait_ts value high now to give the master * a chance to start sending us these records before * the gap code re-requests the same gap. Wait_recs * will get reset once we start receiving these * records. */ lp->wait_ts = rep->max_gap; MUTEX_UNLOCK(env, rep->mtx_clientdb); (void)__rep_send_message(env, master, REP_ALL_REQ, reclsnp, NULL, 0, DB_REP_ANYWHERE); } if (0) { errunlock2: MUTEX_UNLOCK(env, rep->mtx_clientdb); errunlock: REP_SYSTEM_UNLOCK(env); } out: return (ret); } /* * __rep_log_backup -- * * In the verify handshake, we walk backward looking for * identification records. Those are the only record types * we verify and match on. * * PUBLIC: int __rep_log_backup __P((ENV *, REP *, DB_LOGC *, DB_LSN *)); */ int __rep_log_backup(env, rep, logc, lsn) ENV *env; REP *rep; DB_LOGC *logc; DB_LSN *lsn; { DBT mylog; u_int32_t rectype; int ret; ret = 0; memset(&mylog, 0, sizeof(mylog)); while ((ret = __logc_get(logc, lsn, &mylog, DB_PREV)) == 0) { /* * Determine what we look for based on version number. * Due to the contents of records changing between * versions we have to match based on criteria of that * particular version. */ LOGCOPY_32(env, &rectype, mylog.data); /* * In 4.4 and beyond we match checkpoint and commit. */ if (rep->version >= DB_REPVERSION_44 && (rectype == DB___txn_ckp || rectype == DB___txn_regop)) break; } return (ret); }