/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint static const char revid[] = "$Id: log.c,v 1.2 2004/03/30 01:23:43 jtownsen Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #include #include #endif #include "db_int.h" #include "dbinc/crypto.h" #include "dbinc/hmac.h" #include "dbinc/log.h" #include "dbinc/txn.h" static int __log_init __P((DB_ENV *, DB_LOG *)); static int __log_recover __P((DB_LOG *)); static size_t __log_region_size __P((DB_ENV *)); static int __log_stat __P((DB_ENV *, DB_LOG_STAT **, u_int32_t)); static int __log_zero __P((DB_ENV *, DB_LSN *, DB_LSN *)); /* * __log_open -- * Internal version of log_open: only called from DB_ENV->open. * * PUBLIC: int __log_open __P((DB_ENV *)); */ int __log_open(dbenv) DB_ENV *dbenv; { DB_LOG *dblp; LOG *lp; int ret; /* Create/initialize the DB_LOG structure. */ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_LOG), &dblp)) != 0) return (ret); dblp->dbenv = dbenv; /* Join/create the log region. */ dblp->reginfo.type = REGION_TYPE_LOG; dblp->reginfo.id = INVALID_REGION_ID; dblp->reginfo.mode = dbenv->db_mode; dblp->reginfo.flags = REGION_JOIN_OK; if (F_ISSET(dbenv, DB_ENV_CREATE)) F_SET(&dblp->reginfo, REGION_CREATE_OK); if ((ret = __db_r_attach( dbenv, &dblp->reginfo, __log_region_size(dbenv))) != 0) goto err; /* If we created the region, initialize it. */ if (F_ISSET(&dblp->reginfo, REGION_CREATE)) if ((ret = __log_init(dbenv, dblp)) != 0) goto err; /* Set the local addresses. */ lp = dblp->reginfo.primary = R_ADDR(&dblp->reginfo, dblp->reginfo.rp->primary); /* * If the region is threaded, then we have to lock both the handles * and the region, and we need to allocate a mutex for that purpose. */ if (F_ISSET(dbenv, DB_ENV_THREAD) && (ret = __db_mutex_setup(dbenv, &dblp->reginfo, &dblp->mutexp, MUTEX_ALLOC | MUTEX_NO_RLOCK)) != 0) goto err; /* Initialize the rest of the structure. */ dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off); /* * Set the handle -- we may be about to run recovery, which allocates * log cursors. Log cursors require logging be already configured, * and the handle being set is what demonstrates that. * * If we created the region, run recovery. If that fails, make sure * we reset the log handle before cleaning up, otherwise we will try * and clean up again in the mainline DB_ENV initialization code. */ dbenv->lg_handle = dblp; if (F_ISSET(&dblp->reginfo, REGION_CREATE)) { /* * We first take the log file size from the environment, if * specified. If that wasn't set, default it. Regardless, * recovery may set it from the persistent information in a * log file header. */ if (lp->log_size == 0) lp->log_size = LG_MAX_DEFAULT; if ((ret = __log_recover(dblp)) != 0) goto err; /* * If the next log file size hasn't been set yet, default it * to the current log file size. */ if (lp->log_nsize == 0) lp->log_nsize = lp->log_size; /* * If we haven't written any log files, write the first one * so that checkpoint gets a valid ckp_lsn value. */ if (IS_INIT_LSN(lp->lsn) && (ret = __log_newfile(dblp, NULL)) != 0) goto err; /* Initialize replication's next-expected LSN value. */ lp->ready_lsn = lp->lsn; } else { /* * A process joining the region may have reset the log file * size, too. If so, it only affects the next log file we * create. */ if (dbenv->lg_size != 0) lp->log_nsize = dbenv->lg_size; } R_UNLOCK(dbenv, &dblp->reginfo); return (0); err: dbenv->lg_handle = NULL; if (dblp->reginfo.addr != NULL) { if (F_ISSET(&dblp->reginfo, REGION_CREATE)) ret = __db_panic(dbenv, ret); R_UNLOCK(dbenv, &dblp->reginfo); (void)__db_r_detach(dbenv, &dblp->reginfo, 0); } if (dblp->mutexp != NULL) __db_mutex_free(dbenv, &dblp->reginfo, dblp->mutexp); __os_free(dbenv, dblp); return (ret); } /* * __log_init -- * Initialize a log region in shared memory. */ static int __log_init(dbenv, dblp) DB_ENV *dbenv; DB_LOG *dblp; { DB_MUTEX *flush_mutexp; LOG *region; int ret; void *p; #ifdef HAVE_MUTEX_SYSTEM_RESOURCES u_int8_t *addr; #endif if ((ret = __db_shalloc(dblp->reginfo.addr, sizeof(*region), 0, &dblp->reginfo.primary)) != 0) goto mem_err; dblp->reginfo.rp->primary = R_OFFSET(&dblp->reginfo, dblp->reginfo.primary); region = dblp->reginfo.primary; memset(region, 0, sizeof(*region)); region->fid_max = 0; SH_TAILQ_INIT(®ion->fq); region->free_fid_stack = INVALID_ROFF; region->free_fids = region->free_fids_alloced = 0; /* Initialize LOG LSNs. */ INIT_LSN(region->lsn); INIT_LSN(region->t_lsn); /* * It's possible to be waiting for an LSN of [1][0], if a replication * client gets the first log record out of order. An LSN of [0][0] * signifies that we're not waiting. */ ZERO_LSN(region->waiting_lsn); /* * Log makes note of the fact that it ran into a checkpoint on * startup if it did so, as a recovery optimization. A zero * LSN signifies that it hasn't found one [yet]. */ ZERO_LSN(region->cached_ckp_lsn); #ifdef HAVE_MUTEX_SYSTEM_RESOURCES /* Allocate room for the log maintenance info and initialize it. */ if ((ret = __db_shalloc(dblp->reginfo.addr, sizeof(REGMAINT) + LG_MAINT_SIZE, 0, &addr)) != 0) goto mem_err; __db_maintinit(&dblp->reginfo, addr, LG_MAINT_SIZE); region->maint_off = R_OFFSET(&dblp->reginfo, addr); #endif if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo, ®ion->fq_mutex, MUTEX_NO_RLOCK)) != 0) return (ret); /* * We must create a place for the flush mutex separately; mutexes have * to be aligned to MUTEX_ALIGN, and the only way to guarantee that is * to make sure they're at the beginning of a shalloc'ed chunk. */ if ((ret = __db_shalloc(dblp->reginfo.addr, sizeof(DB_MUTEX), MUTEX_ALIGN, &flush_mutexp)) != 0) goto mem_err; if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo, flush_mutexp, MUTEX_NO_RLOCK)) != 0) return (ret); region->flush_mutex_off = R_OFFSET(&dblp->reginfo, flush_mutexp); /* Initialize the buffer. */ if ((ret = __db_shalloc(dblp->reginfo.addr, dbenv->lg_bsize, 0, &p)) != 0) { mem_err: __db_err(dbenv, "Unable to allocate memory for the log buffer"); return (ret); } region->buffer_size = dbenv->lg_bsize; region->buffer_off = R_OFFSET(&dblp->reginfo, p); region->log_size = region->log_nsize = dbenv->lg_size; /* Initialize the commit Queue. */ SH_TAILQ_INIT(®ion->free_commits); SH_TAILQ_INIT(®ion->commits); region->ncommit = 0; /* * Fill in the log's persistent header. Don't fill in the log file * sizes, as they may change at any time and so have to be filled in * as each log file is created. */ region->persist.magic = DB_LOGMAGIC; region->persist.version = DB_LOGVERSION; region->persist.mode = (u_int32_t)dbenv->db_mode; return (0); } /* * __log_recover -- * Recover a log. */ static int __log_recover(dblp) DB_LOG *dblp; { DBT dbt; DB_ENV *dbenv; DB_LOGC *logc; DB_LSN lsn; LOG *lp; u_int32_t cnt, rectype; int ret; logfile_validity status; logc = NULL; dbenv = dblp->dbenv; lp = dblp->reginfo.primary; /* * Find a log file. If none exist, we simply return, leaving * everything initialized to a new log. */ if ((ret = __log_find(dblp, 0, &cnt, &status)) != 0) return (ret); if (cnt == 0) return (0); /* * If the last file is an old version, readable or no, start a new * file. Don't bother finding the end of the last log file; * we assume that it's valid in its entirety, since the user * should have shut down cleanly or run recovery before upgrading. */ if (status == DB_LV_OLD_READABLE || status == DB_LV_OLD_UNREADABLE) { lp->lsn.file = lp->s_lsn.file = cnt + 1; lp->lsn.offset = lp->s_lsn.offset = 0; goto skipsearch; } DB_ASSERT(status == DB_LV_NORMAL); /* * We have the last useful log file and we've loaded any persistent * information. Set the end point of the log past the end of the last * file. Read the last file, looking for the last checkpoint and * the log's end. */ lp->lsn.file = cnt + 1; lp->lsn.offset = 0; lsn.file = cnt; lsn.offset = 0; /* * Allocate a cursor and set it to the first record. This shouldn't * fail, leave error messages on. */ if ((ret = __log_cursor(dbenv, &logc)) != 0) return (ret); F_SET(logc, DB_LOG_LOCKED); memset(&dbt, 0, sizeof(dbt)); if ((ret = __log_c_get(logc, &lsn, &dbt, DB_SET)) != 0) goto err; /* * Read to the end of the file. This may fail at some point, so * turn off error messages. */ F_SET(logc, DB_LOG_SILENT_ERR); while (__log_c_get(logc, &lsn, &dbt, DB_NEXT) == 0) { if (dbt.size < sizeof(u_int32_t)) continue; memcpy(&rectype, dbt.data, sizeof(u_int32_t)); if (rectype == DB___txn_ckp) /* * If we happen to run into a checkpoint, cache its * LSN so that the transaction system doesn't have * to walk this log file again looking for it. */ lp->cached_ckp_lsn = lsn; } F_CLR(logc, DB_LOG_SILENT_ERR); /* * We now know where the end of the log is. Set the first LSN that * we want to return to an application and the LSN of the last known * record on disk. */ lp->lsn = lsn; lp->s_lsn = lsn; lp->lsn.offset += logc->c_len; lp->s_lsn.offset += logc->c_len; /* Set up the current buffer information, too. */ lp->len = logc->c_len; lp->b_off = 0; lp->w_off = lp->lsn.offset; skipsearch: if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) __db_err(dbenv, "Finding last valid log LSN: file: %lu offset %lu", (u_long)lp->lsn.file, (u_long)lp->lsn.offset); err: if (logc != NULL) (void)__log_c_close(logc); return (ret); } /* * __log_find -- * Try to find a log file. If find_first is set, valp will contain * the number of the first readable log file, else it will contain the number * of the last log file (which may be too old to read). * * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *)); */ int __log_find(dblp, find_first, valp, statusp) DB_LOG *dblp; int find_first; u_int32_t *valp; logfile_validity *statusp; { DB_ENV *dbenv; logfile_validity logval_status, status; u_int32_t clv, logval; int cnt, fcnt, ret; const char *dir; char *c, **names, *p, *q, savech; dbenv = dblp->dbenv; logval_status = status = DB_LV_NONEXISTENT; /* Return a value of 0 as the log file number on failure. */ *valp = 0; /* Find the directory name. */ if ((ret = __log_name(dblp, 1, &p, NULL, 0)) != 0) return (ret); if ((q = __db_rpath(p)) == NULL) { COMPQUIET(savech, 0); dir = PATH_DOT; } else { savech = *q; *q = '\0'; dir = p; } /* Get the list of file names. */ ret = __os_dirlist(dbenv, dir, &names, &fcnt); /* * !!! * We overwrote a byte in the string with a nul. Restore the string * so that the diagnostic checks in the memory allocation code work * and any error messages display the right file name. */ if (q != NULL) *q = savech; if (ret != 0) { __db_err(dbenv, "%s: %s", dir, db_strerror(ret)); __os_free(dbenv, p); return (ret); } /* Search for a valid log file name. */ for (cnt = fcnt, clv = logval = 0; --cnt >= 0;) { if (strncmp(names[cnt], LFPREFIX, sizeof(LFPREFIX) - 1) != 0) continue; /* * Names of the form log\.[0-9]* are reserved for DB. Other * names sharing LFPREFIX, such as "log.db", are legal. */ for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++) if (!isdigit((int)*c)) break; if (*c != '\0') continue; /* * Use atol, not atoi; if an "int" is 16-bits, the largest * log file name won't fit. */ clv = atol(names[cnt] + (sizeof(LFPREFIX) - 1)); /* * If searching for the first log file, we want to return the * oldest log file we can read, or, if no readable log files * exist, the newest log file we can't read (the crossover * point between the old and new versions of the log file). * * If we're searching for the last log file, we want to return * the newest log file, period. * * Readable log files should never preceede unreadable log * files, that would mean the admin seriously screwed up. */ if (find_first) { if (logval != 0 && status != DB_LV_OLD_UNREADABLE && clv > logval) continue; } else if (logval != 0 && clv < logval) continue; if ((ret = __log_valid(dblp, clv, 1, NULL, 0, &status)) != 0) { __db_err(dbenv, "Invalid log file: %s: %s", names[cnt], db_strerror(ret)); goto err; } switch (status) { case DB_LV_NONEXISTENT: /* __log_valid never returns DB_LV_NONEXISTENT. */ DB_ASSERT(0); break; case DB_LV_INCOMPLETE: /* * The last log file may not have been initialized -- * it's possible to create a log file but not write * anything to it. If performing recovery (that is, * if find_first isn't set), ignore the file, it's * not interesting. If we're searching for the first * log record, return the file (assuming we don't find * something better), as the "real" first log record * is likely to be in the log buffer, and we want to * set the file LSN for our return. */ if (find_first) goto found; break; case DB_LV_OLD_UNREADABLE: /* * If we're searching for the first log file, then we * only want this file if we don't yet have a file or * already have an unreadable file and this one is * newer than that one. If we're searching for the * last log file, we always want this file because we * wouldn't be here if it wasn't newer than our current * choice. */ if (!find_first || logval == 0 || (status == DB_LV_OLD_UNREADABLE && clv > logval)) goto found; break; case DB_LV_NORMAL: case DB_LV_OLD_READABLE: found: logval = clv; logval_status = status; break; } } *valp = logval; err: __os_dirfree(dbenv, names, fcnt); __os_free(dbenv, p); *statusp = logval_status; return (ret); } /* * log_valid -- * Validate a log file. Returns an error code in the event of * a fatal flaw in a the specified log file; returns success with * a code indicating the currentness and completeness of the specified * log file if it is not unexpectedly flawed (that is, if it's perfectly * normal, if it's zero-length, or if it's an old version). * * PUBLIC: int __log_valid __P((DB_LOG *, u_int32_t, int, * PUBLIC: DB_FH **, int, logfile_validity *)); */ int __log_valid(dblp, number, set_persist, fhpp, flags, statusp) DB_LOG *dblp; u_int32_t number; int set_persist; DB_FH **fhpp; int flags; logfile_validity *statusp; { DB_CIPHER *db_cipher; DB_ENV *dbenv; DB_FH *fhp; HDR *hdr; LOG *region; LOGP *persist; logfile_validity status; size_t hdrsize, nw, recsize; int is_hmac, ret; u_int8_t *tmp; char *fname; dbenv = dblp->dbenv; db_cipher = dbenv->crypto_handle; fhp = NULL; persist = NULL; status = DB_LV_NORMAL; tmp = NULL; /* Return the file handle to our caller, on request */ if (fhpp != NULL) *fhpp = NULL; if (flags == 0) flags = DB_OSO_RDONLY | DB_OSO_SEQ; /* Try to open the log file. */ if ((ret = __log_name(dblp, number, &fname, &fhp, flags)) != 0) { __os_free(dbenv, fname); return (ret); } hdrsize = HDR_NORMAL_SZ; is_hmac = 0; recsize = sizeof(LOGP); if (CRYPTO_ON(dbenv)) { hdrsize = HDR_CRYPTO_SZ; recsize = sizeof(LOGP); recsize += db_cipher->adj_size(recsize); is_hmac = 1; } if ((ret = __os_calloc(dbenv, 1, recsize + hdrsize, &tmp)) != 0) goto err; hdr = (HDR *)tmp; persist = (LOGP *)(tmp + hdrsize); /* Try to read the header. */ if ((ret = __os_read(dbenv, fhp, tmp, recsize + hdrsize, &nw)) != 0 || nw != recsize + hdrsize) { if (ret == 0) status = DB_LV_INCOMPLETE; else /* * The error was a fatal read error, not just an * incompletely initialized log file. */ __db_err(dbenv, "Ignoring log file: %s: %s", fname, db_strerror(ret)); goto err; } /* * Now we have to validate the persistent record. We have * several scenarios we have to deal with: * * 1. User has crypto turned on: * - They're reading an old, unencrypted log file * . We will fail the record size match check below. * - They're reading a current, unencrypted log file * . We will fail the record size match check below. * - They're reading an old, encrypted log file [NOT YET] * . After decryption we'll fail the version check. [NOT YET] * - They're reading a current, encrypted log file * . We should proceed as usual. * 2. User has crypto turned off: * - They're reading an old, unencrypted log file * . We will fail the version check. * - They're reading a current, unencrypted log file * . We should proceed as usual. * - They're reading an old, encrypted log file [NOT YET] * . We'll fail the magic number check (it is encrypted). * - They're reading a current, encrypted log file * . We'll fail the magic number check (it is encrypted). */ if (CRYPTO_ON(dbenv)) { /* * If we are trying to decrypt an unencrypted log * we can only detect that by having an unreasonable * data length for our persistent data. */ if ((hdr->len - hdrsize) != sizeof(LOGP)) { __db_err(dbenv, "log record size mismatch"); goto err; } /* Check the checksum and decrypt. */ if ((ret = __db_check_chksum(dbenv, db_cipher, &hdr->chksum[0], (u_int8_t *)persist, hdr->len - hdrsize, is_hmac)) != 0) { __db_err(dbenv, "log record checksum mismatch"); goto err; } if ((ret = db_cipher->decrypt(dbenv, db_cipher->data, &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0) goto err; } /* Validate the header. */ if (persist->magic != DB_LOGMAGIC) { __db_err(dbenv, "Ignoring log file: %s: magic number %lx, not %lx", fname, (u_long)persist->magic, (u_long)DB_LOGMAGIC); ret = EINVAL; goto err; } /* * Set our status code to indicate whether the log file * belongs to an unreadable or readable old version; leave it * alone if and only if the log file version is the current one. */ if (persist->version > DB_LOGVERSION) { /* This is a fatal error--the log file is newer than DB. */ __db_err(dbenv, "Ignoring log file: %s: unsupported log version %lu", fname, (u_long)persist->version); ret = EINVAL; goto err; } else if (persist->version < DB_LOGOLDVER) { status = DB_LV_OLD_UNREADABLE; /* This is a non-fatal error, but give some feedback. */ __db_err(dbenv, "Ignoring log file: %s: unreadable log version %lu", fname, (u_long)persist->version); /* * We don't want to set persistent info based on an * unreadable region, so jump to "err". */ goto err; } else if (persist->version < DB_LOGVERSION) status = DB_LV_OLD_READABLE; /* * Only if we have a current log do we verify the checksum. * We could not check the checksum before checking the magic * and version because old log hdrs have the length and checksum * in a different location. */ if (!CRYPTO_ON(dbenv) && ((ret = __db_check_chksum(dbenv, db_cipher, &hdr->chksum[0], (u_int8_t *)persist, hdr->len - hdrsize, is_hmac)) != 0)) { __db_err(dbenv, "log record checksum mismatch"); goto err; } /* * If the log is readable so far and we're doing system initialization, * set the region's persistent information based on the headers. * * Override the current log file size. * * XXX * Always use the persistent header's mode, regardless of what was set * in the current environment. We've always done it this way, but it's * probably a bug -- I can't think of a way not-changing the mode would * be a problem, though. */ if (set_persist) { region = dblp->reginfo.primary; region->log_size = persist->log_size; region->persist.mode = persist->mode; } err: if (fname != NULL) __os_free(dbenv, fname); if (ret == 0 && fhpp != NULL) *fhpp = fhp; else /* Must close on error or if we only used it locally. */ (void)__os_closehandle(dbenv, fhp); if (tmp != NULL) __os_free(dbenv, tmp); *statusp = status; return (ret); } /* * __log_dbenv_refresh -- * Clean up after the log system on a close or failed open. Called only * from __dbenv_refresh. (Formerly called __log_close.) * * PUBLIC: int __log_dbenv_refresh __P((DB_ENV *)); */ int __log_dbenv_refresh(dbenv) DB_ENV *dbenv; { DB_LOG *dblp; int ret, t_ret; dblp = dbenv->lg_handle; /* We may have opened files as part of XA; if so, close them. */ F_SET(dblp, DBLOG_RECOVER); ret = __dbreg_close_files(dbenv); /* Discard the per-thread lock. */ if (dblp->mutexp != NULL) __db_mutex_free(dbenv, &dblp->reginfo, dblp->mutexp); /* Detach from the region. */ if ((t_ret = __db_r_detach(dbenv, &dblp->reginfo, 0)) != 0 && ret == 0) ret = t_ret; /* Close open files, release allocated memory. */ if (dblp->lfhp != NULL) { if ((t_ret = __os_closehandle(dbenv, dblp->lfhp)) != 0 && ret == 0) ret = t_ret; dblp->lfhp = NULL; } if (dblp->dbentry != NULL) __os_free(dbenv, dblp->dbentry); __os_free(dbenv, dblp); dbenv->lg_handle = NULL; return (ret); } /* * __log_stat_pp -- * DB_ENV->log_stat pre/post processing. * * PUBLIC: int __log_stat_pp __P((DB_ENV *, DB_LOG_STAT **, u_int32_t)); */ int __log_stat_pp(dbenv, statp, flags) DB_ENV *dbenv; DB_LOG_STAT **statp; u_int32_t flags; { int rep_check, ret; PANIC_CHECK(dbenv); ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, "DB_ENV->log_stat", DB_INIT_LOG); if ((ret = __db_fchk(dbenv, "DB_ENV->log_stat", flags, DB_STAT_CLEAR)) != 0) return (ret); rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; if (rep_check) __env_rep_enter(dbenv); ret = __log_stat(dbenv, statp, flags); if (rep_check) __env_rep_exit(dbenv); return (ret); } /* * __log_stat -- * Return log statistics. */ static int __log_stat(dbenv, statp, flags) DB_ENV *dbenv; DB_LOG_STAT **statp; u_int32_t flags; { DB_LOG *dblp; DB_LOG_STAT *stats; LOG *region; int ret; *statp = NULL; dblp = dbenv->lg_handle; region = dblp->reginfo.primary; if ((ret = __os_umalloc(dbenv, sizeof(DB_LOG_STAT), &stats)) != 0) return (ret); /* Copy out the global statistics. */ R_LOCK(dbenv, &dblp->reginfo); *stats = region->stat; if (LF_ISSET(DB_STAT_CLEAR)) memset(®ion->stat, 0, sizeof(region->stat)); stats->st_magic = region->persist.magic; stats->st_version = region->persist.version; stats->st_mode = region->persist.mode; stats->st_lg_bsize = region->buffer_size; stats->st_lg_size = region->log_nsize; stats->st_region_wait = dblp->reginfo.rp->mutex.mutex_set_wait; stats->st_region_nowait = dblp->reginfo.rp->mutex.mutex_set_nowait; if (LF_ISSET(DB_STAT_CLEAR)) { dblp->reginfo.rp->mutex.mutex_set_wait = 0; dblp->reginfo.rp->mutex.mutex_set_nowait = 0; } stats->st_regsize = dblp->reginfo.rp->size; stats->st_cur_file = region->lsn.file; stats->st_cur_offset = region->lsn.offset; stats->st_disk_file = region->s_lsn.file; stats->st_disk_offset = region->s_lsn.offset; R_UNLOCK(dbenv, &dblp->reginfo); *statp = stats; return (0); } /* * __log_get_cached_ckp_lsn -- * Retrieve any last checkpoint LSN that we may have found on startup. * * PUBLIC: void __log_get_cached_ckp_lsn __P((DB_ENV *, DB_LSN *)); */ void __log_get_cached_ckp_lsn(dbenv, ckp_lsnp) DB_ENV *dbenv; DB_LSN *ckp_lsnp; { DB_LOG *dblp; LOG *lp; dblp = (DB_LOG *)dbenv->lg_handle; lp = (LOG *)dblp->reginfo.primary; R_LOCK(dbenv, &dblp->reginfo); *ckp_lsnp = lp->cached_ckp_lsn; R_UNLOCK(dbenv, &dblp->reginfo); } /* * __log_region_size -- * Return the amount of space needed for the log region. * Make the region large enough to hold txn_max transaction * detail structures plus some space to hold thread handles * and the beginning of the shalloc region and anything we * need for mutex system resource recording. */ static size_t __log_region_size(dbenv) DB_ENV *dbenv; { size_t s; s = dbenv->lg_regionmax + dbenv->lg_bsize; #ifdef HAVE_MUTEX_SYSTEM_RESOURCES if (F_ISSET(dbenv, DB_ENV_THREAD)) s += sizeof(REGMAINT) + LG_MAINT_SIZE; #endif return (s); } /* * __log_region_destroy * Destroy any region maintenance info. * * PUBLIC: void __log_region_destroy __P((DB_ENV *, REGINFO *)); */ void __log_region_destroy(dbenv, infop) DB_ENV *dbenv; REGINFO *infop; { __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, ((LOG *)R_ADDR(infop, infop->rp->primary))->maint_off)); COMPQUIET(dbenv, NULL); COMPQUIET(infop, NULL); } /* * __log_vtruncate * This is a virtual truncate. We set up the log indicators to * make everyone believe that the given record is the last one in the * log. Returns with the next valid LSN (i.e., the LSN of the next * record to be written). This is used in replication to discard records * in the log file that do not agree with the master. * * PUBLIC: int __log_vtruncate __P((DB_ENV *, DB_LSN *, DB_LSN *, DB_LSN *)); */ int __log_vtruncate(dbenv, lsn, ckplsn, trunclsn) DB_ENV *dbenv; DB_LSN *lsn, *ckplsn, *trunclsn; { DBT log_dbt; DB_LOG *dblp; DB_LOGC *logc; DB_LSN end_lsn; DB_MUTEX *flush_mutexp; LOG *lp; u_int32_t bytes, c_len; int ret, t_ret; /* Need to find out the length of this soon-to-be-last record. */ if ((ret = __log_cursor(dbenv, &logc)) != 0) return (ret); memset(&log_dbt, 0, sizeof(log_dbt)); ret = __log_c_get(logc, lsn, &log_dbt, DB_SET); c_len = logc->c_len; if ((t_ret = __log_c_close(logc)) != 0 && ret == 0) ret = t_ret; if (ret != 0) return (ret); /* Now do the truncate. */ dblp = (DB_LOG *)dbenv->lg_handle; lp = (LOG *)dblp->reginfo.primary; R_LOCK(dbenv, &dblp->reginfo); /* * Flush the log so we can simply initialize the in-memory buffer * after the truncate. */ if ((ret = __log_flush_int(dblp, NULL, 0)) != 0) goto err; end_lsn = lp->lsn; lp->lsn = *lsn; lp->len = c_len; lp->lsn.offset += lp->len; /* * I am going to assume that the number of bytes written since * the last checkpoint doesn't exceed a 32-bit number. */ DB_ASSERT(lp->lsn.file >= ckplsn->file); bytes = 0; if (ckplsn->file != lp->lsn.file) { bytes = lp->log_size - ckplsn->offset; if (lp->lsn.file > ckplsn->file + 1) bytes += lp->log_size * (lp->lsn.file - ckplsn->file - 1); bytes += lp->lsn.offset; } else bytes = lp->lsn.offset - ckplsn->offset; lp->stat.st_wc_mbytes += bytes / MEGABYTE; lp->stat.st_wc_bytes += bytes % MEGABYTE; /* * If the saved lsn is greater than our new end of log, reset it * to our current end of log. */ flush_mutexp = R_ADDR(&dblp->reginfo, lp->flush_mutex_off); MUTEX_LOCK(dbenv, flush_mutexp); if (log_compare(&lp->s_lsn, lsn) > 0) lp->s_lsn = lp->lsn; MUTEX_UNLOCK(dbenv, flush_mutexp); /* Initialize the in-region buffer to a pristine state. */ ZERO_LSN(lp->f_lsn); lp->w_off = lp->lsn.offset; if (trunclsn != NULL) *trunclsn = lp->lsn; /* Truncate the log to the new point. */ if ((ret = __log_zero(dbenv, &lp->lsn, &end_lsn)) != 0) goto err; err: R_UNLOCK(dbenv, &dblp->reginfo); return (ret); } /* * __log_is_outdated -- * Used by the replication system to identify if a client's logs * are too old. The log represented by dbenv is compared to the file * number passed in fnum. If the log file fnum does not exist and is * lower-numbered than the current logs, the we return *outdatedp non * zero, else we return it 0. * * PUBLIC: int __log_is_outdated __P((DB_ENV *, u_int32_t, int *)); */ int __log_is_outdated(dbenv, fnum, outdatedp) DB_ENV *dbenv; u_int32_t fnum; int *outdatedp; { DB_LOG *dblp; LOG *lp; char *name; int ret; u_int32_t cfile; dblp = dbenv->lg_handle; *outdatedp = 0; if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) return (ret); /* If the file exists, we're just fine. */ if (__os_exists(name, NULL) == 0) goto out; /* * It didn't exist, decide if the file number is too big or * too little. If it's too little, then we need to indicate * that the LSN is outdated. */ R_LOCK(dbenv, &dblp->reginfo); lp = (LOG *)dblp->reginfo.primary; cfile = lp->lsn.file; R_UNLOCK(dbenv, &dblp->reginfo); if (cfile > fnum) *outdatedp = 1; out: __os_free(dbenv, name); return (ret); } /* * __log_zero -- * Zero out the tail of a log after a truncate. */ static int __log_zero(dbenv, from_lsn, to_lsn) DB_ENV *dbenv; DB_LSN *from_lsn, *to_lsn; { DB_FH *fhp; DB_LOG *dblp; size_t nbytes, len, nw; u_int8_t buf[4096]; u_int32_t mbytes, bytes; int fn, ret; char *fname; dblp = dbenv->lg_handle; DB_ASSERT(log_compare(from_lsn, to_lsn) <= 0); if (log_compare(from_lsn, to_lsn) > 0) { __db_err(dbenv, "Warning: truncating to point beyond end of log"); return (0); } /* Close any open file handles so unlinks don't fail. */ if (dblp->lfhp != NULL) { (void)__os_closehandle(dbenv, dblp->lfhp); dblp->lfhp = NULL; } /* Throw away any extra log files that we have around. */ for (fn = from_lsn->file + 1;; fn++) { if (__log_name(dblp, fn, &fname, &fhp, DB_OSO_RDONLY) != 0) { __os_free(dbenv, fname); break; } (void)__os_closehandle(dbenv, fhp); ret = __os_unlink(dbenv, fname); __os_free(dbenv, fname); if (ret != 0) return (ret); } /* We removed some log files; have to 0 to end of file. */ if ((ret = __log_name(dblp, from_lsn->file, &fname, &dblp->lfhp, 0)) != 0) return (ret); __os_free(dbenv, fname); if ((ret = __os_ioinfo(dbenv, NULL, dblp->lfhp, &mbytes, &bytes, NULL)) != 0) goto err; DB_ASSERT((mbytes * MEGABYTE + bytes) >= from_lsn->offset); len = (mbytes * MEGABYTE + bytes) - from_lsn->offset; memset(buf, 0, sizeof(buf)); /* Initialize the write position. */ if ((ret = __os_seek(dbenv, dblp->lfhp, 0, 0, from_lsn->offset, 0, DB_OS_SEEK_SET)) != 0) goto err; while (len > 0) { nbytes = len > sizeof(buf) ? sizeof(buf) : len; if ((ret = __os_write(dbenv, dblp->lfhp, buf, nbytes, &nw)) != 0) goto err; len -= nbytes; } err: (void)__os_closehandle(dbenv, dblp->lfhp); dblp->lfhp = NULL; return (ret); } /* * __log_autoremove -- * Delete any non-essential log files. * * PUBLIC: void __log_autoremove __P((DB_ENV *)); */ void __log_autoremove(dbenv) DB_ENV *dbenv; { char **begin, **list; if (__log_archive(dbenv, &list, DB_ARCH_ABS) != 0) return; if (list != NULL) { for (begin = list; *list != NULL; ++list) (void)__os_unlink(dbenv, *list); __os_ufree(dbenv, begin); } return; }