fop_rec.c   [plain text]


/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 2001,2008 Oracle.  All rights reserved.
 *
 * $Id: fop_rec.c,v 12.27 2008/01/31 18:40:43 bostic Exp $
 */

#include "db_config.h"

#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/fop.h"
#include "dbinc/db_am.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"

static int __fop_rename_recover_int
    __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));

/*
 * The transactional guarantees Berkeley DB provides for file
 * system level operations (database physical file create, delete,
 * rename) are based on our understanding of current file system
 * semantics; a system that does not provide these semantics and
 * guarantees could be in danger.
 *
 * First, as in standard database changes, fsync and fdatasync must
 * work: when applied to the log file, the records written into the
 * log must be transferred to stable storage.
 *
 * Second, it must not be possible for the log file to be removed
 * without previous file system level operations being flushed to
 * stable storage.  Berkeley DB applications write log records
 * describing file system operations into the log, then perform the
 * file system operation, then commit the enclosing transaction
 * (which flushes the log file to stable storage).  Subsequently,
 * a database environment checkpoint may make it possible for the
 * application to remove the log file containing the record of the
 * file system operation.  DB's transactional guarantees for file
 * system operations require the log file removal not succeed until
 * all previous filesystem operations have been flushed to stable
 * storage.  In other words, the flush of the log file, or the
 * removal of the log file, must block until all previous
 * filesystem operations have been flushed to stable storage.  This
 * semantic is not, as far as we know, required by any existing
 * standards document, but we have never seen a filesystem where
 * it does not apply.
 */

/*
 * __fop_create_recover --
 *	Recovery function for create.
 *
 * PUBLIC: int __fop_create_recover
 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
 */
int
__fop_create_recover(env, dbtp, lsnp, op, info)
	ENV *env;
	DBT *dbtp;
	DB_LSN *lsnp;
	db_recops op;
	void *info;
{
	__fop_create_args *argp;
	DB_FH *fhp;
	DBMETA *meta;
	u_int8_t mbuf[DBMETASIZE];
	int ret;
	char *real_name;

	COMPQUIET(info, NULL);

	real_name = NULL;
	REC_PRINT(__fop_create_print);
	REC_NOOP_INTRO(__fop_create_read);
	meta = (DBMETA *)mbuf;

	if ((ret = __db_appname(env, (APPNAME)argp->appname,
	    (const char *)argp->name.data, 0, NULL, &real_name)) != 0)
		goto out;

	if (DB_UNDO(op)) {
		/*
		 * If the file was opened in mpool, we must mark it as
		 * dead via nameop which will also unlink the file.
		 */
		if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
			if (__fop_read_meta(env,
			    real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
			    __db_chk_meta(env, NULL, meta, 1) == 0) {
				if ((ret = __memp_nameop(env,
				    meta->uid, NULL, real_name, NULL, 0)) != 0)
					goto out;
			} else
				goto do_unlink;
			(void)__os_closehandle(env, fhp);
		} else
do_unlink:		(void)__os_unlink(env, real_name, 0);
	} else if (DB_REDO(op)) {
		if ((ret = __os_open(env, real_name, 0,
		    DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
			(void)__os_closehandle(env, fhp);
		else
			goto out;
	}

	*lsnp = argp->prev_lsn;

out: if (real_name != NULL)
		__os_free(env, real_name);

	REC_NOOP_CLOSE;
}

/*
 * __fop_remove_recover --
 *	Recovery function for remove.
 *
 * PUBLIC: int __fop_remove_recover
 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
 */
int
__fop_remove_recover(env, dbtp, lsnp, op, info)
	ENV *env;
	DBT *dbtp;
	DB_LSN *lsnp;
	db_recops op;
	void *info;
{
	__fop_remove_args *argp;
	int ret;
	char *real_name;

	COMPQUIET(info, NULL);

	real_name = NULL;
	REC_PRINT(__fop_remove_print);
	REC_NOOP_INTRO(__fop_remove_read);

	if ((ret = __db_appname(env, (APPNAME)argp->appname,
	    (const char *)argp->name.data, 0, NULL, &real_name)) != 0)
		goto out;

	/* Its ok if the file is not there. */
	if (DB_REDO(op))
		(void)__memp_nameop(env,
		    (u_int8_t *)argp->fid.data, NULL, real_name, NULL, 0);

	*lsnp = argp->prev_lsn;
out:	if (real_name != NULL)
		__os_free(env, real_name);
	REC_NOOP_CLOSE;
}

/*
 * __fop_write_recover --
 *	Recovery function for writechunk.
 *
 * PUBLIC: int __fop_write_recover
 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
 */
int
__fop_write_recover(env, dbtp, lsnp, op, info)
	ENV *env;
	DBT *dbtp;
	DB_LSN *lsnp;
	db_recops op;
	void *info;
{
	__fop_write_args *argp;
	int ret;

	COMPQUIET(info, NULL);

	REC_PRINT(__fop_write_print);
	REC_NOOP_INTRO(__fop_write_read);

	ret = 0;
	if (DB_UNDO(op))
		DB_ASSERT(env, argp->flag != 0);
	else if (DB_REDO(op))
		ret = __fop_write(env,
		    argp->txnp, argp->name.data, (APPNAME)argp->appname,
		    NULL, argp->pgsize, argp->pageno, argp->offset,
		    argp->page.data, argp->page.size, argp->flag, 0);

	if (ret == 0)
		*lsnp = argp->prev_lsn;
	REC_NOOP_CLOSE;
}

/*
 * __fop_rename_recover --
 *	Recovery functions for rename.  There are two variants that
 * both use the same utility function.  Had we known about this on day
 * one, we would have simply added a parameter.  However, since we need
 * to retain old records for backward compatibility (online-upgrade)
 * wrapping the two seems like the right solution.
 *
 * PUBLIC: int __fop_rename_recover
 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
 *
 * PUBLIC: int __fop_rename_noundo_recover
 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
 */
int
__fop_rename_recover(env, dbtp, lsnp, op, info)
	ENV *env;
	DBT *dbtp;
	DB_LSN *lsnp;
	db_recops op;
	void *info;
{
	return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 1));
}

int
__fop_rename_noundo_recover(env, dbtp, lsnp, op, info)
	ENV *env;
	DBT *dbtp;
	DB_LSN *lsnp;
	db_recops op;
	void *info;
{
	return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 0));
}

static int
__fop_rename_recover_int(env, dbtp, lsnp, op, info, undo)
	ENV *env;
	DBT *dbtp;
	DB_LSN *lsnp;
	db_recops op;
	void *info;
	int undo;
{
	__fop_rename_args *argp;
	DB_FH *fhp;
	DBMETA *meta;
	u_int8_t *fileid, mbuf[DBMETASIZE];
	int ret;
	char *real_new, *real_old, *src;

	COMPQUIET(info, NULL);

	fhp = NULL;
	meta = (DBMETA *)&mbuf[0];
	ret = 0;
	real_new = real_old = NULL;

	REC_PRINT(__fop_rename_print);
	REC_NOOP_INTRO(__fop_rename_read);
	fileid = argp->fileid.data;

	if ((ret = __db_appname(env, (APPNAME)argp->appname,
	    (const char *)argp->newname.data, 0, NULL, &real_new)) != 0)
		goto out;
	if ((ret = __db_appname(env, (APPNAME)argp->appname,
	    (const char *)argp->oldname.data, 0, NULL, &real_old)) != 0)
		goto out;

	/*
	 * Verify that we are manipulating the correct file.  We should always
	 * be OK on an ABORT or an APPLY, but during recovery, we have to
	 * check.
	 */
	if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
		src = DB_UNDO(op) ? real_new : real_old;
		/*
		 * Interpret any error as meaning that the file either doesn't
		 * exist, doesn't have a meta-data page, or is in some other
		 * way, shape or form, incorrect, so that we should not restore
		 * it.
		 */
		if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
			goto done;
		if (__fop_read_meta(env,
		    src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
			goto done;
		if (__db_chk_meta(env, NULL, meta, 1) != 0)
			goto done;
		if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
			goto done;
		(void)__os_closehandle(env, fhp);
		fhp = NULL;
		if (DB_REDO(op)) {
			/*
			 * Check to see if the target file exists.  If it
			 * does and it does not have the proper id then
			 * it is a later version.  We just remove the source
			 * file since the state of the world is beyond this
			 * point.
			 */
			if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
			    __fop_read_meta(env, src, mbuf,
			    DBMETASIZE, fhp, 1, NULL) == 0 &&
			    __db_chk_meta(env, NULL, meta, 1) == 0 &&
			    memcmp(argp->fileid.data,
			    meta->uid, DB_FILE_ID_LEN) != 0) {
				(void)__memp_nameop(env,
				    fileid, NULL, real_old, NULL, 0);
				goto done;
			}
		}
	}

	if (undo && DB_UNDO(op))
		(void)__memp_nameop(env, fileid,
		    (const char *)argp->oldname.data, real_new, real_old, 0);
	if (DB_REDO(op))
		(void)__memp_nameop(env, fileid,
		    (const char *)argp->newname.data, real_old, real_new, 0);

done:	*lsnp = argp->prev_lsn;
out:	if (real_new != NULL)
		__os_free(env, real_new);
	if (real_old != NULL)
		__os_free(env, real_old);
	if (fhp != NULL)
		(void)__os_closehandle(env, fhp);

	REC_NOOP_CLOSE;
}

/*
 * __fop_file_remove_recover --
 *	Recovery function for file_remove.  On the REDO pass, we need to
 * make sure no one recreated the file while we weren't looking.  On an
 * undo pass must check if the file we are interested in is the one that
 * exists and then set the status of the child transaction depending on
 * what we find out.
 *
 * PUBLIC: int __fop_file_remove_recover
 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
 */
int
__fop_file_remove_recover(env, dbtp, lsnp, op, info)
	ENV *env;
	DBT *dbtp;
	DB_LSN *lsnp;
	db_recops op;
	void *info;
{
	__fop_file_remove_args *argp;
	DBMETA *meta;
	DB_FH *fhp;
	size_t len;
	u_int8_t mbuf[DBMETASIZE];
	u_int32_t cstat, ret_stat;
	int is_real, is_tmp, ret;
	char *real_name;

	fhp = NULL;
	meta = (DBMETA *)&mbuf[0];
	is_real = is_tmp = 0;
	real_name = NULL;
	REC_PRINT(__fop_file_remove_print);
	REC_NOOP_INTRO(__fop_file_remove_read);

	/*
	 * This record is only interesting on the backward, forward, and
	 * apply phases.
	 */
	if (op != DB_TXN_BACKWARD_ROLL &&
	    op != DB_TXN_FORWARD_ROLL && op != DB_TXN_APPLY)
		goto done;

	if ((ret = __db_appname(env,
	    (APPNAME)argp->appname, argp->name.data, 0, NULL, &real_name)) != 0)
		goto out;

	/* Verify that we are manipulating the correct file.  */
	len = 0;
	if (__os_open(env, real_name, 0, 0, 0, &fhp) != 0 ||
	    (ret = __fop_read_meta(env, real_name,
	    mbuf, DBMETASIZE, fhp, 1, &len)) != 0) {
		/*
		 * If len is non-zero, then the file exists and has something
		 * in it, but that something isn't a full meta-data page, so
		 * this is very bad.  Bail out!
		 */
		if (len != 0)
			goto out;

		/* File does not exist. */
		cstat = TXN_EXPECTED;
	} else {
		/*
		 * We can ignore errors here since we'll simply fail the
		 * checks below and assume this is the wrong file.
		 */
		(void)__db_chk_meta(env, NULL, meta, 1);
		is_real =
		    memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
		is_tmp =
		    memcmp(argp->tmp_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;

		if (!is_real && !is_tmp)
			/* File exists, but isn't what we were removing. */
			cstat = TXN_IGNORE;
		else
			/* File exists and is the one that we were removing. */
			cstat = TXN_COMMIT;
	}
	if (fhp != NULL) {
		(void)__os_closehandle(env, fhp);
		fhp = NULL;
	}

	if (DB_UNDO(op)) {
		/* On the backward pass, we leave a note for the child txn. */
		if ((ret = __db_txnlist_update(env,
		    info, argp->child, cstat, NULL, &ret_stat, 1)) != 0)
			goto out;
	} else if (DB_REDO(op)) {
		/*
		 * On the forward pass, check if someone recreated the
		 * file while we weren't looking.
		 */
		if (cstat == TXN_COMMIT)
			(void)__memp_nameop(env,
			    is_real ? argp->real_fid.data : argp->tmp_fid.data,
			    NULL, real_name, NULL, 0);
	}

done:	*lsnp = argp->prev_lsn;
	ret = 0;

out:	if (real_name != NULL)
		__os_free(env, real_name);
	if (fhp != NULL)
		(void)__os_closehandle(env, fhp);
	REC_NOOP_CLOSE;
}