os_map.c   [plain text]


/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
 *
 * $Id: os_map.c,v 12.25 2008/05/07 12:27:35 bschmeck Exp $
 */

#include "db_config.h"

#include "db_int.h"

static int __os_map
  __P((ENV *, char *, REGINFO *, DB_FH *, size_t, int, int, int, void **));
static int __os_unique_name __P((_TCHAR *, HANDLE, _TCHAR *, size_t));

/*
 * __os_attach --
 *	Create/join a shared memory region.
 */
int
__os_attach(env, infop, rp)
	ENV *env;
	REGINFO *infop;
	REGION *rp;
{
	DB_FH *fhp;
	int ret;

	/*
	 * On Windows/9X, files that are opened by multiple processes do not
	 * share data correctly.  For this reason, we require that DB_PRIVATE
	 * be specified on that platform.
	 */
	if (!F_ISSET(env, ENV_PRIVATE) && __os_is_winnt() == 0) {
		__db_err(env,
		    EINVAL, "Windows 9X systems must specify DB_PRIVATE");
		return (EINVAL);
	}

	/*
	 * Try to open/create the file.  We DO NOT need to ensure that multiple
	 * threads/processes attempting to simultaneously create the region are
	 * properly ordered, our caller has already taken care of that.
	 */
	if ((ret = __os_open(env, infop->name, 0, DB_OSO_REGION |
	    (F_ISSET(infop, REGION_CREATE_OK) ? DB_OSO_CREATE : 0),
	    env->db_mode, &fhp)) != 0) {
		__db_err(env, ret, "%s", infop->name);
		return (ret);
	}

	/*
	 * Map the file in.  If we're creating an in-system-memory region,
	 * specify a segment ID (which is never used again) so that the
	 * calling code writes out the REGENV_REF structure to the primary
	 * environment file.
	 */
	ret = __os_map(env, infop->name, infop, fhp, rp->size,
	   1, F_ISSET(env, ENV_SYSTEM_MEM), 0, &infop->addr);
	if (ret == 0 && F_ISSET(env, ENV_SYSTEM_MEM))
		rp->segid = 1;

	(void)__os_closehandle(env, fhp);

	return (ret);
}

/*
 * __os_detach --
 *	Detach from a shared memory region.
 */
int
__os_detach(env, infop, destroy)
	ENV *env;
	REGINFO *infop;
	int destroy;
{
	DB_ENV *dbenv;
	int ret, t_ret;

	dbenv = env->dbenv;

	if (infop->wnt_handle != NULL) {
		(void)CloseHandle(infop->wnt_handle);
		infop->wnt_handle = NULL;
	}

	ret = !UnmapViewOfFile(infop->addr) ? __os_get_syserr() : 0;
	if (ret != 0) {
		__db_syserr(env, ret, "UnmapViewOfFile");
		ret = __os_posix_err(ret);
	}

	if (!F_ISSET(env, ENV_SYSTEM_MEM) && destroy &&
	    (t_ret = __os_unlink(env, infop->name, 1)) != 0 && ret == 0)
		ret = t_ret;

	return (ret);
}

/*
 * __os_mapfile --
 *	Map in a shared memory file.
 */
int
__os_mapfile(env, path, fhp, len, is_rdonly, addr)
	ENV *env;
	char *path;
	DB_FH *fhp;
	int is_rdonly;
	size_t len;
	void **addr;
{
#ifdef DB_WINCE
	/*
	 * Windows CE has special requirements for file mapping to work. 
	 * * The input handle needs to be opened using CreateFileForMapping
	 * * Concurrent access via a non mapped file is not supported. 
	 * So we disable support for memory mapping files on Windows CE. It is
	 * currently only used as an optimization in mpool for small read only 
	 * databases.
	 */
	return (EFAULT);
#else
	DB_ENV *dbenv;

	dbenv = env == NULL ? NULL : env->dbenv;

	if (dbenv != NULL &&
	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
		__db_msg(env, "fileops: mmap %s", path);
	return (__os_map(env, path, NULL, fhp, len, 0, 0, is_rdonly, addr));
#endif
}

/*
 * __os_unmapfile --
 *	Unmap the shared memory file.
 */
int
__os_unmapfile(env, addr, len)
	ENV *env;
	void *addr;
	size_t len;
{
	DB_ENV *dbenv;

	dbenv = env == NULL ? NULL : env->dbenv;

	if (dbenv != NULL &&
	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
		__db_msg(env, "fileops: munmap");

	return (!UnmapViewOfFile(addr) ? __os_posix_err(__os_get_syserr()) : 0);
}

/*
 * __os_unique_name --
 *	Create a unique identifying name from a pathname (may be absolute or
 *	relative) and/or a file descriptor.
 *
 *	The name returned must be unique (different files map to different
 *	names), and repeatable (same files, map to same names).  It's not
 *	so easy to do by name.  Should handle not only:
 *
 *		foo.bar == ./foo.bar == c:/whatever_path/foo.bar
 *
 *	but also understand that:
 *
 *		foo.bar == Foo.Bar	(FAT file system)
 *		foo.bar != Foo.Bar	(NTFS)
 *
 *	The best solution is to use the file index, found in the file
 *	information structure (similar to UNIX inode #).
 *
 *	When a file is deleted, its file index may be reused,
 *	but if the unique name has not gone from its namespace,
 *	we may get a conflict.  So to ensure some tie in to the
 *	original pathname, we also use the creation time and the
 *	file basename.  This is not a perfect system, but it
 *	should work for all but anamolous test cases.
 *
 */
static int
__os_unique_name(orig_path, hfile, result_path, result_path_len)
	_TCHAR *orig_path, *result_path;
	HANDLE hfile;
	size_t result_path_len;
{
	BY_HANDLE_FILE_INFORMATION fileinfo;
	_TCHAR *basename, *p;

	/*
	 * In Windows, pathname components are delimited by '/' or '\', and
	 * if neither is present, we need to strip off leading drive letter
	 * (e.g. c:foo.txt).
	 */
	basename = _tcsrchr(orig_path, '/');
	p = _tcsrchr(orig_path, '\\');
	if (basename == NULL || (p != NULL && p > basename))
		basename = p;
	if (basename == NULL)
		basename = _tcsrchr(orig_path, ':');

	if (basename == NULL)
		basename = orig_path;
	else
		basename++;

	if (!GetFileInformationByHandle(hfile, &fileinfo))
		return (__os_posix_err(__os_get_syserr()));

	(void)_sntprintf(result_path, result_path_len,
	    _T("__db_shmem.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%s"),
	    fileinfo.dwVolumeSerialNumber,
	    fileinfo.nFileIndexHigh,
	    fileinfo.nFileIndexLow,
	    fileinfo.ftCreationTime.dwHighDateTime,
	    fileinfo.ftCreationTime.dwHighDateTime,
	    basename);

	return (0);
}

/*
 * __os_map --
 *	The mmap(2) function for Windows.
 */
static int
__os_map(env, path, infop, fhp, len, is_region, is_system, is_rdonly, addr)
	ENV *env;
	REGINFO *infop;
	char *path;
	DB_FH *fhp;
	int is_region, is_system, is_rdonly;
	size_t len;
	void **addr;
{
	HANDLE hMemory;
	int ret, use_pagefile;
	_TCHAR *tpath, shmem_name[DB_MAXPATHLEN];
	void *pMemory;
	unsigned __int64 len64;

	ret = 0;
	if (infop != NULL)
		infop->wnt_handle = NULL;

	/*
	 * On 64 bit systems, len is already a 64 bit value.
	 * On 32 bit systems len is a 32 bit value.
	 * Always convert to a 64 bit value, so that the high order
	 * DWORD can be simply extracted on 64 bit platforms.
	 */
	len64 = len;

	use_pagefile = is_region && is_system;

	/*
	 * If creating a region in system space, get a matching name in the
	 * paging file namespace.
	 */
	if (use_pagefile) {
#ifdef DB_WINCE
		__db_errx(env, "Unable to memory map regions using system "
		    "memory on WinCE.");
		return (EFAULT);
#endif
		TO_TSTRING(env, path, tpath, ret);
		if (ret != 0)
			return (ret);
		ret = __os_unique_name(tpath, fhp->handle,
		    shmem_name, sizeof(shmem_name));
		FREE_STRING(env, tpath);
		if (ret != 0)
			return (ret);
	}

	/*
	 * XXX
	 * DB: We have not implemented copy-on-write here.
	 *
	 * If this is an region in system memory, we try to open it using the
	 * OpenFileMapping() first, and only call CreateFileMapping() if we're
	 * really creating the section.  There are two reasons:
	 *
	 * 1) We only create the mapping if we have newly created the region.
	 *    This avoids a long-running problem caused by Windows reference
	 *    counting, where regions that are closed by all processes are
	 *    deleted.  It turns out that just checking for a zeroed region
	 *    is not good enough. See [#4882] and [#7127] for the details.
	 *
	 * 2) CreateFileMapping seems to mess up making the commit charge to
	 *    the process. It thinks, incorrectly, that when we want to join a
	 *    previously existing section, that it should make a commit charge
	 *    for the whole section.  In fact, there is no new committed memory
	 *    whatever.  The call can fail if there is insufficient memory free
	 *    to handle the erroneous commit charge.  So, we find that the
	 *    bogus commit is not made if we call OpenFileMapping.
	 */
	hMemory = NULL;
	if (use_pagefile) {
#ifndef DB_WINCE
		hMemory = OpenFileMapping(
		    is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS,
		    0, shmem_name);

		if (hMemory == NULL && F_ISSET(infop, REGION_CREATE_OK))
			hMemory = CreateFileMapping((HANDLE)-1, 0,
			    is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
			    (DWORD)(len64 >> 32), (DWORD)len64, shmem_name);
#endif
	} else {
		hMemory = CreateFileMapping(fhp->handle, 0,
		    is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
		    (DWORD)(len64 >> 32), (DWORD)len64, NULL);
#ifdef DB_WINCE
		/*
		 * WinCE automatically closes the handle passed in.
		 * Ensure DB does not attempt to close the handle again.
		 */
		fhp->handle = INVALID_HANDLE_VALUE;
		F_CLR(fhp, DB_FH_OPENED);
#endif
	}

	if (hMemory == NULL) {
		ret = __os_get_syserr();
		__db_syserr(env, ret, "OpenFileMapping");
		return (__env_panic(env, __os_posix_err(ret)));
	}

	pMemory = MapViewOfFile(hMemory,
	    (is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS), 0, 0, len);
	if (pMemory == NULL) {
		ret = __os_get_syserr();
		__db_syserr(env, ret, "MapViewOfFile");
		return (__env_panic(env, __os_posix_err(ret)));
	}

	/*
	 * XXX
	 * It turns out that the kernel object underlying the named section
	 * is reference counted, but that the call to MapViewOfFile() above
	 * does NOT increment the reference count! So, if we close the handle
	 * here, the kernel deletes the object from the kernel namespace.
	 * When a second process comes along to join the region, the kernel
	 * happily creates a new object with the same name, but completely
	 * different identity. The two processes then have distinct isolated
	 * mapped sections, not at all what was wanted. Not closing the handle
	 * here fixes this problem.  We carry the handle around in the region
	 * structure so we can close it when unmap is called.
	 */
	if (use_pagefile && infop != NULL)
		infop->wnt_handle = hMemory;
	else
		CloseHandle(hMemory);

	*addr = pMemory;
	return (ret);
}