/* * ntfs_vnops.c - NTFS kernel vnode operations. * * Copyright (c) 2006-2011 Anton Altaparmakov. All Rights Reserved. * Portions Copyright (c) 2006-2011 Apple Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the name of Apple Inc. ("Apple") nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ALTERNATIVELY, provided that this notice and licensing terms are retained in * full, this file may be redistributed and/or modified under the terms of the * GNU General Public License (GPL) Version 2, in which case the provisions of * that version of the GPL will apply to you instead of the license terms * above. You can obtain a copy of the GPL Version 2 at * http://developer.apple.com/opensource/licenses/gpl-2.txt. */ #include <sys/attr.h> #include <sys/buf.h> #include <sys/errno.h> #include <sys/param.h> #include <sys/stat.h> #include <sys/syslimits.h> #include <sys/time.h> #include <sys/ubc.h> #include <sys/ucred.h> #include <sys/uio.h> #include <sys/unistd.h> #include <sys/vnode.h> #include <sys/vnode_if.h> #include <sys/xattr.h> #include <string.h> #include <mach/kern_return.h> #include <mach/memory_object_types.h> #include <kern/debug.h> #include <kern/locks.h> #include <vfs/vfs_support.h> #include "ntfs.h" #include "ntfs_attr.h" #include "ntfs_bitmap.h" #include "ntfs_compress.h" #include "ntfs_debug.h" #include "ntfs_dir.h" #include "ntfs_endian.h" #include "ntfs_hash.h" #include "ntfs_inode.h" #include "ntfs_layout.h" #include "ntfs_lcnalloc.h" #include "ntfs_mft.h" #include "ntfs_mst.h" #include "ntfs_page.h" #include "ntfs_sfm.h" #include "ntfs_time.h" #include "ntfs_unistr.h" #include "ntfs_vnops.h" #include "ntfs_volume.h" /* Global ntfs vnode operations. */ vnop_t **ntfs_vnodeop_p; /** * ntfs_cluster_iodone - complete i/o on a memory region * @cbp: cluster head buffer for which i/o is being completed * @arg: callback argument, we do not use it at present * * In the read case: * * For an mst protected attribute we do the post read mst deprotection and for * an encrypted attribute we do the decryption (not supported at present). * Note we ignore mst fixup errors as those are detected when * ntfs_mft_record_map() is called later which gives us per record granularity. * * In the write case: * * For an mst protected attribute we do the post write mst deprotection. * Writing to encrypted attributes is not supported at present. * * Return 0 on success and errno on error. */ int ntfs_cluster_iodone(buf_t cbp, void *arg __unused) { long size; ntfs_inode *ni; u8 *kend, *kaddr; errno_t err, err2; BOOL is_read = buf_flags(cbp) & B_READ; ni = NTFS_I(buf_vnode(cbp)); size = buf_count(cbp); if (size & (ni->block_size - 1)) panic("%s(): Called with size not a multiple of the inode " "block size.\n", __FUNCTION__); err = buf_map(cbp, (caddr_t*)&kaddr); if (err) { ntfs_error(ni->vol->mp, "Failed to map buffer (error %d).", err); goto err; } kend = kaddr + size; if (NInoMstProtected(ni)) { s64 ofs, data_size, init_size; u32 rec_size = ni->block_size; NTFS_RECORD_TYPE magic = 0; if (!is_read) { if (ni->type == AT_INDEX_ALLOCATION) magic = magic_INDX; else panic("%s(): Unknown mst protected inode " "0x%llx, type 0x%x, name_len " "0x%x.", __FUNCTION__, (unsigned long long)ni->mft_no, (unsigned)le32_to_cpu(ni->type), (unsigned)ni->name_len); } /* The offset in the attribute at which this buffer begins. */ ofs = (s64)buf_lblkno(cbp) << PAGE_SHIFT; lck_spin_lock(&ni->size_lock); data_size = ni->data_size; init_size = ni->initialized_size; lck_spin_unlock(&ni->size_lock); /* * Limit mst deprotection to the initialized size as beyond * that the data is zero and deprotection will fail. And worse * in the write case it will lead to a kernel panic. */ if (ofs + size > init_size) { if (ofs > data_size) { ntfs_error(ni->vol->mp, "Buffer begins past " "the end of the data of the " "attribute (mft_no 0x%llx).", (unsigned long long)ni->mft_no); err = EINVAL; goto unm_err; } if (ofs > init_size) { ntfs_debug("Buffer begins past the end of the " "initialized data of the " "attribute (mft_no 0x%llx).", (unsigned long long)ni->mft_no); goto unm_err; } size = init_size - ofs; kend = kaddr + size; } /* * Do the mst deprotection ignoring errors and make sure we do * not go past the initialized size should an error somehow * have caused the last record to straddle the initialized * size. */ while (kaddr + rec_size <= kend) { if (is_read) (void)ntfs_mst_fixup_post_read( (NTFS_RECORD*)kaddr, rec_size); else if (__ntfs_is_magic(((NTFS_RECORD*)kaddr)->magic, magic)) ntfs_mst_fixup_post_write((NTFS_RECORD*)kaddr); kaddr += rec_size; } } else if (NInoEncrypted(ni)) { // TODO: Need to decrypt the encrypted sectors here. This // cannot happen at present as we deny opening/reading/writing/ // paging encrypted vnodes. panic("%s(): Called for encrypted vnode.\n", __FUNCTION__); } else panic("%s(): Called for normal vnode.\n", __FUNCTION__); unm_err: err2 = buf_unmap(cbp); if (err2) { if (!err) err = err2; ntfs_error(ni->vol->mp, "Failed to unmap buffer (error %d).", err2); } err: return err; } /** * ntfs_buf_iodone - remove the MST fixups when i/o is complete on a buffer * @buf: buffer for which to remove the MST fixups * @arg: unused, always NULL * * ntfs_buf_iodone() is an i/o completion handler which is called when i/o is * completed on a buffer belonging to $MFT/$DATA. It removes the MST fixups * and returns after which the buffer busy state (BL_BUSY flag) is cleared and * others can access the buffer again. * * ntfs_buf_iodone() is called both when the i/o was successful and when it * failed thus we have to deal with that as appropriate. * * Note that ntfs_buf_iodone() is called deep from within the driver stack and * thus there are limitations on what it is allowed to do. In particular it is * not allowed to initiate new i/o operations nor to allocate/free memory. * * WARNING: This function can be called whilst an unmount is in progress and * thus it may not look up nor use the ntfs_volume structure to which the inode * belongs. */ static void ntfs_buf_iodone(buf_t buf, void *arg __unused) { s64 ofs, data_size, init_size; vnode_t vn; mount_t mp; ntfs_inode *ni; unsigned size, b_flags; errno_t err; vn = buf_vnode(buf); mp = vnode_mount(vn); ni = NTFS_I(vn); ntfs_debug("Entering for mft_no 0x%llx, lblkno 0x%llx.", (unsigned long long)ni->mft_no, (unsigned long long)buf_lblkno(buf)); if (!NInoMstProtected(ni) || ni->mft_no || NInoAttr(ni)) panic("%s(): Called not for $MFT!\n", __FUNCTION__); /* The size and offset in the attribute at which this buffer begins. */ size = buf_count(buf); if (size != ni->block_size) panic("%s(): size != ni->block_size\n", __FUNCTION__); ofs = (s64)buf_lblkno(buf) << ni->block_size_shift; lck_spin_lock(&ni->size_lock); data_size = ni->data_size; init_size = ni->initialized_size; lck_spin_unlock(&ni->size_lock); b_flags = buf_flags(buf); /* * Limit mst deprotection to the initialized size as beyond that the * data is zero and deprotection will fail. And worse in the write * case it will lead to a kernel panic. */ if (ofs + size > init_size) { if (ofs > data_size) { ntfs_error(mp, "Buffer begins past the end of the " "data of the attribute (mft_no " "0x%llx).", (unsigned long long)ni->mft_no); err = EINVAL; goto err; } if (ofs > init_size) { ntfs_error(mp, "Buffer begins past the end of the " "initialized data of the attribute " "(mft_no 0x%llx).", (unsigned long long)ni->mft_no); err = EINVAL; goto err; } } /* * Do not try to remove the fixups if a read failed as there will be * nothing to remove. */ if (!buf_error(buf) || !(b_flags & B_READ)) { NTFS_RECORD *rec; err = buf_map(buf, (caddr_t*)&rec); if (err) { ntfs_error(mp, "Failed to map buffer (error %d).", err); goto err; } if (b_flags & B_READ) { err = ntfs_mst_fixup_post_read(rec, size); if (err) { ntfs_error(mp, "Multi sector transfer error " "detected in mft_no 0x%llx " "(error %d). Run chkdsk", (unsigned long long)ni->mft_no, err); buf_seterror(buf, err); } } else ntfs_mst_fixup_post_write(rec); err = buf_unmap(buf); if (err) { ntfs_error(mp, "Failed to unmap buffer (error %d).", err); goto err; } } ntfs_debug("Done."); return; err: if (!buf_error(buf)) buf_seterror(buf, err); ntfs_debug("Failed."); return; } /** * ntfs_vnop_strategy - prepare and issue the i/o described by a buffer * @a: arguments to strategy function * * @a contains: * buf_t a_bp; buffer for which to prepare and issue the i/o * * Prepare and issue the i/o described by the buffer @a->a_bp. Adapted from * buf_strategy(). * * In NTFS, we only ever get called for buffers which have a page list * attached. The page list is mapped and the address of the mapping is stored * in (u8*)buf_dataptr(@a->a_bp). The exception to this is i/o for $MFT/$DATA * and $MFTMirr/$DATA which is issued via buf_meta_bread(), etc, and thus does * not involve a page list at all. * * Return 0 on success and errno on error. */ static int ntfs_vnop_strategy(struct vnop_strategy_args *a) { s64 ofs, max_end_io; daddr64_t lblkno; buf_t buf = a->a_bp; vnode_t vn = buf_vnode(buf); ntfs_inode *ni; ntfs_volume *vol; void (*old_iodone)(buf_t, void *); void *old_transact; unsigned b_flags; errno_t err, err2; BOOL do_fixup; /* Same checks as in buf_strategy(). */ if (!vn || vnode_ischr(vn) || vnode_isblk(vn)) panic("%s(): !vn || vnode_ischr(vn) || vnode_isblk(vn)\n", __FUNCTION__); ni = NTFS_I(vn); if (!ni) { err = EIO; ntfs_debug("Entered with NULL ntfs_inode, aborting."); goto err; } ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x, " "logical block 0x%llx.", (unsigned long long)ni->mft_no, le32_to_cpu(ni->type), (unsigned)ni->name_len, (unsigned long long)buf_lblkno(buf)); if (S_ISDIR(ni->mode)) panic("%s(): Called for directory vnode.\n", __FUNCTION__); vol = ni->vol; b_flags = buf_flags(buf); /* * If we are called from cluster_io() then pass the request down to the * underlying device containing the NTFS volume. We have no KPI way of * doing this directly so we invoke buf_strategy() and rely on the fact * that it does not do anything other than associate the physical * device with the buffer and then pass the buffer down to the device. */ if (b_flags & B_CLUSTER) goto done; /* * If this i/o is for $MFTMirr/$DATA send it through straight without * modifications. This is because we keep the $MFTMirr/$DATA buffers * in memory with the fixups applied for simplicity. */ if (ni->mft_no == FILE_MFTMirr && !NInoAttr(ni)) goto done; /* * Except for $MFT/$DATA we never do i/o via file system buffers thus * we should never get here. */ if (ni->mft_no != FILE_MFT || NInoAttr(ni)) panic("%s(): Called for non-cluster i/o buffer.\n", __FUNCTION__); /* * We are reading/writing $MFT/$DATA. * * For reads, i/o is allowed up to the data_size whilst for writes, i/o * is only allowed up to the initialized_size. * * Further when reading past the initialized size we do not need to do * i/o at all as we can simply clear the buffer and return success. */ lblkno = buf_lblkno(buf); ofs = lblkno << ni->block_size_shift; lck_spin_lock(&ni->size_lock); max_end_io = ni->initialized_size; do_fixup = FALSE; if (b_flags & B_READ) { if (ofs >= max_end_io) { if (max_end_io > ni->data_size) panic("%s() initialized_size > data_size\n", __FUNCTION__); if (ofs < ni->data_size) { lck_spin_unlock(&ni->size_lock); buf_clear(buf); buf_biodone(buf); ntfs_debug("Read past initialized size. " "Clearing buffer."); return 0; } } max_end_io = ni->data_size; do_fixup = TRUE; } lck_spin_unlock(&ni->size_lock); if (ofs >= max_end_io) { /* I/o is out of range. This should never happen. */ ntfs_error(vol->mp, "Trying to %s buffer for $MFT/$DATA which " "is out of range, aborting.", b_flags & B_READ ? "read" : "write"); err = EIO; goto err; } /* * For writes we need to apply the MST fixups before calling * buf_strategy() which will perform the i/o and if the write is for an * mft record that is also in the mft mirror we now need to write it to * the mft mirror as well. * * Note B_WRITE is a pseudo flag and cannot be used for checking thus * check that B_READ is not set which implies it is a write. */ if (!(b_flags & B_READ)) { NTFS_RECORD *rec; NTFS_RECORD_TYPE magic; BOOL need_mirr_sync; err = buf_map(buf, (caddr_t*)&rec); if (err) { ntfs_error(vol->mp, "Failed to map buffer (error %d).", err); goto err; } if (!rec) panic("%s(): buf_map() returned NULL.\n", __FUNCTION__); #if 0 need_mirr_sync = FALSE; if (ni->type == AT_INDEX_ALLOCATION) magic = magic_INDX; else if (ni == mft_ni || ni == vol->mftmirr_ni) { magic = magic_FILE; if (ni == mft_ni) need_mirr_sync = (lblkno < vol->mftmirr_size); } else panic("%s(): Unknown mst protected inode 0x%llx, type " "0x%x, name_len 0x%x.", __FUNCTION__, (unsigned long long)ni->mft_no, (unsigned)le32_to_cpu(ni->type), (unsigned)ni->name_len); #else need_mirr_sync = (lblkno < vol->mftmirr_size); magic = magic_FILE; #endif /* * Only apply fixups if the record has the correct magic. We * may have detected a multi sector transfer error and are thus * now writing a BAAD record in which case we do not want to * touch its contents. * * Further, if there is an error do not sync the record to the * mft mirror as that may still be intact and we do not want to * overwrite the correct data with corrupt data. */ if (__ntfs_is_magic(rec->magic, magic)) { err = ntfs_mst_fixup_pre_write(rec, ni->block_size); if (err) { /* The record is corrupt, do not write it. */ ntfs_error(vol->mp, "Failed to apply mst " "fixups (mft_no 0x%llx, type " "0x%x, offset 0x%llx).", (unsigned long long)ni->mft_no, (unsigned)le32_to_cpu(ni->type), (unsigned long long)ofs); err = EIO; goto unm_err; } do_fixup = TRUE; if (need_mirr_sync) { /* * Note we continue despite an error as we may * succeed to write the actual mft record. */ err = ntfs_mft_mirror_sync(vol, lblkno, (MFT_RECORD*)rec, !(b_flags & B_ASYNC)); if (err) ntfs_error(vol->mp, "Failed to sync " "mft mirror (error " "%d). Run chkdsk.", err); } } err = buf_unmap(buf); if (err) ntfs_error(vol->mp, "Failed to unmap buffer (error " "%d).", err); } /* * For both reads and writes we need to register our i/o completion * handler which will be called after i/o is complete (including on i/o * failure) and in which we will remove the MST fixups so the buffer in * memory never has MST fixups applied unless it is under i/o in which * case it is BL_BUSY and thus cannot be accessed by anyone so it is * safe to have the MST fixups applied whilst i/o is in flight. */ if (do_fixup) { buf_setfilter(buf, ntfs_buf_iodone, NULL, &old_iodone, &old_transact); if (old_iodone || old_transact) panic("%s(): Buffer for $MFT/$DATA already had an i/o " "completion handler assigned!\n", __FUNCTION__); } /* * Everything is set up. Pass the i/o onto the buffer layer. * * When the i/o is done it will call our i/o completion handler which * will remove the mst fixups. */ done: return buf_strategy(vol->dev_vn, a); unm_err: err2 = buf_unmap(buf); if (err2) ntfs_error(vol->mp, "Failed to unmap buffer in error code " "path (error %d).", err2); err: buf_seterror(buf, err); buf_biodone(buf); return err; } /** * ntfs_vnop_lookup - find a vnode inside an ntfs directory given its name * @a: arguments to lookup function * * @a contains: * vnode_t a_dvp; directory vnode in which to search * vnode_t *a_vpp; destination pointer for the found vnode * struct componentname *a_cnp; name to find in the directory vnode * vfs_context_t a_context; * * In short, ntfs_vnop_lookup() looks for the vnode represented by the name * @a->a_cnp in the directory vnode @a->a_dvp and if found returns the vnode in * *@a->a_vpp. * * Return 0 on success and the error code on error. A return value of ENOENT * does not signify an error as such but merely the fact that the name * @a->a_cnp is not present in the directory @a->a_dvp. When the lookup is * done for purposes of create, including for the destination of a rename, we * return EJUSTRETURNED instead of ENOENT when the name is not found. This * allows the VFS to proceed with the create/rename. * * To simplify matters for us, we do not treat the DOS and WIN32 filenames as * two hard links but instead if the lookup matches a DOS filename, we return * the corresponding WIN32 filename instead. * * There are three cases we need to distinguish here: * * 1) The name perfectly matches (i.e. including case) a directory entry with a * filename in the WIN32 or POSIX namespaces. In this case * ntfs_lookup_inode_by_name() will return with name set to NULL and we * just use the name as supplied in @a->a_cnp. * 2) The name matches (not including case) a directory entry with a filename * in the WIN32 or POSIX namespaces. In this case * ntfs_lookup_inode_by_name() will return with name set to point to an * allocated ntfs_dir_lookup_name structure containing the properly cased * little endian Unicode name. We convert the name to decomposed UTF-8 and * use that name. * 3) The name matches either perfectly or not (i.e. we do not care about case) * a directory entry with a filename in the DOS namespace. In this case * ntfs_lookup_inode_by_name() will return with name set to point to an * allocated ntfs_dir_lookup_name structure which just tells us that the * name is in the DOS namespace. We read the inode and find the filename in * the WIN32 namespace corresponding to the matched DOS name. We then * convert the name to decomposed UTF-8 and use that name to update the * vnode identity with. */ static int ntfs_vnop_lookup(struct vnop_lookup_args *a) { MFT_REF mref; ino64_t mft_no; unsigned long op; struct componentname *name_cn, *cn; ntfs_inode *ni, *dir_ni = NTFS_I(a->a_dvp); vnode_t vn; ntfs_volume *vol; ntfschar *ntfs_name; ntfs_dir_lookup_name *name = NULL; u8 *utf8_name = NULL; size_t ntfs_name_size, utf8_size; signed ntfs_name_len; int err; /* * This is rather gross but several other file systems do it so perhaps * the large stack (16kiB I believe) in the OS X kernel is big enough. * If we do not want to do the static allocation then simply set * ntfs_name to NULL and utf8_to_ntfs() will allocate the memory for * us. (We then have to free it, see utf8_to_ntfs() description for * details.) */ ntfschar ntfs_name_buf[NTFS_MAX_NAME_LEN]; struct componentname cn_buf; #ifdef DEBUG static const char *ops[4] = { "LOOKUP", "CREATE", "DELETE", "RENAME" }; #endif if (!dir_ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vol = dir_ni->vol; name_cn = cn = a->a_cnp; op = cn->cn_nameiop; ntfs_debug("Looking up %.*s in directory inode 0x%llx for %s, flags " "0x%lx.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)dir_ni->mft_no, op < 4 ? ops[op] : "UNKNOWN", (unsigned long)cn->cn_flags); /* * Ensure we are being called for a directory in case we are not being * called from the VFS. */ if (!S_ISDIR(dir_ni->mode)) { ntfs_error(vol->mp, "Not a directory."); return ENOTDIR; } lck_rw_lock_shared(&dir_ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(dir_ni)) { /* Remove the inode from the name cache. */ cache_purge(dir_ni->vn); lck_rw_unlock_shared(&dir_ni->lock); ntfs_debug("Parent directory is deleted."); return ENOENT; } /* * First, look for the name in the name cache. cache_lookup() returns * -1 if found and @vn is set to the vnode, ENOENT if found and it is a * negative entry thus @vn is not set to anything, or 0 if the lookup * failed in which case we need to do a file system based lookup. * * Note that if @op is CREATE and there is a negative entry in the name * cache cache_lookup() will discard that name and return 0, i.e. the * lookup failed. In this case we will automatically fall through and * do the right thing during the real lookup. */ err = cache_lookup(dir_ni->vn, &vn, cn); if (err) { if (err == -1) { ni = NTFS_I(vn); lck_rw_lock_shared(&ni->lock); /* * Do not allow messing with the inode once it has been * deleted. */ if (!NInoDeleted(ni)) { lck_rw_unlock_shared(&ni->lock); lck_rw_unlock_shared(&dir_ni->lock); *a->a_vpp = vn; ntfs_debug("Done (cached)."); return 0; } lck_rw_unlock_shared(&ni->lock); /* Remove the inode from the name cache. */ cache_purge(vn); vnode_put(vn); ntfs_warning(vol->mp, "Cached but deleted vnode " "found, purged from cache and doing " "real lookup."); } else { lck_rw_unlock_shared(&dir_ni->lock); if (err == ENOENT) { ntfs_debug("Done (cached, negative)."); return err; } ntfs_error(vol->mp, "cache_lookup() failed (error " "%d).", err); return err; } } /* We special case "." and ".." as they are emulated on NTFS. */ if (cn->cn_namelen == 1 && cn->cn_nameptr[0] == '.') { /* "." is not cached. */ cn->cn_flags &= ~MAKEENTRY; if (op == RENAME) { lck_rw_unlock_shared(&dir_ni->lock); ntfs_debug("Op is RENAME but name is \".\", returning " "EISDIR."); return EISDIR; } err = vnode_get(dir_ni->vn); lck_rw_unlock_shared(&dir_ni->lock); if (err) { ntfs_error(vol->mp, "Failed to get iocount reference " "on current directory (error %d).", err); return err; } ntfs_debug("Got \".\" directory 0x%llx.", (unsigned long long)dir_ni->mft_no); *a->a_vpp = dir_ni->vn; return 0; } else if (cn->cn_flags & ISDOTDOT) { /* ".." is not cached. */ cn->cn_flags &= ~MAKEENTRY; vn = vnode_getparent(dir_ni->vn); if (vn) { lck_rw_unlock_shared(&dir_ni->lock); ntfs_debug("Got \"..\" directory 0x%llx of directory " "0x%llx.", (unsigned long long)NTFS_I(vn)->mft_no, (unsigned long long)dir_ni->mft_no); *a->a_vpp = vn; return 0; } /* * Look up a filename attribute in the mft record of the * directory @dir_ni and use its parent mft reference to run an * ntfs_inode_get() on it to obtain an inode for "..". */ err = ntfs_inode_get_name_and_parent_mref(dir_ni, FALSE, &mref, NULL); lck_rw_unlock_shared(&dir_ni->lock); if (err) { ntfs_error(vol->mp, "Failed to obtain parent mft " "reference for directory 0x%llx " "(error %d).", (unsigned long long)dir_ni->mft_no, err); return err; } mft_no = MREF(mref); err = ntfs_inode_get(vol, mft_no, FALSE, LCK_RW_TYPE_SHARED, &ni, NULL, NULL); if (err) { ntfs_error(vol->mp, "Failed to obtain parent inode " "0x%llx for directory 0x%llx (error " "%d).", (unsigned long long)mft_no, (unsigned long long)dir_ni->mft_no, err); return err; } /* Consistency check. */ if (MSEQNO(mref) != ni->seq_no) { lck_rw_unlock_shared(&ni->lock); (void)vnode_put(ni->vn); ntfs_error(vol->mp, "Found stale parent mft reference " "in filename of directory 0x%llx. " "Volume is corrupt. Run chkdsk.", (unsigned long long)dir_ni->mft_no); return EIO; } if (!S_ISDIR(ni->mode)) { lck_rw_unlock_shared(&ni->lock); (void)vnode_put(ni->vn); ntfs_error(vol->mp, "Found non-directory parent for " "filename of directory 0x%llx. " "Volume is corrupt. Run chkdsk.", (unsigned long long)dir_ni->mft_no); return EIO; } ntfs_debug("Got \"..\" directory 0x%llx of directory 0x%llx.", (unsigned long long)mft_no, (unsigned long long)dir_ni->mft_no); *a->a_vpp = ni->vn; lck_rw_unlock_shared(&ni->lock); return 0; } /* Convert the name from utf8 to Unicode. */ ntfs_name = ntfs_name_buf; ntfs_name_size = sizeof(ntfs_name_buf); ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen, &ntfs_name, &ntfs_name_size); if (ntfs_name_len < 0) { lck_rw_unlock_shared(&dir_ni->lock); err = -ntfs_name_len; if (err == ENAMETOOLONG) ntfs_debug("Failed (name is too long)."); else ntfs_error(vol->mp, "Failed to convert name to " "Unicode (error %d).", err); return err; } /* Look up the converted name in the directory index. */ err = ntfs_lookup_inode_by_name(dir_ni, ntfs_name, ntfs_name_len, &mref, &name); if (err) { lck_rw_unlock_shared(&dir_ni->lock); if (err != ENOENT) { ntfs_error(vol->mp, "Failed to find name in directory " "(error %d).", err); return err; } not_found: /* * The name does not exist in the directory @dir_ni. * * If creating (or renaming and the name is the destination * name) and we are at the end of a pathname we can consider * allowing the file to be created so return EJUSTRETURN * instead of ENOENT. */ if (cn->cn_flags & ISLASTCN && (op == CREATE || op == RENAME)) { ntfs_debug("Done (not found but for CREATE or RENAME, " "returning EJUSTRETURN)."); return EJUSTRETURN; } /* * Insert a negative entry into the name cache if caching of * this name is desired unless this is a create operation in * which case we do not want to do that. */ if (cn->cn_flags & MAKEENTRY && op != CREATE) cache_enter(dir_ni->vn, NULL, cn); /* * Prevent the caller from trying to add the name to the cache * as well. */ cn->cn_flags &= ~MAKEENTRY; ntfs_debug("Done (not found%s).", cn->cn_flags & MAKEENTRY ? "adding negative name cache entry" : ""); return err; } /* The lookup succeeded. */ mft_no = MREF(mref); ntfs_debug("Name matches inode number 0x%llx.", (unsigned long long)mft_no); /* * Remove all NTFS core system files from the name space so we do not * need to worry about users damaging a volume by writing to them or * deleting/renaming them and so that we can return fsRtParID (1) as * the inode number of the parent of the volume root directory and * fsRtDirID (2) as the inode number of the volume root directory which * are both expected by Carbon and various applications. */ if (mft_no < FILE_first_user) { lck_rw_unlock_shared(&dir_ni->lock); if (name) OSFree(name, sizeof(*name), ntfs_malloc_tag); ntfs_debug("Removing core NTFS system file (mft_no 0x%x) " "from name space.", (unsigned)mft_no); err = ENOENT; goto not_found; } /* * If the name is at the end of a pathname and is about to be deleted * either directly or as a consequence of a rename with the name as the * target, do not cache it. */ if (cn->cn_flags & ISLASTCN && (op == DELETE || op == RENAME)) cn->cn_flags &= ~MAKEENTRY; /* * If a name was returned from the lookup and it is in the POSIX or * WIN32 namespaces we need to convert it into a componentname so we * can use it instead of the existing componentname @cn when getting * the inode. * * If the returned name is in the DOS namespace we have to get the * inode without a name as we need the inode in order to be able to * find the WIN32 name corresponding to the DOS name. Once we have the * name we will update the vnode identity with it. * * If no name was returned, the match was perfect and we just use the * componentname that was passed in by the caller. */ if (name) { if (name->type == FILENAME_DOS) { name_cn = NULL; /* * We do not need @name any more but do not set it to * NULL because we use that fact to distinguish between * the DOS and WIN32/POSIX cases. */ OSFree(name, sizeof(*name), ntfs_malloc_tag); } else { signed res_size; res_size = ntfs_to_utf8(vol, name->name, name->len << NTFSCHAR_SIZE_SHIFT, &utf8_name, &utf8_size); OSFree(name, sizeof(*name), ntfs_malloc_tag); if (res_size < 0) { lck_rw_unlock_shared(&dir_ni->lock); /* Failed to convert name. */ err = -res_size; ntfs_error(vol->mp, "Failed to convert inode " "name to decomposed UTF-8 " "(error %d).", err); return err; } name = NULL; cn_buf = (struct componentname) { .cn_flags = cn->cn_flags, .cn_nameptr = (char*)utf8_name, .cn_namelen = res_size, }; name_cn = &cn_buf; } } /* * @name_cn now contains the correct name of the inode or is NULL. * * If @name_cn is not NULL and its cn_flags indicate that the name is * to be entered into the name cache, ntfs_inode_get() will do this and * clear the MAKEENTRY bit in the cn_flags. * * Note we only drop the directory lock after obtaining the inode * otherwise someone could delete it under our feet. */ err = ntfs_inode_get(vol, mft_no, FALSE, LCK_RW_TYPE_SHARED, &ni, dir_ni->vn, name_cn); lck_rw_unlock_shared(&dir_ni->lock); if (name_cn == &cn_buf) { /* Pick up any modifications to the cn_flags. */ cn->cn_flags = cn_buf.cn_flags; OSFree(utf8_name, utf8_size, ntfs_malloc_tag); } if (!err) { /* Consistency check. */ // FIXME: I cannot remember why we need the "mft_no != // FILE_MFT" test... if (MSEQNO(mref) != ni->seq_no && mft_no != FILE_MFT) { lck_rw_unlock_shared(&ni->lock); (void)vnode_put(ni->vn); ntfs_debug("Inode was deleted and reused under our " "feet."); err = ENOENT; goto not_found; } /* * We found it. Before we can return it, we have to check if * returning this inode is a valid response to the requested * lookup. To be more specific, if the lookup was for an * intermediate path component and the inode is not a directory * or symbolic link, it is not a valid response because it * cannot be part of an intermediate path component. In that * case return an error. */ if (cn->cn_flags & ISLASTCN || S_ISDIR(ni->mode) || S_ISLNK(ni->mode)) { /* * Perfect WIN32/POSIX match or wrong case WIN32/POSIX * match, i.e. cases 1 and 2, respectively. */ if (!name) { *a->a_vpp = ni->vn; ntfs_debug("Done (case %d).", name_cn == &cn_buf ? 2 : 1); lck_rw_unlock_shared(&ni->lock); return 0; } /* * We are too indented. Handle DOS matches further * below. */ goto handle_dos_name; } lck_rw_unlock_shared(&ni->lock); (void)vnode_put(ni->vn); ntfs_debug("Done (intermediate path component requested but " "found inode is not a directory or symbolic " "link, returning ENOTDIR)."); err = ENOTDIR; } else { if (err == ENOENT) { ntfs_debug("Inode was deleted under our feet."); goto not_found; } ntfs_error(vol->mp, "Failed to get inode 0x%llx (error %d).", (unsigned long long)mft_no, err); } return err; // TODO: Consider moving this lot to a separate function. handle_dos_name: { MFT_RECORD *m; ntfs_attr_search_ctx *ctx; FILENAME_ATTR *fn; const char *old_name; signed res_size; vn = ni->vn; /* * DOS match. -- Case 3. * * Find the WIN32 name corresponding to the matched DOS name. * * At present @ni is guaranteed to be a base inode. */ err = ntfs_mft_record_map(ni, &m); if (err) { ntfs_error(vol->mp, "Failed to map mft record (error %d).", err); goto err; } ctx = ntfs_attr_search_ctx_get(ni, m); if (!ctx) { ntfs_error(vol->mp, "Failed to allocate search context."); err = ENOMEM; goto unm_err; } do { ATTR_RECORD *attr; u32 val_len; u16 val_ofs; err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0, ctx); if (err) { if (err == ENOENT) { ntfs_error(vol->mp, "WIN32 namespace name is " "missing from inode. Run " "chkdsk."); err = EIO; } else ntfs_error(vol->mp, "Failed to find WIN32 " "namespace name in inode " "(error %d).", err); goto put_err; } /* Consistency checks. */ attr = ctx->a; if (attr->non_resident || attr->flags) goto attr_err; val_len = le32_to_cpu(attr->value_length); val_ofs = le16_to_cpu(attr->value_offset); if (val_ofs + val_len > le32_to_cpu(attr->length)) goto attr_err; fn = (FILENAME_ATTR*)((u8*)attr + val_ofs); if ((u32)(sizeof(FILENAME_ATTR) + (fn->filename_length << NTFSCHAR_SIZE_SHIFT)) > val_len) goto attr_err; } while (fn->filename_type != FILENAME_WIN32); /* Convert the name to decomposed UTF-8. */ res_size = ntfs_to_utf8(vol, fn->filename, fn->filename_length << NTFSCHAR_SIZE_SHIFT, &utf8_name, &utf8_size); ntfs_attr_search_ctx_put(ctx); ntfs_mft_record_unmap(ni); if (res_size < 0) { /* Failed to convert name. */ err = -res_size; ntfs_error(vol->mp, "Failed to convert inode name to " "decomposed UTF-8 (error %d).", err); goto err; } /* Update the vnode with the new name if it differs from the old one. */ old_name = vnode_getname(vn); if (!old_name || (ni->link_count > 1 && ((long)strlen(old_name) != res_size || bcmp(old_name, utf8_name, res_size)))) { vnode_update_identity(vn, NULL, (char*)utf8_name, res_size, 0, VNODE_UPDATE_NAME | VNODE_UPDATE_CACHE); } if (old_name) vnode_putname(old_name); /* * Enter the name into the cache (if it is already there this is a * no-op) and prevent the caller from trying to add the name to the * cache as well. */ cn_buf = (struct componentname) { .cn_flags = cn->cn_flags, .cn_nameptr = (char*)utf8_name, .cn_namelen = res_size, }; cache_enter(dir_ni->vn, vn, &cn_buf); cn->cn_flags &= ~MAKEENTRY; OSFree(utf8_name, utf8_size, ntfs_malloc_tag); *a->a_vpp = ni->vn; lck_rw_unlock_shared(&ni->lock); ntfs_debug("Done (case 3)."); return 0; attr_err: ntfs_error(vol->mp, "Filename attribute is corrupt. Run chkdsk."); err = EIO; put_err: ntfs_attr_search_ctx_put(ctx); unm_err: ntfs_mft_record_unmap(ni); err: lck_rw_unlock_shared(&ni->lock); (void)vnode_put(vn); return err; } } // TODO: Rename to ntfs_inode_create and move to ntfs_inode.[hc]? /** * ntfs_create - create an inode on an ntfs volume * @dir_vn: vnode of directory in which to create the new inode * @vn: destination pointer for the vnode of the created inode * @cn: componentname specifying name of the inode to create * @va: vnode attributes to assign to the new inode * @lock: if true the ntfs inode of the returned vnode *@vn is locked * * Create an inode with name as specified in @cn in the directory specified by * the vnode @dir_vn. Assign the attributes @va to the created inode. Finally * return the vnode of the created inode in *@vn. * * @va is used to determine which type of inode is to be created, i.e. if * @va->va_type if VDIR create a directory, etc. * * If @lock is true the ntfs inode of the returned vnode is locked for writing * (NTFS_I(@vn)->lock). * * Called by the various inode creation ntfs functions (ntfs_vnop_create(), * ntfs_vnop_mkdir(), ntfs_vnop_symlink(), ntfs_vnop_mknod(), etc) which are * called by the VFS. * * Return 0 on success and errno on error. * * Note we always create inode names in the POSIX namespace. */ static errno_t ntfs_create(vnode_t dir_vn, vnode_t *vn, struct componentname *cn, struct vnode_attr *va, const BOOL lock) { ntfs_inode *ni, *dir_ni = NTFS_I(dir_vn); ntfs_volume *vol; FILENAME_ATTR *fn; ntfschar *ntfs_name; MFT_RECORD *m; ATTR_RECORD *a; size_t ntfs_name_size; signed ntfs_name_len; unsigned fn_alloc, fn_size; errno_t err, err2; if (!dir_ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vol = dir_ni->vol; if (!S_ISDIR(dir_ni->mode)) { ntfs_debug("Parent inode is not a directory, returning " "ENOTDIR."); return ENOTDIR; } if (dir_ni->file_attributes & FILE_ATTR_REPARSE_POINT) { ntfs_error(vol->mp, "Parent inode is a reparse point and not " "a regular directory, returning ENOTSUP."); return ENOTDIR; } /* * Create a temporary copy of the filename attribute so we can release * the mft record before we add the directory entry. This is needed * because when we hold the mft record for the newly created inode and * we call ntfs_dir_entry_add() this would cause the mft record for the * directory to be mapped which would result in a deadlock in the event * that both mft records are in the same page. */ fn_alloc = sizeof(FILENAME_ATTR) + NTFS_MAX_NAME_LEN * sizeof(ntfschar); fn = OSMalloc(fn_alloc, ntfs_malloc_tag); if (!fn) { ntfs_error(vol->mp, "Failed to allocate memory for temporary " "filename attribute."); return ENOMEM; } bzero(fn, fn_alloc); /* Begin setting up the temporary filename attribute. */ fn->parent_directory = MK_LE_MREF(dir_ni->mft_no, dir_ni->seq_no); /* FILENAME_POSIX is zero and the attribute is already zeroed. */ /* fn->filename_type = FILENAME_POSIX; */ /* Convert the name from utf8 to Unicode. */ ntfs_name = fn->filename; ntfs_name_size = NTFS_MAX_NAME_LEN * sizeof(ntfschar); ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen, &ntfs_name, &ntfs_name_size); if (ntfs_name_len < 0) { err = -ntfs_name_len; if (err == ENAMETOOLONG) ntfs_debug("Failed (name is too long)."); else ntfs_error(vol->mp, "Failed to convert name to " "Unicode (error %d).", err); goto err; } /* Set the filename length in the temporary filename attribute. */ fn->filename_length = ntfs_name_len; fn_size = sizeof(FILENAME_ATTR) + ntfs_name_len * sizeof(ntfschar); /* If no vnode type is specified default to VREG, i.e. regular file. */ if (va->va_type == VNON) va->va_type = VREG; /* * We support regular files, directories, symbolic links, sockets, * fifos, and block and character device special filesr. */ switch (va->va_type) { case VBLK: case VCHR: if (!VATTR_IS_ACTIVE(va, va_rdev)) { ntfs_error(vol->mp, "va_type is %s but va_rdev is not " "specified!", va->va_type == VBLK ? "VBLK" : "VCHR"); err = EINVAL; goto err; } case VREG: case VDIR: case VLNK: case VSOCK: case VFIFO: break; default: ntfs_error(vol->mp, "Tried to create inode of type 0x%x which " "is not supported at present.", va->va_type); err = ENOTSUP; goto err; } va->va_mode |= VTTOIF(va->va_type); /* If no create time is supplied default it to the current time. */ if (!VATTR_IS_ACTIVE(va, va_create_time)) nanotime(&va->va_create_time); /* * Round the time down to the nearest 100-nano-second interval as * needed for NTFS. */ va->va_create_time.tv_nsec -= va->va_create_time.tv_nsec % 100; /* Set the times in the temporary filename attribute. */ fn->last_access_time = fn->last_mft_change_time = fn->last_data_change_time = fn->creation_time = utc2ntfs(va->va_create_time); /* Set the bits for all the supported fields at once. */ va->va_supported |= VNODE_ATTR_BIT(va_mode) | VNODE_ATTR_BIT(va_flags) | VNODE_ATTR_BIT(va_create_time) | VNODE_ATTR_BIT(va_type); again: /* Lock the target directory and check that it has not been deleted. */ lck_rw_lock_exclusive(&dir_ni->lock); if (!dir_ni->link_count) { /* Remove the target directory from the name cache. */ cache_purge(dir_vn); err = ENOENT; goto unl_err; } /* Allocate and map a new mft record. */ err = ntfs_mft_record_alloc(vol, va, cn, dir_ni, &ni, &m, &a); if (err) { if (err != ENOSPC) ntfs_error(vol->mp, "Failed to allocate a new on-disk " "inode (error %d).", err); goto unl_err; } /* * If requested by the caller, take the ntfs inode lock on the * allocated ntfs inode for writing so no-one can start using it before * it is ready. For example if it is a symbolic link we cannot allow * anyone to look at it until we have set the data size to the symbolic * link target size otherwise a concurrent ntfs_vnop_readlink() would * return EINVAL as it would see a target size of zero. * * Also, if the inode is a symbolic link we need to take the lock so * that we can create the AFP_AfpInfo attribute when we have finished * setting up the inode. */ if (lock || S_ISLNK(ni->mode)) lck_rw_lock_exclusive(&ni->lock); /* * @a now points to the location in the allocated mft record at which * we need to insert the filename attribute so we can insert it without * having to do a lookup first. * * Insert the filename attribute and initialize the value to zero. * This cannot fail as we are dealing with a newly allocated mft record * so there must be enough space for a filename attribute even if the * filename is of the maximum allowed length. */ err = ntfs_resident_attr_record_insert_internal(m, a, AT_FILENAME, NULL, 0, fn_size); if (err) panic("%s(): err\n", __FUNCTION__); /* Finish setting up the filename attribute value. */ fn->file_attributes = ni->file_attributes; /* * Directories need the FILE_ATTR_DUP_FILENAME_INDEX_PRESENT flag set * in their filename attributes both in their mft records and in the * index entries pointing to them but not in the standard information * attribute which is why it is not set in @ni->file_attributes. */ if (va->va_type == VDIR) fn->file_attributes |= FILE_ATTR_DUP_FILENAME_INDEX_PRESENT; /* * Update the data_size in the temporary filename attribute from the * created ntfs inode. This will not be zero for fifos and block and * character device special files for example. */ fn->data_size = ni->data_size; /* * Copy the created filename attribute into place in the attribute * record. */ memcpy((u8*)a + le16_to_cpu(a->value_offset), fn, fn_size); /* * Set the link count to one to indicate there is one filename * attribute inside the mft record. */ m->link_count = const_cpu_to_le16(1); ni->link_count = 1; /* * Ensure the mft record is written to disk. * * Note we do not set any of the NInoDirty*() flags because we have * just created the inode thus all the fields are in sync between the * ntfs_inode @ni and its mft record @m. */ NInoSetMrecNeedsDirtying(ni); /* * Release the mft record. It is safe to do so even though the * directory entry has not been added yet because the inode is still * locked and marked new thus it is not a candidate for syncing yet. */ ntfs_mft_record_unmap(ni); /* * If the inode is a symbolic link now create the AFP_AfpInfo attribute * with the Finder Info specifying that this is a symbolic link. */ if (S_ISLNK(ni->mode)) { err = ntfs_inode_afpinfo_write(ni); /* * If the caller has not requested that the inode be returned * locked unlock it now. */ if (!lock) lck_rw_unlock_exclusive(&ni->lock); if (err) { ntfs_error(vol->mp, "Failed to create AFP_AfpInfo " "attribute in allocated inode 0x%llx " "(error %d).", (unsigned long long)ni->mft_no, err); goto rm_err; } } /* Add the created filename attribute to the parent directory index. */ err = ntfs_dir_entry_add(dir_ni, fn, fn_size, MK_LE_MREF(ni->mft_no, ni->seq_no)); if (!err) { /* Free the temporary filename attribute. */ OSFree(fn, fn_alloc, ntfs_malloc_tag); /* * Invalidate negative cache entries in the directory. We need * to do this because there may be negative cache entries * which would match the name of the just created inode but in * a different case. Such negative cache entries would now be * incorrect thus we need to throw away all negative cache * entries to ensure there cannot be any incorrectly negative * entries in the name cache. */ cache_purge_negatives(dir_vn); /* * Add the inode to the name cache. Note that * ntfs_vnop_lookup() will have caused the name to not be * cached because it will have cleared the MAKEENTRY flag. */ cache_enter(dir_ni->vn, ni->vn, cn); /* We are done with the directory so unlock it. */ lck_rw_unlock_exclusive(&dir_ni->lock); /* * We can finally unlock and unmark as new the new ntfs inode * thus rendering the inode a full member of society. */ ntfs_inode_unlock_alloc(ni); ntfs_debug("Done (new mft_no 0x%llx).", (unsigned long long)ni->mft_no); *vn = ni->vn; return 0; } /* * We failed to add the directory entry thus we have to effectively * delete the created inode again. To do this we need to map the mft * record and mark it as no longer in use. * * We then also need to set the link count in the ntfs inode to zero to * reflect that it is deleted and to ensure that the subsequent * vnode_put() results in ntfs_delete_inode() being called (via * VNOP_INACTIVE() and ntfs_vnop_inactive() respectively). * * But first, unlock the allocated ntfs inode if we locked it above. * No-one can get to it now as it does not have a directory entry * pointing to it. */ rm_err: if (lock) lck_rw_unlock_exclusive(&ni->lock); err2 = ntfs_mft_record_map(ni, &m); if (err2) { ntfs_error(vol->mp, "Failed to map mft record in error code " "path (error %d). Run chkdsk to recover the " "lost mft record.", err2); NVolSetErrors(vol); } else { m->flags &= ~MFT_RECORD_IN_USE; NInoSetMrecNeedsDirtying(ni); ntfs_mft_record_unmap(ni); } ni->link_count = 0; lck_rw_unlock_exclusive(&dir_ni->lock); ntfs_inode_unlock_alloc(ni); cache_purge(ni->vn); (void)vnode_put(ni->vn); if (err == EEXIST) { /* * There are two possible reasons why the directory entry * already exists. Either someone created it under our feet in * which case we try to look up the existing vnode and retrn * that instead and failing that we try to create the inode * again or the name really does exist but we have removed it * from the name space thus ntfs_vnop_lookup() will always * return ENOENT/EJUSTRETURN for it. This is the case for the * core system files for example. This would cause an infinite * loop thus we need to check for this case by checking that * the name being created does not match one of the core system * filenames and if it does we return EEXIST. */ if (dir_ni == vol->root_ni) { /* Catch the "." entry. */ if (cn->cn_namelen == 1 && cn->cn_nameptr[0] == '.') goto is_system; /* * Catch the core system files which all start with the * '$' character. */ if (cn->cn_nameptr[0] == '$') { char *n = (char*)cn->cn_nameptr + 1; int l = cn->cn_namelen; if ((l == 4 && !strncmp(n, "MFT", 3)) || (l == 5 && !strncmp(n, "Boot", 4)) || (l == 6 && !strncmp(n, "Quota", 5)) || (l == 7 && ( !strncmp(n, "Volume", 6) || !strncmp(n, "Bitmap", 6) || !strncmp(n, "Secure", 6) || !strncmp(n, "UpCase", 6) || !strncmp(n, "Extend", 6))) || (l == 8 && ( !strncmp(n, "MFTMirr", 7) || !strncmp(n, "LogFile", 7) || !strncmp(n, "AttrDef", 7) || !strncmp(n, "BadClus", 7)))) goto is_system; } } ntfs_debug("Inode was created under our feet."); /* * If the inode was created under our feet, we are creating a * regular file, and the caller did not want an exclusive * create, simply look up the inode and return that. */ if (va->va_type == VREG && !(va->va_vaflags & VA_EXCLUSIVE)) { struct vnop_lookup_args la; cn->cn_nameiop = LOOKUP; la = (struct vnop_lookup_args) { .a_desc = &vnop_lookup_desc, .a_dvp = dir_vn, .a_vpp = vn, .a_cnp = cn, }; err = ntfs_vnop_lookup(&la); cn->cn_nameiop = CREATE; /* * If the inode that was created under our feet was * also deleted under our feet, repeat the whole * process. */ if (err == ENOENT || err == EJUSTRETURN) { *vn = NULL; goto again; } /* * Make sure the vnode we looked up is a regular file * as we would not want to return a directory instead * of a file for example. */ if (!err && vnode_vtype(*vn) != VREG) { (void)vnode_put(*vn); *vn = NULL; err = EEXIST; } } } else ntfs_error(vol->mp, "Failed to add directory entry (error " "%d).", err); err: OSFree(fn, fn_alloc, ntfs_malloc_tag); return err; unl_err: lck_rw_unlock_exclusive(&dir_ni->lock); goto err; is_system: ntfs_error(vol->mp, "Cannot create inode with name %.*s in the volume " "root directory as the name clashes with the name of " "a core system file. Returning EEXIST.", (int)cn->cn_namelen, cn->cn_nameptr); err = EEXIST; *vn = NULL; goto err; } /** * ntfs_vnop_create - create a regular file * @a: arguments to create function * * @a contains: * vnode_t a_dvp; directory in which to create the file * vnode_t *a_vpp; destination pointer for the created file * struct componentname *a_cnp; name of the file to create * struct vnode_attr *a_vap; attributes to set on the created file * vfs_context_t a_context; * * Create a regular file with name as specified in @a->a_cnp in the directory * specified by the vnode @a->a_dvp. Assign the attributes @a->a_vap to the * created file. Finally return the vnode of the created file in *@a->a_vpp. * * Return 0 on success and errno on error. * * Note we always create filenames in the POSIX namespace. */ static int ntfs_vnop_create(struct vnop_create_args *a) { errno_t err; #ifdef DEBUG ntfs_inode *ni = NTFS_I(a->a_dvp); if (ni) ntfs_debug("Creating a file named %.*s in directory mft_no " "0x%llx.", (int)a->a_cnp->cn_namelen, a->a_cnp->cn_nameptr, (unsigned long long)ni->mft_no); #endif err = ntfs_create(a->a_dvp, a->a_vpp, a->a_cnp, a->a_vap, FALSE); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_mknod - create a special file node * @a: arguments to mknod function * * @a contains: * vnode_t a_dvp; directory in which to create the file * vnode_t *a_vpp; destination pointer for the created file * struct componentname *a_cnp; name of the file to create * struct vnode_attr *a_vap; attributes to set on the created file * vfs_context_t a_context; * * Create a special file node with name as specified in @a->a_cnp in the * directory specified by the vnode @a->a_dvp. Assign the attributes @a->a_vap * to the created node. Finally return the vnode of the created file in * *@a->a_vpp. * * The type of special file node to create is specified by the caller in * @a->a_vap->va_type and can be one of: * VSOCK - create a socket * VFIFO - create a fifo * VBLK - create a block special device * VCHR - create a character special device * * Return 0 on success and errno on error. * * Note we always create filenames in the POSIX namespace. */ static int ntfs_vnop_mknod(struct vnop_mknod_args *a) { errno_t err; #ifdef DEBUG ntfs_inode *ni = NTFS_I(a->a_dvp); if (ni) ntfs_debug("Creating a special inode of type 0x%x named %.*s " "in directory mft_no 0x%llx.", a->a_vap->va_type, (int)a->a_cnp->cn_namelen, a->a_cnp->cn_nameptr, (unsigned long long)ni->mft_no); #endif err = ntfs_create(a->a_dvp, a->a_vpp, a->a_cnp, a->a_vap, FALSE); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_open - open a vnode * @a: arguments to open function * * @a contains: * vnode_t a_vp; vnode to open * int a_mode; mode to open the file with * vfs_context_t a_context; * * Open the vnode @a->a_vp with mode @a->a_mode. * * Note the VFS does a lot of checking before ntfs_vnop_open() is called * including permissions and checking for a read-only file system thus we do * not need to worry about the case where the driver is compiled read-only as * the volume is then mounted read-only so the vfs catches all write accesses * very early on and denies them. * * Return 0 on success and errno on error. */ static int ntfs_vnop_open(struct vnop_open_args *a) { ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp); errno_t err = 0; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } ntfs_debug("Entering for mft_no 0x%llx, mode 0x%x.", (unsigned long long)ni->mft_no, (unsigned)a->a_mode); base_ni = ni; if (NInoAttr(ni)) base_ni = ni->base_ni; /* * All the core system files cannot possibly be opened because they are * removed from the name space thus it is impossible for a process to * obtain a vnode to them thus VNOP_OPEN() can never be called for * them. The only exception is the root directory which we of course * allow access to. */ if (ni->mft_no < FILE_first_user && ni != ni->vol->root_ni) panic("%s(): Called for a system inode. This is not " "possible.\n", __FUNCTION__); lck_rw_lock_shared(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { lck_rw_unlock_shared(&ni->lock); /* Remove the inode from the name cache. */ cache_purge(ni->vn); ntfs_debug("Cannot open deleted mft_no 0x%llx, returning " "ENOENT.", (unsigned long long)ni->mft_no); return ENOENT; } /* * Do not allow opening encrpyted files as we do not support reading, * writing, nor mmap()ing them. */ if (NInoEncrypted(ni)) { lck_rw_unlock_shared(&ni->lock); ntfs_debug("Cannot open encrypted mft_no 0x%llx, returning " "EACCES.", (unsigned long long)ni->mft_no); return EACCES; } lck_rw_unlock_shared(&ni->lock); /* * We keep track of how many times the base vnode has been opened and * we count other vnodes towards the base vnode open count to ensure * we do the right thing in ntfs_unlink(). */ OSIncrementAtomic(&base_ni->nr_opens); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_close - close a vnode * @a: arguments to close function * * @a contains: * vnode_t a_vp; vnode to close * int a_fflag; close flags (FREAD and/or FWRITE for example) * vfs_context_t a_context; * * Close the vnode @a->a_vp with flags @a->a_fflag. * * Return 0 on success and errno on error. */ static int ntfs_vnop_close(struct vnop_close_args *a) { vnode_t vn = a->a_vp; ntfs_inode *base_ni, *ni = NTFS_I(vn); if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return 0; } ntfs_debug("Entering for mft_no 0x%llx, fflag 0x%x.", (unsigned long long)ni->mft_no, a->a_fflag); base_ni = ni; if (NInoAttr(ni)) base_ni = ni->base_ni; /* * We keep track of how many times the base vnode has been opened and * we count other vnodes towards the base vnode open count to ensure * we do the right thing in ntfs_unlink(). */ OSDecrementAtomic(&base_ni->nr_opens); /* * If the vnode is still in use release any expired directory hints. * * If the vnode is no longer in use release all directory hints. * * Note we check for presence of directory hints outside the locks as * an optimization. It is not a disaster if we miss any as all will be * released in ntfs_inode_free() before the inode is thrown away at the * latest. */ if (ni != base_ni && ni->type == AT_INDEX_ALLOCATION && ni->nr_dirhints) { int busy; busy = vnode_isinuse(vn, ni->nr_refs + 1); lck_rw_lock_exclusive(&ni->lock); ntfs_dirhints_put(ni, busy); lck_rw_unlock_exclusive(&ni->lock); } ntfs_debug("Done."); return 0; } /** * ntfs_vnop_access - * */ static int ntfs_vnop_access(struct vnop_access_args *a) { errno_t err; ntfs_debug("Entering."); // TODO: err = ENOTSUP; ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_getattr - get attributes about a vnode or about the mounted volume * @a: arguments to getattr function * * @a contains: * vnode_t a_vp; vnode for which to return attributes * struct vnode_attr *a_vap; attributes to return and destination * vfs_context_t a_context; * * Return the attributes described in @a_vap about the vnode @a_vp. Some * attributes are intercepted by the VFS in getattrlist() and getvolattrlist() * so we do not bother with them. * * At present we do not support all attributes. We declare what we support to * the world in our VFS_GETATTR() function (ntfs_vfsops.c::ntfs_getattr()) so * do not forget to update that when support for further attributes is added * here. * * Return 0 on success and errno on error. * * TODO: Implement more attributes. */ static int ntfs_vnop_getattr(struct vnop_getattr_args *a) { MFT_REF parent_mref; ino64_t mft_no; s64 on_disk_size; struct vnode_attr *va = a->a_vap; ntfs_inode *ni, *base_ni; ntfs_volume *vol; const char *name; FILE_ATTR_FLAGS file_attributes; unsigned flags; errno_t err; lck_rw_type_t lock; BOOL is_root, name_is_done, have_parent; ni = NTFS_I(a->a_vp); if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return 0; } vol = ni->vol; mft_no = ni->mft_no; have_parent = name_is_done = is_root = FALSE; ntfs_debug("Entering for mft_no 0x%llx.", (unsigned long long)mft_no); base_ni = ni; if (NInoAttr(ni)) { base_ni = ni->base_ni; lck_rw_lock_shared(&base_ni->lock); } lck_rw_lock_shared(&ni->lock); lock = LCK_RW_TYPE_SHARED; /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); err = ENOENT; goto err; } /* * If this is the root directory, leave it to the VFS to get the name * from the mountpoint (see below). */ if (base_ni == vol->root_ni) name_is_done = is_root = TRUE; /* For directories always return a link count of 1. */ va->va_nlink = 1; if (!S_ISDIR(ni->mode)) va->va_nlink = ni->link_count; va->va_rdev = (dev_t)0; switch (ni->mode & S_IFMT) { case S_IFBLK: case S_IFCHR: /* * For block and character device special inodes return the * device. */ va->va_rdev = ni->rdev; case S_IFIFO: case S_IFSOCK: /* * For fifos, sockets, block and character device special files * return all sizes set to zero. */ va->va_total_alloc = va->va_data_alloc = va->va_total_size = va->va_data_size = 0; break; default: lck_spin_lock(&ni->size_lock); /* * We cheat for both the total size and the total allocated * size and just return the attribute size rather than looping * over all ($DATA?) attributes and adding up their sizes. */ va->va_total_size = va->va_data_size = ni->data_size; /* * Resident attributes reside inside the on-disk inode and thus * have no on-disk allocation because the on-disk inode itself * is already accounted for in the allocated size of the $MFT * system file which contains the table of on-disk inodes. * Perhaps more importantly, if we delete a resident file no * space would be freed up on the volume, thus we definitely * need to return zero for the allocated size of such resident * files. */ on_disk_size = 0; if (NInoNonResident(ni)) { if (ni->type == AT_DATA && (NInoCompressed(ni) || NInoSparse(ni))) on_disk_size = ni->compressed_size; else on_disk_size = ni->allocated_size; } va->va_total_alloc = va->va_data_alloc = on_disk_size; lck_spin_unlock(&ni->size_lock); } va->va_iosize = ubc_upl_maxbufsize(); va->va_uid = ni->uid; va->va_gid = ni->gid; va->va_mode = ni->mode; file_attributes = base_ni->file_attributes; /* * Do not allow the volume root directory to be read-only or hidden and * do not allow directories in general to be read-only as Windows uses * the read-only bit on directories for completely different purposes * like customized/specialized folder views which are lost when you * clear the read-only bit. */ if (S_ISDIR(base_ni->mode)) { file_attributes &= ~FILE_ATTR_READONLY; if (is_root) file_attributes &= ~FILE_ATTR_HIDDEN; } flags = 0; /* * if (NInoCompressed(ni)) * flags |= SF_COMPRESSED; */ if (file_attributes & FILE_ATTR_READONLY) flags |= UF_IMMUTABLE; if (file_attributes & FILE_ATTR_HIDDEN) flags |= UF_HIDDEN; /* * Windows does not set the "needs archiving" bit on directories * except for encrypted directories where it does set the bit. */ if ((!S_ISDIR(base_ni->mode) || file_attributes & FILE_ATTR_ENCRYPTED) && !(file_attributes & FILE_ATTR_ARCHIVE)) flags |= SF_ARCHIVED; va->va_flags = flags; va->va_create_time = base_ni->creation_time; va->va_access_time = base_ni->last_access_time; va->va_modify_time = base_ni->last_data_change_time; va->va_change_time = base_ni->last_mft_change_time; /* * NTFS does not distinguish between the inode and its hard links. * * We have to remap the root directory inode to inode number 2, i.e. * fsRtDirID, for compatibility with Carbon. */ if (!is_root) va->va_fileid = mft_no; else va->va_fileid = 2; va->va_fsid = vol->dev; /* FIXME: What is the difference between the below two? */ va->va_filerev = base_ni->seq_no; va->va_gen = base_ni->seq_no; va->va_encoding = 0x7e; /* = kTextEncodingMacUnicode */ va->va_supported |= VNODE_ATTR_BIT(va_rdev) | VNODE_ATTR_BIT(va_nlink) | VNODE_ATTR_BIT(va_total_size) | VNODE_ATTR_BIT(va_total_alloc) | VNODE_ATTR_BIT(va_data_size) | VNODE_ATTR_BIT(va_data_alloc) | VNODE_ATTR_BIT(va_iosize) | VNODE_ATTR_BIT(va_uid) | VNODE_ATTR_BIT(va_gid) | VNODE_ATTR_BIT(va_mode) | VNODE_ATTR_BIT(va_flags) | VNODE_ATTR_BIT(va_create_time) | VNODE_ATTR_BIT(va_access_time) | VNODE_ATTR_BIT(va_modify_time) | VNODE_ATTR_BIT(va_change_time) | VNODE_ATTR_BIT(va_fileid) | VNODE_ATTR_BIT(va_fsid) | VNODE_ATTR_BIT(va_filerev) | VNODE_ATTR_BIT(va_gen) | VNODE_ATTR_BIT(va_encoding) | 0; /* * Return va_parentid, i.e. the mft record number of the parent of the * inode, if it was requested. * * We have to return 1, i.e. fsRtParID, for the parent inode number of * the root directory inode for compatibility with Carbon. Simillarly * we have to return 2, i.e. fsRtDirID, if the parent inode is the root * directory inode. * * For all other inodes we try to get the parent from the vnode and if * it does not have the vnode cached then if the inode is an attribute * inode we return the inode number of the base inode (in line with how * named streams work on Mac OS X) and otherwise we obtain the parent * mft reference by looking up a filename attribute record in the mft * record of the inode and obtaining the parent mft record reference * from there. * * There is one pitfall with this approach for files and that is that a * file may have multiple parents and we are returning a random one but * that is the best we can do. * * To make this a little better we get the name at the same time as we * get the parent mft reference so we can at least return a parent id * and name that match, i.e. the name is present in the parent id. * * And to make this even better, when the parent is requested and a * name is cached in the vnode, we use the name in the vnode to find * the parent that matches that name if it exists. If it does not * exist we revert to finding a random parent. */ if (VATTR_IS_ACTIVE(va, va_parentid)) { ino64_t parent_mft_no; vnode_t parent_vn; if (is_root && base_ni == ni) VATTR_RETURN(va, va_parentid, 1); else if ((parent_vn = vnode_getparent(ni->vn))) { parent_mft_no = NTFS_I(parent_vn)->mft_no; (void)vnode_put(parent_vn); have_parent = TRUE; if (parent_mft_no == FILE_root) parent_mft_no = 2; VATTR_RETURN(va, va_parentid, parent_mft_no); } else if (ni != base_ni) { parent_mft_no = base_ni->mft_no; if (parent_mft_no == FILE_root) parent_mft_no = 2; VATTR_RETURN(va, va_parentid, parent_mft_no); } else /* if (ni == base_ni) */ { name_is_done = TRUE; name = NULL; if (VATTR_IS_ACTIVE(va, va_name)) name = va->va_name; err = ntfs_inode_get_name_and_parent_mref(base_ni, FALSE, &parent_mref, name); if (err) { ntfs_error(base_ni->vol->mp, "Failed to obtain " "parent mft reference for " "mft_no 0x%llx (error %d).", (unsigned long long) base_ni->mft_no, err); goto err; } parent_mft_no = MREF(parent_mref); if (parent_mft_no == FILE_root) parent_mft_no = 2; va->va_parentid = parent_mft_no; va->va_supported |= VNODE_ATTR_BIT(va_parentid) | (name ? VNODE_ATTR_BIT(va_name) : 0); } } /* * Return va_name, i.e. the name of the inode, if it was requested. * * If this is the root directory of the volume, leave it to the VFS to * find the mounted-on name, which is different from the real volume * root directory name of "." (this is ensured by the fact that * @name_is_done was set to TRUE for the root directory earlier). * * For all other inodes we try to get the name from the vnode and if it * does not have the name cached we obtain the name by looking up a * filename attribute record in the mft record of the inode and using * that. * * Note we do not need to do anything if we dealt with the name as part * of dealing with va_parentid above. In this case @name_is_done will * be set to true. * * Also we do not need to do anything if we tried to deal with * va_parentid above and failed as we would only fail again here. This * means that if @err is not zero we skip the call to * ntfs_inode_get_name_and_parent_mref(). * * TODO: What do we return for attribute inodes? Shall we exclude them * from VNOP_GETATTR() altogether? For now we simply do not return a * name for them. */ if (!name_is_done && VATTR_IS_ACTIVE(va, va_name) && ni == base_ni) { name = vnode_getname(base_ni->vn); if (name) { (void)strlcpy(va->va_name, name, MAXPATHLEN - 1); VATTR_SET_SUPPORTED(va, va_name); (void)vnode_putname(name); } else { err = ntfs_inode_get_name_and_parent_mref(base_ni, have_parent, &parent_mref, va->va_name); if (err) { ntfs_error(base_ni->vol->mp, "Failed to obtain " "parent mft reference for " "mft_no 0x%llx (error %d).", (unsigned long long) base_ni->mft_no, err); goto err; } /* * We forcibly overwrite the parent id with the * possibly new parent id here to be consistent with * the name, i.e. we want the name we return to * actually exist in the returned parent. * * If we already had the parent id from before then * ntfs_inode_get_name_and_parent_mref() will have * found the name matching this parent id thus our * setting of the parent id here will be a no-op. */ va->va_parentid = MREF(parent_mref); if (va->va_parentid == FILE_root) va->va_parentid = 2; va->va_supported |= VNODE_ATTR_BIT(va_parentid) | VNODE_ATTR_BIT(va_name); } } /* * Unlock the attribute inode as we do not need it any more and so we * cannot deadlock with converting the lock on the base inode to * exclusive and with the call to ntfs_inode_afpinfo_read() below. */ if (ni != base_ni) lck_rw_unlock_shared(&ni->lock); if (VATTR_IS_ACTIVE(va, va_backup_time)) { if (!NInoValidBackupTime(base_ni)) { if (!lck_rw_lock_shared_to_exclusive(&base_ni->lock)) { lck_rw_lock_exclusive(&base_ni->lock); if (NInoDeleted(base_ni)) { cache_purge(base_ni->vn); lck_rw_unlock_exclusive(&base_ni->lock); return ENOENT; } } lock = LCK_RW_TYPE_EXCLUSIVE; /* * Load the AFP_AfpInfo stream and initialize the * backup time and Finder Info (if they are not already * valid). */ err = ntfs_inode_afpinfo_read(base_ni); if (err) { ntfs_error(base_ni->vol->mp, "Failed to " "read AFP_AfpInfo attribute " "from inode 0x%llx (error " "%d).", (unsigned long long) base_ni->mft_no, err); lck_rw_unlock_exclusive(&base_ni->lock); return err; } if (!NInoValidBackupTime(base_ni)) panic("%s(): !NInoValidBackupTime(base_ni)\n", __FUNCTION__); } VATTR_RETURN(va, va_backup_time, base_ni->backup_time); } if (lock == LCK_RW_TYPE_SHARED) lck_rw_unlock_shared(&base_ni->lock); else lck_rw_unlock_exclusive(&base_ni->lock); ntfs_debug("Done."); return 0; err: lck_rw_unlock_shared(&ni->lock); if (ni != base_ni) lck_rw_unlock_shared(&base_ni->lock); return err; } /** * ntfs_vnop_setattr - set attributes of a vnode or of the mounted volume * @a: arguments to setattr function * * @a contains: * vnode_t a_vp; vnode of which to set attributes * struct vnode_attr *a_vap; attributes to set and source * vfs_context_t a_context; * * Set the attributes described by @a_vap in the vnode @a_vp. Some attributes * are intercepted by the VFS in setattrlist() and setvolattrlist() so we do * not bother with them. * * At present we do not support all attributes. We declare what we support to * the world in our VFS_GETATTR() function (ntfs_vfsops.c::ntfs_getattr()) so * do not forget to update that when support for further attributes is added * here. * * Return 0 on success and errno on error. * * TODO: Implement more attributes. */ static int ntfs_vnop_setattr(struct vnop_setattr_args *a) { ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp); ntfs_volume *vol; struct vnode_attr *va = a->a_vap; errno_t err = 0; BOOL dirty_times = FALSE; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vol = ni->vol; ntfs_debug("Entering for mft_no 0x%llx.", (unsigned long long)ni->mft_no); base_ni = ni; if (NInoAttr(ni)) { base_ni = ni->base_ni; lck_rw_lock_exclusive(&base_ni->lock); } lck_rw_lock_exclusive(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); err = ENOENT; goto unl_err; } if (VATTR_IS_ACTIVE(va, va_data_size)) { ntfs_debug("Changing size for mft_no 0x%llx to 0x%llx.", (unsigned long long)ni->mft_no, (unsigned long long)va->va_data_size); #if 1 // TODO: Remove this when sparse support is done... if (NInoSparse(ni)) { err = ENOTSUP; goto unl_err; } #endif /* * Do not allow calling for $MFT/$DATA as it would destroy the * volume. * * Also only allow setting the size of VREG vnodes as that * covers both regular files and named streams whilst excluding * symbolic links for example. */ if (vnode_vtype(ni->vn) != VREG || (!ni->mft_no && !NInoAttr(ni))) err = EPERM; else err = ntfs_attr_resize(ni, va->va_data_size, va->va_vaflags & 0xffff, NULL); if (err) { ntfs_error(vol->mp, "Failed to set inode size (error " "%d).", err); goto unl_err; } VATTR_SET_SUPPORTED(va, va_data_size); } /* * Unlock the attribute inode as we do not need it any more and so we * cannot deadlock with the call to ntfs_inode_afpinfo_write() below. */ if (ni != base_ni) lck_rw_unlock_exclusive(&ni->lock); if (VATTR_IS_ACTIVE(va, va_flags)) { u32 flags = va->va_flags; BOOL dirty_flags = FALSE; /* * Only allow changing of supported flags. There are two * exceptions and those are the archived flag and read-only bit * on directories which are not supported on NTFS but we have * to ignore them or too many things break such as "cp -pr" * from a more sensible file system. */ if (flags & ~(SF_ARCHIVED | SF_IMMUTABLE | UF_IMMUTABLE | UF_HIDDEN /* | SF_COMPRESSED */)) { ntfs_error(vol->mp, "Cannot set unsupported flags " "0x%x.", (unsigned)(flags & ~(SF_ARCHIVED | SF_IMMUTABLE | UF_IMMUTABLE | UF_HIDDEN))); err = EINVAL; goto err; } /* * We do not allow modification for any of the core NTFS * system files which we want to remain as they are except that * we silently ignore changes to the root directory. */ if (base_ni->mft_no < FILE_first_user && base_ni != vol->root_ni) { ntfs_error(vol->mp, "Refusing to change flags on core " "NTFS system file (mft_no 0x%llx).", (unsigned long long)base_ni->mft_no); err = EPERM; goto err; } /* * We currently do not support changing the compression state * of a vnode. * * Further, only the base inode may be compressed. */ /* * if (((flags & SF_COMPRESSED) && !NInoCompressed(ni)) || * (!(flags & SF_COMPRESSED) && * NInoCompressed(ni))) { * if (ni != base_ni) { * ntfs_error(vol->mp, "Only regular files and " * "directories may be " * "compressed, aborting."); * err = EINVAL; * goto err; * } * ntfs_warning(vol->mp, "Changing the compression state " * "is not supported at present, " * "returning ENOTSUP."); * err = ENOTSUP; * goto err; * } */ /* * The root directory of a volume always has the hidden bit set * but we pretend that it is not hidden to OS X and we do not * allow this bit to be modified for the root directory. */ if (base_ni != vol->root_ni) { /* * If the Finder info is valid need to update it as * well. Note setting or clearing the hidden flag in * the Finder info does not cause the Finder info to * become dirty as the hidden bit is not stored on disk * in the Finder info. */ if (flags & UF_HIDDEN) { base_ni->file_attributes |= FILE_ATTR_HIDDEN; if (NInoValidFinderInfo(base_ni)) base_ni->finder_info.attrs |= FINDER_ATTR_IS_HIDDEN; } else { base_ni->file_attributes &= ~FILE_ATTR_HIDDEN; if (NInoValidFinderInfo(base_ni)) base_ni->finder_info.attrs &= ~FINDER_ATTR_IS_HIDDEN; } dirty_flags = TRUE; } /* * Windows does not allow users to set/clear the read-only bit * on directories. In fact Windows uses the read-only bit on a * directory to signify that a customized or specialized folder * view is in effect thus we do not allow setting/clearing the * read-only bit on directories from OS X. * * Windows does not set the "needs archiving" bit on * directories. * * The only exception are encrypted directories which do have * the "needs archiving" bit set but we do not want to allow * this bit to be cleared so ignore them, too. */ if (!S_ISDIR(base_ni->mode)) { if (flags & (SF_IMMUTABLE | UF_IMMUTABLE)) base_ni->file_attributes |= FILE_ATTR_READONLY; else base_ni->file_attributes &= ~FILE_ATTR_READONLY; if (flags & SF_ARCHIVED) base_ni->file_attributes &= ~FILE_ATTR_ARCHIVE; else base_ni->file_attributes |= FILE_ATTR_ARCHIVE; dirty_flags = TRUE; } if (dirty_flags) NInoSetDirtyFileAttributes(base_ni); VATTR_SET_SUPPORTED(va, va_flags); } if (VATTR_IS_ACTIVE(va, va_create_time)) { base_ni->creation_time = va->va_create_time; VATTR_SET_SUPPORTED(va, va_create_time); dirty_times = TRUE; } if (VATTR_IS_ACTIVE(va, va_modify_time)) { base_ni->last_data_change_time = va->va_modify_time; VATTR_SET_SUPPORTED(va, va_modify_time); dirty_times = TRUE; /* * The following comment came from the HFS code: * * <quote>The utimes system call can reset the modification * time but it doesn't know about HFS create times. So we need * to ensure that the creation time is always at least as old * as the modification time.</quote> * * SMB also follows this behaviour and it also adds the * following comment: * * <quote>The HFS code also checks to make sure it was not the * root vnode. Don Brady said that the SMB code should not use * that part of the check.</quote> * * I assume the root vnode check is there in HFS as it does not * support times on the root vnode at all so the check is * needed for HFS only. * * The same applies for NTFS so follow the HFS/SMB behaviour. * * One salient point is that we only do the above if the * creation time is not being explicitly set already. */ if (!VATTR_IS_ACTIVE(va, va_create_time) && (va->va_modify_time.tv_sec < base_ni->creation_time.tv_sec || (va->va_modify_time.tv_sec == base_ni->creation_time.tv_sec && va->va_modify_time.tv_nsec < base_ni->creation_time.tv_nsec))) base_ni->creation_time = va->va_modify_time; } if (VATTR_IS_ACTIVE(va, va_change_time)) { base_ni->last_mft_change_time = va->va_change_time; VATTR_SET_SUPPORTED(va, va_change_time); dirty_times = TRUE; } if (VATTR_IS_ACTIVE(va, va_access_time)) { base_ni->last_access_time = va->va_access_time; VATTR_SET_SUPPORTED(va, va_access_time); dirty_times = TRUE; } if (dirty_times) NInoSetDirtyTimes(base_ni); if (VATTR_IS_ACTIVE(va, va_backup_time)) { base_ni->backup_time = va->va_backup_time; NInoSetValidBackupTime(base_ni); NInoSetDirtyBackupTime(base_ni); /* * Now write (if needed creating) the AFP_AfpInfo attribute * with the specified backup time. */ err = ntfs_inode_afpinfo_write(base_ni); if (err) { ntfs_error(vol->mp, "Failed to write/create " "AFP_AfpInfo attribute in inode " "0x%llx (error %d).", (unsigned long long)base_ni->mft_no, err); goto err; } VATTR_SET_SUPPORTED(va, va_backup_time); } ntfs_debug("Done."); err: lck_rw_unlock_exclusive(&base_ni->lock); return err; unl_err: if (ni != base_ni) lck_rw_unlock_exclusive(&ni->lock); goto err; } /* Limit the internal i/o size so we can represent it in a 32-bit int. */ #define NTFS_MAX_IO_REQUEST_SIZE (1024 * 1024 * 256) /** * ntfs_vnop_read_compressed - read from a compressed attribute * @ni: ntfs inode describing the compressed attribute to read * @uio: destination in which to return the read data * @data_size: data size of the compressed attribute * @ioflags: flags further describing the read request (see ntfs_vnop_read()) * * This is a helper function for ntfs_vnop_read() (see below). It is called * when a read request for a compressed attribute is received by * ntfs_vnop_read(). * * This function is somewhat similar to cluster_read() or to be more precise to * cluster_read_copy() in that it breaks up large i/os into smaller manageable * chunks, and for each chunk tries to get the data from the vm page cache and * return it in the destination buffer described by @uio and failing that, it * creates and maps a upl and causes it to be filled with data by calling * ntfs_read_compressed() which reads the compressed data via the raw inode and * decompresses it into our mapped upl and once that is done we now have the * data in the vm page cache and copy it into the destination buffer described * by @uio. * * Return 0 on success and errno on error. */ static inline int ntfs_vnop_read_compressed(ntfs_inode *ni, uio_t uio, const s64 data_size, int ioflags) { s64 size; user_ssize_t start_count; off_t ofs; vnode_t vn = ni->vn; ntfs_inode *raw_ni; upl_t upl; upl_page_info_t *pl; kern_return_t kerr; int count, err, align_mask, cur_pg, last_pg; int max_upl_size = ubc_upl_maxbufsize(); ofs = uio_offset(uio); start_count = uio_resid(uio); ntfs_debug("Entering for compressed file inode 0x%llx, offset 0x%llx, " "count 0x%llx, ioflags 0x%x.", (unsigned long long)ni->mft_no, (unsigned long long)ofs, (unsigned long long)start_count, ioflags); /* * We can only read from regular files and named streams that are * compressed and non-resident. We should never be called for anything * else. */ if (ni->type != AT_DATA || !NInoCompressed(ni) || !NInoNonResident(ni) || NInoEncrypted(ni) || NInoRaw(ni)) panic("%s(): Called for inappropriate inode.\n", __FUNCTION__); /* * Get the raw inode. We take the inode lock shared to protect against * concurrent writers as the compressed data is invalid whilst a write * is in progress. */ err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_SHARED, &raw_ni); if (err) { ntfs_error(ni->vol->mp, "Failed to get raw inode (error %d).", err); return err; } if (!NInoRaw(raw_ni)) panic("%s(): Requested raw inode but got non-raw one.\n", __FUNCTION__); lck_spin_lock(&raw_ni->size_lock); size = ubc_getsize(raw_ni->vn); if (size != raw_ni->data_size) panic("%s(): size != raw_ni->data_size\n", __FUNCTION__); lck_spin_unlock(&raw_ni->size_lock); /* * If nothing was requested or the request starts at or beyond the end * of the attribute, we do not need to do anything. */ if (!start_count || ofs >= data_size) { err = 0; goto err; } /* Cannot read from a negative offset. */ if (ofs < 0) { err = EINVAL; goto err; } if (vnode_isnocache(vn) || vnode_isnocache(raw_ni->vn)) ioflags |= IO_NOCACHE; if (vnode_isnoreadahead(vn) || vnode_isnoreadahead(raw_ni->vn)) ioflags |= IO_RAOFF; align_mask = ni->compression_block_size - 1; if (align_mask < PAGE_MASK) align_mask = PAGE_MASK; /* * Loop until we have finished the whole request or reached the end of * the attribute. * * FIXME: We do not bother with read-ahead on the uncompressed vnode * for now except to the extent that we always decompress full * compression blocks which may be larger than the current i/o request * so the next i/o request will find the whole compression block * decompressed in the vm page cache thus small reads will in effect * experience a certain amount of read-ahead in this way. */ do { u8 *kaddr; int delta, next_pg, orig_count; size = data_size - ofs; if (size > start_count) size = start_count; count = size; /* * Break up the i/o in chunks that fit into a 32-bit int so * we can call cluster_copy_ubc_data(), etc. */ if (size > NTFS_MAX_IO_REQUEST_SIZE) count = NTFS_MAX_IO_REQUEST_SIZE; /* * First of all, try to copy the data from the vm page cache. * This will work on the second and all later reads so this is * the hot path. If the attribute has not been accessed at all * before or its cached pages were dropped due to vm pressure * this will fail to copy any data due to the lack of a valid * page and we will drop into the slow path. */ if (!(ioflags & IO_NOCACHE)) { err = cluster_copy_ubc_data(vn, uio, &count, 0); if (err) { /* * The copying (uiomove()) failed with an * error, abort. */ ntfs_error(ni->vol->mp, "cluster_copy_ubc_data() " "failed (error %d).", err); goto err; } /* * @count is now set to the number of bytes remaining * to be transferred. If it is zero, it means all the * pages were in the vm page cache so we can skip onto * the next part of the i/o. */ if (!count) continue; ofs = uio_offset(uio); } /* * Only some or none of the pages were in the vm page cache or * this is not a cached i/o. First align this i/o request to * compression block boundaries and to PAGE_SIZE boundaries and * truncate it to the maximum upl size then create and map a * page list so we can fill it with the data. */ delta = ofs & align_mask; ofs -= delta; orig_count = count; count += delta; count = (count + align_mask) & ~(off_t)align_mask; if (count > max_upl_size) count = max_upl_size; /* * Do not exceed the attribute size except for a final partial * page. */ size = (data_size - ofs + PAGE_MASK) & ~PAGE_MASK_64; if (count > size) count = size; start_count = count; kerr = ubc_create_upl(vn, ofs, count, &upl, &pl, UPL_SET_LITE); if (kerr != KERN_SUCCESS) panic("%s(): Failed to get page list (error %d).\n", __FUNCTION__, (int)kerr); kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr); if (kerr != KERN_SUCCESS) { ntfs_error(ni->vol->mp, "Failed to map page list " "(error %d).", (int)kerr); err = EIO; goto abort_err; } /* * We know @ofs starts on both a compression block and a page * boundary. We read from the compressed raw vnode * decompressing the data into our mapped page list. Any * already valid pages are automatically skipped. */ err = ntfs_read_compressed(ni, raw_ni, ofs, count, kaddr, pl, ioflags); if (err) { ntfs_error(ni->vol->mp, "Failed to decompress data " "(error %d).", err); goto unm_err; } /* * We now have the entire page list filled with valid pages, * thus we can now copy from the mapped page list into the * destination buffer using uiomove(). We just need to make * sure not to copy past the end of the attribute. */ ofs += delta; count -= delta; if (count > orig_count) count = orig_count; if (ofs + count > data_size) count = data_size - ofs; err = uiomove((caddr_t)(kaddr + delta), count, uio); if (err) { ntfs_error(ni->vol->mp, "uiomove() failed (error %d).", err); goto unm_err; } kerr = ubc_upl_unmap(upl); if (kerr != KERN_SUCCESS) { ntfs_error(ni->vol->mp, "ubc_upl_unmap() failed " "(error %d).", (int)kerr); err = EIO; goto abort_err; } /* * We are done with the page list, commit and/or abort the * pages. */ next_pg = 0; last_pg = start_count >> PAGE_SHIFT; do { int commit_flags; BOOL was_valid, was_dirty; cur_pg = next_pg; /* Determine the state of the current first page. */ was_valid = upl_valid_page(pl, cur_pg); was_dirty = (was_valid && upl_dirty_page(pl, cur_pg)); /* Find sequential pages of the same state. */ for (next_pg = cur_pg + 1; next_pg < last_pg; next_pg++) { if (was_valid != upl_valid_page(pl, next_pg)) break; if (was_valid) { if (was_dirty != upl_dirty_page(pl, next_pg)) break; } } count = (next_pg - cur_pg) << PAGE_SHIFT; /* * For a set of pages that were invalid and hence we * just filled them with data we commit and clean them * unless no caching is requested in which case we dump * them. * * For a set of pages that were already valid and hence * we did not touch we commit them taking care to * preserve any dirty state unless the pages were clean * and no caching is requested in which case we dump * them. */ if (ioflags & IO_NOCACHE && !was_dirty) { ubc_upl_abort_range(upl, cur_pg << PAGE_SHIFT, count, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); continue; } commit_flags = UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_INACTIVATE; if (!was_valid) commit_flags |= UPL_COMMIT_CLEAR_DIRTY; else if (was_dirty) commit_flags |= UPL_COMMIT_SET_DIRTY; ubc_upl_commit_range(upl, cur_pg << PAGE_SHIFT, count, commit_flags); } while (next_pg < last_pg); } while ((start_count = uio_resid(uio)) && (ofs = uio_offset(uio)) < data_size); ntfs_debug("Done."); err: lck_rw_unlock_shared(&raw_ni->lock); (void)vnode_put(raw_ni->vn); return err; unm_err: kerr = ubc_upl_unmap(upl); if (kerr != KERN_SUCCESS) ntfs_error(ni->vol->mp, "ubc_upl_unmap() failed (error %d).", (int)kerr); abort_err: /* * We handle each page independently for simplicity. We do not care * for performance given this is an error code path. * * For a page that was not valid, we dump it as it still does not * contain valid data. For a page that was valid, we release it * without modification as we have not touched it unless no caching is * requested and the page was clean in which case we dump it. */ last_pg = start_count >> PAGE_SHIFT; for (cur_pg = 0; cur_pg < last_pg; cur_pg++) { int abort_flags; abort_flags = UPL_ABORT_FREE_ON_EMPTY; if (!upl_valid_page(pl, cur_pg) || (ioflags & IO_NOCACHE && !upl_dirty_page(pl, cur_pg))) abort_flags |= UPL_ABORT_DUMP_PAGES; ubc_upl_abort_range(upl, cur_pg << PAGE_SHIFT, PAGE_SIZE, abort_flags); } goto err; } // TODO: Rename to ntfs_inode_read and move to ntfs_inode.[hc]? /** * ntfs_read - read a number of bytes from an inode into memory * @ni: ntfs inode whose data to read into memory * @uio: destination in which to return the read data * @ioflags: flags further describing the read request * @locked: if true the ntfs inode lock is already taken for reading * * Read uio_resid(@uio) bytes from the ntfs inode @ni, starting at byte offset * uio_offset(@uio) into the inode into the destination buffer pointed to by * @uio. * * The flags in @ioflags further describe the read request. The following * ioflags are currently defined in OS X kernel (a lot of them are not * applicable to VNOP_READ() however): * IO_UNIT - Do i/o as atomic unit. * IO_APPEND - Append write to end. * IO_SYNC - Do i/o synchronously. * IO_NODELOCKED - Underlying node already locked. * IO_NDELAY - FNDELAY flag set in file table. * IO_NOZEROFILL - F_SETSIZE fcntl uses this to prevent zero filling. * IO_TAILZEROFILL - Zero fills at the tail of write. * IO_HEADZEROFILL - Zero fills at the head of write. * IO_NOZEROVALID - Do not zero fill if valid page. * IO_NOZERODIRTY - Do not zero fill if page is dirty. * IO_CLOSE - The i/o was issued from close path. * IO_NOCACHE - Same effect as VNOCACHE_DATA, but only for this i/o. * IO_RAOFF - Same effect as VRAOFF, but only for this i/o. * IO_DEFWRITE - Defer write if vfs.defwrite is set. * IO_PASSIVE - This is background i/o so do not throttle other i/o. * * For encrypted attributes we abort for now as we do not support them yet. * * For non-resident attributes we use cluster_read_ext() which deals with both * normal and multi sector transfer protected attributes and * ntfs_vnop_read_compressed() which deals with compressed attributes. * * For resident attributes we read the data from the vm page cache and if it is * not there we cause the vm page cache to be populated by reading the buffer * at offset 0 in the attribute. * * Return 0 on success and errno on error. * * Note it is up to the caller to verify that reading from the inode @ni makes * sense. We cannot do the verification inside ntfs_read() as it is called * from various VNOPs which all have different requirements. For example * VNOP_READLINK(), i.e. ntfs_vnop_readlink(), needs to only allow S_ISLNK() * inodes whilst VNOP_READ(), i.e. ntfs_vnop_read(), needs to not allow * S_ISLNK() but needs to allow S_IFREG() instead but only if it is not a * system file. */ static errno_t ntfs_read(ntfs_inode *ni, uio_t uio, const int ioflags, const BOOL locked) { s64 size; user_ssize_t start_count; off_t ofs; vnode_t vn = ni->vn; ntfs_inode *base_ni; upl_t upl; upl_page_info_array_t pl; u8 *kaddr; int err, count; ofs = uio_offset(uio); start_count = uio_resid(uio); base_ni = ni; if (NInoAttr(ni)) base_ni = ni->base_ni; ntfs_debug("Entering for file inode 0x%llx, offset 0x%llx, count " "0x%llx, ioflags 0x%x, locked is %s.", (unsigned long long)ni->mft_no, (unsigned long long)ofs, (unsigned long long)start_count, ioflags, locked ? "true" : "false"); /* * Protect against changes in initialized_size and thus against * truncation also. */ if (!locked) lck_rw_lock_shared(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { if (!locked) lck_rw_unlock_shared(&ni->lock); /* Remove the inode from the name cache. */ cache_purge(ni->vn); return ENOENT; } /* * TODO: This check may no longer be necessary now that we lock against * changes in initialized size and thus truncation... Revisit this * issue when the write code has been written and remove the check if * appropriate simply using ubc_getsize(vn); without the size_lock. */ lck_spin_lock(&ni->size_lock); size = ubc_getsize(vn); if (size > ni->data_size) size = ni->data_size; lck_spin_unlock(&ni->size_lock); /* * If nothing was requested or the request starts at or beyond the end * of the attribute, we do not need to do anything. */ if (!start_count || ofs >= size) { err = 0; goto err; } /* Cannot read from a negative offset. */ if (ofs < 0) { err = EINVAL; goto err; } /* TODO: Deny access to encrypted attributes, just like NT4. */ if (NInoEncrypted(ni)) { ntfs_warning(ni->vol->mp, "Denying access to encrypted " "attribute (EACCES)."); err = EACCES; goto err; } if (NInoNonResident(ni)) { int (*callback)(buf_t, void *); if (NInoCompressed(ni) && !NInoRaw(ni)) { err = ntfs_vnop_read_compressed(ni, uio, size, ioflags); if (!err) ntfs_debug("Done (ntfs_vnop_read_compressed()" ")."); else ntfs_error(ni->vol->mp, "Failed (" "ntfs_vnop_read_compressed(), " "error %d).", err); goto err; } callback = NULL; if (NInoMstProtected(ni) || NInoEncrypted(ni)) callback = ntfs_cluster_iodone; err = cluster_read_ext(vn, uio, size, ioflags, callback, NULL); if (!err) ntfs_debug("Done (cluster_read_ext())."); else ntfs_error(ni->vol->mp, "Failed for file inode " "0x%llx, start offset 0x%llx, start " "count 0x%llx, now offset 0x%llx, " "now count 0x%llx, ioflags 0x%x " "(cluster_read_ext(), error %d).", (unsigned long long)ni->mft_no, (unsigned long long)ofs, (unsigned long long)start_count, (unsigned long long)uio_offset(uio), (unsigned long long)uio_resid(uio), ioflags, err); goto err; } /* else if (!NInoNonResident(ni)) */ /* * That attribute is resident thus we have to deal with it by * ourselves. First of all, try to copy the data from the vm page * cache. This will work on the second and all later reads so this is * the hot path. If the attribute has not been accessed at all before * or its cached pages were dropped due to vm pressure this will fail * to copy any data due to the lack of a valid page and we will drop * into the slow path. */ size -= ofs; if (size > start_count) size = start_count; if (size > PAGE_SIZE) { ntfs_warning(ni->vol->mp, "Unexpected count 0x%llx > PAGE_SIZE " "0x%x, overriding it to PAGE_SIZE.", (unsigned long long)size, PAGE_SIZE); size = PAGE_SIZE; } count = size; err = cluster_copy_ubc_data(vn, uio, &count, 0); if (err) { /* The copying (uiomove()) failed with an error, abort. */ ntfs_error(ni->vol->mp, "cluster_copy_ubc_data() failed " "(error %d).", err); goto err; } /* * @count is now set to the number of bytes remaining to be * transferred. If it is zero, it means we are done. Note it is * possible that there is more data requested, i.e. uio_resid(uio) > 0, * but that just means the request goes beyond the end of the * attribute. */ if (!count) { ntfs_debug("Done (resident, cached, returned 0x%llx bytes).", (unsigned long long)size); goto err; } /* * We failed to transfer everything. That really means we failed to * transfer anything at all as we are guaranteed that a resident * attribute is smaller than a page thus either the page is there and * valid and we transfer everything or it is not and we transfer * nothing. */ if (count != size) { ntfs_warning(ni->vol->mp, "Unexpected partial transfer from " "cached page (size 0x%llx, count 0x%x).", (unsigned long long)size, count); ofs = uio_offset(uio); } /* * The page is not in cache or is not valid. We need to bring it into * cache and make it valid so we can then copy the data out. The * easiest way to do this is to just map the page which will take care * of everything for us. We can than uiomove() straight out of the * page into the @uio and then unmap the page again. * * Note this will take the inode lock again but this is ok as in both * cases the lock is taken shared. */ err = ntfs_page_map(ni, 0, &upl, &pl, &kaddr, FALSE); if (err) { ntfs_error(ni->vol->mp, "Failed to map page (error %d).", err); goto err; } err = uiomove((caddr_t)(kaddr + ofs), count, uio); ntfs_page_unmap(ni, upl, pl, FALSE); if (!err) ntfs_debug("Done (resident, not cached, returned 0x%llx " "bytes).", (unsigned long long)size - uio_resid(uio)); else ntfs_error(ni->vol->mp, "uiomove() failed (error %d).", err); err: /* * Update the last_access_time (atime) if something was read and this * is the base ntfs inode or it is a named stream (this is what HFS+ * does, too). * * Skip the update if atime updates are disabled via the noatime mount * option or the volume is read only or this is a symbolic link. * * Also, skip the core system files except for the root directory. */ if (uio_resid(uio) < start_count && !NVolReadOnly(ni->vol) && !(vfs_flags(ni->vol->mp) & MNT_NOATIME) && !S_ISLNK(base_ni->mode) && (ni == base_ni || ni->type == AT_DATA)) { BOOL need_update_time; need_update_time = TRUE; if (ni->vol->major_ver > 1) { if (base_ni->mft_no <= FILE_Extend && base_ni != ni->vol->root_ni) need_update_time = FALSE; } else { if (base_ni->mft_no <= FILE_UpCase && base_ni != ni->vol->root_ni) need_update_time = FALSE; } if (need_update_time) { base_ni->last_access_time = ntfs_utc_current_time(); NInoSetDirtyTimes(base_ni); } } if (!locked) lck_rw_unlock_shared(&ni->lock); return err; } /** * ntfs_vnop_read - read a number of bytes from a file into memory * @a: arguments to read function * * @a contains: * vnode_t a_vp; vnode of file whose data to read into memory * uio_t a_uio; destination in which to return the read data * int a_ioflag; flags further describing the read request * vfs_context_t a_context; * * Read uio_resid(@a->a_uio) bytes from the vnode @a-a_vp, starting at byte * offset uio_offset(@a->a_uio) into the vnode into the destination buffer * pointed to by @uio. * * The flags in @a->a_ioflag further describe the read request. The following * ioflags are currently defined in OS X kernel (a lot of them are not * applicable to VNOP_READ() however): * IO_UNIT - Do i/o as atomic unit. * IO_APPEND - Append write to end. * IO_SYNC - Do i/o synchronously. * IO_NODELOCKED - Underlying node already locked. * IO_NDELAY - FNDELAY flag set in file table. * IO_NOZEROFILL - F_SETSIZE fcntl uses this to prevent zero filling. * IO_TAILZEROFILL - Zero fills at the tail of write. * IO_HEADZEROFILL - Zero fills at the head of write. * IO_NOZEROVALID - Do not zero fill if valid page. * IO_NOZERODIRTY - Do not zero fill if page is dirty. * IO_CLOSE - The i/o was issued from close path. * IO_NOCACHE - Same effect as VNOCACHE_DATA, but only for this i/o. * IO_RAOFF - Same effect as VRAOFF, but only for this i/o. * IO_DEFWRITE - Defer write if vfs.defwrite is set. * IO_PASSIVE - This is background i/o so do not throttle other i/o. * * For encrypted attributes we abort for now as we do not support them yet. * * For non-resident attributes we use cluster_read_ext() which deals with both * normal and multi sector transfer protected attributes and * ntfs_vnop_read_compressed() which deals with compressed attributes. * * For resident attributes we read the data from the vm page cache and if it is * not there we cause the vm page cache to be populated by reading the buffer * at offset 0 in the attribute. * * Return 0 on success and errno on error. */ static int ntfs_vnop_read(struct vnop_read_args *a) { vnode_t vn = a->a_vp; ntfs_inode *ni = NTFS_I(vn); if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } /* * We can only read from regular files and named streams. * * Also, do not allow reading from system files or mst protected * attributes. */ if (vnode_issystem(vn) || NInoMstProtected(ni) || (!S_ISREG(ni->mode) && !(NInoAttr(ni) && ni->type == AT_DATA))) { if (S_ISDIR(ni->mode)) return EISDIR; return EPERM; } return (int)ntfs_read(ni, a->a_uio, a->a_ioflag, FALSE); } // TODO: Rename to ntfs_inode_write and move to ntfs_inode.[hc]? /** * ntfs_write - write a number of bytes from a memory buffer into a file * @ni: ntfs inode to write to * @uio: source containing the data to write * @ioflags: flags further describing the write request * @write_locked: if true the ntfs inode lock is already taken for writing * * Write uio_resid(@uio) bytes from the source buffer specified by @uio to the * ntfs inode @ni, starting at byte offset uio_offset(@uio) into the inode. * * The flags in @ioflags further describe the write request. The following * ioflags are currently defined in OS X kernel (not all of them are applicable * to VNOP_WRITE() however): * IO_UNIT - Do i/o as atomic unit. * IO_APPEND - Append write to end. * IO_SYNC - Do i/o synchronously. * IO_NODELOCKED - Underlying node already locked. * IO_NDELAY - FNDELAY flag set in file table. * IO_NOZEROFILL - F_SETSIZE fcntl uses this to prevent zero filling. * IO_TAILZEROFILL - Zero fills at the tail of write. * IO_HEADZEROFILL - Zero fills at the head of write. * IO_NOZEROVALID - Do not zero fill if valid page. * IO_NOZERODIRTY - Do not zero fill if page is dirty. * IO_CLOSE - The i/o was issued from close path. * IO_NOCACHE - Same effect as VNOCACHE_DATA, but only for this i/o. * IO_RAOFF - Same effect as VRAOFF, but only for this i/o. * IO_DEFWRITE - Defer write if vfs.defwrite is set. * IO_PASSIVE - This is background i/o so do not throttle other i/o. * * For compressed and encrypted attributes we abort for now as we do not * support them yet. * * For non-resident attributes we use cluster_write_ext() which deals with * normal attributes. * * Return 0 on success and errno on error. * * Note it is up to the caller to verify that writing to the inode @ni makes * sense. We cannot do the verification inside ntfs_write() as it is called * from various VNOPs which all have different requirements. For example * VNOP_SYMLINK(), i.e. ntfs_vnop_symlink(), needs to write to S_ISLNK() inodes * whilst VNOP_WRITE(), i.e. ntfs_vnop_write(), needs to not allow S_ISLNK() * but needs to allow S_IFREG() instead but only if it is not a system file. */ static errno_t ntfs_write(ntfs_inode *ni, uio_t uio, int ioflags, BOOL write_locked) { s64 old_size, size, end, nr_truncated; user_ssize_t old_count, count; off_t old_ofs, ofs; vnode_t vn = ni->vn; ntfs_inode *base_ni; upl_t upl; upl_page_info_array_t pl; u8 *kaddr; int cnt; errno_t err; BOOL was_locked, need_uptodate; /* Do not allow writing if mounted read-only. */ if (NVolReadOnly(ni->vol)) return EROFS; nr_truncated = 0; ofs = old_ofs = uio_offset(uio); count = old_count = uio_resid(uio); ntfs_debug("Entering for file inode 0x%llx, offset 0x%llx, count " "0x%llx, ioflags 0x%x, write_locked is %s.", (unsigned long long)ni->mft_no, (unsigned long long)ofs, (unsigned long long)count, ioflags, write_locked ? "true" : "false"); /* If nothing to do return success. */ if (!count) return 0; /* Cannot write to a negative offset. */ if (ofs < 0) return EINVAL; /* TODO: Deny access to encrypted attributes, just like NT4. */ if (NInoEncrypted(ni)) { ntfs_warning(ni->vol->mp, "Denying write to encrypted " "attribute (EACCES)."); return EACCES; } /* TODO: We do not support writing to compressed files. */ if (NInoCompressed(ni)) { ntfs_error(ni->vol->mp, "Writing to compressed files is not " "implemented yet. Sorry."); return ENOTSUP; } #if 1 // TODO: Remove this when sparse support is done... if (NInoSparse(ni)) return ENOTSUP; #endif base_ni = ni; if (NInoAttr(ni)) base_ni = ni->base_ni; /* The first byte after the write. */ end = ofs + count; /* * If we are going to extend the initialized size take the inode lock * for writing and take it for reading otherwise. * * Appending will always cause the initialized size to be extended thus * always take the lock for writing. * * Writing into holes requires us to take the lock for writing thus if * this is a sparse file take the lock for writing just in case. */ was_locked = write_locked; if (ioflags & IO_APPEND) { if (!was_locked) { lck_rw_lock_exclusive(&ni->lock); write_locked = TRUE; } /* * Do not allow messing with the inode once it has been * deleted. */ if (NInoDeleted(ni)) { if (!was_locked) lck_rw_unlock_exclusive(&ni->lock); /* Remove the inode from the name cache. */ cache_purge(ni->vn); return ENOENT; } lck_spin_lock(&ni->size_lock); ofs = ni->data_size; lck_spin_unlock(&ni->size_lock); uio_setoffset(uio, ofs); ntfs_debug("Write to mft_no 0x%llx, IO_APPEND flag is set, " "setting uio_offset() to file size 0x%llx.", (unsigned long long)ni->mft_no, (unsigned long long)ofs); /* Update the first byte after the write with the new offset. */ end = ofs + count; } else { if (!was_locked) { if (NInoSparse(ni)) { lck_rw_lock_exclusive(&ni->lock); write_locked = TRUE; } else { lck_rw_lock_shared(&ni->lock); write_locked = FALSE; } } recheck_deleted: /* * Do not allow messing with the inode once it has been * deleted. */ if (NInoDeleted(ni)) { if (!was_locked) { if (write_locked) lck_rw_unlock_exclusive(&ni->lock); else lck_rw_unlock_shared(&ni->lock); } /* Remove the inode from the name cache. */ cache_purge(ni->vn); return ENOENT; } lck_spin_lock(&ni->size_lock); size = ni->initialized_size; lck_spin_unlock(&ni->size_lock); if (!write_locked && end > size) { /* If we fail to convert the lock, take it. */ if (!lck_rw_lock_shared_to_exclusive(&ni->lock)) lck_rw_lock_exclusive(&ni->lock); write_locked = TRUE; goto recheck_deleted; } ntfs_debug("Mft_no 0x%llx, inode lock taken for %s.", (unsigned long long)ni->mft_no, write_locked ? "writing" : "reading"); } /* * We do not want any form of zero filling to happen at the starting * offset of the write as we sort this out ourselves. * * Further, we never want to zero fill at the end of the write as this * is pointless. We automatically get zero filling at the end of the * page when a page is read in and when the initialized size is * extended. */ ioflags &= ~(IO_HEADZEROFILL | IO_TAILZEROFILL); /* * We do not want to zero any valid/dirty pages as they could already * have new data written via mmap() for example and we do not want to * lose that. */ ioflags |= IO_NOZEROVALID | IO_NOZERODIRTY; lck_spin_lock(&ni->size_lock); old_size = ni->data_size; size = ni->allocated_size; lck_spin_unlock(&ni->size_lock); /* * If this is a sparse attribute and the write overlaps the existing * allocated size we need to fill any holes overlapping the write. We * can skip resident attributes as they cannot have sparse regions. * * As allocated size goes in units of clusters we need to round down * the start offset to the nearest cluster boundary and we need to * round up the end offset to the next cluster boundary. */ if (NInoSparse(ni) && NInoNonResident(ni) && (ofs & ~ni->vol->cluster_size_mask) < size) { s64 aligned_end, new_end; if (!write_locked) panic("%s(): !write_locked\n", __FUNCTION__); aligned_end = (end + ni->vol->cluster_size_mask) & ~ni->vol->cluster_size_mask; /* * Only need to instantiate holes up to the allocated size * itself. Everything else is an extension and will be dealt * with by ntfs_attr_extend_allocation() below. */ if (aligned_end > size) aligned_end = size; err = ntfs_attr_instantiate_holes(ni, ofs & ~ni->vol->cluster_size_mask, aligned_end, &new_end, ioflags & IO_UNIT); if (err) { ntfs_error(ni->vol->mp, "Cannot perform write to " "mft_no 0x%llx because instantiation " "of sparse regions failed (error %d).", (unsigned long long)ni->mft_no, err); uio_setoffset(uio, old_ofs); uio_setresid(uio, old_count); if (!was_locked) lck_rw_unlock_exclusive(&ni->lock); return err; } /* If the instantiation was partial, truncate the write. */ if (new_end < aligned_end) { s64 new_count; if (ioflags & IO_UNIT) panic("%s(): new_end < aligned_end && " "ioflags & IO_UNIT\n", __FUNCTION__); ntfs_debug("Truncating write to mft_no 0x%llx because " "instantiation of sparse regions was " "only partially completed.", (unsigned long long)ni->mft_no); if (new_end > end) panic("%s(): new_end > end\n", __FUNCTION__); end = new_end; new_count = new_end - ofs; if (new_count >= count) panic("%s(): new_count >= count\n", __FUNCTION__); nr_truncated += count - new_count; count = new_count; uio_setresid(uio, new_count); } } /* * If the write goes beyond the allocated size, extend the allocation * to cover the whole of the write, rounded up to the nearest cluster. */ if (end > size) { if (!write_locked) panic("%s(): !write_locked\n", __FUNCTION__); /* Extend the allocation without changing the data size. */ err = ntfs_attr_extend_allocation(ni, end, -1, ofs, NULL, &size, ioflags & IO_UNIT); if (!err) { if (ofs >= size) panic("%s(): ofs >= size\n", __FUNCTION__); /* If the extension was partial truncate the write. */ if (end > size) { s64 new_count; if (ioflags & IO_UNIT) panic("%s(): end > size && " "ioflags & IO_UNIT\n", __FUNCTION__); ntfs_debug("Truncating write to mft_no 0x%llx " "because the allocation was " "only partially extended.", (unsigned long long)ni->mft_no); end = size; new_count = size - ofs; if (new_count >= count) panic("%s(): new_count >= count\n", __FUNCTION__); nr_truncated += count - new_count; count = new_count; uio_setresid(uio, new_count); } } else /* if (err) */ { lck_spin_lock(&ni->size_lock); size = ni->allocated_size; lck_spin_unlock(&ni->size_lock); /* Perform a partial write if possible or fail. */ if (ofs < size && !(ioflags & IO_UNIT)) { s64 new_count; ntfs_debug("Truncating write to mft_no 0x%llx " "because extending the " "allocation failed (error %d).", (unsigned long long)ni->mft_no, err); end = size; new_count = size - ofs; if (new_count >= count) panic("%s(): new_count >= count\n", __FUNCTION__); nr_truncated += count - new_count; count = new_count; uio_setresid(uio, new_count); } else { ntfs_error(ni->vol->mp, "Cannot perform write " "to mft_no 0x%llx because " "extending the allocation " "failed (error %d).", (unsigned long long)ni->mft_no, err); goto abort; } } } /* * If the write starts beyond the initialized size, extend it up to the * beginning of the write and initialize all non-sparse space between * the old initialized size and the new one. This automatically also * increments the data size as well as the ubc size to keep it above or * equal to the initialized size. */ lck_spin_lock(&ni->size_lock); size = ni->initialized_size; lck_spin_unlock(&ni->size_lock); if (ofs > size) { if (!write_locked) panic("%s(): !write_locked 2\n", __FUNCTION__); err = ntfs_attr_extend_initialized(ni, ofs); if (err) { ntfs_error(ni->vol->mp, "Cannot perform write to " "mft_no 0x%llx because extending the " "initialized size failed (error %d).", (unsigned long long)ni->mft_no, err); goto abort; } size = ofs; } if (NInoNonResident(ni)) { int (*callback)(buf_t, void *); if (NInoCompressed(ni) && !NInoRaw(ni)) { #if 0 err = ntfs_vnop_write_compressed(ni, uio, size, ioflags); if (!err) ntfs_debug("Done (ntfs_vnop_write_compressed()" ")."); else ntfs_error(ni->vol->mp, "Failed (" "ntfs_vnop_write_compressed(), " "error %d).", err); #endif /* * TODO: At present we should never get here for * compressed files as this case is aborted at the * start of the function. */ panic("%s(): NInoCompressed(ni) && !NInoRaw(ni)\n", __FUNCTION__); } callback = NULL; if (NInoEncrypted(ni)) { callback = ntfs_cluster_iodone; /* * TODO: At present we should never get here for * encrypted files as this case is aborted at the start * of the function. */ panic("%s(): NInoEncrypted(ni)\n", __FUNCTION__); } /* Determine the new file size. */ size = ubc_getsize(vn); if (end > size) size = end; /* * Note the first size is the original file size and the second * file size is the new file size when the write is complete. */ err = cluster_write_ext(vn, uio, ubc_getsize(vn), size, 0, 0, ioflags, callback, NULL); if (err) { /* * There was an error. We do not know where. Ensure * everything is set up as if the write never happened. */ ntfs_error(ni->vol->mp, "Failed (cluster_write_ext(), " "error %d).", err); goto abort; } goto done; } /* * The attribute is resident thus we have to deal with it by ourselves. * First of all, try to copy the data to the vm page cache. This will * work on the second and all later writes so this is the hot path. If * the attribute has not been accessed at all before or its cached * pages were dropped due to vm pressure this will fail to copy any * data due to the lack of a valid page and we will drop into the slow * path. */ if (ofs > PAGE_SIZE) panic("%s(): ofs > PAGE_SIZE\n", __FUNCTION__); cnt = (int)count; if (count > PAGE_SIZE - ofs) { cnt = PAGE_SIZE - ofs; ntfs_warning(ni->vol->mp, "Unexpected count (0x%llx) > " "PAGE_SIZE - ofs (0x%x), overriding it to " "PAGE_SIZE - ofs.", (unsigned long long)count, cnt); } /* * Note we pass mark_dirty = 1 (the last parameter) which means the * pages that are written to will be marked dirty. */ err = cluster_copy_ubc_data(vn, uio, &cnt, 1); if (err) { /* * The copying (uiomove()) failed with an error. Ensure * everything is set up as if the write never happened. */ ntfs_error(ni->vol->mp, "cluster_copy_ubc_data() failed " "(error %d).", err); goto abort; } /* * @cnt is now set to the number of bytes remaining to be transferred. * If it is zero, it means we are done. */ if (!cnt) goto done; /* * We failed to transfer everything. That really means we failed to * transfer anything at all as we are guaranteed that a resident * attribute is smaller than a page thus either the page is there and * valid and we transfer everything or it is not and we transfer * nothing. */ if (cnt != count) { ntfs_warning(ni->vol->mp, "Unexpected partial transfer to " "cached page (count 0x%llx, cnt 0x%x).", (unsigned long long)count, cnt); /* Ensure everything is as it was before. */ uio_setoffset(uio, old_ofs); uio_setresid(uio, old_count - nr_truncated); } /* * The page is not in cache or is not valid. We need to bring it into * cache and make it valid so we can then copy the data in. The * easiest way to do this is to just map the page which will take care * of everything for us. We can then uiomove() straight into the page * from the @uio and then mark the page dirty and unmap it again. * * As an optimization, if the write covers the whole existing attribute * we grab the page without bringing it uptodate if it is not valid * already thus saving a pagein from disk. */ need_uptodate = (ofs || end < size); err = ntfs_page_map_ext(ni, 0, &upl, &pl, &kaddr, need_uptodate, TRUE); if (err) { ntfs_error(ni->vol->mp, "Failed to map page (error %d).", err); goto abort; } err = uiomove((caddr_t)(kaddr + ofs), cnt, uio); if (err) { /* * If we just caused the page to exist and did not bring it * up-to-date or caching is disabled on the vnode or for this * i/o, dump the page. Otherwise release it back to the VM. */ if (upl_valid_page(pl, 0) || (need_uptodate && !vnode_isnocache(vn) && !(ioflags & IO_NOCACHE))) ntfs_page_unmap(ni, upl, pl, FALSE); else ntfs_page_dump(ni, upl, pl); /* * The copying (uiomove()) failed with an error. Ensure * everything is set up as if the write never happened. */ ntfs_error(ni->vol->mp, "uiomove() failed (error %d).", err); goto abort; } /* * If the page is not uptodate and we did not bring it up-to-date when * mapping it, zero the remainder of the page now thus bringing it * up-to-date. */ if (!need_uptodate && !upl_valid_page(pl, 0)) { const off_t cur_ofs = uio_offset(uio); if (cur_ofs > PAGE_SIZE) panic("%s(): cur_ofs > PAGE_SIZE\n", __FUNCTION__); bzero(kaddr + cur_ofs, PAGE_SIZE - cur_ofs); } /* * Unmap the page marking it dirty. * * Note we leave the page cached even if no caching is requested for * simplicity. That way we do not need to touch the mft record at all * and can instead rely on the next sync to propagate the dirty data * from the page into the mft record and then to disk. In the sync i/o * case we will call ntfs_inode_sync() at the end of this function. */ ntfs_page_unmap(ni, upl, pl, TRUE); done: /* * If the write went past the end of the initialized size update it * both in the ntfs inode and in the base attribute record. * * Also update the data size and the ubc size if the write went past * the end of the data size. Note this is automatically done by * ntfs_attr_set_initialized_size() so we do not need to do it here. */ size = uio_offset(uio); lck_spin_lock(&ni->size_lock); if (size > ni->initialized_size) { lck_spin_unlock(&ni->size_lock); if (!write_locked) panic("%s(): !write_locked 3\n", __FUNCTION__); err = ntfs_attr_set_initialized_size(ni, size); if (err) { ntfs_error(ni->vol->mp, "Failed to update the " "initialized size of mft_no 0x%llx " "(error %d).", (unsigned long long)ni->mft_no, err); /* * If the write was meant to be atomic, the write * started beyond the end of the initialized size, or * nothing was written ensure everything is set up as * if the write never happened. */ lck_spin_lock(&ni->size_lock); size = ni->initialized_size; lck_spin_unlock(&ni->size_lock); if (ioflags & IO_UNIT || old_ofs >= size || uio_resid(uio) >= old_count) goto abort; /* * Something was written before the initialized size * thus turn the error into a partial, successful write * up to the initialized size. */ uio_setoffset(uio, size); uio_setresid(uio, size - old_ofs); err = 0; } } else lck_spin_unlock(&ni->size_lock); // TODO: If we wrote anything at all we have to clear the S_ISUID and // S_ISGID bits in the file mode as a precaution against tampering // (see xnu/bsd/hfs/hfs_readwrite.c::hfs_vnop_write()). /* * Update the last_data_change_time (mtime) and last_mft_change_time * (ctime) on the base ntfs inode @base_ni unless this is an attribute * inode update in which case only update the ctime as named stream/ * extended attribute semantics expect on OS X. */ base_ni->last_mft_change_time = ntfs_utc_current_time(); if (ni == base_ni) base_ni->last_data_change_time = base_ni->last_mft_change_time; NInoSetDirtyTimes(base_ni); /* * If this is not a directory or it is an encrypted directory, set the * needs archiving bit except for the core system files. */ if (!S_ISDIR(base_ni->mode) || NInoEncrypted(base_ni)) { BOOL need_set_archive_bit = TRUE; if (ni->vol->major_ver >= 2) { if (ni->mft_no <= FILE_Extend) need_set_archive_bit = FALSE; } else { if (ni->mft_no <= FILE_UpCase) need_set_archive_bit = FALSE; } if (need_set_archive_bit) { base_ni->file_attributes |= FILE_ATTR_ARCHIVE; NInoSetDirtyFileAttributes(base_ni); } } /* * If we truncated the write add back the number of truncated bytes to * the number of bytes remaining. */ if (nr_truncated > 0) { if (ioflags & IO_UNIT) panic("%s(): ioflags & IO_UNIT\n", __FUNCTION__); uio_setresid(uio, uio_resid(uio) + nr_truncated); } /* * If the write was partial we need to trim off any extra allocated * space by truncating the attribute to its old size. We can only have * extended the allocation if we hold the inode lock for writing so do * not bother going through this code if we only hold the lock for * reading. * * There is one exception and that is that if the write was meant to be * atomic a partial write is not acceptable thus we need to abort the * write completely in this case. */ size = uio_resid(uio); if (write_locked && size > nr_truncated) { s64 truncate_size; errno_t err2; int rflags; /* * If the write was meant to be atomic or nothing was written * reset everything as if the write never happened thus * releasing any extra space we may have allocated. */ if (ioflags & IO_UNIT || size >= old_count) { if (size > old_count) panic("%s(): size > old_count\n", __FUNCTION__); abort: uio_setoffset(uio, old_ofs); uio_setresid(uio, old_count); if (!write_locked) { if (!err) panic("%s(): !err\n", __FUNCTION__); goto skip_truncate; } truncate_size = old_size; } else /* if (uio_resid(uio) < old_count) */ { /* * At least something was written. Truncate the * attribute to the successfully written size thus * releasing any extra space we allocated but ensure we * do not truncate to less than the old size. */ truncate_size = uio_offset(uio); if (truncate_size < old_size) truncate_size = old_size; } /* * Truncate the attribute to @truncate_size. * * The truncate must be complete or no need to bother at all so * set the IO_UNIT flag. Also remove unwanted flags. */ rflags = (ioflags | IO_UNIT) & ~(IO_APPEND | IO_SYNC | IO_NOZEROFILL); err2 = ntfs_attr_resize(ni, truncate_size, rflags, NULL); if (err2) { BOOL is_dirty; /* * If no other error has occured failing the truncate * will at worst mean that we have too much allocated * space which is not a disaster so carry on in this * case. * * If another error has occured any of a number of * things can now be wrong and in particular if the * data size is not equal to @truncate_size this is * very bad news so mark the volume dirty and warn the * user about it. */ is_dirty = (err); if (is_dirty) { lck_spin_lock(&ni->size_lock); if (truncate_size == ni->data_size) is_dirty = FALSE; lck_spin_unlock(&ni->size_lock); } ntfs_error(ni->vol->mp, "Truncate failed (error %d).%s", err2, is_dirty ? " Leaving " "inconsistent data on disk. Unmount " "and run chkdsk." : ""); if (is_dirty) NVolSetErrors(ni->vol); } } skip_truncate: if (!was_locked) { if (!write_locked) lck_rw_unlock_shared(&ni->lock); else lck_rw_unlock_exclusive(&ni->lock); /* * If the write was successful and synchronous i/o was * requested, sync all changes to the backing store. We * dropped the inode lock already to be able to call * ntfs_inode_sync() thus if it fails we cannot do anything * about it so we just return the error even though the * operation has otherwise been performed. * * Note we cannot do this if the inode was already locked or * the call to ntfs_inode_sync() would cause a deadlock. */ if (!err && ioflags & IO_SYNC) { /* Mask out undersired @ioflags. */ ioflags &= ~(IO_UNIT | IO_APPEND | IO_DEFWRITE); err = ntfs_inode_sync(ni, ioflags, FALSE); } } return err; } /** * ntfs_vnop_write - write a number of bytes from a memory buffer into a file * @a: arguments to write function * * @a contains: * vnode_t a_vp; vnode of file to write to * uio_t a_uio; source containing the data to write * int a_ioflag; flags further describing the write request * vfs_context_t a_context; * * Write uio_resid(@a->a_uio) bytes from the source buffer specified by * @a->a_uio to the vnode @a-a_vp, starting at byte offset * uio_offset(@a->a_uio) into the vnode. * * The flags in @a->a_ioflag further describe the write request. The following * ioflags are currently defined in OS X kernel (not all of them are applicable * to VNOP_WRITE() however): * IO_UNIT - Do i/o as atomic unit. * IO_APPEND - Append write to end. * IO_SYNC - Do i/o synchronously. * IO_NODELOCKED - Underlying node already locked. * IO_NDELAY - FNDELAY flag set in file table. * IO_NOZEROFILL - F_SETSIZE fcntl uses this to prevent zero filling. * IO_TAILZEROFILL - Zero fills at the tail of write. * IO_HEADZEROFILL - Zero fills at the head of write. * IO_NOZEROVALID - Do not zero fill if valid page. * IO_NOZERODIRTY - Do not zero fill if page is dirty. * IO_CLOSE - The i/o was issued from close path. * IO_NOCACHE - Same effect as VNOCACHE_DATA, but only for this i/o. * IO_RAOFF - Same effect as VRAOFF, but only for this i/o. * IO_DEFWRITE - Defer write if vfs.defwrite is set. * IO_PASSIVE - This is background i/o so do not throttle other i/o. * * For compressed and encrypted attributes we abort for now as we do not * support them yet. * * For non-resident attributes we use cluster_write_ext() which deals with * normal attributes. * * Return 0 on success and errno on error. */ static int ntfs_vnop_write(struct vnop_write_args *a) { vnode_t vn = a->a_vp; ntfs_inode *ni = NTFS_I(vn); if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } /* * We can only write to regular files and named streams. * * Also, do not allow writing to system files and mst protected * attributes. */ if (vnode_issystem(vn) || NInoMstProtected(ni) || (!S_ISREG(ni->mode) && !(NInoAttr(ni) && ni->type == AT_DATA))) { if (S_ISDIR(ni->mode)) return EISDIR; return EPERM; } return (int)ntfs_write(ni, a->a_uio, a->a_ioflag, FALSE); } /** * ntfs_vnop_ioctl - * */ static int ntfs_vnop_ioctl(struct vnop_ioctl_args *a) { errno_t err; ntfs_debug("Entering."); // TODO: err = ENOTSUP; ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_select - * */ static int ntfs_vnop_select(struct vnop_select_args *a) { errno_t err; ntfs_debug("Entering."); // TODO: err = ENOTSUP; ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_exchange - * */ static int ntfs_vnop_exchange(struct vnop_exchange_args *a) { errno_t err; ntfs_debug("Entering."); // TODO: err = ENOTSUP; ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_mmap - map a file (vnode) into memory * @a: arguments to mmap function * * @a contains: * vnode_t a_vp; file vnode which to map into memory * int a_fflags; mapping flags for the vnode * vfs_context_t a_context; * * Map the file vnode @a->a_vp into memory applying the mapping flags * @a->a_fflags which are a combination of one or more of PROT_READ, * PROT_WRITE, and PROT_EXEC. * * VNOP_MMAP() and hence ntfs_vnop_mmap() gets called from ubc_map() which in * turn gets called from the mmap() system call when a file is being mapped * into memory. * * The mmap() system call does the necessary permission checking and in fact * ignores the return value from ubc_map() and relies on things not working * later on for error handling. * * ubc_map() on the other hand does look at the return value of VNOP_MMAP() but * it only cares for one error code and that is EPERM. All other errors are * ignored and not passed to its caller. Thus for any return value not equal * to EPERM, ubc_map() takes an extra reference on the vnode and sets the flags * UI_ISMAPPED and UI_WASMAPPED in the ubc info of the vnode and for EPERM it * does not do anything and just returns EPERM to the caller. * * In effect neither class of return value (EPERM or not EPERM) actually has * any effect at all so we do not bother doing any checking here and defer all * checks to VNOP_PAGEIN() and hence ntfs_vnop_pagein(). * * FIXME: This is a huge problem because it means that anyone can use mmap() on * a system file and then write rubbish into the mapped memory and then trash * the metadata in the mapped memory by calling msync() to write the rubbish * out into the system file on disk! This will need to be fixed in the kernel * I think, i.e. the mmap() system call must fail if VNOP_MMAP() fails. This * is because we have no way to tell who is causing a page{in,out} at * ntfs_vnop_page{in,out}() time and for what reason so we have to always * permit page{in,out} to be called. * * Return 0 on success and EPERM on error. */ static int ntfs_vnop_mmap(struct vnop_mmap_args *a) { #ifdef DEBUG ntfs_inode *ni = NTFS_I(a->a_vp); if (ni) ntfs_debug("Mapping mft_no 0x%llx, type 0x%x, name_len 0x%x, " "mapping flags 0x%x.", (unsigned long long)ni->mft_no, le32_to_cpu(ni->type), (unsigned)ni->name_len, a->a_fflags); #endif /* Nothing to do. */ return 0; } /** * ntfs_vnop_mnomap - unmap a file (vnode) from memory * @a: arguments to mnomap function * * @a contains: * vnode_t a_vp; file vnode which to unmap from memory * vfs_context_t a_context; * * Remove the memory mapping of the file vnode @a->a_vp that was previously * established via ntfs_vnop_mmap(). * * VNOP_MNOMAP() and hence ntfs_vnop_mnomap() gets called from ubc_unmap() when * a file is being unmapped from memory via the munmap() system call. * * ubc_unmap() only calls VNOP_MNOMAP() if the previous VNOP_MMAP() call did * not return EPERM. * * ubc_unmap() completely ignores the return value from VNOP_MNOMAP(). * * Always return 0 as the return value is always ignored. */ static int ntfs_vnop_mnomap(struct vnop_mnomap_args *a) { #ifdef DEBUG ntfs_inode *ni = NTFS_I(a->a_vp); if (ni) ntfs_debug("Unmapping mft_no 0x%llx, type 0x%x, name_len " "0x%x.", (unsigned long long)ni->mft_no, le32_to_cpu(ni->type), (unsigned)ni->name_len); #endif /* Nothing to do. */ return 0; } /** * ntfs_vnop_fsync - synchronize a vnode's in-core state with that on disk * @a: arguments to fsync function * * @a contains: * vnode_t a_vp; vnode which to sync * int a_waitfor; if MNT_WAIT wait for i/o to complete * vfs_context_t a_context; * * Write all dirty cached data belonging/related to the vnode @a->a_vp to disk. * * If @a->a_waitfor is MNT_WAIT, wait for all i/o to complete before returning. * * Note: When called from reclaim, the vnode has a zero v_iocount and * v_usecount and vnode_isrecycled() is true. * * Return 0 on success and the error code on error. */ static int ntfs_vnop_fsync(struct vnop_fsync_args *a) { vnode_t vn = a->a_vp; ntfs_inode *ni = NTFS_I(vn); int sync, err; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return 0; } /* If we are mounted read-only, we do not need to sync anything. */ if (NVolReadOnly(ni->vol)) return 0; sync = (a->a_waitfor == MNT_WAIT) ? IO_SYNC : 0; ntfs_debug("Entering for inode 0x%llx, waitfor 0x%x, %ssync i/o.", (unsigned long long)ni->mft_no, a->a_waitfor, (sync == IO_SYNC) ? "a" : ""); /* * We need to allow ENOENT errors since the unlink system call can call * VNOP_FSYNC() during vclean(). */ err = ntfs_inode_sync(ni, sync, FALSE); if (err == ENOENT) err = 0; ntfs_debug("Done (error %d).", err); return err; } /** * ntfs_unlink_internal - unlink and ntfs inode from its parent directory * @dir_ni: directory ntfs inode from which to unlink the ntfs inode * @ni: base ntfs inode to unlink * @name: Unicode name of the inode to unlink * @name_len: length of the name in Unicode characters * @name_type: Namespace the name is in (i.e. FILENAME_{DOS,WIN32,POSIX,etc}) * @is_rename: if true ntfs_unlink_internal() is called for a rename operation * * Unlink an inode with the ntfs inode @ni and name @name with length @name_len * Unicode characters and of namespace @name_type from the directory with ntfs * inode @dir_ni. * * If @is_rename is true the caller was ntfs_vnop_rename() in which case the * link count of the inode to unlink @ni will be one higher than the link count * in the mft record. * * Return 0 on success and the error code on error. * * Note that if the name of the inode to be removed is in the WIN32 or DOS * namespaces, both the WIN32 and the corresponding DOS names are removed. * * Note that for a hard link this function simply removes the name and its * directory entry and decrements the hard link count whilst for the last name, * i.e. the last link to an inode, it only removes the directory entry, i.e. it * does not remove the name, however it does decrement the hard link count to * zero. This is so that the inode can be undeleted and its original name * restored. In any case, we do not actually delete the inode here as it may * still be open and UNIX semantics require an unlinked inode to be still * accessible through already opened file descriptors. When the last file * descriptor is closed, we causes the inode to be deleted when the VFS * notifies us of the last close by calling VNOP_INACTIVE(), i.e. * ntfs_vnop_inactive(). */ static errno_t ntfs_unlink_internal(ntfs_inode *dir_ni, ntfs_inode *ni, ntfschar *name, signed name_len, FILENAME_TYPE_FLAGS name_type, const BOOL is_rename) { ntfs_volume *vol; ntfs_inode *objid_o_ni; ntfschar *ntfs_name; MFT_RECORD *m; ntfs_attr_search_ctx *actx; ATTR_RECORD *a; ntfs_index_context *ictx; FILENAME_ATTR *fn, *tfn; signed ntfs_name_len; unsigned fn_count, tfn_alloc; errno_t err; BOOL seen_dos; FILENAME_TYPE_FLAGS seek_type, fn_type; vol = ni->vol; objid_o_ni = vol->objid_o_ni; ntfs_debug("Unlinking mft_no 0x%llx from directory mft_no 0x%llx, " "name type 0x%x.", (unsigned long long)ni->mft_no, (unsigned long long)dir_ni->mft_no, (unsigned)name_type); if (NInoAttr(ni)) panic("%s(): Target inode is an attribute inode.\n", __FUNCTION__); /* Start the unlink by evicting the target from the name cache. */ cache_purge(ni->vn); /* * We now need to look up the target name in the target mft record. * * If @name_type is FILENAME_POSIX then @name and @name_len contain the * correctly cased name and length in Unicode characters, respectively * so we simply set @ntfs_name and @ntfs_name_len to @name and * @name_len, respectively. * * If @name_type is anything else, i.e. FILENAME_WIN32, FILENAME_DOS, * or FILENAME_WIN32_AND_DOS we simply need to look for that type of * name in the target mft record as there can only be one filename * attribute of this type thus the name is uniquely identified by type * so the lookup can be optimized that way. */ seek_type = 0; if (name_type == FILENAME_POSIX) { ntfs_name = name; ntfs_name_len = name_len; } else { /* * Set @ntfs_name to NULL so we know to do the look up based on * the filename namespace @seek_type instead. */ ntfs_name = NULL; ntfs_name_len = 0; seek_type = name_type; /* * If the target name is the WIN32 name we first need to delete * the DOS name thus re-set @seek_type accordingly (see below * for details). */ if (seek_type == FILENAME_WIN32) seek_type = FILENAME_DOS; } /* * We know this is the base inode since we bailed out for attribute * inodes above. */ err = ntfs_mft_record_map(ni, &m); if (err) { ntfs_error(vol->mp, "Failed to map mft record 0x%llx (error " "%d).", (unsigned long long)ni->mft_no, err); goto err; } /* * Sanity check that the inode link count is in step with the mft * record link count. */ if ((!is_rename && ni->link_count != le16_to_cpu(m->link_count)) || (is_rename && ni->link_count != (unsigned)le16_to_cpu(m->link_count) + 1)) panic("%s(): ni->link_count != le16_to_cpu(m->link_count)\n", __FUNCTION__); actx = ntfs_attr_search_ctx_get(ni, m); if (!actx) { err = ENOMEM; goto unm_err; } /* * Find the name in the target mft record. * * If it is a name in the WIN32 or DOS namespace (but not both), we * remove the DOS name from both the directory index it is in and from * the mft record and we decrement the link count both in the base mft * record and in the ntfs inode. In the case of a WIN32 name, we find * the corresponding DOS name first and proceed as described. * * If the removal of the DOS name from the directory index is * successful, we change the namespace of the remaining WIN32 name to * the POSIX namespace, thus if we fail to remove the remaining name * after successfully removing the DOS name, we still have a consistent * file system. This also has the side effect of allowing undelete to * work properly as otherwise the undelete would restore a WIN32 name * without a corresponding DOS name which would result in an illegal * inode. * * We thus reduce the problem to a normal single name unlink and we can * now determine whether this unlink is just a hard link removal or the * final name removal, i.e. the inode is being deleted. */ seen_dos = FALSE; restart_name: /* * Before looking for the last name and removing it from its directory * index entry, i.e. before unlinking the inode and targeting it for * deletion, we need to check if the inode has an object id and if so * we need to remove it from the object id index on the volume (present * in $O index of $Extend/$ObjId system file), so that the inode cannot * be found via its object id any more either. Also, when the deleted * inode gets reused for different purposes, we do not want the old * object id to still point at it. * * If the volume is pre-NTFS 3.0, i.e. it does not support object ids, * @vol->objid_o_ni will be NULL. It will also be NULL if the volume * is NTFS 3.0+ but no object ids are present on the volume, thus we * can make the check conditional on @objid_o_ni not being NULL. * * We do this before deleting the last directory entry so that we can * abort the unlink if we fail to remove the object id from the index * to ensure the volume does not become inconsistent. */ if (objid_o_ni && ni->link_count <= 1) { err = ntfs_attr_lookup(AT_OBJECT_ID, AT_UNNAMED, 0, 0, NULL, 0, actx); if (err) { if (err != ENOENT) { ntfs_error(vol->mp, "Failed to look up object " "id in mft_no 0x%llx (error " "%d).", (unsigned long long)ni->mft_no, err); goto put_err; } /* * The object id was not found which is fine. The * inode simply does not have an object id assigned to * it so there is nothing for us to do. */ ntfs_debug("Target mft_no 0x%llx does not have an " "object id assigned to it.", (unsigned long long)ni->mft_no); } else /* if (!err) */ { INDEX_ENTRY *ie; GUID object_id; /* The inode has an object id assigned to it. */ ntfs_debug("Deleting object id from target mft_no " "0x%llx.", (unsigned long long)ni->mft_no); a = actx->a; /* * We need to make a copy of the object id and release * the mft record before looking up the object id in * the $ObjID/$O index otherwise we could deadlock if * the currently mapped mft record is in the same page * as one of the mft records of $ObjId. */ memcpy(&object_id, &((OBJECT_ID_ATTR*)((u8*)a + le16_to_cpu(a->value_offset)))-> object_id, sizeof(object_id)); ntfs_attr_search_ctx_put(actx); ntfs_mft_record_unmap(ni); err = vnode_get(objid_o_ni->vn); if (err) { ntfs_error(vol->mp, "Failed to get index " "vnode for $ObjId/$O."); goto err; } lck_rw_lock_exclusive(&objid_o_ni->lock); ictx = ntfs_index_ctx_get(objid_o_ni); if (!ictx) { ntfs_error(vol->mp, "Failed to get index " "context."); err = ENOMEM; goto iput_err; } restart_ictx: /* Get the index entry matching the object id. */ err = ntfs_index_lookup(&object_id, sizeof(object_id), &ictx); if (err) { if (err == ENOENT) { ntfs_error(vol->mp, "Failed to delete " "object id of target " "inode 0x%llx from " "object id index " "because the object " "id was not found in " "the object id " "index. Volume is " "corrupt. Run " "chkdsk.", (unsigned long long) ni->mft_no); NVolSetErrors(vol); err = EIO; } else ntfs_error(vol->mp, "Failed to delete " "object id of target " "inode 0x%llx from " "object id index " "because looking up " "the object id in the " "object id index " "failed (error %d)." , (unsigned long long) ni->mft_no, err); goto iput_err; } ie = ictx->entry; /* We now have the index entry, delete it. */ err = ntfs_index_entry_delete(ictx); if (err) { if (err == -EAGAIN) { ntfs_debug("Restarting object id " "delete as tree was " "rearranged."); ntfs_index_ctx_reinit(ictx, objid_o_ni); goto restart_ictx; } ntfs_error(vol->mp, "Failed to delete object " "id of target inode 0x%llx " "from object id index (error " "%d).", (unsigned long long)ni->mft_no, err); goto iput_err; } ntfs_index_ctx_put(ictx); lck_rw_unlock_exclusive(&objid_o_ni->lock); (void)vnode_put(objid_o_ni->vn); /* * Now get back the mft record so we can re-look up the * object id attribute so we can delete it. * * This means we do not need to worry about * inconsistencies to do with the object id in our * error handling code paths later on. */ err = ntfs_mft_record_map(ni, &m); if (err) { ntfs_error(vol->mp, "Failed to re-map mft " "record 0x%llx (error %d). " "Leaving inconstent " "metadata. Run chkdsk.", (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); goto err; } actx = ntfs_attr_search_ctx_get(ni, m); if (!actx) { ntfs_error(vol->mp, "Failed to re-get " "attribute search context for " "mft record 0x%llx (error " "%d). Leaving inconstent " "metadata. Run chkdsk.", (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); err = ENOMEM; goto unm_err; } err = ntfs_attr_lookup(AT_OBJECT_ID, AT_UNNAMED, 0, 0, NULL, 0, actx); if (err) { ntfs_error(vol->mp, "Failed to re-look up " "object id in mft_no 0x%llx " "(error %d). Leaving " "inconsistent metadata. Run " "chkdsk.", (unsigned long long)ni->mft_no, err); NVolSetErrors(ni->vol); err = EIO; goto put_err; } /* * Remove the object id attribute from the mft record * and mark the mft record dirty. */ err = ntfs_attr_record_delete(ni, actx); if (err) { ntfs_error(vol->mp, "Failed to delete object " "id in mft_no 0x%llx (error " "%d). Leaving inconsistent " "metadata. Run chkdsk.", (unsigned long long)ni->mft_no, err); goto put_err; } } /* Reinit the search context for the AT_FILENAME lookup. */ ntfs_attr_search_ctx_reinit(actx); } /* Use label and goto instead of a loop to reduce indentation. */ fn_count = 0; next_name: /* Increment the filename attribute counter. */ fn_count++; err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0, actx); if (err) { if (err == ENOENT) { /* * If the name we are looking for is not found there is * either some corruption or a bug given that a call to * ntfs_lookup_inode_by_name() just found the name in * the directory index. */ ntfs_error(vol->mp, "The target filename was not " "found in the mft record 0x%llx. " "This is not possible. This is " "either due to corruption or due to a " "driver bug. Run chkdsk.", (unsigned long long)ni->mft_no); NVolSetErrors(vol); err = EIO; } else ntfs_error(vol->mp, "Failed to look up target " "filename in the mft record 0x%llx " "(error %d).", (unsigned long long)ni->mft_no, err); goto put_err; } a = actx->a; fn = (FILENAME_ATTR*)((u8*)a + le16_to_cpu(a->value_offset)); fn_type = fn->filename_type; /* * If this is a specific DOS or WIN32 or combined name lookup, no need * to compare the actual name as there can only be one DOS and one * WIN32 name or only one combined name in an inode. */ if (seek_type && seek_type != FILENAME_POSIX) { /* * If this filename attribute does not match the target name * try the next one. */ if (seek_type != fn_type) goto next_name; /* We found the filename attribute matching the target name. */ if (fn_type == FILENAME_WIN32) { /* * We were looking for the WIN32 name so we can remove * it after having removed the DOS name. We now found * it, so switch it to the POSIX namespace as described * above and then go ahead and delete it. */ ntfs_debug("Switching namespace of filename attribute " "from WIN32 to POSIX."); fn_type = fn->filename_type = FILENAME_POSIX; NInoSetMrecNeedsDirtying(actx->ni); } goto found_name; } /* If this is the DOS name, note that we have seen it. */ if (fn_type == FILENAME_DOS) seen_dos = TRUE; /* If the names do not match, continue searching. */ if (fn->filename_length != ntfs_name_len) goto next_name; if (MREF_LE(fn->parent_directory) != dir_ni->mft_no) goto next_name; if (bcmp(fn->filename, ntfs_name, ntfs_name_len * sizeof(ntfschar))) goto next_name; /* Found the matching name. */ if (fn_type == FILENAME_WIN32) { /* * Pure WIN32 name. Repeat the lookup but for the DOS name * this time so we can remove that first. */ seek_type = FILENAME_DOS; /* * If @seen_dos is true, then restart the lookup from the * beginning and if not then continue the lookup where we left * off. */ if (seen_dos) { ntfs_attr_search_ctx_reinit(actx); fn_count = 0; } goto next_name; } if (fn_type == FILENAME_DOS) { /* * This cannot happen as ntfs_lookup_inode_by_name() always * returns @name for pure DOS names and hence we would have * @seek_type == FILENAME_DOS and thus would have picked this * filename attribute up above without ever doing a name based * match. */ ntfs_error(vol->mp, "Filename is in DOS namespace. This is " "not possible. This is either due to " "corruption or due to a driver bug. Run " "chkdsk."); NVolSetErrors(vol); err = EIO; goto put_err; } found_name: /* * We found the target filename attribute and can now remove it from * the directory index. But before we can do that we need to make a * copy of the filename attribute value so we can release the mft * record before we delete the directory index entry. This is needed * because when we hold the target mft record and we call * ntfs_dir_entry_delete() this would cause the mft record for the * directory to be mapped which could result in a deadlock in the event * that both mft records are in the same page. */ tfn_alloc = le32_to_cpu(a->value_length); tfn = OSMalloc(tfn_alloc, ntfs_malloc_tag); if (!tfn) { /* * TODO: If @seek_type == FILENAME_WIN32 && * @fn->filename_type == FILENAME_POSIX we need to update the * directory entry filename_type to FILENAME_POSIX. See below * for how this is done for the error case in * ntfs_dir_entry_delete(). Given a memory allocation just * failed it is highly unlikely we would succeed in trying to * look up the directory entry so that we could change the * filename_type in it so at least for now just set the volume * has errors flag instead. */ ntfs_error(vol->mp, "Failed to allocate memory for temporary " "filename attribute. Leaving inconsistent " "metadata. Run chkdsk."); NVolSetErrors(vol); err = EIO; goto put_err; } memcpy(tfn, fn, tfn_alloc); ntfs_attr_search_ctx_put(actx); ntfs_mft_record_unmap(ni); /* * We copied the name and can now remove it from the directory index. * If the name is in the POSIX namespace, we may have converted it from * a pure WIN32 name after removing the corresponding DOS name, in * which case we need to update the index entry to reflect the * conversion should we fail to remove it from the directory index. * ntfs_dir_entry_delete() takes care of this for us. */ err = ntfs_dir_entry_delete(dir_ni, ni, tfn, tfn_alloc); if (err) { ntfs_error(vol->mp, "Failed to delete directory index entry " "(error %d).", err); goto err; } /* * Now get back the mft record. * * If getting back the mft record fails there is nothing we can do to * recover and must bail out completely leaving inconsistent metadata. * * TODO: We could try to add the dir entry back again in an attempt to * recover but as above we likely fail a memory allocation it is highly * unlikely we would succeed in trying to do the lookup and addition of * the directory entry. */ err = ntfs_mft_record_map(ni, &m); if (err) { ntfs_error(vol->mp, "Failed to re-map mft record 0x%llx " "(error %d). Leaving inconsistent metadata. " "Run chkdsk.", (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); goto err; } actx = ntfs_attr_search_ctx_get(ni, m); if (!actx) { ntfs_error(vol->mp, "Failed to re-get attribute search " "context for mft record 0x%llx (error %d). " "Leaving inconsitent metadata. Run chkdsk.", (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); err = EIO; goto unm_err; } /* * If the name is in the DOS namespace or this is not the last name we * also need to remove the name from the mft record it is in and * decrement the link count in the base mft record. */ if (fn_type == FILENAME_DOS || ni->link_count > 1) { /* Now need to re-lookup the target filename attribute. */ while (fn_count > 0) { fn_count--; err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0, actx); if (!err) continue; ntfs_error(vol->mp, "Failed to re-look up target " "filename in mft_no 0x%llx (error %d).", (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); err = EIO; goto put_err; } a = actx->a; if (a->type != AT_FILENAME) panic("%s(): a->type (0x%x) != AT_FILENAME (0x30)\n", __FUNCTION__, le32_to_cpu(a->type)); fn = (FILENAME_ATTR*)((u8*)a + le16_to_cpu(a->value_offset)); if (fn_type != fn->filename_type) panic("%s(): fn_type != fn->filename_type\n", __FUNCTION__); /* Remove the filename from the mft record, too. */ err = ntfs_attr_record_delete(ni, actx); if (err) { ntfs_error(vol->mp, "Failed to delete filename " "attribute from mft_no 0x%llx (error " "%d).", (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); err = EIO; goto put_err; } /* * Update the hard link count in the base mft record. Note we * subtract one from the inode link count if this is a rename * as the link count has been elevated by one by the caller. */ m->link_count = cpu_to_le16(ni->link_count - 1 - (is_rename ? 1 : 0)); } else /* if (fn_type != FILENAME_DOS && ni->link_count <= 1) */ { /* * This is the last name, so we need to mark the mft record as * unused in the mft record flags so no-one can open it by * accident and so that, in case of a crash between now and the * deletion of the inode, ntfsck will know that we meant to * delete the inode rather than that we were in the process of * allocating or renaming it so it will do the Right Thing(TM) * and complete the deletion process. */ m->flags &= ~MFT_RECORD_IN_USE; /* Ensure the base mft record gets written out. */ NInoSetMrecNeedsDirtying(ni); } /* * We have either deleted the filename completely or we only removed * the directory index entry if this is the last name. * * In either case, we need to update the hard link count and the ctime * in the ntfs inode (the ctime is the last_mft_change_time on NTFS). */ ni->link_count--; ni->last_mft_change_time = dir_ni->last_mft_change_time; NInoSetDirtyTimes(ni); /* * If this is the DOS name, we now need to find the WIN32 name, so it * can be deleted, too. Otherwise we are done. */ if (fn_type == FILENAME_DOS) { seek_type = FILENAME_WIN32; /* * We looked up the DOS name above thus we need to reinitialize * the search context for the WIN32 name lookup. */ ntfs_attr_search_ctx_reinit(actx); fn_count = 0; goto restart_name; } /* * If we removed a hard link but the inode is not deleted yet we need * to remove the parent vnode from the vnode as this association may no * longer exist. * * The same is true for the vnode name as we have just unlinked it. * * Note we skip this for the rename case because the subsequent call to * ntfs_link_internal() is going to update the vnode identity with the * new name and parent so no need to do wipe them here. */ if (ni->link_count > 0 && !is_rename) vnode_update_identity(ni->vn, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME); ntfs_debug("Done."); put_err: ntfs_attr_search_ctx_put(actx); unm_err: ntfs_mft_record_unmap(ni); err: return err; iput_err: if (ictx) ntfs_index_ctx_put(ictx); lck_rw_unlock_exclusive(&objid_o_ni->lock); (void)vnode_put(objid_o_ni->vn); return err; } /** * ntfs_unlink - unlink and ntfs inode from its parent directory * @dir_ni: directory ntfs inode from which to unlink the ntfs inode * @ni: base ntfs inode to unlink * @cn: name of the inode to unlink * @flags: flags describing the unlink request * @is_rmdir: true if called from VNOP_RMDIR() and hence ntfs_vnop_rmdir() * * Unlink an inode with the ntfs inode @ni and name as specified in @cn from * the directory with ntfs inode @dir_ni. * * The flags in @flags further describe the unlink request. The following * flags are currently defined in OS X kernel: * VNODE_REMOVE_NODELETEBUSY - Do not delete busy files, i.e. use * Carbon delete semantics). * * If @is_rmdir is true the caller is VNOP_RMDIR() and hence ntfs_vnop_rmdir() * and if @is_rmdir is false the caller is VNOP_REMOVE() and hence * ntfs_vnop_remove(). Note @flags is always zero if @is_rmdir is true. * * Return 0 on success and the error code on error. * * Note that if the name of the inode to be removed is in the WIN32 or DOS * namespaces, both the WIN32 and the corresponding DOS names are removed. * * Note that for a hard link this function simply removes the name and its * directory entry and decrements the hard link count whilst for the last name, * i.e. the last link to an inode, it only removes the directory entry, i.e. it * does not remove the name, however it does decrement the hard link count to * zero. This is so that the inode can be undeleted and its original name * restored. In any case, we do not actually delete the inode here as it may * still be open and UNIX semantics require an unlinked inode to be still * accessible through already opened file descriptors. When the last file * descriptor is closed, we causes the inode to be deleted when the VFS * notifies us of the last close by calling VNOP_INACTIVE(), i.e. * ntfs_vnop_inactive(). */ static errno_t ntfs_unlink(ntfs_inode *dir_ni, ntfs_inode *ni, struct componentname *cn, const int flags, const BOOL is_rmdir) { MFT_REF mref; ntfs_volume *vol; ntfs_inode *objid_o_ni; ntfschar *ntfs_name; ntfs_dir_lookup_name *name = NULL; size_t ntfs_name_size; signed ntfs_name_len; errno_t err; FILENAME_TYPE_FLAGS ntfs_name_type; ntfschar ntfs_name_buf[NTFS_MAX_NAME_LEN]; vol = ni->vol; objid_o_ni = vol->objid_o_ni; ntfs_debug("Unlinking %s%.*s with mft_no 0x%llx from directory " "mft_no 0x%llx, flags 0x%x.", is_rmdir ? "directory " : "", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no, (unsigned long long)dir_ni->mft_no, flags); /* * Do not allow attribute inodes or raw inodes to be deleted. Note * raw inodes are always attribute inodes, too. */ if (NInoAttr(ni)) { ntfs_debug("Target %.*s, mft_no 0x%llx is a%s inode, " "returning EPERM.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no, NInoAttr(ni) ? "n attribute" : " raw"); return EPERM; } /* The parent inode must be a directory. */ if (!S_ISDIR(dir_ni->mode)) { ntfs_debug("Parent mft_no 0x%llx is not a directory, " "returning ENOTDIR.", (unsigned long long)dir_ni->mft_no); return ENOTDIR; } /* Check for "." removal. */ if (ni == dir_ni) { ntfs_debug("Target %.*s, mft_no 0x%llx is the same as its " "parent directory, returning EINVAL.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no); return EINVAL; } /* Lock both the parent directory and the target inode for writing. */ lck_rw_lock_exclusive(&dir_ni->lock); lck_rw_lock_exclusive(&ni->lock); /* Ensure the parent directory has not been deleted. */ if (!dir_ni->link_count) { ntfs_debug("Parent directory mft_no 0x%llx has been deleted, " "returning ENOENT.", (unsigned long long)dir_ni->mft_no); /* * If the directory is somehow still in the name cache remove * it now. */ cache_purge(dir_ni->vn); err = ENOENT; goto err; } /* Ensure tha target has not been deleted by someone else already. */ if (!ni->link_count) { ntfs_debug("Target %.*s, mft_no 0x%llx has been deleted, " "returning ENOENT.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no); /* * If the target is somehow still in the name cache remove it * now. */ cache_purge(ni->vn); err = ENOENT; goto err; } /* * If this is a directory removal, i.e. rmdir, need to check that the * directory is empty. * * Note we already checked for "." removal and we do not need to check * for ".." removal because that would fail the directory is empty * check as the parent directory would at least have one entry and that * is the current directory. */ if (is_rmdir) { err = ntfs_dir_is_empty(ni); if (err) { if (err == ENOTEMPTY) ntfs_debug("Target directory %.*s, mft_no " "0x%llx is not empty, " "returning ENOTEMPTY.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no); else ntfs_error(vol->mp, "Failed to determine if " "target directory %.*s, " "mft_no 0x%llx is empty " "(error %d).", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no, err); goto err; } } else { /* Do not allow directories to be unlinked. */ if (S_ISDIR(ni->mode)) { ntfs_debug("Target %.*s, mft_no 0x%llx is a " "directory, returning EPERM.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no); err = EPERM; goto err; } } /* * Do not allow any of the system files to be deleted. * * For NTFS 3.0+ volumes do not allow any of the extended system files * to be deleted, either. * * Note we specifically blacklist all system files that we make use of * except for the transaction log $UsnJrnl as that is allowed to be * deleted and its deletion means that transaction logging is disabled. * * Note that if the transaction log is present it will be held busy by * the NTFS driver thus unlinking the $UsnJrnl will not actually delete * it until the driver is unmounted. FIXME: Should we leave it like * this or should we detach the $UsnJrnl vnodes from the volume and * release them so they can be deleted immediately? * * TODO: What about all the new metadata files introduced with Windows * Vista? We are currently ignoring them and allowing them to be * deleted... */ if (ni->file_attributes & FILE_ATTR_SYSTEM) { BOOL is_system = FALSE; if (vol->major_ver <= 1) { if (ni->mft_no < FILE_Extend) is_system = TRUE; } else { if (ni->mft_no <= FILE_Extend) is_system = TRUE; if (dir_ni == vol->extend_ni) { if (ni == vol->objid_ni || ni == vol->quota_ni) is_system = TRUE; } } if (is_system) { ntfs_debug("Target %.*s, mft_no 0x%llx is a%s system " "file, returning EPERM.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no, (dir_ni == vol->extend_ni) ? "n extended" : ""); err = EPERM; goto err; } } /* * Ensure the file is not read-only (the read-only bit is ignored for * directories. */ if (!S_ISDIR(ni->mode) && ni->file_attributes & FILE_ATTR_READONLY) { ntfs_debug("Target %.*s, mft_no 0x%llx is marked read-only, " "returning EPERM.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no); err = EPERM; goto err; } /* * If the inode is a reparse point or if the inode is offline we cannot * remove a name from it yet. TODO: Implement this. */ if (ni->file_attributes & (FILE_ATTR_REPARSE_POINT | FILE_ATTR_OFFLINE)) { ntfs_error(vol->mp, "Target %.*s, mft_no 0x%llx is %s. " "Deleting names from such inodes is not " "supported yet, returning ENOTSUP.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no, ni->file_attributes & FILE_ATTR_REPARSE_POINT ? "a reparse point" : "offline"); err = ENOTSUP; goto err; } /* * If Carbon delete semantics are requested, do not allow busy files to * be unlinked. Note we do not use vnode_isinuse() as that accounts * for open named streams/extended attributes as well which we do not * care about. We only care for actually opened files thus we keep * track of them ourselves. */ if (flags & VNODE_REMOVE_NODELETEBUSY && ni->nr_opens) { ntfs_debug("Target %.*s, mft_no 0x%llx is busy (nr_opens " "0x%x) and Carbon delete semantics were " "requested, returning EBUSY.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no, (unsigned)ni->nr_opens); err = EBUSY; goto err; } /* * We need to make sure the target still has the name specified in @cn * that is being unlinked. It could have been unlinked or renamed * before we took the locks on the parent directory and the target. * * To do this, first convert the name of the target from utf8 to * Unicode then look up the converted name in the directory index. */ ntfs_name = ntfs_name_buf; ntfs_name_size = sizeof(ntfs_name_buf); ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen, &ntfs_name, &ntfs_name_size); if (ntfs_name_len < 0) { err = -ntfs_name_len; if (err == ENAMETOOLONG) ntfs_debug("Failed (name is too long)."); else ntfs_error(vol->mp, "Failed to convert name to " "Unicode (error %d).", err); goto err; } err = ntfs_lookup_inode_by_name(dir_ni, ntfs_name, ntfs_name_len, &mref, &name); if (err) { if (err != ENOENT) { ntfs_error(vol->mp, "Failed to find name in directory " "(error %d).", err); goto err; } enoent: /* * The name does not exist in the directory @dir_ni. * * This means someone renamed or deleted the name from the * directory before we managed to take the locks. */ ntfs_debug("Target %.*s, mft_no 0x%llx has been renamed or " "deleted already, returning ENOENT.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no); /* * If the target is somehow still in the name cache remove it * now. */ cache_purge(ni->vn); err = ENOENT; goto err; } /* * We found the target name in the directory index but does it still * point to the same mft record? The sequence number check ensures the * inode was not deleted and recreated with the same name and the same * mft record number. */ if (mref != MK_MREF(ni->mft_no, ni->seq_no)) goto enoent; /* * We are going to go ahead with unlinking the target. * * There are several different types of outcome from the above lookup * that need to be handled. * * If @name is NULL @ntfs_name contains the correctly cased name thus * we can simply look for that. In this case we set the name type to 0 * as we do not know which namespace the name is in. * * If @name is not NULL the correctly cased name is in @name->name thus * we look for that. In this case we do know which namespace the name * is in as it is @name->type. */ ntfs_name_type = 0; if (name) { ntfs_name = name->name; ntfs_name_len = name->len; ntfs_name_type = name->type; } /* Now we can perform the actual unlink. */ err = ntfs_unlink_internal(dir_ni, ni, ntfs_name, ntfs_name_len, ntfs_name_type, FALSE); if (err) ntfs_error(vol->mp, "Failed to unlink %.*s with mft_no 0x%llx " "from directory mft_no 0x%llx (error %d).", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no, (unsigned long long)dir_ni->mft_no, err); else ntfs_debug("Done."); err: if (name) OSFree(name, sizeof(*name), ntfs_malloc_tag); lck_rw_unlock_exclusive(&ni->lock); lck_rw_unlock_exclusive(&dir_ni->lock); return err; } /** * ntfs_vnop_remove - unlink a file * @a: arguments to remove function * * @a contains: * vnode_t a_dvp; directory from which to unlink the file * vnode_t a_vp; file to unlink * struct componentname *a_cnp; name of the file to unlink * int a_flags; flags describing the unlink request * vfs_context_t a_context; * * Unlink a file with vnode @a->a_vp and name as specified in @a->a_cnp form * the directory with vnode @a->a_dvp. * * The flags in @a->a_flags further describe the unlink request. The following * flags are currently defined in OS X kernel: * VNODE_REMOVE_NODELETEBUSY - Do not delete busy files, i.e. use * Carbon delete semantics). * * Return 0 on success and errno on error. * * Note that if the name of the inode to be removed is in the WIN32 or DOS * namespaces, both the WIN32 and the corresponding DOS names are removed. * * Note that for a hard link this function simply removes the name and its * directory entry and decrements the hard link count whilst for the last name, * i.e. the last link to an inode, it only removes the directory entry, i.e. it * does not remove the name, however it does decrement the hard link count to * zero. This is so that the inode can be undeleted and its original name * restored. In any case, we do not actually delete the inode here as it may * still be open and UNIX semantics require an unlinked inode to be still * accessible through already opened file descriptors. When the last file * descriptor is closed, we causes the inode to be deleted when the VFS * notifies us of the last close by calling VNOP_INACTIVE(), i.e. * ntfs_vnop_inactive(). */ static int ntfs_vnop_remove(struct vnop_remove_args *a) { ntfs_inode *dir_ni = NTFS_I(a->a_dvp); ntfs_inode *ni = NTFS_I(a->a_vp); errno_t err; if (!dir_ni || !ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } ntfs_debug("Entering."); err = ntfs_unlink(NTFS_I(a->a_dvp), NTFS_I(a->a_vp), a->a_cnp, a->a_flags, FALSE); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_link_internal - create a hard link to an inode * @ni: base ntfs inode to create hard link to * @dir_ni: directory ntfs inode in which to create the hard link * @cn: componentname specifying name of the hard link to create * @is_rename: if true ntfs_link_internal() is called for a rename * @name: Unicode name of the inode to unlink * @name_len: length of the name in Unicode characters * * Create a hard link to the ntfs inode @ni with name as specified in @cn in * the directory ntfs inode @dir_ni. * * If @is_rename is true the caller was ntfs_vnop_rename() in which case the * link count of the inode to link to will be one higher than the link count in * the mft record and @name and @name_len specify the Unicode name and length * in Unicode characters corresponding to @cn, respectively so we do not have * to convert @cn to Unicode in this case. * * If @is_rename is false then @name and @name_len are undefined. * * Return 0 on success and errno on error. * * Note we always create filenames in the POSIX namespace. */ static errno_t ntfs_link_internal(ntfs_inode *ni, ntfs_inode *dir_ni, struct componentname *cn, const BOOL is_rename, const ntfschar *name, const signed name_len) { ntfs_volume *vol; FILENAME_ATTR *fn; ntfschar *ntfs_name; MFT_RECORD *m; ntfs_attr_search_ctx *ctx; size_t ntfs_name_size; signed ntfs_name_len; unsigned fn_alloc, fn_size; errno_t err, err2; BOOL is_dir; vol = ni->vol; ntfs_debug("Creating a hard link to mft_no 0x%llx, named %.*s in " "directory mft_no 0x%llx.", (unsigned long long)ni->mft_no, (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)dir_ni->mft_no); if (NInoAttr(ni)) panic("%s(): Inode to link to is an attribute/raw inode.\n", __FUNCTION__); is_dir = S_ISDIR(ni->mode); /* * Create a temporary filename attribute so we can find the correct * place to insert it into. We also need a temporary copy so we can * release the mft record before we add the directory entry. This is * needed because when we hold the mft record for the inode and we call * ntfs_dir_entry_add() this would cause the mft record for the * directory to be mapped which would result in a deadlock in the event * that both mft records are in the same page. */ fn_alloc = sizeof(FILENAME_ATTR) + NTFS_MAX_NAME_LEN * sizeof(ntfschar); fn = OSMalloc(fn_alloc, ntfs_malloc_tag); if (!fn) { ntfs_error(vol->mp, "Failed to allocate memory for temporary " "filename attribute."); err = ENOMEM; goto err; } bzero(fn, fn_alloc); /* Begin setting up the temporary filename attribute. */ fn->parent_directory = MK_LE_MREF(dir_ni->mft_no, dir_ni->seq_no); /* FILENAME_POSIX is zero and the attribute is already zeroed. */ /* fn->filename_type = FILENAME_POSIX; */ /* * If this is not a rename then convert the name from utf8 to Unicode. * If this is a rename on the other hand then we have the name in * Unicode already so just copy that over. */ ntfs_name = fn->filename; ntfs_name_size = NTFS_MAX_NAME_LEN * sizeof(ntfschar); if (!is_rename) { ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen, &ntfs_name, &ntfs_name_size); if (ntfs_name_len < 0) { err = -ntfs_name_len; if (err == ENAMETOOLONG) ntfs_debug("Failed (name is too long)."); else ntfs_error(vol->mp, "Failed to convert name to " "Unicode (error %d).", err); goto err; } } else { memcpy(ntfs_name, name, name_len * sizeof(ntfschar)); ntfs_name_len = name_len; } /* Set the filename length in the temporary filename attribute. */ fn->filename_length = ntfs_name_len; fn_size = sizeof(FILENAME_ATTR) + ntfs_name_len * sizeof(ntfschar); /* * Copy the times from the standard information attribute which we have * cached in the ntfs inode. */ fn->creation_time = utc2ntfs(ni->creation_time); fn->last_data_change_time = utc2ntfs(ni->last_data_change_time); fn->last_mft_change_time = utc2ntfs(ni->last_mft_change_time); fn->last_access_time = utc2ntfs(ni->last_access_time); if (!is_dir) { lck_spin_lock(&ni->size_lock); fn->allocated_size = cpu_to_sle64(NInoNonResident(ni) && (NInoSparse(ni) || NInoCompressed(ni)) ? ni->compressed_size : ni->allocated_size); fn->data_size = cpu_to_sle64(ni->data_size); lck_spin_unlock(&ni->size_lock); } else { /* * Directories use 0 for the sizes in the filename attribute * and the attribute is already zeroed. */ /* fn->data_size = fn->allocated_size = 0; */ } /* * If this is not a directory or it is an encrypted directory, set the * needs archiving bit except for the core system files. */ fn->file_attributes = ni->file_attributes; if (!is_dir || NInoEncrypted(ni)) { BOOL need_set_archive_bit = TRUE; if (vol->major_ver >= 2) { if (ni->mft_no <= FILE_Extend) need_set_archive_bit = FALSE; } else { if (ni->mft_no <= FILE_UpCase) need_set_archive_bit = FALSE; } if (need_set_archive_bit) { ni->file_attributes |= FILE_ATTR_ARCHIVE; fn->file_attributes = ni->file_attributes; NInoSetDirtyFileAttributes(ni); } } /* * Directories need the FILE_ATTR_DUP_FILENAME_INDEX_PRESENT flag set * in their filename attributes both in their mft records and in the * index entries pointing to them but not in the standard information * attribute which is why it is not set in @ni->file_attributes. */ if (is_dir) fn->file_attributes |= FILE_ATTR_DUP_FILENAME_INDEX_PRESENT; /* * TODO: We need to find out whether it is true that ea_length takes * precedence over reparse_tag, i.e. we need to check that if both EAs * are present and this is a reparse point, we need to set the * ea_length rather than the reparse_tag. So far I have not been able * to create EAs on a reparse point and vice versa so perhaps the two * are mutually exclusive in which case we are fine... * * The attribute is already zeroed so no need to set anything to zero. */ #if 0 if (ni->ea_length) { fn->ea_length = cpu_to_le16(ni->ea_length); /* fn->reserved = 0; */ } else if (ni->file_attributes & FILE_ATTR_REPARSE_POINT) { // TODO: Instead of zero use actual value if/when we enable // creating hard links to reparse points... /* fn->reparse_tag = 0; */ } else { /* * We need to initialize the unused field to zero but as we * have already zeroed the attribute we do not need to do * anything now. */ /* fn->reparse_tag = 0; */ } #endif /* * Add the created filename attribute to the parent directory index. * * We know @ni is the base inode since we bailed out for attribute * inodes above so we can use it to generate the mft reference. */ err = ntfs_dir_entry_add(dir_ni, fn, fn_size, MK_LE_MREF(ni->mft_no, ni->seq_no)); if (err) goto err; /* * The ea_length and reparse_tag are only set in the directory index * entries and not in filename attributes in the mft record so zero * them here, before adding the filename attribute to the mft record. */ fn->reparse_tag = 0; /* * Add the created filename attribute to the mft record as well. * * Again, we know @ni is the base inode. */ err = ntfs_mft_record_map(ni, &m); if (err) { ntfs_error(vol->mp, "Failed to map mft record 0x%llx (error " "%d).", (unsigned long long)ni->mft_no, err); goto rm_err; } ctx = ntfs_attr_search_ctx_get(ni, m); if (!ctx) { err = ENOMEM; goto unm_err; } err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, fn, fn_size, ctx); if (err != ENOENT) { if (!err) { ntfs_debug("Failed (filename already present in " "inode."); err = EEXIST; } else ntfs_error(vol->mp, "Failed to add filename to mft_no " "0x%llx because looking up the " "filename in the mft record failed " "(error %d).", (unsigned long long)ni->mft_no, err); goto put_err; } /* * The current implementation of ntfs_attr_lookup() will always return * pointing into the base mft record when an attribute was not found. */ if (ni != ctx->ni) panic("%s(): ni != ctx->ni\n", __FUNCTION__); if (m != ctx->m) panic("%s(): m != ctx->m\n", __FUNCTION__); /* * @ctx->a now points to the location in the mft record at which we * need to insert the filename attribute, so insert it now. * * Note we ignore the case where @ctx->is_error is true because we do * not need the attribute any more for anything after it has been * inserted so we do not care that we failed to map its mft record. */ err = ntfs_resident_attr_record_insert(ni, ctx, AT_FILENAME, NULL, 0, fn, fn_size); if (err) { ntfs_error(vol->mp, "Failed to add filename to mft_no 0x%llx " "because inserting the filename attribute " "failed (error %d).", (unsigned long long)ni->mft_no, err); goto put_err; } /* * Update the hard link count in the mft record. Note we subtract one * from the inode link count if this is a rename as the link count has * been elevated by one by the caller. */ ni->link_count++; m->link_count = cpu_to_le16(ni->link_count - (is_rename ? 1 : 0)); /* * Update the ctime in the inode by copying it from the target * directory inode where it will have been updated by the above call to * ntfs_dir_entry_add(). */ ni->last_mft_change_time = dir_ni->last_mft_change_time; NInoSetDirtyTimes(ni); /* * Invalidate negative cache entries in the directory. We need to do * this because there may be negative cache entries which would match * the name of the just created inode but in a different case. Such * negative cache entries would now be incorrect thus we need to throw * away all negative cache entries to ensure there cannot be any * incorrectly negative entries in the name cache. */ cache_purge_negatives(dir_ni->vn); /* * We should add the new hard link to the name cache. Problem is that * this is likely not to be a useful thing to do as the original name * is likely in the name cache already and the OS X name cache only * allows one name per vnode and cache_enter() simply returns without * doing anything if a name is already present in the name cache for * the vnode. Thus we could use vnode_update_identity() instead to * switch the cached name from the original name to the new hard link. * * FIXME: The question is whether this is a useful thing to do. On the * one hand people creating a hard link are likely to want to then * access the inode via the new name but on the other hand hard links * are often used in applications for locking purposes and in this case * after the hard link is created the application is likely to unlink * the original name thus it would be beneficial if that remains in the * cache until this happens which will automatically remove the name * from the name cache and the next lookup of the new name will insert * the new one. Thus it is best if we do nothing at all now. If OS X * ever allows multiple name links per vnode we can uncomment the below * cache_enter() call. * * For the rename case we have just removed the original name, thus it * makes sense to add the new name now and whilst at it also update the * vnode identity with the new name and parent as the old ones are no * longer valid. */ if (is_rename) { vnode_update_identity(ni->vn, dir_ni->vn, cn->cn_nameptr, cn->cn_namelen, cn->cn_hash, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME); cache_enter(dir_ni->vn, ni->vn, cn); cn->cn_flags &= ~MAKEENTRY; } /* * Ensure the base mft record is written to disk. * * Note we do not set any of the NInoDirty*() flags because we have * just created the inode thus all the fields are in sync between the * ntfs_inode @ni and its mft record @m. * * Also note we defer the unmapping of the mft record to here so that * we do not get racing time updates, etc during concurrent runs of * link(2) and rename(2) where the source inode for the rename is the * inode that has a new hardlink created to it at the same time. This * case can happen because we do not lock the source inode in * ntfs_vnop_rename(). */ NInoSetMrecNeedsDirtying(ni); /* We are done with the mft record. */ ntfs_attr_search_ctx_put(ctx); ntfs_mft_record_unmap(ni); /* Free the temporary filename attribute. */ OSFree(fn, fn_alloc, ntfs_malloc_tag); ntfs_debug("Done."); return 0; put_err: ntfs_attr_search_ctx_put(ctx); unm_err: ntfs_mft_record_unmap(ni); rm_err: #if 0 if (ni->ea_length) { fn->ea_length = cpu_to_le16(ni->ea_length); /* fn->reserved = 0; */ } else if (ni->file_attributes & FILE_ATTR_REPARSE_POINT) { // TODO: Instead of zero use actual value if/when we enable // creating hard links to reparse points... /* fn->reparse_tag = 0; */ } else { /* * We need to initialize the unused field to zero but as we * have already zeroed the attribute we do not need to do * anything now. */ /* fn->reparse_tag = 0; */ } #endif err2 = ntfs_dir_entry_delete(dir_ni, ni, fn, fn_size); if (err2) { ntfs_error(vol->mp, "Failed to rollback index entry creation " "in error handling code path (error %d). " "Leaving inconsistent metadata. Run chkdsk.", err2); NVolSetErrors(vol); } err: if (fn) OSFree(fn, fn_alloc, ntfs_malloc_tag); if (err != EEXIST) ntfs_error(vol->mp, "Failed (error %d).", err); else ntfs_debug("Failed (error EEXIST)."); return err; } /** * ntfs_vnop_link - create a hard link to an inode * @a: arguments to link function * * @a contains: * vnode_t a_vp; vnode to create hard link to * vnode_t a_tdvp; destination directory for the hard link * struct componentname *a_cnp; name of the hard link to create * vfs_context_t a_context; * * Create a hard link to the inode specified by the vnode @a->a_vp with name as * specified in @a->a_cnp in the directory specified by the vnode @a->a_tdvp. * * Return 0 on success and errno on error. * * Note we always create filenames in the POSIX namespace. */ static int ntfs_vnop_link(struct vnop_link_args *a) { ntfs_inode *ni, *dir_ni; ntfs_volume *vol; struct componentname *cn; errno_t err; ni = NTFS_I(a->a_vp); vol = ni->vol; dir_ni = NTFS_I(a->a_tdvp); if (!dir_ni || !ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } cn = a->a_cnp; ntfs_debug("Creating a hard link to mft_no 0x%llx, named %.*s in " "directory mft_no 0x%llx.", (unsigned long long)ni->mft_no, (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)dir_ni->mft_no); /* Do not allow attribute/raw inodes to be linked to. */ if (NInoAttr(ni)) { ntfs_debug("Mft_no 0x%llx is a%s inode, returning EPERM.", (unsigned long long)ni->mft_no, NInoRaw(ni) ? " raw" : "n attribute"); return EPERM; } /* The target inode must be a directory. */ if (!S_ISDIR(dir_ni->mode)) { ntfs_debug("Target mft_no 0x%llx is not a directory, " "returning ENOTDIR.", (unsigned long long)dir_ni->mft_no); return ENOTDIR; } /* Lock the target directory inode for writing. */ lck_rw_lock_exclusive(&dir_ni->lock); /* The inode being linked to must not be a directory. */ if (S_ISDIR(ni->mode)) { lck_rw_unlock_exclusive(&dir_ni->lock); ntfs_debug("Mft_no 0x%llx to link to is a directory, cannot " "create hard link %.*s to it, returning " "EPERM.", (unsigned long long)ni->mft_no, (int)cn->cn_namelen, cn->cn_nameptr); return EPERM; } /* Lock the inode to link to for writing. */ lck_rw_lock_exclusive(&ni->lock); /* Ensure the target directory has not been deleted. */ if (!dir_ni->link_count) { ntfs_debug("Target directory mft_no 0x%llx has been deleted, " "returning ENOENT.", (unsigned long long)dir_ni->mft_no); /* * If the directory is somehow still in the name cache remove * it now. */ cache_purge(dir_ni->vn); err = ENOENT; goto err; } /* * Ensure the inode has not been deleted. Note we really should be * checking that the source of the hard link has not been unlinked yet * but we do not know what the source name was as the caller does not * provide it to us and we do not know which name we were called for * from just looking at the source vnode/inode. */ if (!ni->link_count) { ntfs_debug("Inode %.*s, mft_no 0x%llx has been deleted, " "returning ENOENT.", (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)ni->mft_no); /* * If the target is somehow still in the name cache remove it * now. */ cache_purge(ni->vn); err = ENOENT; goto err; } /* * The inode being linked to must not be a directory or device special * file. TODO: Extend the checks when we support device special files. */ if (S_ISDIR(ni->mode)) { ntfs_debug("Mft_no 0x%llx to link to is a directory, cannot " "create hard link %.*s to it, returning " "EPERM.", (unsigned long long)ni->mft_no, (int)cn->cn_namelen, cn->cn_nameptr); err = EPERM; goto err; } /* * Do not allow any of the system files to be linked to. * * For NTFS 3.0+ volumes do not allow any of the extended system files * to be linked to, either. * * Note we specifically blacklist all system files that we make use of. * * TODO: What about all the new metadata files introduced with Windows * Vista? We are currently ignoring them and allowing them to be * linked to... */ if (ni->file_attributes & FILE_ATTR_SYSTEM) { BOOL is_system = FALSE; if (vol->major_ver <= 1) { if (ni->mft_no < FILE_Extend) is_system = TRUE; } else { if (ni->mft_no <= FILE_Extend) is_system = TRUE; if (ni == vol->objid_ni || ni == vol->quota_ni || ni == vol->usnjrnl_ni) is_system = TRUE; } if (is_system) { ntfs_debug("Mft_no 0x%llx is a%s system file, " "returning EPERM.", (unsigned long long)ni->mft_no, (ni->mft_no > FILE_Extend) ? "n extended" : ""); err = EPERM; goto err; } } /* * Ensure the inode to link to is not read-only (we already checked * that @ni is not a directory). */ if (ni->file_attributes & FILE_ATTR_READONLY) { ntfs_debug("Mft_no 0x%llx is marked read-only, returning " "EPERM.", (unsigned long long)ni->mft_no); err = EPERM; goto err; } /* * TODO: Test if Windows is happy with a reparse point having a hard * link and if so remove this check and copy in the reparse point tag * into the filename attribute below. For mount point reparse points * the reparse point is a directory so the link attempt would already * have been aborted. * * TODO: Test if Windows is happy with an offline inode having a hard * link and if so remove this check. */ if (ni->file_attributes & (FILE_ATTR_REPARSE_POINT | FILE_ATTR_OFFLINE)) { ntfs_debug("Mft_no 0x%llx is %s. Creating hard links to such " "inodes is not allowed, returning EPERM.", (unsigned long long)ni->mft_no, (ni->file_attributes & FILE_ATTR_REPARSE_POINT) ? "a reparse point" : "offline"); err = EPERM; goto err; } /* Check if the maximum link count is already reached. */ if (ni->link_count >= NTFS_MAX_HARD_LINKS) { ntfs_debug("Cannot create hard link to mft_no 0x%llx because " "it already has too many hard links.", (unsigned long long)ni->mft_no); err = EMLINK; goto err; } /* Go ahead and create the hard link. */ err = ntfs_link_internal(ni, dir_ni, cn, FALSE, NULL, 0); if (err) { if (err != EEXIST) ntfs_error(vol->mp, "Failed to create hard link to " "mft_no 0x%llx, named %.*s, in " "directory mft_no 0x%llx (error %d).", (unsigned long long)ni->mft_no, (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)dir_ni->mft_no, err); else ntfs_debug("Failed to create hard link to mft_no " "0x%llx, named %.*s, in directory " "mft_no 0x%llx (error EEXIST).", (unsigned long long)ni->mft_no, (int)cn->cn_namelen, cn->cn_nameptr, (unsigned long long)dir_ni->mft_no); } else ntfs_debug("Done."); err: /* We are done, unlock the inode and the target directory. */ lck_rw_unlock_exclusive(&ni->lock); lck_rw_unlock_exclusive(&dir_ni->lock); return err; } /** * ntfs_vnop_rename - rename an inode (file/directory/symbolic link/etc) * @a: arguments to rename function * * @a contains: * vnode_t a_fdvp; directory containing source inode * vnode_t a_fvp; source inode to be renamed * struct componentname *a_fcnp; name of the inode to rename * vnode_t a_tdvp; target directory to move the source to * vnode_t a_tvp; target inode to be deleted * struct componentname *a_tcnp; name of the inode to delete * vfs_context_t a_context; * * Rename the inode @a_fvp with name as specified in @a->a_fcnp located in the * directory @a->a_fdvp to the new name specified in a->a_tcnp placing it in * the target directory @a->a_tdvp. * * If @a->a_tvp is not NULL it means that the rename target already exists * which means we have to delete the rename target before we can perform the * rename. In this case @a->a_tvp is the existing target inode and its name is * the rename target name specified in @a->a_tcnp and it is located in the * target directory @a->a_tdvp. * * Return 0 on success and errno on error. * * Note we always create the target name @a->a_tcnp in the POSIX namespace. * * Rename is a complicated operation because there are several special cases * that need consideration: * * First of all unchecked renaming can create directory loops which are not * attached to the file system root, e.g. take the directory tree /a/b/c and * perform a rename of /a/b to /a/b/c/ which if allowed to proceed would create * /a and b/c/b where the latter is a loop in that b points back to c which * points back to b. Also this loop no longer is attached to the file system * directory tree and there is no way to access it any more as there is no link * from /a to b or c any more. Thus we have to check for this case and return * EINVAL error instead of doing the rename. Also a concurrent rename could * reshape the tree after our check so that our case would result in a loop * after all thus all tree reshaping renames must be done under a rename lock. * Note the VFS already holds the mnt_renamelock mutex for some renames but it * does not hold it in all cases we need it to be held so we still need our own * NTFS rename lock. * * Further VNOP_RENAME() must observe the following rules: * * - Source and destination must either both be directories, or both not be * directories. If this is not the case return ENOTDIR if the target is not * a directory and EISDIR if the target is a directory. * * - If the target is a directory, it must be empty. Return ENOTEMPTY if not. * * - It is not allowed to rename "/", ".", or "..". Return EINVAL if this is * attempted. * * - If the source inode and the target inode are the same and the mount is * case sensitive or the parent directories are also the same and the names * are the same do not do anything at all and return success, i.e. 0. Note * this is a violation of POSIX but it is needed to allow renaming of files * from one case to another, i.e. when a mount is not case sensitive but case * preserving (this is the default for NTFS) and the source and target inodes * and their parent directories match but the names do not match we want to * perform the rename rather than just return success. If we still find that * the target exists as a hard link rather than this being a case changing * rename we still need to abort and return success to comply with POSIX. * * FIXME: There is a bug in the VFS in that it never calls VNOP_RENAME() at * all when it is called with source and target strings being the same. This * is wrong when the string matches the name but does not have the same case, * i.e. the rename would normally succeed switching the case to the new case. * The VFS is currently forbidding this to happen. <rdar://problem/5485782> */ static int ntfs_vnop_rename(struct vnop_rename_args *a) { MFT_REF src_mref, dst_mref; ntfs_inode *src_dir_ni, *src_ni, *dst_dir_ni, *dst_ni; struct componentname *src_cn, *dst_cn; ntfs_volume *vol; ntfschar *ntfs_name_buf, *orig_ntfs_name, *dst_ntfs_name; ntfschar *src_ntfs_name, *target_ntfs_name; ntfs_dir_lookup_name *src_name, *dst_name; size_t orig_ntfs_name_size, dst_ntfs_name_size; signed orig_ntfs_name_len, dst_ntfs_name_len, src_ntfs_name_len; signed target_ntfs_name_len; errno_t err, err2; FILENAME_TYPE_FLAGS src_ntfs_name_type, target_ntfs_name_type; BOOL have_unlinked = FALSE; dst_name = src_name = NULL; src_dir_ni = NTFS_I(a->a_fdvp); src_ni = NTFS_I(a->a_fvp); src_cn = a->a_fcnp; dst_dir_ni = NTFS_I(a->a_tdvp); if (!src_dir_ni || !src_ni || !dst_dir_ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vol = src_dir_ni->vol; dst_cn = a->a_tcnp; if (a->a_tvp) { dst_ni = NTFS_I(a->a_tvp); if (!dst_ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } ntfs_debug("Entering for source mft_no 0x%llx, name %.*s, " "parent directory mft_no 0x%llx and " "destination mft_no 0x%llx, name %.*s, parent " "directory mft_no 0x%llx.", (unsigned long long)src_ni->mft_no, (int)src_cn->cn_namelen, src_cn->cn_nameptr, (unsigned long long)src_dir_ni->mft_no, (unsigned long long)dst_ni->mft_no, (int)dst_cn->cn_namelen, dst_cn->cn_nameptr, (unsigned long long)dst_dir_ni->mft_no); if (src_ni == dst_ni && NVolCaseSensitive(vol)) { ntfs_debug("Source and destination inodes are the " "same and the volume is case " "sensitive. Returning success " "without doing anything as required " "by POSIX."); return 0; } } else { dst_ni = NULL; ntfs_debug("Entering for source mft_no 0x%llx, name %.*s, " "parent directory mft_no 0x%llx and no " "destination mft_no, destination name %.*s, " "parent directory mft_no 0x%llx.", (unsigned long long)src_ni->mft_no, (int)src_cn->cn_namelen, src_cn->cn_nameptr, (unsigned long long)src_dir_ni->mft_no, (int)dst_cn->cn_namelen, dst_cn->cn_nameptr, (unsigned long long)dst_dir_ni->mft_no); } /* * The source and target parent inodes must be directories which * implies they are base inodes. */ if (!S_ISDIR(src_dir_ni->mode) || !S_ISDIR(dst_dir_ni->mode)) { ntfs_debug("%s parent inode 0x%llx is not a directory, " "returning ENOTDIR.", !S_ISDIR(src_dir_ni->mode) ? "Source" : "Destination", (unsigned long long) (!S_ISDIR(src_dir_ni->mode) ? src_dir_ni->mft_no : dst_dir_ni->mft_no)); return ENOTDIR; } /* * All inodes must be locked in parent -> child order so we need to * check whether the source and target parent inodes have a * parent/child relationship with each other. * * If both are the same we have the easiest case and we just lock the * single directory inode. * * If the two are not the same we need to exclude all other tree * reshaping renames from happening as they could change the * relationship between the parent directory inodes under our feet. To * do this we use a per ntfs volume lock so we can then go on to * determine their parent/child relationship. * * Once we have established if there is a parent/child relationship we * lock the parent followed by the child and if the two are completely * unrelated the order of locking does not matter so we just lock the * destination followed by the source. * * Note that we take this opportunity of walking the directory tree up * to the root starting from @dst_dir_ni to also check whether @src_ni * is either equal to or a parent of @dst_dir_ni in which case a * directory loop would be caused by the rename so we have to abort it * with EINVAL error. */ if (src_dir_ni == dst_dir_ni) lck_rw_lock_exclusive(&src_dir_ni->lock); else { BOOL is_parent; lck_mtx_lock(&vol->rename_lock); err = ntfs_inode_is_parent(src_dir_ni, dst_dir_ni, &is_parent, src_ni); if (err) { lck_mtx_unlock(&vol->rename_lock); /* * @err == EINVAL means @src_ni matches or is a parent * of @dst_dir_ni. This would create a directory * loop so abort the rename but do not emit an error * message as there is no error as such. */ if (err != EINVAL) ntfs_error(vol->mp, "Failed to determine " "whether source directory " "mft_no 0x%llx is a parent of " "destination directory mft_no " "0x%llx (error %d).", (unsigned long long) src_dir_ni->mft_no, (unsigned long long) dst_dir_ni->mft_no, err); return err; } /* * If @src_dir_ni is a parent of @dst_dir_ni, lock @src_dir_ni * followed by @dst_dir_ni. * * Otherwise either @dst_dir_ni is a parent of @src_dir_ni, in * which case we have to lock @dst_dir_ni followed by * @src_dir_ni, or they are unrelated in which case lock * ordering does not matter thus we do not need to distinguish * those two cases and can simply lock @dst_dir_ni followed by * @src_dir_ni. */ if (is_parent) { lck_rw_lock_exclusive(&src_dir_ni->lock); lck_rw_lock_exclusive(&dst_dir_ni->lock); } else { lck_rw_lock_exclusive(&dst_dir_ni->lock); lck_rw_lock_exclusive(&src_dir_ni->lock); } } /* * The source cannot be the source directory and the destination cannot * be the destination directory. Also as we are about to lock the * target ensure it does not equal the source directory either. We * have already checked for the source being equal to the target * directory above so no need to check again. */ if (dst_ni && dst_ni == src_dir_ni) { ntfs_debug("The source parent directory equals the target, " "returning ENOTEMPTY."); err = ENOTEMPTY; /* Set @dst_ni to NULL so we do not try to unlock it. */ dst_ni = NULL; goto err; } if (src_ni == src_dir_ni || (dst_ni && dst_ni == dst_dir_ni)) { ntfs_debug("The source and/or the target is/are equal to " "their parent directories, returning EINVAL."); err = EINVAL; /* Set @dst_ni to NULL so we do not try to unlock it. */ dst_ni = NULL; goto err; } /* * If the destination inode exists lock it so it can be unlinked * safely. For example if it is a directory we need to ensure that it * is empty and that no-one creates an entry in it whilst the delete is * in progress which requires us to hold an exclusive lock on it. */ if (dst_ni) lck_rw_lock_exclusive(&dst_ni->lock); /* * Because we have locked the parent inode of the source inode there is * no need to lock the source inode itself. We are not going to unlink * it completely, just move it from one location/name to another name * and/or place in the directory tree and the mft record will be mapped * and thus locked for exclusive access whenever we modify the inode * which will serialize any potential concurrent operations on the * inode. The only concurrent operation to watch out for is when the * source inode is a directory and someone calls VNOP_REMOVE() or * VNOP_RMDIR() on any of its child inodes. This can end up in the * situation where the index root node is locked in * ntfs_index_entry_delete() and hence the mft record is mapped whilst * the free space in the mft record is evaluated but then before this * information is used the mft record is unmapped and then mapped again * as part of a call to ntfs_index_entry_lock_two() and if our * VNOP_RENAME() manages to map the mft record whilst it is temporarily * unmapped during the ntfs_index_entry_lock_two() we can cause the * free space in the mft record to decrease and thus the * ntfs_index_entry_delete() may then encounter an out of space * condition when it thought it had determined the amount of free space * already and thus assume something has gone wrong and panic(). We * overcome this problem inside ntfs_index_entry_delete() by rechecking * the free space after reacquiring the lock and dealing with it as * appropriate. * * First, ensure the parent directories have not been deleted. */ if (!src_dir_ni->link_count || !dst_dir_ni->link_count) { ntfs_debug("One or both of the parent directories mft_no " "0x%llx and mft_no 0x%llx has/have been " "deleted, returning ENOENT.", (unsigned long long)src_dir_ni->mft_no, (unsigned long long)dst_dir_ni->mft_no); /* * If the directory is somehow still in the name cache remove * it now. */ if (!src_dir_ni->link_count) cache_purge(src_dir_ni->vn); if (!dst_dir_ni->link_count) cache_purge(dst_dir_ni->vn); err = ENOENT; goto err; } /* Rename is not allowed on attribute/raw inodes. */ if (NInoAttr(src_ni) || (dst_ni && NInoAttr(dst_ni))) { ntfs_debug("Source and/or target inode is/are attribute/raw " "inodes, returning EPERM."); err = EPERM; goto err; } /* Ensure the source has not been deleted by someone else already. */ if (!src_ni->link_count) { ntfs_debug("Source %.*s, mft_no 0x%llx has been deleted, " "returning ENOENT.", (int)src_cn->cn_namelen, src_cn->cn_nameptr, (unsigned long long)src_ni->mft_no); /* * If the source is somehow still in the name cache remove it * now. */ cache_purge(src_ni->vn); err = ENOENT; goto err; } /* * Ensure the target has not been deleted by someone else already. If * it has been deleted pretend the caller did not specify a target. * This is what HFS+ does, too. */ if (dst_ni && !dst_ni->link_count) { ntfs_debug("Target %.*s, mft_no 0x%llx has been deleted, " "pretending no target was specified.", (int)dst_cn->cn_namelen, dst_cn->cn_nameptr, (unsigned long long)dst_ni->mft_no); /* * If the target is somehow still in the name cache remove it * now. */ cache_purge(dst_ni->vn); lck_rw_unlock_exclusive(&dst_ni->lock); dst_ni = NULL; } /* * If the destination exists need to ensure that it is a directory if * the source is a directory or that it is not a directory if the * source is not a directory. * * Also, need to ensure the target directory is empty. * * If the source and destination are the same none of these checks * apply so skip them. */ if (dst_ni && src_ni != dst_ni) { if (S_ISDIR(src_ni->mode)) { if (!S_ISDIR(dst_ni->mode)) { ntfs_debug("Source is a directory but " "destination is not, " "returning ENOTDIR"); err = ENOTDIR; goto err; } /* The target is a directory, but is it empty? */ err = ntfs_dir_is_empty(dst_ni); if (err) { if (err == ENOTEMPTY) ntfs_debug("Target directory %.*s, " "mft_no 0x%llx is not " "empty, returning " "ENOTEMPTY.", (int)dst_cn->cn_namelen, dst_cn->cn_nameptr, (unsigned long long) dst_ni->mft_no); else { ntfs_error(vol->mp, "Failed to " "determine if target " "directory %.*s, " "mft_no 0x%llx is " "empty (error %d).", (int)dst_cn->cn_namelen, dst_cn->cn_nameptr, (unsigned long long) dst_ni->mft_no, err); err = EIO; } goto err; } } else /* if (!S_ISDIR(src_ni->mode)) */ { if (S_ISDIR(dst_ni->mode)) { ntfs_debug("Source is not a directory but " "destination is, returning " "EISDIR"); err = EISDIR; goto err; } } } /* Ensure none of the inodes are read-only. */ if ((!S_ISDIR(src_ni->mode) && src_ni->file_attributes & FILE_ATTR_READONLY) || (dst_ni && !S_ISDIR(dst_ni->mode) && dst_ni->file_attributes & FILE_ATTR_READONLY)) { ntfs_debug("One of the inodes involved in the rename is " "read-only, returning EPERM."); err = EPERM; goto err; } /* * Do not allow any of the system files to be renamed/deleted. * * For NTFS 3.0+ volumes do not allow any of the extended system files * to be renamed/deleted, either. * * Note we specifically blacklist all system files that we make use of. * * TODO: What about all the new metadata files introduced with Windows * Vista? We are currently ignoring them and allowing them to be * renamed/deleted... */ if (src_ni->file_attributes & FILE_ATTR_SYSTEM || (dst_ni && dst_ni->file_attributes & FILE_ATTR_SYSTEM)) { BOOL is_system = FALSE; if (vol->major_ver <= 1) { if (src_ni->mft_no < FILE_Extend || (dst_ni && dst_ni->mft_no < FILE_Extend)) is_system = TRUE; } else { if (src_ni->mft_no <= FILE_Extend || (dst_ni && dst_ni->mft_no <= FILE_Extend)) is_system = TRUE; if (src_dir_ni == vol->extend_ni) { if (src_ni == vol->objid_ni || src_ni == vol->quota_ni || src_ni == vol->usnjrnl_ni) is_system = TRUE; } if (dst_dir_ni == vol->extend_ni) { if (dst_ni == vol->objid_ni || dst_ni == vol->quota_ni || dst_ni == vol->usnjrnl_ni) is_system = TRUE; } } if (is_system) { ntfs_debug("Source and/or target inode is a system " "file, returning EPERM."); err = EPERM; goto err; } } /* * If the source/target inodes are reparse points or if they are * offline we cannot rename/delete them yet. TODO: Implement this. */ if (src_ni->file_attributes & (FILE_ATTR_REPARSE_POINT | FILE_ATTR_OFFLINE) || (dst_ni && dst_ni->file_attributes & (FILE_ATTR_REPARSE_POINT | FILE_ATTR_OFFLINE))) { ntfs_error(vol->mp, "Source or target inode is a reparse " "point or offline, renaming such indoes is " "notsupported yet, returning ENOTSUP."); err = ENOTSUP; goto err; } /* * To proceed further we need to convert both the source and target * names from utf8 to Unicode. This is a good time to do both as the * conversion also checks for invalid names, too long names, etc. * * Note we allocate both source and target names with a single buffer * so we only have to call once into the allocator. */ ntfs_name_buf = OSMalloc(NTFS_MAX_NAME_LEN * 2, ntfs_malloc_tag); if (!ntfs_name_buf) { ntfs_debug("Not enough memory to allocate name buffer."); err = ENOMEM; goto err; } orig_ntfs_name = ntfs_name_buf; dst_ntfs_name = (ntfschar*)((u8*)ntfs_name_buf + NTFS_MAX_NAME_LEN); dst_ntfs_name_size = orig_ntfs_name_size = NTFS_MAX_NAME_LEN; orig_ntfs_name_len = utf8_to_ntfs(vol, (u8*)src_cn->cn_nameptr, src_cn->cn_namelen, &orig_ntfs_name, &orig_ntfs_name_size); if (orig_ntfs_name_len < 0) { err = -orig_ntfs_name_len; if (err == ENAMETOOLONG) ntfs_debug("Failed (source name is too long)."); else ntfs_error(vol->mp, "Failed to convert name to " "Unicode (error %d).", err); goto free_err; } dst_ntfs_name_len = utf8_to_ntfs(vol, (u8*)dst_cn->cn_nameptr, dst_cn->cn_namelen, &dst_ntfs_name, &dst_ntfs_name_size); if (dst_ntfs_name_len < 0) { err = -dst_ntfs_name_len; if (err == ENAMETOOLONG) ntfs_debug("Failed (target name is too long)."); else ntfs_error(vol->mp, "Failed to convert target name to " "Unicode (error %d).", err); goto free_err; } /* * We need to make sure the source still has the name specified in * @src_cn. It could have been unlinked or renamed before we took the * lock on the parent directory. * * To do this, look up the converted source name in the source parent * directory index. */ err = ntfs_lookup_inode_by_name(src_dir_ni, orig_ntfs_name, orig_ntfs_name_len, &src_mref, &src_name); if (err) { if (err != ENOENT) { ntfs_error(vol->mp, "Failed to find source name in " "directory (error %d).", err); goto free_err; } src_enoent: /* * The source name does not exist in the source parent * directory. * * This means someone renamed or deleted the name from the * directory before we managed to take the locks. */ ntfs_debug("Source has been renamed or deleted already, " "returning ENOENT."); /* * If the source is somehow still in the name cache remove it * now. */ cache_purge(src_ni->vn); err = ENOENT; goto free_err; } /* * We found the source name in the directory index but does it still * point to the same mft record? The sequence number check ensures the * inode was not deleted and recreated with the same name and the same * mft record number. */ if (src_mref != MK_MREF(src_ni->mft_no, src_ni->seq_no)) goto src_enoent; /* * We now have verified everything to do with the source. Set the * source name to be the correctly cased name (unless it was correctly * cased already in which case @src_name will be NULL and * @orig_ntfs_name contains the correcly cased name). */ if (src_name) { src_ntfs_name = src_name->name; src_ntfs_name_len = src_name->len; src_ntfs_name_type = src_name->type; } else { src_ntfs_name = orig_ntfs_name; src_ntfs_name_len = orig_ntfs_name_len; src_ntfs_name_type = 0; } /* * Now we need to verify the target. In an ideal world, either it has * to be specified in @dst_ni in which case it also has to exist in the * destination parent directory @dst_dir_ni, or @dst_ni has to be NULL * in which case the target name must not exist in the destination * parent directory. * * But because the VFS obtains the target before we take the necessary * locks it is possible for the above ideal not to be true. There are * several possible cases: * * - Target was specified but deleted. We have detected this case * above and have set @dst_ni to NULL thus we do not need to worry * about this case any more. * - Target was not specified but another inode was created with the * same name. In this case we return EEXIST which is what HFS+ does, * too. * - Target was specified but renamed. This means we may or may not * find a directory entry of the same name. If we do not find a * matching directory entry we know the target has been renamed thus * we can simply set @dst_ni to NULL and pretend it does not exist. * If we do find a directory entry that matches in name but does not * point to the same mft reference we know the target was renamed and * another inode was created with the same name. In this case we * return EEXIST which is what HFS+ does, too. */ err = ntfs_lookup_inode_by_name(dst_dir_ni, dst_ntfs_name, dst_ntfs_name_len, &dst_mref, &dst_name); if (err) { if (err != ENOENT) { ntfs_error(vol->mp, "Failed to find target name in " "directory (error %d).", err); goto free_err; } /* * The destination name does not exist in the destination * parent directory which means that the target must have been * renamed to something else before we took the locks. We * treat this the same as if had been deleted, i.e. we pretend * the caller did not specify a target. */ if (dst_ni) { ntfs_debug("Target %.*s, mft_no 0x%llx has been " "renamed, pretending no target was " "specified.", (int)dst_cn->cn_namelen, dst_cn->cn_nameptr, (unsigned long long)dst_ni->mft_no); lck_rw_unlock_exclusive(&dst_ni->lock); dst_ni = NULL; } } else /* if (!err) */ { /* * The destination name exists in the directory index. * * If the caller did not specify it in @dst_ni or the * destination inode has been deleted (in which case we set * @dst_ni to NULL above) or the target was renamed and another * inode was created with the same name return error EEXIST * which is what HFS+ does, too. * * FIXME: Technically it would probably be more correct to get * the new target ntfs inode and restart the function but at * least for now stick with the same behaviour as HFS+. */ if (!dst_ni || dst_mref != MK_MREF(dst_ni->mft_no, dst_ni->seq_no)) { ntfs_debug("Target name %.*s exists but %s, returning " "EEXIST.", (int)dst_cn->cn_namelen, dst_cn->cn_nameptr, !dst_ni ? "target inode was not specified or it " "was already deleted" : "does not match specified target " "inode (it must have been renamed and " "a new inode created with the same " "name)"); err = EEXIST; goto free_err; } /* * We still need the destination name thus use a new variable * to store the correctly cased target name. */ if (!dst_name) { target_ntfs_name = dst_ntfs_name; target_ntfs_name_len = dst_ntfs_name_len; target_ntfs_name_type = 0; } else { target_ntfs_name = dst_name->name; target_ntfs_name_len = dst_name->len; target_ntfs_name_type = dst_name->type; } /* * We have verified everything to do with the target. We now * need to unlink it unless the source and the target are the * same, i.e. we are changing the case of an existing filename. * We need to distinguish two cases. If the volume is mounted * case sensitive or it is not case sensitive and the source * and destination names do not match (i.e. they are different * hard links to the same inode) we do not proceed and return * success (this is required by POSIX). Otherwise the volume * is not case sensitive and the source and destination names * match (i.e. they are the same hard link) and we can either * return success when the source and destination names are * identical (same case) or we can proceed with the rename when * the case differs. * * Note we have caught the case of the inodes being equal and * the volume being mounted case sensitive earlier on so we now * know that the volume is not mounted case sensitive. */ if (src_ni == dst_ni) { /* * If the two names are not the same hardlink return * success not doing anything as required by POSIX. * * Note we do not need to care about case when * comparing because we are comparing the correctly * cased names. */ if (src_ntfs_name_len != target_ntfs_name_len || bcmp(src_ntfs_name, target_ntfs_name, src_ntfs_name_len * sizeof(ntfschar))) { ntfs_debug("Source and target inodes are the " "same but the source and " "target names are different " "hard links. Returning " "success without doing " "anything as required by " "POSIX."); goto done; } /* * The names are the same hard link. If the existing * name is the same as the destination name (i.e. the * target name before case correction) there is * nothing to do and we can return success. */ if (src_ntfs_name_len == dst_ntfs_name_len && !bcmp(src_ntfs_name, dst_ntfs_name, src_ntfs_name_len * sizeof(ntfschar))) { ntfs_debug("Source and destination are " "identical so no need to do " "anything. Returning " "success."); goto done; } /* * The names are the same hard link but they differ in * case thus there is no target to be removed as it * will be removed as part of the actual rename when * the source name is removed. */ } else /* if (dst_ni && src_ni != dst_ni) */ { /* * The source and the target are not the same thus now * unlink the target. We can do this atomically before * adding the new entry because both the parent * directory inode and the target inode are locked for * writing thus no-one can access either until we have * finished. FIXME: The only pitfal is what happens if * the rename fails after we have removed the target? * We just ignore this problem for now and let the * target disappear. This is what HFS does also so at * least we are not the only non-POSIX conformant file * system on OS X... In fact as long as we return EIO * on error once we have unlinked the target POSIX * still considers this ok. (This is what HFS does, * too.) * * Note we do not set @is_rename to true here as this * is just a normal unlink operation. */ err = ntfs_unlink_internal(dst_dir_ni, dst_ni, target_ntfs_name, target_ntfs_name_len, target_ntfs_name_type, FALSE); if (err) { ntfs_error(vol->mp, "Rename failed because " "the target mft_no 0x%llx " "could not be removed from " "directory mft_no 0x%llx " "(error %d).", (unsigned long long) dst_ni->mft_no, (unsigned long long) dst_dir_ni->mft_no, err); goto free_err; } /* * Set @have_unlinked to true so that we know that we * have to return error EIO from now on if we fail to * complete the rename. */ have_unlinked = TRUE; } /* * Release the lock on the destination inode and set it to NULL * so we assume it does not exist from now on. */ lck_rw_unlock_exclusive(&dst_ni->lock); dst_ni = NULL; } /* * We dealt with the target if there was one thus now we can begin the * actual rename. * * To start with we lock the source inode for writing which allows us * to split the removal of the source name and the addition of the * destination name into two events. * * Note we cheat a little and set @dst_ni to @src_ni so that @src_ni is * unlocked at the end of the function/on error. */ if (dst_ni) panic("%s(): dst_ni\n", __FUNCTION__); dst_ni = src_ni; lck_rw_lock_exclusive(&src_ni->lock); /* * As the source inode is now locked for writing we can perform the * rename in two stages. First we remove the source name and then we * add the destination name both to the mft record of the inode and to * the parent directory indexes. We can do this atomically because * both the parent directory and the source inode are locked for * writing thus no-one can access either until we are finished. * * As removal of the source name can leave the source inode with a zero * link count we artificially increment the link count here to ensure * it cannot reach zero. This is required to guarantee that the unlink * of the source name will remove the filename attribute and to ensure * that the object id is not deleted. Finally, this also ensures * no-one can ever see the inode in a deleted state (although this * should never happen anyway as we have the inode locked for writing). * * Note the link count in the ntfs inode is unsigned int type, i.e. at * least 32-bit, to allow us to overflow 16-bits here if needed. In * this way we do not need to worry about the link count overflowing * here which makes the code simpler. * * We set @is_rename to true as we have elevated the link count by one. */ src_ni->link_count++; err = ntfs_unlink_internal(src_dir_ni, src_ni, src_ntfs_name, src_ntfs_name_len, src_ntfs_name_type, TRUE); if (err) { ntfs_error(vol->mp, "Rename failed because the source name, " "%.*s mft_no 0x%llx could not be removed from " "directory mft_no 0x%llx (error %d).", (int)src_cn->cn_namelen, src_cn->cn_nameptr, (unsigned long long)src_ni->mft_no, (unsigned long long)src_dir_ni->mft_no, err); goto dec_err; } /* * The source name is now removed both from the source parent directory * index and from the mft record of the source inode. * * Now add the destination name as a hard link to the mft record of the * source inode and to the destination parent directory index. * * Calling ntfs_link_internal() also sets the "needs to be archived" * bit on the ntfs inode unless we are renaming an unencrypted * directory inode so we do not need to worry about setting it * ourselves. */ err = ntfs_link_internal(src_ni, dst_dir_ni, dst_cn, TRUE, dst_ntfs_name, dst_ntfs_name_len); if (err) goto link_err; /* We are done, decrement the link count back to its correct value. */ src_ni->link_count--; done: if (src_name) OSFree(src_name, sizeof(*src_name), ntfs_malloc_tag); if (dst_name) OSFree(dst_name, sizeof(*dst_name), ntfs_malloc_tag); OSFree(ntfs_name_buf, NTFS_MAX_NAME_LEN * 2, ntfs_malloc_tag); err: /* If the destination inode existed we locked it so unlock it now. */ if (dst_ni) lck_rw_unlock_exclusive(&dst_ni->lock); /* Drop the source and destination parent directory inode locks. */ lck_rw_unlock_exclusive(&src_dir_ni->lock); if (src_dir_ni != dst_dir_ni) { lck_rw_unlock_exclusive(&dst_dir_ni->lock); lck_mtx_unlock(&vol->rename_lock); } ntfs_debug("Done (error %d).", (int)err); return err; link_err: ntfs_error(vol->mp, "Rename failed because the destination name %.*s, " "mft_ni 0x%llx could not be added to directory mft_no " "0x%llx (error %d).", (int)dst_cn->cn_namelen, dst_cn->cn_nameptr, (unsigned long long)src_ni->mft_no, (unsigned long long)dst_dir_ni->mft_no, err); /* * Try to roll back the unlink of the source by creating a new hard * link with the old name. */ err2 = ntfs_link_internal(src_ni, src_dir_ni, src_cn, TRUE, orig_ntfs_name, orig_ntfs_name_len); if (err2) { ntfs_error(vol->mp, "Failed to roll back partially completed " "rename (error %d). Leaving corrupt " "metadata and returning EIO. Unmount and run " "chkdsk.", err2); NVolSetErrors(vol); err = EIO; } else ntfs_debug("Re-linking of source name succeeded."); dec_err: src_ni->link_count--; free_err: if (have_unlinked) { /* We unlinked an existing target, need to re-link it now. */ ntfs_debug("Rename failed but the target was already unlinked " "and relinking it is not implemented (yet), " "returning EIO. (Given you were renaming " "over it chances are you did not care about " "the target anyway.)"); err = EIO; } goto done; } /** * ntfs_vnop_mkdir - create a directory * @a: arguments to mkdir function * * @a contains: * vnode_t a_dvp; directory in which to create the dir * vnode_t *a_vpp; destination pointer for the created dir * struct componentname *a_cnp; name of the directory to create * struct vnode_attr *a_vap; attributes to set on the created dir * vfs_context_t a_context; * * Create a directory with name as specified in @a->a_cnp in the directory * specified by the vnode @a->a_dvp. Assign the attributes @a->a_vap to the * created directory. Finally return the vnode of the created directory in * *@a->a_vpp. * * Return 0 on success and errno on error. * * Note we always create directory names in the POSIX namespace. */ static int ntfs_vnop_mkdir(struct vnop_mkdir_args *a) { errno_t err; #ifdef DEBUG ntfs_inode *ni = NTFS_I(a->a_dvp); if (ni) ntfs_debug("Creating a directory named %.*s in directory " "mft_no 0x%llx.", (int)a->a_cnp->cn_namelen, a->a_cnp->cn_nameptr, (unsigned long long)ni->mft_no); #endif err = ntfs_create(a->a_dvp, a->a_vpp, a->a_cnp, a->a_vap, FALSE); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_rmdir - remove an empty directory * @a: arguments to rmdir function * * @a contains: * vnode_t a_dvp; parent directory remove from * vnode_t a_vp; directory to remove * struct componentname *a_cnp; name of the dircetory to remove * vfs_context_t a_context; * * Make sure that the directory with vnode @a->a_vp and name as specified in * @a->a_cnp is empty and if so remove it from its parent directory with vnode * @a->a_dvp. * * Return 0 on success and errno on error. * * Note that if the name of the directory to be removed is in the WIN32 or DOS * namespaces, both the WIN32 and the corresponding DOS names are removed. * * Note that this function only removes the directory entry, i.e. it does not * remove the name, however it does decrement the hard link count to zero. * This is so that the directory can be undeleted and its original name * restored. In any case, we do not actually delete the inode here as it may * still be open and UNIX semantics require an unlinked inode to be still * accessible through already opened file descriptors. When the last file * descriptor is closed, we causes the inode to be deleted when the VFS * notifies us of the last close by calling VNOP_INACTIVE(), i.e. * ntfs_vnop_inactive(). */ static int ntfs_vnop_rmdir(struct vnop_rmdir_args *a) { ntfs_inode *dir_ni = NTFS_I(a->a_dvp); ntfs_inode *ni = NTFS_I(a->a_vp); errno_t err; ntfs_debug("Entering."); if (!dir_ni || !ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } err = ntfs_unlink(dir_ni, ni, a->a_cnp, 0, TRUE); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_symlink - create a symbolic link * @a: arguments to symlink function * * @a contains: * vnode_t a_dvp; directory to create the symlink in * vnode_t *a_vpp; destination pointer for the new symlink * struct componentname *a_cnp; name of the symlink to create * struct vnode_attr *a_vap; attributes to set on the new symlink * char *a_target; path to point the created symlink at * vfs_context_t a_context; * * Create a symbolic link to the path string @a->a_target with name as * specified in @a->a_cnp in directory specified by the vnode @a->a_dvp. * Assign the attributes @a->a_vap to the created symlink. Finally return the * vnode of the created symlink in *@a->a_vpp. * * We implement symbolic links the same way as SFM, i.e. a symbolic link is a * regular file as far as NTFS is concerned with an AFP_AfpInfo named stream * containing the finder info with the type set to 'slnk' and the creator set * to 'rhap'. This is basically how HFS+ stores symbolic links, too. * * Return 0 on success and errno on error. * * Note, since IEEE Std 1003.1-2001 does not require any association of file * times with symbolic links, there is no requirement that file times be * updated by symlink(). - This is what POSIX says about updating times in * symlink() thus we do not update any of the times except as an indirect * result of calling ntfs_write() on the symbolic link inode. */ static int ntfs_vnop_symlink(struct vnop_symlink_args *a) { uio_t uio; ntfs_inode *dir_ni, *ni, *raw_ni; int err, err2; unsigned len; dir_ni = NTFS_I(a->a_dvp); if (!dir_ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } ntfs_debug("Creating a symbolic link named %.*s in directory mft_no " "0x%llx and pointing it at path \"%s\".", (int)a->a_cnp->cn_namelen, a->a_cnp->cn_nameptr, (unsigned long long)dir_ni->mft_no, a->a_target); len = strlen(a->a_target); /* Zero length symbolic links are not allowed. */ if (!len || len > MAXPATHLEN) { err = EINVAL; if (len) err = ENAMETOOLONG; ntfs_error(dir_ni->vol->mp, "Invalid symbolic link target " "length %d, returning %s.", len, len ? "ENAMETOOLONG" : "EINVAL"); return err; } retry: /* Create the symbolic link inode. */ err = ntfs_create(dir_ni->vn, a->a_vpp, a->a_cnp, a->a_vap, TRUE); if (err) { if (err != EEXIST) ntfs_error(dir_ni->vol->mp, "Failed to create " "symbolic link named %.*s in " "directory mft_no 0x%llx and pointing " "to path \"%s\" (error %d).", (int)a->a_cnp->cn_namelen, a->a_cnp->cn_nameptr, (unsigned long long)dir_ni->mft_no, a->a_target, err); else ntfs_debug("Failed to create symbolic link named %.*s " "in directory mft_no 0x%llx and " "pointing to path \"%s\" (error " "EEXIST).", (int)a->a_cnp->cn_namelen, a->a_cnp->cn_nameptr, (unsigned long long)dir_ni->mft_no, a->a_target); return err; } /* Note the ntfs inode @ni is locked for writing. */ ni = NTFS_I(*a->a_vpp); /* Make sure no-one deleted it under our feet. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); /* Release the vnode and try the create again. */ lck_rw_unlock_exclusive(&ni->lock); vnode_put(ni->vn); goto retry; } /* * Create a uio and attach the target path to it so we can use * ntfs_write() to do the work. */ uio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE); if (!uio) { err = ENOMEM; ntfs_error(dir_ni->vol->mp, "Failed to allocate UIO."); goto err; } err = uio_addiov(uio, (uintptr_t)a->a_target, len); if (err) panic("%s(): Failed to attach target path buffer to UIO " "(error %d).", __FUNCTION__, err); /* * FIXME: At present the kernel does not allow VLNK vnodes to use the * UBC (<rdar://problem/5794900>) thus we need to use a shadow VREG * vnode to do the actual write of the symbolic link data. Fortunately * we already implemented this functionality for compressed files where * we need to read the compressed data using a shadow vnode so we use * the same implementation here, thus our shadow vnode is a raw inode. */ err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_EXCLUSIVE, &raw_ni); if (err) { ntfs_error(ni->vol->mp, "Failed to get raw inode (error %d).", err); goto err; } if (!NInoRaw(raw_ni)) panic("%s(): Requested raw inode but got non-raw one.\n", __FUNCTION__); /* * Write the symbolic link target to the created inode. We pass in * IO_UNIT as we want an atomic i/o operation. * * FIXME: ntfs_write() does not always honour the IO_UNIT flag so we * still have to test for partial writes. */ err = ntfs_write(raw_ni, uio, IO_UNIT, TRUE); /* * Update the sizes in the base inode. Note there is no need to lock * @raw_ni->size_lock as the values cannot change at present as we are * holding the inode lock @raw_ni->lock for write. */ lck_spin_lock(&ni->size_lock); ni->initialized_size = raw_ni->initialized_size; ni->data_size = raw_ni->data_size; ni->allocated_size = raw_ni->allocated_size; ni->compressed_size = raw_ni->compressed_size; lck_spin_unlock(&ni->size_lock); if (NInoNonResident(raw_ni)) NInoSetNonResident(ni); lck_rw_unlock_exclusive(&raw_ni->lock); vnode_put(raw_ni->vn); /* Check for write errors. */ if (uio_resid(uio) && !err) err = EIO; /* We no longer need the uio. */ uio_free(uio); if (!err) { lck_rw_unlock_exclusive(&ni->lock); ntfs_debug("Done."); return 0; } /* Write failed or was partial, unlink the created symbolic link. */ ntfs_error(dir_ni->vol->mp, "Failed to write target path to symbolic " "link inode (error %d).", err); err: lck_rw_unlock_exclusive(&ni->lock); err2 = ntfs_unlink(dir_ni, ni, a->a_cnp, 0, FALSE); if (err2) { ntfs_error(dir_ni->vol->mp, "Failed to unlink symbolic link " "inode in error code path (error %d). Run " "chkdsk.", err2); NVolSetErrors(dir_ni->vol); } vnode_put(ni->vn); return err; } /** * ntfs_vnop_readdir - read directory entries into a supplied buffer * @a: arguments to readdir function * * @a contains: * vnode_t a_vp; directory vnode to read directory entries from * uio_t a_uio; destination in which to return the entries * int a_flags; flags describing the entries to return * int *a_eofflag; return end of file status (can be NULL) * int *a_numdirent; return number of entries returned (can be NULL) * vfs_context_t a_context; * * See ntfs_dir.c::ntfs_readdir() for a description of the implemented * features. In addition to those described features VNOP_READDIR() should * also implement the below features. * * @a->a_flags can have the following bits set: * VNODE_READDIR_EXTENDED use extended directory entries * VNODE_READDIR_REQSEEKOFF requires seek offset (cookies) * VNODE_READDIR_SEEKOFF32 seek offset values should be 32-bit * * When VNODE_READDIR_EXTENDED is set, the format of the returned directory * entry structures changes to the direntry structure which is defined as: * * u64 d_ino; inode number of entry * u64 d_seekoff; seek offset (optional, used by servers) * u16 d_reclen; length of this record * u16 d_namlen; length of string in d_name * u8 d_type; inode type (one of DT_DIR, DT_REG, etc) * char d_name[MAXPATHLEN]; null terminated filename * * If VNODE_READDIR_REQSEEKOFF is set, VNODE_READDIR_EXTENDED must also be set, * and it means that the seek offset (d_seekoff) in the direntry structure must * be set. If VNODE_READDIR_REQSEEKOFF is not set, the seek offset can be set * to zero as the caller will ignore it. * * If VNODE_READDIR_SEEKOFF32 is set, both VNODE_READDIR_EXTENDED and * VNODE_READDIR_REQSEEKOFF must be set and it means that the seek offset must * be at most 32-bits, i.e. the most significant 32-bits of d_seekoff must be * zero. * * All the VNODE_READDIR_* flags are only ever set by the NFS server and given * we do not yet support NFS exporting of NTFS volumes we just abort if any of * them are set. * * If the directory is deleted-but-in-use, we do not synthesize entries for "." * and "..". * * Return 0 on success and the error code on error. */ static int ntfs_vnop_readdir(struct vnop_readdir_args *a) { user_ssize_t start_count; ntfs_inode *dir_ni = NTFS_I(a->a_vp); errno_t err; if (!dir_ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } ntfs_debug("Entering for directory inode 0x%llx.", (unsigned long long)dir_ni->mft_no); /* * FIXME: Is this check necessary? Can we ever get here for * non-directories? All current callers (except the NFS server) ensure * that @dir_ni is a directory. We do not currently support NFS * exporting so this should indeed definitely never trigger but leave * it here as a kind of debug assertion. */ if (!S_ISDIR(dir_ni->mode)) { ntfs_debug("Not a directory, returning ENOTDIR."); return ENOTDIR; } if (a->a_flags) { ntfs_error(dir_ni->vol->mp, "None of the VNODE_READDIR_* " "flags are supported yet, sorry."); return ENOTSUP; } lck_rw_lock_shared(&dir_ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(dir_ni)) { /* Remove the inode from the name cache. */ cache_purge(dir_ni->vn); lck_rw_unlock_shared(&dir_ni->lock); ntfs_debug("Directory is deleted."); return ENOENT; } start_count = uio_resid(a->a_uio); err = ntfs_readdir(dir_ni, a->a_uio, a->a_eofflag, a->a_numdirent); /* * Update the last_access_time (atime) if something was read. * * Skip the update if atime updates are disabled via the noatime mount * option or the volume is read only. */ if (uio_resid(a->a_uio) < start_count && !NVolReadOnly(dir_ni->vol) && !(vfs_flags(dir_ni->vol->mp) & MNT_NOATIME)) { dir_ni->last_access_time = ntfs_utc_current_time(); NInoSetDirtyTimes(dir_ni); } lck_rw_unlock_shared(&dir_ni->lock); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_readdirattr - * */ static int ntfs_vnop_readdirattr(struct vnop_readdirattr_args *a) { errno_t err; ntfs_debug("Entering."); (void)nop_readdirattr(a); // TODO: err = ENOTSUP; ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_readlink - read the contents of a symbolic link * @a: arguments to readlink function * * @a contains: * vnode_t a_vp; vnode of symbolic link whose data to read * uio_t *a_uio; destination in which to return the read data * vfs_context_t a_context; * * Read the path stored in the symbolic link vnode @a->a_vp and return it in * the destination buffer pointed to by @a->a_uio. * * uio_resid(@a->a_uio) is the maximum number of bytes to read and * uio_offset(@a->a_uio) must be zero. * * We implement symbolic links the same way as SFM, i.e. a symbolic link is a * regular file as far as NTFS is concerned with an AFP_AfpInfo named stream * containing the finder info with the type set to 'slnk' and the creator set * to 'rhap'. This is basically how HFS+ stores symbolic links, too. * * Thus obtaining the symbolic link target is a simple matter of calling * ntfs_read() on the symbolic link inode. * * TODO: We may wish to add support for other symbolic link types found on NTFS * volumes such as the methods used by: * - Windows Services for Unix (SFU) and the userspace ntfsmount driver, * - SMB/Samba (when run on a file system without native symbolic links) * - Cygwin * * It may also be worth supporting reparse point based symbolic links but those * are a lot trickier if at all possible as they contain information that * cannot be resolved without access to the Windows registry and potentially * without access to the Windows Domain/Active Directory. * * Return 0 on success and errno on error. * * Note, since IEEE Std 1003.1-2001 does not require any association of file * times with symbolic links, there is no requirement that file times be * updated by readlink(). */ static int ntfs_vnop_readlink(struct vnop_readlink_args *a) { s64 size; user_ssize_t start_count; ntfs_inode *ni, *raw_ni; uio_t uio = a->a_uio; errno_t err; ni = NTFS_I(a->a_vp); if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } ntfs_debug("Entering for mft_no 0x%llx.", (unsigned long long)ni->mft_no); /* * Protect against changes in initialized_size and thus against * truncation also and against deletion/rename. */ lck_rw_lock_shared(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (!ni->link_count || NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); err = ENOENT; goto err; } if (!S_ISLNK(ni->mode)) { ntfs_debug("Not a symbolic link, returning EINVAL."); err = EINVAL; goto err; } if (uio_offset(uio)) { ntfs_error(ni->vol->mp, "uio_offset(uio) is not zero, " "returning EINVAL."); err = EINVAL; goto err; } /* * FIXME: At present the kernel does not allow VLNK vnodes to use the * UBC (<rdar://problem/5794900>) thus we need to use a shadow VREG * vnode to do the actual read of the symbolic link data. Fortunately * we already implemented this functionality for compressed files where * we need to read the compressed data using a shadow vnode so we use * the same implementation here, thus our shadow vnode is a raw inode. * * Doing this has the unfortunate consequence that if the symbolic link * inode is compressed or encrypted we cannot read it as we are already * using the raw inode and we can only have one raw inode. */ lck_spin_lock(&ni->size_lock); size = ni->data_size; lck_spin_unlock(&ni->size_lock); /* Zero length symbolic links are not allowed. */ if (!size || size > MAXPATHLEN) { ntfs_error(ni->vol->mp, "Invalid symbolic link size %lld in " "mft_no 0x%llx, returning EINVAL.", (long long)size, (unsigned long long)ni->mft_no); err = EINVAL; goto err; } start_count = uio_resid(uio); err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_SHARED, &raw_ni); if (err) { ntfs_error(ni->vol->mp, "Failed to get raw inode (error %d).", err); goto err; } if (!NInoRaw(raw_ni)) panic("%s(): Requested raw inode but got non-raw one.\n", __FUNCTION__); lck_spin_lock(&raw_ni->size_lock); if (size > ubc_getsize(raw_ni->vn) || size != raw_ni->data_size) panic("%s(): size (0x%llx) > ubc_getsize(raw_ni->vn, 0x%llx) " "|| size != raw_ni->data_size (0x%llx)\n", __FUNCTION__, (unsigned long long)size, (unsigned long long)ubc_getsize(raw_ni->vn), (unsigned long long)raw_ni->data_size); lck_spin_unlock(&raw_ni->size_lock); /* Perform the actual read of the symbolic link data into the uio. */ err = ntfs_read(raw_ni, uio, 0, TRUE); lck_rw_unlock_shared(&raw_ni->lock); vnode_put(raw_ni->vn); /* * If the read was partial, reset @uio pretending that the read never * happened unless we used up all the space in the uio and it was * simply not big enough to hold the entire symbolic link data in which * case we return a truncated result. */ if (err || (uio_resid(uio) && start_count - uio_resid(uio) != size)) { /* * FIXME: Should we be trying to continue a partial read in * case we can complete it with multiple calls to ntfs_read()? */ if (!err) { ntfs_debug("ntfs_read() returned a partial read, " "pretending the read never happened."); err = EIO; } uio_setoffset(uio, 0); uio_setresid(uio, start_count); if (err) ntfs_error(ni->vol->mp, "Failed to read symbolic link " "data (error %d).", err); } ntfs_debug("Done (error %d).", (int)err); err: lck_rw_unlock_shared(&ni->lock); return err; } /** * ntfs_mft_record_free_all - free clusters referenced by an mft record * @base_ni: base ntfs inode to which the (extent) inode @ni and @m belong * @ni: ntfs inode for which to free all clusters * @m: mft record for which to free all clusters * * For the ntfs inode @ni and its mft record @m, iterate over all attributes in * the mft record and free all clusters referenced by the attributes. @base_ni * is the base ntfs inode to which @ni and @m belong. * * Also, mark the mft record as not in use, increment its sequence number and * mark it dirty to ensure it gets written out later. * * When any operations fail this function notifies the user about it and marks * the volume dirty but does not return an error code as the caller can proceed * regardless without caring if some clusters failed to be freed. A later * chkdsk will find them and free them and in the mean time they just waste * some space on the volume. */ static void ntfs_mft_record_free_all(ntfs_inode *base_ni, ntfs_inode *ni, MFT_RECORD *m) { ntfs_volume *vol = base_ni->vol; ATTR_RECORD *a; errno_t err; ntfs_runlist rl; for (a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)); a->type != AT_END; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { if ((u8*)a < (u8*)m || (u8*)a > (u8*)m + le32_to_cpu(m->bytes_in_use) || le32_to_cpu(m->bytes_in_use) > le32_to_cpu(m->bytes_allocated) || !a->length) { ntfs_warning(vol->mp, "Found corrupt attribute whilst " "releasing deleted mft_no 0x%llx. " "Run chkdsk to recover lost space and " "fix any other inconsistencies.", (unsigned long long)ni->mft_no); NVolSetErrors(vol); break; } /* * For most resident attribute records, there is nothing we * need to do as they do not reference any clusters outside the * mft record itself. */ if (!a->non_resident) { STANDARD_INFORMATION *si; /* * We only need to deal with the standard information * attribute. */ if (a->type != AT_STANDARD_INFORMATION) continue; /* * We need to update the {a,m,c}times from the ntfs * inode into the corresponding times in the standard * information attribute. The inode ctime, i.e. the * last_mft_change_time in the standard information * attribute, gives us a de facto deleted time that can * be used by ntfsck and ntfsundelete for example. */ si = (STANDARD_INFORMATION*)((u8*)a + le16_to_cpu(a->value_offset)); si->last_data_change_time = utc2ntfs( base_ni->last_data_change_time); si->last_mft_change_time = utc2ntfs( base_ni->last_mft_change_time); si->last_access_time = utc2ntfs( base_ni->last_access_time); /* Whilst here also update the file attributes. */ si->file_attributes = base_ni->file_attributes; /* * We need to take care to handle NTFS 1.x style * standard information attributes on NTFS 3.0+ volumes * as they are lazily updated on write after a volume * has been upgraded from 1.x and after a volume has * been accessed by an older NTFS driver such as the * one in Windows NT4. */ #if 0 if (vol->major_ver <= 3 || le32_to_cpu(a->value_length) < sizeof(STANDARD_INFORMATION)) continue; #endif /* * We have an NTFS 3.0+ style, extended standard * information attribute. */ /* * TODO: When we implement support for $UsnJrnl, we * will need to journal the delete event and update the * usn field in the standard information attribute. * For now this is not needed as we stamp the * transaction log thus telling applications querying * the transaction log that it does not contain * uptodate information. We cannot do this at unlink * time because there may still be writes and truncates * happening due to existing open file descriptors and * the delete event has to come last. */ /* * TODO: When we implement support for quotas, we will * need to update the quota control entry belonging to * the user_id specified in the owner_id field in the * standard information attribute by updating its * change_time field to the current time and * decrementing its bytes_used field by the amount * specified in the quota_charged field in the standard * information attribute as well as setting the * exceeded_time to 0 if we go from over the soft quota * specified in the limit of the quota control entry. * For now this is not needed as we mark all quotas as * invalid when we mount a volume read-write. We * cannot do the quota update at unlink time because * there may still be writes and truncates happening * due to existing open file descriptors which will * affect the quota related fields. */ continue; } /* * For non-resident attribute records, we need to free all the * clusters specified in their mapping pairs array. * * If this is the base extent, we only need to do this if the * allocated size is not zero. If this is not the base extent * then by definition the allocated size cannot be zero and * more importantly an extent mft rceord does not have the * allocated_size field set thus it is always zero. */ if (!a->lowest_vcn && !a->allocated_size) continue; rl.rl = NULL; rl.alloc = rl.elements = 0; err = ntfs_mapping_pairs_decompress(vol, a, &rl); if (!err) { VCN lowest_vcn; /* * We need to supply the correct start and count values * otherwise freeing the clusters fails when an * attribute has multiple extent records because the * runlist contains unmapped elements. */ lowest_vcn = sle64_to_cpu(a->lowest_vcn); err = ntfs_cluster_free_from_rl(vol, rl.rl, lowest_vcn, sle64_to_cpu(a->highest_vcn) + 1 - lowest_vcn, NULL); if (err) { ntfs_warning(vol->mp, "Failed to free some " "allocated clusters belonging " "to mft_no 0x%llx (error " "%d). Run chkdsk to recover " "the lost space.", (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); } OSFree(rl.rl, rl.alloc, ntfs_malloc_tag); } else { ntfs_error(vol->mp, "Cannot free some allocated space " "belonging to mft_no 0x%llx because " "the decompression of the mapping " "pairs array failed (error %d). Run " "chkdsk to recover the lost space.", (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); } } /* * We have processed all attributes in the base mft record thus we can * mark it as not in use, increment its sequence number, and mark it * dirty for later writeout. */ m->flags &= ~MFT_RECORD_IN_USE; if (m->sequence_number != const_cpu_to_le16(0xffff)) m->sequence_number = cpu_to_le16( le16_to_cpu(m->sequence_number) + 1); else m->sequence_number = const_cpu_to_le16(1); ni->seq_no = le16_to_cpu(m->sequence_number); NInoSetMrecNeedsDirtying(ni); } /** * ntfs_vnop_inactive - the last reference to a vnode has been dropped * @args: arguments to inactive function * * @args contains: * vnode_t a_vp; vnode whose last reference has been dropped * vfs_context_t a_context; * * Last reference to a vnode has been dropped or a forced unmount is in * progress. * * Note: When called from reclaim, the vnode has a zero v_iocount and * v_usecount and vnode_isrecycled() is true. * * Return 0 on success and errno on error. * * Note the current OS X VFS ignores the return value from VNOP_INACTIVE() and * hence ntfs_vnop_inactive(). */ static int ntfs_vnop_inactive(struct vnop_inactive_args *args) { leMFT_REF mref; vnode_t vn = args->a_vp; ntfs_inode *base_ni, *mftbmp_ni, *ni = NTFS_I(vn); ntfs_volume *vol; MFT_RECORD *m; leMFT_REF *mrefs; unsigned nr_mrefs; errno_t err; BOOL is_delete; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return 0; } is_delete = !ni->link_count; vol = ni->vol; ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x%s.", (unsigned long long)ni->mft_no, (unsigned)le32_to_cpu(ni->type), (unsigned)ni->name_len, is_delete ? ", is delete" : ""); base_ni = ni; if (NInoAttr(ni)) base_ni = ni->base_ni; /* * This is the last close thus remove any directory hints. * * Note we check for presence of directory hints outside the locks as * an optimization. It is not a disaster if we miss any as all will be * released in ntfs_inode_free() before the inode is thrown away at the * latest. */ if (ni != base_ni && ni->type == AT_INDEX_ALLOCATION && ni->nr_dirhints) { lck_rw_lock_exclusive(&ni->lock); ntfs_dirhints_put(ni, 0); lck_rw_unlock_exclusive(&ni->lock); } /* * If the inode is not being deleted or this is a raw inode sync it and * we are done. */ if (!is_delete || NInoRaw(ni)) { sync: /* * Commit dirty data to disk unless mounted read-only. * * WARNING: Please see <rdar://problem/7202356> why this causes * stack exhaustion and kernel panics by creating a loop where * the VNOP_INACTIVE() calls ntfs_inode_sync() which ends up * doing ntfs_inode_get() which in turn triggers another * VNOP_INACTIVE() which in turn calls ntfs_inode_sync() and * thus ntfs_inode_get() which in turns calls VNOP_INACTIVE() * and so on until the stack overflows. */ err = 0; if (!NVolReadOnly(vol)) err = ntfs_inode_sync(ni, IO_SYNC | IO_CLOSE, FALSE); if (!err) ntfs_debug("Done."); else ntfs_error(vol->mp, "Failed to sync mft_no 0x%llx, " "type 0x%x, name_len 0x%x (error %d).", (unsigned long long)ni->mft_no, (unsigned)le32_to_cpu(ni->type), (unsigned)ni->name_len, err); return err; } if (ni != base_ni) lck_rw_lock_exclusive(&base_ni->lock); lck_rw_lock_exclusive(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(vn); lck_rw_unlock_exclusive(&ni->lock); if (ni != base_ni) lck_rw_unlock_exclusive(&base_ni->lock); ntfs_debug("Done (was already deleted)."); return 0; } /* * If someone else re-instantiated the inode whilst we were waiting for * the inode lock sync the inode instead of deleting it. */ if (ni->link_count) { lck_rw_unlock_exclusive(&ni->lock); if (ni != base_ni) lck_rw_unlock_exclusive(&base_ni->lock); ntfs_debug("Someone re-instantiated the inode."); goto sync; } /* * The inode has been unlinked, delete it now freeing all allocated * space on disk as well as all related resources on disk. Note we * proceed on errors because there is not much we can do about them. * We have to carry on regardless as the inode is about to be * terminated in any case. * * On a metadata affecting error, we mark the volume dirty and leave it * to a subsequent chkdsk to clean up after us. This is not a disaster * since there are no directory entries pointing to the inode @ni any * more, thus us failing just means that we will keep some on disk * resources allocated so chkdsk will just find this file and delete * it. * * First, remove the inode from the inode cache so it cannot be found * any more. */ lck_mtx_lock(&ntfs_inode_hash_lock); /* * Mark the inode as having been deleted so we do not try to remove it * from the ntfs inode hash again in ntfs_inode_reclaim(). */ NInoSetDeleted(ni); /* * Remove the ntfs_inode from the inode hash so it cannot be looked up * any more. */ ntfs_inode_hash_rm_nolock(ni); lck_mtx_unlock(&ntfs_inode_hash_lock); /* Remove the inode from the name cache if it is still in it. */ cache_purge(vn); /* * The inode/vnode are no longer reachable at all so drop the inode * lock. Anyone waiting on the lock should test for NInoDeleted() and * abort once they have taken the lock. */ lck_rw_unlock_exclusive(&ni->lock); /* In case someone is waiting on the inode do a wakeup. */ ntfs_inode_wakeup(ni); /* Invalidate all buffers to do with the vnode. */ err = buf_invalidateblks(vn, 0, 0, 0); if (err) ntfs_error(vol->mp, "Failed to invalidate cached buffers " "(error %d).", err); /* * Invalidate all cached pages in the VM. * * This will fail for non-regular (VREG) nodes as they do not have UBC * info attached to them and ubc_msync() returns error in this case. */ if (vnode_isreg(vn)) { err = ubc_msync(vn, 0, ubc_getsize(vn), NULL, UBC_INVALIDATE); if (err) ntfs_error(vol->mp, "Failed to invalidate cached " "pages (error %d).", err); } /* * Cause the vnode to be reused immediately when we return rather than * sitting around in the vnode cache. */ vnode_recycle(vn); /* * ntfs_unlink() and ntfs_vnop_rename() bail out for attribute inodes * so we cannot get here with an attribute inode unless something has * gone badly wrong. * * When a named stream is deleted via VNOP_REMOVENAMEDSTREAM() its * link_count is set to zero so we get here on the last close. We have * to perform the actual freeing of allocated space if the attribute is * non-resident as well as the removal of the attribute record here. */ if (ni != base_ni) { ntfs_attr_search_ctx *ctx; if (ni->type != AT_DATA || !ni->name_len) panic("%s(): ni != base_ni && (ni->type != AT_DATA || " "!ni->name_len)\n", __FUNCTION__); /* * For simplicity, if the attribute is non-resident, we * truncate the attribute to zero size first as that causes * both the allocated clusters to be freed as well as all * extent attribute records to be deleted. * * We then only need to remove the base attribute record and we * are done. */ if (NInoNonResident(ni)) { err = ntfs_attr_resize(ni, 0, 0, NULL); if (err) { ntfs_error(vol->mp, "Cannot delete named " "stream from mft_no 0x%llx " "because truncating the " "stream inode to zero size " "failed (error %d).", (unsigned long long)ni->mft_no, err); goto err; } } /* Remove the named stream. */ err = ntfs_mft_record_map(base_ni, &m); if (err) { ntfs_error(vol->mp, "Failed to delete named stream " "because mapping the mft record " "0x%llx failed (error %d).", (unsigned long long)ni->mft_no, err); goto err; } ctx = ntfs_attr_search_ctx_get(base_ni, m); if (!ctx) { ntfs_error(vol->mp, "Failed to delete named stream " "because allocating an attribute " "search context failed."); goto unm_err; } err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0, ctx); if (err) { ntfs_error(vol->mp, "Failed to delete named stream " "because looking up the named $DATA " "attribute in the mft record 0x%llx " "failed (error %d).", (unsigned long long)ni->mft_no, err); goto put_err; } err = ntfs_attr_record_delete(base_ni, ctx); if (err) { ntfs_error(vol->mp, "Failed to delete named stream " "because deleting the named $DATA " "attribute from its mft record 0x%llx " "failed (error %d).", (unsigned long long)ctx->ni->mft_no, err); goto put_err; } ntfs_debug("Done (deleted attribute inode)."); put_err: ntfs_attr_search_ctx_put(ctx); unm_err: ntfs_mft_record_unmap(base_ni); err: lck_rw_unlock_exclusive(&base_ni->lock); return err; } /* * We only need to be concerned with the allocated space on disk which * we need to deallocate and any related resources on disk, which we * also need to deallocate and/or mark unused. To do this, we map the * base mft record and iterate over all its attributes and deal with * each of them in sequence. */ err = ntfs_mft_record_map(ni, &m); if (err) { ntfs_warning(vol->mp, "Cannot release deleted mft_no 0x%llx " "because the mapping of the base mft record " "failed (error %d). Run chkdsk to recover " "lost resources.", (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); return 0; } /* * Make sure the mft record was marked as not in use in * ntfs_unlink_internal(). */ if (m->flags & MFT_RECORD_IN_USE) panic("%s(): m->flags & MFT_RECORD_IN_USE\n", __FUNCTION__); /* * We will need the mft reference of the base mft record below but we * are about to change it thus make a note of the old one now. */ mref = MK_LE_MREF(ni->mft_no, ni->seq_no); /* * Release all clusters allocated to attribute records located in the * extent mft record. */ ntfs_mft_record_free_all(ni, ni, m); /* * We are finished with the base mft record, if there is an attribute * list attribute, we iterate over its entries and each time we * encounter an extent mft record that we have not done yet, we map it * and iterate over all its attributes as we did above for the base mft * record, followed by marking the extent mft record as not in use, * incrementing its sequence number, and marking it dirty, again as we * did above for the base mft record. Finally, we add it to our list * of mft records to deallocate from the $MFT/$BITMAP attribute. * * As an optimization, we reuse the attribute list buffer as our list * of mft records to deallocate from the $MFT/$BITMAP attribute. This * works because each ATTR_LIST_ENTRY record in the attribute list * attribute is at least 24 bytes long and we only need to store 8 * bytes for each mft reference in our list of mft records to * deallocate so we are guaranteed to have enough space in the buffer * for our needs and we are also guaranteed that we will never * overwrite part of the attribute list attribute data that we have not * dealt with yet. */ nr_mrefs = 1; mrefs = &mref; if (NInoAttrList(ni)) { ATTR_LIST_ENTRY *entry, *next_entry, *end; ntfs_inode *eni; if (!ni->attr_list || ni->attr_list_size < sizeof(leMFT_REF) || !ni->attr_list_alloc) panic("%s(): !ni->attr_list || !ni->attr_list_size || " "!ni->attr_list_alloc\n", __FUNCTION__); entry = (ATTR_LIST_ENTRY*)ni->attr_list; mrefs = (leMFT_REF*)entry; next_entry = (ATTR_LIST_ENTRY*)((u8*)entry + le16_to_cpu(entry->length)); end = (ATTR_LIST_ENTRY*)(ni->attr_list + ni->attr_list_size); /* * Add the mft reference of the base mft record as the first * element in our list as we have already dealt with it. */ *mrefs = mref; while (entry < end) { unsigned i; mref = entry->mft_reference; for (i = 0; i < nr_mrefs; i++) { if (mref == mrefs[i]) goto do_next; } /* * This mft reference has not been encountered before. * Add it to the list of mft references and free all * disk storage associated with all the attribute * records stored in the mft record with this mft * reference. */ mrefs[nr_mrefs++] = mref; err = ntfs_extent_mft_record_map(ni, le64_to_cpu(mref), &eni, &m); if (!err) { /* * Release all clusters allocated to attribute * records located in the extent mft record and * mark the mft record as not in use. * * We need to ensure the mft record is marked * as in use. It can happen that it is not * marked in use after a system crash occurs * whilst a file is being extended. */ if (m->flags & MFT_RECORD_IN_USE) ntfs_mft_record_free_all(ni, eni, m); else { ntfs_warning(vol->mp, "Extent mft_no " "0x%llx, base mft_no " "0x%llx is marked as " "not in use. Cannot " "release allocated " "clusters. Unmount " "and run chkdsk to " "recover the lost " "clusters.", (unsigned long long) MREF_LE(mref), (unsigned long long) ni->mft_no); NVolSetErrors(vol); } /* Unmap the mft record again. */ ntfs_extent_mft_record_unmap(eni); } else { ntfs_warning(vol->mp, "Failed to release " "allocated clusters because " "mapping extent mft_no 0x%llx, " "base mft_no 0x%llx failed " "(error %d). Unmount and run " "chkdsk to recover the lost " "clusters.", (unsigned long long)MREF_LE(mref), (unsigned long long)ni->mft_no, err); NVolSetErrors(vol); } do_next: entry = next_entry; next_entry = (ATTR_LIST_ENTRY*)((u8*)entry + le16_to_cpu(entry->length)); } } ntfs_mft_record_unmap(ni); /* * Mark the base mft record and all extent mft records (if any) as * unused in the mft bitmap. * * Note that this means that ntfs_inode_reclaim() may run when someone * else has already reused one of the mft records we are freeing now. * This is ok because all ntfs_inode_reclaim() does is to do some * memory freeing. And we have already removed the inode from the * inode cache thus there are no problems from that point of view * either. */ lck_rw_lock_exclusive(&vol->mftbmp_lock); mftbmp_ni = vol->mftbmp_ni; err = vnode_get(mftbmp_ni->vn); if (err) ntfs_warning(vol->mp, "Failed to get vnode for $MFT/$BITMAP " "(error %d) thus cannot release mft " "record(s). Run chkdsk to recover the lost " "mft record(s).", err); else { lck_rw_lock_shared(&mftbmp_ni->lock); while (nr_mrefs > 0) { nr_mrefs--; err = ntfs_bitmap_clear_bit(mftbmp_ni, MREF_LE(mrefs[nr_mrefs])); if (!err) { /* * We cleared a bit in the mft bitmap thus we * need to reflect this in the cached number of * free mft records. */ vol->nr_free_mft_records++; if (vol->nr_free_mft_records >= vol->nr_mft_records) panic("%s(): vol->nr_free_mft_records " "> vol->nr_mft_records" "\n", __FUNCTION__); } else { ntfs_error(vol->mp, "Failed to free mft_no " "0x%llx (error %d). Run " "chkdsk to recover the lost " "mft record.", (unsigned long long) MREF_LE(mrefs[nr_mrefs]), err); NVolSetErrors(vol); } } lck_rw_unlock_shared(&mftbmp_ni->lock); (void)vnode_put(mftbmp_ni->vn); } lck_rw_unlock_exclusive(&vol->mftbmp_lock); ntfs_debug("Done (deleted base inode)."); return 0; } /** * ntfs_vnop_reclaim - free ntfs specific parts of a vnode so it can be reused * @a: arguments to reclaim function * * @a contains: * vnode_t a_vp; vnode to be reclaimed * vfs_context_t a_context; * * Reclaim a vnode so it can be used for other purposes. * * Note: This is called from reclaim. The vnode has a zero v_iocount and * v_usecount and vnode_isrecycled() is true. * * Return 0 on success and errno on error. * * Note the current OS X VFS panic()s the machine if VNOP_RECLAIM() and hence * ntfs_vnop_reclaim() returns an error. */ static int ntfs_vnop_reclaim(struct vnop_reclaim_args *a) { vnode_t vn = a->a_vp; ntfs_inode *ni = NTFS_I(vn); errno_t err; /* Do not dereference @ni if it is NULL. */ #ifdef DEBUG if (ni) ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len " "0x%x.", (unsigned long long)ni->mft_no, le32_to_cpu(ni->type), (unsigned)ni->name_len); else ntfs_debug("Entering for already reclaimed vnode!"); #endif vnode_removefsref(vn); err = ntfs_inode_reclaim(ni); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_pathconf - get configurable pathname variables * @a: arguments to pathconf function * * @a contains: * vnode_t a_vp; vnode for which to return pathconf information * int a_name; the pathconf variable to be queried * register_t *a_retval; destination for result of query * vfs_context_t a_context; * * Return POSIX pathconf information applicable to ntfs file system. Some * @a_name values are intercepted by the VFS in vn_pathconf (pathconf(2) -> * vn_pathconf() -> VNOP_PATHCONF() -> ntfs_vnop_pathconf()) so we do not * bother with them. * * Return 0 on success and EINVAL if an unsupported @a_name was queried for. */ static int ntfs_vnop_pathconf(struct vnop_pathconf_args *a) { ntfs_inode *ni = NTFS_I(a->a_vp); ntfs_volume *vol = NTFS_MP(vnode_mount(a->a_vp)); errno_t err = 0; ntfs_debug("Entering for pathconf variable number %d.", a->a_name); if (ni) { lck_rw_lock_shared(&ni->lock); /* * Do not allow messing with the inode once it has been * deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); lck_rw_unlock_shared(&ni->lock); ntfs_debug("Directory is deleted."); return ENOENT; } } switch (a->a_name) { case _PC_LINK_MAX: /* * The maximum file link count. For ntfs, the link count is * stored in the mft record in the link_count field which is of * type le16, thus 16 bits. For attribute inodes and * directories however, no hard links are allowed and thus the * maximum link count is 1. */ if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } *a->a_retval = NTFS_MAX_HARD_LINKS; if (NInoAttr(ni) || S_ISDIR(ni->mode)) *a->a_retval = 1; break; case _PC_NAME_MAX: /* * The maximum number of bytes in a filename. For ntfs, this * is stored in the attribute record in the name_length field * which is of type u8, thus 8 bits. */ *a->a_retval = NTFS_MAX_NAME_LEN; /* 255 */ break; case _PC_PATH_MAX: /* * The maximum number of bytes in a path name. Ntfs imposes no * restrictions so use the system limit. */ *a->a_retval = PATH_MAX; /* 1024 */ break; case _PC_PIPE_BUF: /* * The maximum number of bytes which will be written atomically * to a pipe, again ntfs imposes no restrictions so use the * system limit. */ *a->a_retval = PIPE_BUF; /* 512 */ break; case _PC_CHOWN_RESTRICTED: /* * Non-zero if appropriate privileges are required for the * chown(2) system call. For ntfs, this is always the case. */ *a->a_retval = 200112; /* unistd.h: _POSIX_CHOWN_RESTRICTED */ break; case _PC_NO_TRUNC: /* * Non-zero if accessing filenames longer than _POSIX_NAME_MAX * (which we specified above to be NTFS_MAX_NAME_LEN) generates * an error. For ntfs, this is always the case. */ *a->a_retval = 200112; /* unistd.h: _POSIX_NO_TRUNC */ break; case _PC_NAME_CHARS_MAX: /* * The maximum number of characters in a filename. This is * the same as _PC_NAME_MAX, above. */ *a->a_retval = NTFS_MAX_NAME_LEN; /* 255 */ break; case _PC_CASE_SENSITIVE: /* * Return 1 if case sensitive and 0 if not. For ntfs, this * depends on the mount options. */ if (vol) *a->a_retval = (NVolCaseSensitive(vol) ? 1 : 0); else err = EINVAL; break; case _PC_CASE_PRESERVING: /* * Return 1 if case preserving and 0 if not. For ntfs, this is * always 1, i.e. ntfs always preserves case. */ *a->a_retval = 1; break; case _PC_FILESIZEBITS: /* * The number of bits to represent file size. For ntfs, the * file size is stored in the attribute record in the data_size * field which is of type sle64, thus 63 bits. */ *a->a_retval = 63; break; default: err = EINVAL; } if (ni) lck_rw_unlock_shared(&ni->lock); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_allocate - */ static int ntfs_vnop_allocate(struct vnop_allocate_args *a) { errno_t err; ntfs_debug("Entering."); // TODO: (void)nop_allocate(a); err = ENOTSUP; ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_pagein - read a range of pages into memory * @a: arguments to pagein function * * @a contains: * vnode_t a_vp; vnode whose data to read into the page range * upl_t a_pl; page list describing destination page range * upl_offset_t a_pl_offset; byte offset into page list at which to start * off_t a_f_offset; byte offset in the vnode at which to start * size_t a_size; number of bytes to read from the vnode * int a_flags; flags further describing the pagein request * vfs_context_t a_context; * * Read @a->a_size bytes from the vnode @a-a_vp, starting at byte offset * @a->a_f_offset into the vnode, into the range of pages specified by the page * list @a->a_pl, starting at byte offset @a->a_pl_offset into the page list. * * The flags in @a->a_flags further describe the pagein request. The following * pagein flags are currently defined in OS X kernel: * UPL_IOSYNC - Perform synchronous i/o. * UPL_NOCOMMIT - Do not commit/abort the page range. * UPL_NORDAHEAD - Do not perform any speculative read-ahead. * IO_PASSIVE - This is background i/o so do not throttle other i/o. * * For encrypted attributes we abort for now as we do not support them yet. * * For non-resident, non-compressed attributes we use cluster_pagein_ext() * which deals with both normal and multi sector transfer protected attributes. * * For resident attributes and non-resident, compressed attributes we read the * data ourselves by mapping the page list, and in the resident case, mapping * the mft record, looking up the attribute in it, and copying the requested * data from the mapped attribute into the page list, then unmapping the mft * record, whilst for non-resident, compressed attributes, we get the raw inode * and use it with ntfs_read_compressed() to read and decompress the data into * our mapped page list. We then unmap the page list and finally, if * UPL_NOCOMMIT is not specified, we commit (success) or abort (error) the page * range. * * Return 0 on success and errno on error. * * Note the pages in the page list are marked busy on entry and the busy bit is * cleared when we commit the page range. Thus it is perfectly safe for us to * fill the pages with encrypted or mst protected data and to decrypt or mst * deprotect in place before committing the page range. * * Adapted from cluster_pagein_ext(). */ static int ntfs_vnop_pagein(struct vnop_pagein_args *a) { ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp); int err; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); if (!(a->a_flags & UPL_NOCOMMIT) && a->a_pl) ubc_upl_abort_range(a->a_pl, a->a_pl_offset, a->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); return EINVAL; } base_ni = ni; if (NInoAttr(ni)) base_ni = ni->base_ni; ntfs_debug("Entering for mft_no 0x%llx, offset 0x%llx, size 0x%llx, " "pagein flags 0x%x, page list offset 0x%llx.", (unsigned long long)ni->mft_no, (unsigned long long)a->a_f_offset, (unsigned long long)a->a_size, a->a_flags, (unsigned long long)a->a_pl_offset); err = ntfs_pagein(ni, a->a_f_offset, a->a_size, a->a_pl, a->a_pl_offset, a->a_flags); /* * Update the last_access_time (atime) if something was read and this * is the base ntfs inode or it is a named stream (this is what HFS+ * does, too). * * Skip the update if atime updates are disabled via the noatime mount * option or the volume is read only or this is a symbolic link. * * Also, skip the core system files except for the root directory. */ if (!err && !NVolReadOnly(ni->vol) && !(vfs_flags(ni->vol->mp) & MNT_NOATIME) && !S_ISLNK(base_ni->mode) && (ni == base_ni || ni->type == AT_DATA)) { BOOL need_update_time; need_update_time = TRUE; if (ni->vol->major_ver > 1) { if (base_ni->mft_no <= FILE_Extend && base_ni != ni->vol->root_ni) need_update_time = FALSE; } else { if (base_ni->mft_no <= FILE_UpCase && base_ni != ni->vol->root_ni) need_update_time = FALSE; } if (need_update_time) { base_ni->last_access_time = ntfs_utc_current_time(); NInoSetDirtyTimes(base_ni); } } return err; } // TODO: Move to ntfs_page.[hc]. static int ntfs_mst_pageout(ntfs_inode *ni, upl_t upl, upl_offset_t upl_ofs, unsigned size, s64 attr_ofs, s64 attr_size, int flags) { ntfs_volume *vol = ni->vol; u8 *kaddr; kern_return_t kerr; unsigned rec_size, rec_shift, nr_recs, i; int err; NTFS_RECORD_TYPE magic = 0; BOOL do_commit; do_commit = !(flags & UPL_NOCOMMIT); if (ni->type == AT_INDEX_ALLOCATION) magic = magic_INDX; else panic("%s(): Unknown mst protected inode 0x%llx, type 0x%x, " "name_len 0x%x.", __FUNCTION__, (unsigned long long)ni->mft_no, (unsigned)le32_to_cpu(ni->type), (unsigned)ni->name_len); ntfs_debug("Entering for mft_no 0x%llx, page list offset 0x%llx, size " "0x%x, offset 0x%llx, pageout flags 0x%x, magic is " "0x%x.", (unsigned long long)ni->mft_no, (unsigned long long)upl_ofs, size, (unsigned long long)attr_ofs, flags, (unsigned)le32_to_cpu(magic)); if (attr_ofs < 0 || attr_ofs >= attr_size || attr_ofs & PAGE_MASK_64 || size & PAGE_MASK || upl_ofs & PAGE_MASK) { err = EINVAL; goto err; } if (!NInoMstProtected(ni)) panic("%s(): Called for non-mst protected attribute.\n", __FUNCTION__); if (!NInoNonResident(ni)) panic("%s(): Resident mst protected attribute.\n", __FUNCTION__); rec_size = ni->block_size; if (attr_ofs & (rec_size - 1) || size & (rec_size - 1)) panic("%s(): Write not aligned to NTFS record boundary.\n", __FUNCTION__); rec_shift = ni->block_size_shift; /* Clip the number of records to the size of the attribute. */ nr_recs = size >> rec_shift; if (attr_ofs + size > attr_size) { unsigned to_write; /* Abort any pages outside the end of the attribute. */ to_write = attr_size - attr_ofs; nr_recs = to_write >> rec_shift; to_write = (to_write + PAGE_MASK) & ~PAGE_MASK; if (size != to_write) { if (size < to_write) panic("%s(): size less than to_write.\n", __FUNCTION__); ntfs_debug("Truncating write past end of attribute."); if (do_commit) ubc_upl_abort_range(upl, upl_ofs + to_write, size - to_write, UPL_ABORT_FREE_ON_EMPTY); size = to_write; } } if (!nr_recs) panic("%s(): NTFS record size greater than write size.\n", __FUNCTION__); /* * Need to apply the mst fixups and abort on errors. To apply the * fixups need to map the page list so we can access its contents. */ kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr); if (kerr != KERN_SUCCESS) { ntfs_error(vol->mp, "ubc_upl_map() failed (error %d).", (int)kerr); err = EIO; goto err; } /* * Loop over the records in the page list and for each apply the mst * fixups. On any fixup errors, remove all the applied fixups and * abort the write completely. */ for (i = 0; i < nr_recs; i++) { NTFS_RECORD *rec = (NTFS_RECORD*)(kaddr + (i << rec_shift)); if (__ntfs_is_magic(rec->magic, magic)) { err = ntfs_mst_fixup_pre_write(rec, rec_size); if (err) { ntfs_error(vol->mp, "Failed to apply mst " "fixups (mft_no 0x%llx, type " "0x%x, offset 0x%llx).", (unsigned long long)ni->mft_no, (unsigned)le32_to_cpu(ni->type), (unsigned long long)attr_ofs + (i << rec_shift)); goto mst_err; } } } /* Unmap the page list again so we can call cluster_pageout_ext(). */ // FIXME: Can we leave the page list mapped throughout the // cluster_pageout_ext() call? That would be a lot more efficient and // simplify error handling. kerr = ubc_upl_unmap(upl); if (kerr != KERN_SUCCESS) { ntfs_error(vol->mp, "ubc_upl_unmap() failed (error %d).", (int)kerr); err = EIO; goto mst_err; } /* * We need the write to be synchronous so we do not leave the metadata * with the fixups applied for too long. * * We also need to set the no commit flag so we can still recover from * errors by removing the fixups. */ flags |= UPL_IOSYNC | UPL_NOCOMMIT; /* * On success the fixups will have been removed by the * ntfs_cluster_iodone() callback. */ err = cluster_pageout_ext(ni->vn, upl, upl_ofs, attr_ofs, size, attr_size, flags, ntfs_cluster_iodone, NULL); if (!err) { if (do_commit) { /* Commit the page range we wrote out. */ ubc_upl_commit_range(upl, upl_ofs, size, UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_CLEAR_DIRTY); } ntfs_debug("Done."); return err; } ntfs_error(vol->mp, "Failed (cluster_pageout_ext() returned error " "%d).", err); /* * We may have some records left with applied fixups thus remove them * again. It does not matter if it is done twice as this is an error * code path and the only side effect is a little slow down. */ kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr); if (kerr != KERN_SUCCESS) { ntfs_error(vol->mp, "ubc_upl_map() failed (error %d), cannot " "remove mst fixups. Unmount and run chkdsk.", (int)kerr); NVolSetErrors(vol); goto err; } mst_err: /* Remove the applied fixups, unmap the page list and abort. */ while (i > 0) { NTFS_RECORD *rec = (NTFS_RECORD*)(kaddr + (--i << rec_shift)); if (__ntfs_is_magic(rec->magic, magic)) ntfs_mst_fixup_post_write(rec); } kerr = ubc_upl_unmap(upl); if (kerr != KERN_SUCCESS) ntfs_error(vol->mp, "ubc_upl_unmap() failed (error %d).", (int)kerr); err: if (do_commit) ubc_upl_abort_range(upl, upl_ofs, size, UPL_ABORT_FREE_ON_EMPTY); return err; } /** * ntfs_vnop_pageout - write a range of pages to storage * @a: arguments to pageout function * * @a contains: * vnode_t a_vp; vnode whose data to write from the page range * upl_t a_pl; page list describing the source page range * upl_offset_t a_pl_offset; byte offset into page list at which to start * off_t a_f_offset; byte offset in the vnode at which to start * size_t a_size; number of bytes to write to the vnode * int a_flags; flags further describing the pageout request * vfs_context_t a_context; * * If UPL_NESTED_PAGEOUT is set in the flags (a->a_flags) we are called from * cluster_io() which is in turn called from cluster_write() which is in turn * called from ntfs_vnop_write() which means we are already holding the inode * lock (@ni->lock). Alternatively cluster_io() can be called from * cluster_push() which can be called from various places in NTFS. * * Write @a->a_size bytes to the vnode @a-a_vp, starting at byte offset * @a->a_f_offset into the vnode, from the range of pages specified by the page * list @a->a_pl, starting at byte offset @a->a_pl_offset into the page list. * * The flags in @a->a_flags further describe the pageout request. The * following pageout flags are currently defined in OS X kernel: * UPL_IOSYNC - Perform synchronous i/o. * UPL_NOCOMMIT - Do not commit/abort the page range. * UPL_KEEPCACHED - Data is already cached in memory, keep it cached. * IO_PASSIVE - This is background i/o so do not throttle other i/o. * * For encrypted attributes we abort for now as we do not support them yet. * * For non-resident, non-compressed attributes we use cluster_pageout_ext() * which deals with both normal and multi sector transfer protected attributes. * * In the case of multi sector transfer protected attributes we apply the * fixups and then submit the i/o synchronously by setting the UPL_IOSYNC flag. * * For resident attributes and non-resident, compressed attributes we write the * data ourselves by mapping the page list, and in the resident case, mapping * the mft record, looking up the attribute in it, and copying the data to the * mapped attribute from the page list, then unmapping the mft record, whilst * for non-resident, compressed attributes, we get the raw inode and use it * with ntfs_write_compressed() to compress and write the data from our mapped * page list. We then unmap the page list and finally, if UPL_NOCOMMIT is not * specified, we commit (success) or abort (error) the page range. * * Return 0 on success and errno on error. * * Note the pages in the page list are marked busy on entry and the busy bit is * cleared when we commit the page range. Thus it is perfectly safe for us to * apply the mst fixups and write out the data which will then also take away * the fixups again before committing the page range. * * Adapted from cluster_pageout_ext(). */ static int ntfs_vnop_pageout(struct vnop_pageout_args *a) { s64 attr_ofs, attr_size, alloc_size, bytes; ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp); upl_t upl = a->a_pl; ntfs_volume *vol; u8 *kaddr; upl_offset_t upl_ofs = a->a_pl_offset; kern_return_t kerr; unsigned to_write, size = a->a_size; int err, flags = a->a_flags; lck_rw_type_t lock_type = LCK_RW_TYPE_SHARED; BOOL locked = FALSE; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); if (!(flags & UPL_NOCOMMIT) && upl) ubc_upl_abort_range(upl, upl_ofs, size, UPL_ABORT_FREE_ON_EMPTY); return EINVAL; } vol = ni->vol; attr_ofs = a->a_f_offset; base_ni = ni; if (NInoAttr(ni)) base_ni = ni->base_ni; ntfs_debug("Entering for mft_no 0x%llx, offset 0x%llx, size 0x%x, " "pageout flags 0x%x, page list offset 0x%llx.", (unsigned long long)ni->mft_no, (unsigned long long)attr_ofs, size, flags, (unsigned long long)upl_ofs); /* * If the caller did not specify any i/o, then we are done. We cannot * issue an abort because we do not have a upl or we do not know its * size. */ if (!upl || size <= 0) { ntfs_error(vol->mp, "NULL page list passed in or request size " "is below zero (error EINVAL)."); return EINVAL; } if (S_ISDIR(ni->mode)) { ntfs_error(vol->mp, "Called for directory vnode."); err = EISDIR; goto err; } if (NVolReadOnly(vol)) { err = EROFS; goto err; } /* * Need to clip i/o at maximum file size of 2^63-1 bytes in case * someone creates a sparse file and is playing silly with seek + write * note we only need to check for this for sparse files as non-sparse * files can never reach 2^63-1 because that is also the maximum space * on the volume thus the write would simply get an ENOSPC when the * volume is full. */ if (NInoSparse(ni) && (u64)attr_ofs + size > NTFS_MAX_ATTRIBUTE_SIZE) { err = EFBIG; goto err; } #if 1 // TODO: Remove this when sparse support is done... if (NInoSparse(ni)) { err = ENOTSUP; goto err; } #endif /* * Protect against changes in initialized_size and thus against * truncation also but only if the VFS is not calling back into the * NTFS driver after the NTFS driver called it in which case we are * already holding the lock. * * There is a complication in that the UPL is already created by the * caller thus us taking the lock here is a case of lock reversal wrt * the UPL keeping the pages locked for exclusive access thus we can * deadlock with a concurrent file create for example when it holds the * ntfs inode lock @ni->lock for exclusive access on the index vnode of * the parent directory and then calls ntfs_page_map() to map a page * from the index as we already hold the same UPL that ntfs_page_map() * will try to get thus if we go to sleep on the ntfs inode lock that * is held exclusive by the create code path we would now deadlock. * * To avoid the deadlock, we do a try-lock for the ntfs inode lock and * if that fails we simply abort the pages returning them to the VM * without modification thus they should remain dirty and they should * be paged out at a later point in time. * * We then return ENXIO to indicate that this is a temporary failure to * the caller. * * FIXME: There is a complication and that is that we really need to * hole the inode lock for writing if we are writing to a hole and/or * writing past the initialized size as we would then be modifying the * initialized_size. But if UPL_NESTED_PAGEOUT is set we have no idea * whether the caller is holding the lock for write or not and we * cannot safely drop/retake the lock in any case... For now we ignore * the problem and just emit a warning in this case. */ if (!(flags & UPL_NESTED_PAGEOUT)) { if (NInoSparse(ni)) lock_type = LCK_RW_TYPE_EXCLUSIVE; if (!lck_rw_try_lock(&ni->lock, lock_type)) { ntfs_debug("Failed to take ni->lock for %s for mft_no " "0x%llx, type 0x%x. Aborting with " "ENXIO to avoid deadlock.", (lock_type == LCK_RW_TYPE_SHARED) ? "reading" : "writing", (unsigned long long)ni->mft_no, (unsigned)le32_to_cpu(ni->type)); if (!(flags & UPL_NOCOMMIT)) ubc_upl_abort_range(upl, upl_ofs, size, UPL_ABORT_FREE_ON_EMPTY); return ENXIO; } locked = TRUE; } else { if (NInoSparse(ni)) ntfs_warning(vol->mp, "flags & UPL_NESTED_PAGEOUT && " "NINoSparse(ni), need inode lock " "exclusive but caller holds the lock " "so we do not know if it is exclusive " "or not."); } /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); err = ENOENT; goto err; } retry_pageout: /* * TODO: This check may no longer be necessary now that we lock against * changes in initialized size and thus truncation... Revisit this * issue when the write code has been written and remove the check if * appropriate simply using ubc_getsize(vn); without the size_lock. */ lck_spin_lock(&ni->size_lock); attr_size = ubc_getsize(a->a_vp); if (attr_size > ni->data_size) attr_size = ni->data_size; /* * Cannot pageout to a negative offset or if we are starting beyond the * end of the attribute or if the attribute offset is not page aligned * or the size requested is not a multiple of PAGE_SIZE. */ if (attr_ofs < 0 || attr_ofs >= attr_size || attr_ofs & PAGE_MASK_64 || size & PAGE_MASK || upl_ofs & PAGE_MASK) { lck_spin_unlock(&ni->size_lock); err = EINVAL; goto err; } // TODO: HERE: // FIXME: For now abort writes beyond initialized size... // TODO: This causes a problem and that is in ntfs_vnop_write() we only // update the initialized size after calling cluster_write() which // means we cannot zero up to the initialized size here or we could // trample over data that has just been written out. Also this causes // our check here to trigger even though we are not really outside the // initialized size at all and in fact this page out may be part of the // write itself so it has to succeed. But on the other hand if this is // a genuine mmap()-based write we do need to do the zeroing. We need // to somehow be able to tell the difference between the two... // If the initialized size equals attr_ofs then we can safely perform // the write and then update the initialized size to attr_ofs + size // but need to be careful to update the data size appropriately and // also need to make sure not to exceed the end of the write otherwise // we would cause a file extension here when we should not do so. In // fact if this is not part of an extending write then we should not // modify the data size and only the initialized size instead. if (attr_ofs + size > ni->initialized_size && ni->initialized_size != ni->data_size) { lck_spin_unlock(&ni->size_lock); ntfs_error(vol->mp, "Writing beyond the initialized size of " "an attribute is not implemented yet."); err = ENOTSUP; goto err; } alloc_size = ni->allocated_size; lck_spin_unlock(&ni->size_lock); /* * If this is a sparse attribute we need to fill any holes overlapping * the write. We can skip resident attributes as they cannot have * sparse regions. * * As allocated size goes in units of clusters we need to round down * the start offset to the nearest cluster boundary and we need to * round up the end offset to the next cluster boundary. */ if (NInoSparse(ni) && NInoNonResident(ni) && ni->type == AT_DATA) { s64 aligned_end, new_end; aligned_end = (attr_ofs + size + vol->cluster_size_mask) & ~vol->cluster_size_mask; /* * Only need to instantiate holes up to the allocated size * itself. Everything else would be an extension which is not * allowed from VNOP_PAGEOUT(). */ if (aligned_end > alloc_size) aligned_end = alloc_size; err = ntfs_attr_instantiate_holes(ni, attr_ofs & ~vol->cluster_size_mask, aligned_end, &new_end, TRUE); if (err) { ntfs_error(vol->mp, "Cannot perform pageout of mft_no " "0x%llx because instantiation of " "sparse regions failed (error %d).", (unsigned long long)ni->mft_no, err); goto err; } /* The instantiation may not be partial. */ if (new_end < aligned_end) panic("%s(): new_end < aligned_end\n", __FUNCTION__); } /* * Only $DATA attributes can be encrypted/compressed. Index root can * have the flags set but this means to create compressed/encrypted * files, not that the attribute is compressed/encrypted. Note we need * to check for AT_INDEX_ALLOCATION since this is the type of directory * index inodes. */ if (ni->type != AT_INDEX_ALLOCATION) { /* TODO: Deny access to encrypted attributes, just like NT4. */ if (NInoEncrypted(ni)) { if (ni->type != AT_DATA) panic("%s(): Encrypted non-data attribute.\n", __FUNCTION__); ntfs_warning(vol->mp, "Denying write to encrypted " "attribute (EACCES)."); err = EACCES; goto err; } /* Compressed data streams need special handling. */ if (NInoNonResident(ni) && NInoCompressed(ni) && !NInoRaw(ni)) { if (ni->type != AT_DATA) panic("%s(): Compressed non-data attribute.\n", __FUNCTION__); goto compressed; } } /* NInoNonResident() == NInoIndexAllocPresent() */ if (NInoNonResident(ni)) { if (NInoMstProtected(ni)) err = ntfs_mst_pageout(ni, upl, upl_ofs, size, attr_ofs, attr_size, flags); else { err = cluster_pageout_ext(a->a_vp, upl, upl_ofs, attr_ofs, size, attr_size, flags, NULL, NULL); if (!err) ntfs_debug("Done (cluster_pageout_ext())."); else ntfs_error(vol->mp, "Failed " "(cluster_pageout_ext(), " "error %d).", err); } goto done; } compressed: /* The attribute is resident and/or compressed. */ to_write = size; bytes = attr_size - attr_ofs; if (to_write > bytes) to_write = bytes; /* * Calculate the number of bytes available in the attribute starting at * offset @attr_ofs up to a maximum of the number of bytes to be * written rounded up to a multiple of the system page size. */ bytes = (to_write + PAGE_MASK) & ~PAGE_MASK; /* Abort any pages outside the end of the attribute. */ if (size > bytes && !(flags & UPL_NOCOMMIT)) { ubc_upl_abort_range(upl, upl_ofs + bytes, size - bytes, UPL_ABORT_FREE_ON_EMPTY); /* Update @size. */ size = bytes; } /* To access the page list contents, we need to map the page list. */ kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr); if (kerr != KERN_SUCCESS) { ntfs_error(vol->mp, "ubc_upl_map() failed (error %d).", (int)kerr); err = EIO; goto err; } if (!NInoNonResident(ni)) { /* * Write the data from the page list into the resident * attribute in its mft record. */ err = ntfs_resident_attr_write(ni, kaddr + upl_ofs, to_write, attr_ofs); // TODO: If !err and synchronous i/o, write the mft record now. // This should probably happen in ntfs_resident_attr_write(). if (err && err != EAGAIN) ntfs_error(vol->mp, "ntfs_resident_attr_write() " "failed (error %d).", err); } else if (NInoCompressed(ni)) { ntfs_error(vol->mp, "Writing to compressed files is not " "implemented yet, sorry."); err = ENOTSUP; #if 0 ntfs_inode *raw_ni; int ioflags; /* * Get the raw inode and lock it for writing to protect against * concurrent readers and writers as the compressed data is * invalid whilst a write is in progress. */ err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_EXCLUSIVE, &raw_ni); if (err) ntfs_error(vol->mp, "Failed to get raw inode (error " "%d).", err); else { if (!NInoRaw(raw_ni)) panic("%s(): Requested raw inode but got " "non-raw one.\n", __FUNCTION__); ioflags = 0; if (vnode_isnocache(ni->vn) || vnode_isnocache(raw_ni->vn)) ioflags |= IO_NOCACHE; if (vnode_isnoreadahead(ni->vn) || vnode_isnoreadahead(raw_ni->vn)) ioflags |= IO_RAOFF; err = ntfs_write_compressed(ni, raw_ni, attr_ofs, size, kaddr + upl_ofs, NULL, ioflags); if (err) ntfs_error(vol->mp, "ntfs_write_compressed() " "failed (error %d).", err); lck_rw_unlock_exclusive(&raw_ni->lock); (void)vnode_put(raw_ni->vn); } #endif } else { /* * The attribute was converted to non-resident under our nose * we need to retry the pageout. * * TODO: This may no longer be possible to happen now that we * lock against changes in initialized size and thus * truncation... Revisit this issue when the write code has * been finished and replace this with a panic(). */ err = EAGAIN; } kerr = ubc_upl_unmap(upl); if (kerr != KERN_SUCCESS) { ntfs_error(vol->mp, "ubc_upl_unmap() failed (error %d).", (int)kerr); if (!err) err = EIO; } if (!err) { if (!(flags & UPL_NOCOMMIT)) { /* Commit the page range we wrote out. */ ubc_upl_commit_range(upl, upl_ofs, size, UPL_COMMIT_FREE_ON_EMPTY); } // TODO: If we wrote anything at all we have to clear the // setuid and setgid bits as a precaution against tampering // (see xnu/bsd/hfs/hfs_readwrite.c::hfs_vnop_pageout()). ntfs_debug("Done (%s).", !NInoNonResident(ni) ? "ntfs_resident_attr_write()" : "ntfs_write_compressed()"); } else /* if (err) */ { /* * If the attribute was converted to non-resident under our * nose, retry the pageout. * * TODO: This may no longer be possible to happen now that we * lock against changes in initialized size and thus * truncation... Revisit this issue when the write code has * been finished and remove the check and goto if appropriate. */ if (err == EAGAIN) goto retry_pageout; err: if (!(flags & UPL_NOCOMMIT)) ubc_upl_abort_range(upl, upl_ofs, size, UPL_ABORT_FREE_ON_EMPTY); ntfs_error(vol->mp, "Failed (error %d).", err); } done: // TODO: If we wrote anything at all we have to clear the setuid and // setgid bits as a precaution against tampering (see // xnu/bsd/hfs/hfs_readwrite.c::hfs_vnop_pageout()). /* * If this is not a directory or it is an encrypted directory, set the * needs archiving bit except for the core system files. */ if (!err && (!S_ISDIR(base_ni->mode) || NInoEncrypted(base_ni))) { BOOL need_set_archive_bit = TRUE; if (vol->major_ver > 1) { if (base_ni->mft_no <= FILE_Extend) need_set_archive_bit = FALSE; } else { if (base_ni->mft_no <= FILE_UpCase) need_set_archive_bit = FALSE; } if (need_set_archive_bit) { base_ni->file_attributes |= FILE_ATTR_ARCHIVE; NInoSetDirtyFileAttributes(base_ni); } } /* * Update the last_data_change_time (mtime) and last_mft_change_time * (ctime) on the base ntfs inode @base_ni but not on the core system * files. However do set it on the root directory. * * Do not update the times on symbolic links. */ if (!err && !S_ISLNK(base_ni->mode)) { BOOL need_update_time = TRUE; if (vol->major_ver > 1) { if (base_ni->mft_no <= FILE_Extend && base_ni != vol->root_ni) need_update_time = FALSE; } else { if (base_ni->mft_no <= FILE_UpCase && base_ni != vol->root_ni) need_update_time = FALSE; } if (need_update_time) { base_ni->last_mft_change_time = base_ni->last_data_change_time = ntfs_utc_current_time(); NInoSetDirtyTimes(base_ni); } } if (locked) { if (lock_type == LCK_RW_TYPE_SHARED) lck_rw_unlock_shared(&ni->lock); else lck_rw_unlock_exclusive(&ni->lock); } return err; } /** * ntfs_vnop_searchfs - * */ static int ntfs_vnop_searchfs(struct vnop_searchfs_args *a) { errno_t err; ntfs_debug("Entering."); // TODO: err = err_searchfs(a); ntfs_debug("Done (error %d).", (int)err); return err; } /** * ntfs_vnop_getxattr - get the data of an extended attribute of an ntfs inode * @a: arguments to getxattr function * * @a contains: * vnode_t a_vp; vnode whose extended attribute to get * char *a_name; name of extented attribute to get in utf8 * uio_t a_uio; destination in which to return the exteneded attribute * size_t *a_size; size of the extended attribute in bytes * int a_options; flags controlling how the attribute is obtained * vfs_context_t a_context; * * Get the named stream with the name @a->a_name (we map named streams 1:1 with * extended attributes for NTFS as the NTFS native EAs are useless) contained * in the vnode @a->a_vp and return its data in the destination specified by * @a->a_uio. * * If there was not enough space to return the whole extended attribute in the * destination @a->a_uio we return error ERANGE. The only exception to this is * the resource fork (@a->a_name is XATTR_RESOURCEFORK_NAME) for which we just * return up to uio_resid(@a->a_uio) bytes (or up to the end of the resource * fork if that is smaller). * * Note that uio_offset(@a->a_uio) must be zero except for the resource fork * where it can specify the offset into the resource fork at which to begin * returning the data. * * If @a->a_uio is NULL, do not return the data of the attribute and instead * return the current data size of the named stream in *@a->a_size. Note that * when @a->a_uio is not NULL @a->a_size is ignored as the size of the named * stream is implicitly returned in the @a->a_uio and it can be obtained by * taking the original buffer size and subtracting uio_resid(@a->a_uio) from * it. * * The flags in @a->a_options control how the attribute is obtained. The * following flags are currently defined in OS X kernel: * XATTR_NOFOLLOW - Do not follow symbolic links. * XATTR_CREATE - Set the value, fail if already exists (setxattr only). * XATTR_REPLACE - Set the value, fail if does not exist (setxattr only). * XATTR_NOSECURITY- Bypass authorization checking. * XATTR_NODEFAULT - Bypass default extended attribute file ('._' file). * * Return 0 on success and errno on error. */ static int ntfs_vnop_getxattr(struct vnop_getxattr_args *a) { s64 size; user_ssize_t start_count; off_t start_ofs; ntfs_inode *ani, *ni = NTFS_I(a->a_vp); const char *name = a->a_name; uio_t uio = a->a_uio; ntfs_volume *vol; ntfschar *ntfs_name; size_t ntfs_name_size; signed ntfs_name_len; errno_t err; ntfschar ntfs_name_buf[NTFS_MAX_ATTR_NAME_LEN]; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vol = ni->vol; /* Check for invalid names. */ if (!name || name[0] == '\0') return EINVAL; start_ofs = uio_offset(uio); start_count = uio_resid(uio); ntfs_debug("Entering for mft_no 0x%llx, extended attribute name %s, " "offset 0x%llx, size 0x%llx, options 0x%x.", (unsigned long long)ni->mft_no, name, start_ofs, start_count, a->a_options); lck_rw_lock_shared(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); ntfs_debug("Mft_no 0x%llx is deleted.", (unsigned long long)ni->mft_no); err = ENOENT; goto err; } /* * Only regular files, directories, and symbolic links can have * extended attributes. (Specifically named streams cannot have them.) * * Thus the check is for attribute inodes as all base inodes are * allowed. Raw inodes are also attribute inodes so they are excluded * automatically, too. */ if (NInoAttr(ni)) { ntfs_debug("Mft_no 0x%llx is an attribute inode.", (unsigned long long)ni->mft_no); err = EPERM; goto err; } /* * First of all deal with requests for the Finder info as that is * special because we cache it in the base ntfs inode @ni and we only * want to return it if the Finder info is non-zero. This is what HFS * does, too. * * Thus we need to check the status of the cache in the ntfs inode * first and if that it valid we can use it to check the content of the * Finder info for being zero. And if it is not valid then we need to * read it into the cache in the ntfs inode and then we can check the * Finder info in the cache for being zero. In fact we do this the * other way round, i.e. if the Finder info cache is not valid we read * the Finder info into the cache first and then the cache is * definitely valid thus we can check the Finder info for being * non-zero and the Finder info data if so. * * A further complication is in the event of symbolic links where we do * not return the type and creator and instead return zero for them as * that is what HFS+ does, too. * * FIXME: This comparison is case sensitive. */ if (!bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME))) { FINDER_INFO fi; if (!NInoValidFinderInfo(ni)) { if (!lck_rw_lock_shared_to_exclusive(&ni->lock)) { lck_rw_lock_exclusive(&ni->lock); if (NInoDeleted(ni)) { cache_purge(ni->vn); lck_rw_unlock_exclusive(&ni->lock); ntfs_debug("Mft_no 0x%llx is deleted.", (unsigned long long) ni->mft_no); return ENOENT; } } /* * Load the AFP_AfpInfo stream and initialize the * backup time and Finder info (if they are not already * valid). */ err = ntfs_inode_afpinfo_read(ni); if (err) { ntfs_error(vol->mp, "Failed to obtain AfpInfo " "for mft_no 0x%llx (error %d).", (unsigned long long)ni->mft_no, err); lck_rw_unlock_exclusive(&ni->lock); return err; } lck_rw_lock_exclusive_to_shared(&ni->lock); if (!NInoValidFinderInfo(ni)) panic("%s(): !NInoValidFinderInfo(ni)\n", __FUNCTION__); } /* * Make a copy of the Finder info and mask out the hidden bit * if this is the root directory and the type and creator if * this is a symbolic link. */ memcpy(&fi, &ni->finder_info, sizeof(fi)); if (ni == vol->root_ni) fi.attrs &= ~FINDER_ATTR_IS_HIDDEN; if (S_ISLNK(ni->mode)) { fi.type = 0; fi.creator = 0; } /* If the Finder info is zero, pretend it does not exist. */ if (!bcmp(&fi, &ntfs_empty_finder_info, sizeof(ni->finder_info))) { ntfs_debug("Mft_no 0x%llx has zero Finder info, " "returning ENOATTR.", (unsigned long long)ni->mft_no); err = ENOATTR; goto err; } /* The Finder info is not zero, return it. */ if (!uio) { *a->a_size = sizeof(FINDER_INFO); err = 0; } else if (start_ofs) err = EINVAL; else if (uio_resid(uio) < (user_ssize_t)sizeof(FINDER_INFO)) err = ERANGE; else { err = uiomove((caddr_t)&fi, sizeof(fi), uio); if (err) ntfs_error(vol->mp, "uiomove() failed (error " "%d).", err); } goto err; } /* * Now deal with requests for the resource fork as that is special * because on one hand we need to translate its name from * XATTR_RESOURCEFORK_NAME to AFP_Resource so we do not need to convert * the utf8 name @name to Unicode and on the other hand the offset * @start_ofs may be non-zero and the read may be only from a partial * region of the resource fork. * * FIXME: This comparison is case sensitive. */ if (!bcmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME))) { ntfs_name = NTFS_SFM_RESOURCEFORK_NAME; ntfs_name_len = 12; } else { /* * The request is not for the resource fork (nor for the Finder * info). This means that the offset @start_ofs must be zero. */ if (start_ofs) { err = EINVAL; goto err; } /* Convert the requested name from utf8 to Unicode. */ ntfs_name = ntfs_name_buf; ntfs_name_size = sizeof(ntfs_name_buf); ntfs_name_len = utf8_to_ntfs(vol, (const u8*)name, strlen(name), &ntfs_name, &ntfs_name_size); if (ntfs_name_len < 0) { err = -ntfs_name_len; if (err == ENAMETOOLONG) ntfs_debug("Failed (name is too long)."); else ntfs_error(vol->mp, "Failed to convert name to " "Unicode (error %d).", err); goto err; } /* * If this is one of the SFM named streams, skip it, as they * contain effectively metadata information so should not be * exposed directly. */ if (ntfs_is_sfm_name(vol, ntfs_name, ntfs_name_len)) { ntfs_debug("Not allowing access to protected SFM name " "(returning EINVAL)."); err = EINVAL; goto err; } } /* * We now have the name of the requested attribute in @ntfs_name and it * is @ntfs_name_len characters long and we have verified that the * start offset is zero (unless this is the resource fork in which case * a non-zero start offset is fine). * * Start by getting the ntfs inode for the $DATA:@ntfs_name attribute. */ err = ntfs_attr_inode_get(ni, AT_DATA, ntfs_name, ntfs_name_len, FALSE, LCK_RW_TYPE_SHARED, &ani); if (err) { if (err == ENOENT) err = ENOATTR; else if (err != ENOATTR) ntfs_error(vol->mp, "Failed to get $DATA/%s attribute " "inode mft_no 0x%llx (error %d).", name, (unsigned long long)ni->mft_no, err); goto err; } /* * TODO: This check may no longer be necessary now that we lock against * changes in initialized size and thus truncation... Revisit this * issue when the write code has been written and remove the check if * appropriate simply using ubc_getsize(ni->vn); without the size_lock. */ lck_spin_lock(&ani->size_lock); size = ubc_getsize(ani->vn); if (size > ani->data_size) size = ani->data_size; lck_spin_unlock(&ani->size_lock); if (!uio) *a->a_size = size; else if (ntfs_name != NTFS_SFM_RESOURCEFORK_NAME && start_count < size) { /* Partial reads are only allowed for the resource fork. */ err = ERANGE; } else { /* * Perform the actual read from the attribute inode. We pass * in IO_UNIT as we want an atomic i/o operation. * * FIXME: ntfs_read() currently ignores the IO_UNIT flag so we * still have to test for partial reads. */ err = ntfs_read(ani, uio, IO_UNIT, TRUE); /* * If the read was partial, reset @uio pretending that the read * never happened. This is because extended attribute i/o is * meant to be atomic, i.e. either we get it all or we do not * get anything. * * Note we also accept the case where uio_resid() has gone to * zero as this covers the exception of the resource fork for * which we do not need to return the whole resource fork in * one go. */ if (uio_resid(uio) && start_count - uio_resid(uio) != size - start_ofs) { /* * FIXME: Should we be trying to continue a partial * read in case we can complete it with multiple calls * to ntfs_read()? If we do that we could also drop * the IO_UNIT flag above. */ if (!err) { ntfs_debug("ntfs_read() returned a partial " "read, pretending the read " "never happened."); err = EIO; } uio_setoffset(uio, start_ofs); uio_setresid(uio, start_count); } } lck_rw_unlock_shared(&ani->lock); (void)vnode_put(ani->vn); err: lck_rw_unlock_shared(&ni->lock); ntfs_debug("Done (error %d).", err); return err; } /** * ntfs_vnop_setxattr - set the data of an extended attribute of an ntfs inode * @a: arguments to setxattr function * * @a contains: * vnode_t a_vp; vnode whose extended attribute to set * char *a_name; name of extented attribute to set in utf8 * uio_t a_uio; source data to which to set the exteneded attribute * int a_options; flags controlling how the attribute is set * vfs_context_t a_context; * * Get the named stream with the name @a->a_name (we map named streams 1:1 with * extended attributes for NTFS as the NTFS native EAs are useless) contained * in the vnode @a->a_vp and set its data to the source specified by @a->a_uio. * * If @a->a_options does not specify XATTR_CREATE nor XATTR_REPLACE the * attribute will be created if it does not exist already and if it exists * already the old value will be replaced with the new one, i.e. if the old * value does not have the same size as the new value the attribute is * truncated to the new size. * * If @a->a_options specifies XATTR_CREATE the call will fail if the attribute * already exists, i.e. the existing attribute will not be replaced. * * If @a->a_options specifies XATTR_REPLACE the call will fail if the attribute * does not exist, i.e. the new attribute will not be created. * * An exception is the resource fork (@a->a_name is XATTR_RESOURCEFORK_NAME) * for which we do not replace the existing attribute and instead we write over * the existing attribute starting at offset uio_offset(@a->a_uio) and writing * uio_resid(@a->a_uio) bytes. Writing past the end of the resource fork will * cause the resource fork to be extended just like a regular file write would * do but a write to any existing part of the attribute will not cause the * attribute to be shrunk. * * Simillar to other extended attributes, if @a->a_options specifies * XATTR_CREATE the call will fail if the resource fork already exists, i.e. * the write to the existing resource fork will be denied and if @a->a_options * specified XATTR_REPLACE the call will fail if the resource fork does not yet * exist, i.e. the new resource fork will not be created. * * Note that uio_offset(@a->a_uio) must be zero except for the resource fork * where it can specify the offset into the resource fork at which to begin * writing the data. * * The flags in @a->a_options control how the attribute is set. The following * flags are currently defined in OS X kernel: * XATTR_NOFOLLOW - Do not follow symbolic links. * XATTR_CREATE - Set the value, fail if already exists (setxattr only). * XATTR_REPLACE - Set the value, fail if does not exist (setxattr only). * XATTR_NOSECURITY- Bypass authorization checking. * XATTR_NODEFAULT - Bypass default extended attribute file ('._' file). * * Return 0 on success and errno on error. */ static int ntfs_vnop_setxattr(struct vnop_setxattr_args *a) { s64 size; user_ssize_t start_count; off_t start_ofs; ntfs_inode *ani, *ni = NTFS_I(a->a_vp); ntfs_volume *vol; const char *name = a->a_name; uio_t uio = a->a_uio; ntfschar *ntfs_name; size_t ntfs_name_size; signed ntfs_name_len; const int options = a->a_options; errno_t err; ntfschar ntfs_name_buf[NTFS_MAX_ATTR_NAME_LEN]; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vol = ni->vol; /* Check for invalid names. */ if (!name || name[0] == '\0') return EINVAL; start_ofs = uio_offset(uio); start_count = uio_resid(uio); ntfs_debug("Entering for mft_no 0x%llx, extended attribute name %s, " "offset 0x%llx, size 0x%llx, options 0x%x.", (unsigned long long)ni->mft_no, name, start_ofs, start_count, options); /* * Access to extended attributes must be atomic which we ensure by * locking the base ntfs inode for writing. */ lck_rw_lock_exclusive(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); ntfs_debug("Mft_no 0x%llx is deleted.", (unsigned long long)ni->mft_no); err = ENOENT; goto err; } /* * Only regular files, directories, and symbolic links can have * extended attributes. (Specifically named streams cannot have them.) * * Thus the check is for attribute inodes as all base inodes are * allowed. Raw inodes are also attribute inodes so they are excluded * automatically, too. */ if (NInoAttr(ni)) { ntfs_debug("Mft_no 0x%llx is an attribute inode.", (unsigned long long)ni->mft_no); err = EPERM; goto err; } /* * XATTR_CREATE and XATTR_REPLACE may not be specified at the same time * or weird things would happen so test for and abort this case here. */ if ((options & (XATTR_CREATE | XATTR_REPLACE)) == (XATTR_CREATE | XATTR_REPLACE)) { ntfs_debug("Either XATTR_CREATE or XATTR_REPLACE but not both " "may be specified."); err = EINVAL; goto err; } /* * First of all deal with requests to set the Finder info as that is * special because we cache it in the base ntfs inode @ni thus we need * to copy the new Finder info into the cache and then write the * changes out to the AFP_AfpInfo attribute (creating it if it did not * exist before). * * The only exception to the above description is when the XATTR_CREATE * or XATTR_REPLACE flags are set in @options in which case we need to * know whether the Finder info extists already or not and thus if the * Finder info cache is not valid we need to make it valid first and * then we can check it against being zero to determine whether the * Finder info exists already or not and then we know whether or not to * proceed with setting the Finder info. * * FIXME: This comparison is case sensitive. */ if (!bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME))) { FINDER_INFO fi; if (start_count != sizeof(ni->finder_info)) { ntfs_debug("Number of bytes to write (%lld) does not " "equal Finder info size (%ld), " "returning ERANGE.", (unsigned long long)start_count, sizeof(ni->finder_info)); err = ERANGE; goto err; } /* * If @options does not specify XATTR_CREATE nor XATTR_REPLACE * there is no need to bring the Finder info up-to-date before * the write. */ if (options & (XATTR_CREATE | XATTR_REPLACE)) { if (!NInoValidFinderInfo(ni)) { /* * Load the AFP_AfpInfo stream and initialize * the backup time and Finder info (at least * the Finder info is not yet valid). */ err = ntfs_inode_afpinfo_read(ni); if (err) { ntfs_error(vol->mp, "Failed to obtain " "AfpInfo for mft_no " "0x%llx (error %d).", (unsigned long long) ni->mft_no, err); goto err; } if (!NInoValidFinderInfo(ni)) panic("%s(): !NInoValidFinderInfo(ni)" "\n", __FUNCTION__); } /* * Make a copy of the Finder info and mask out the * hidden bit if this is the root directory and the * type and creator if this is a symbolic link. */ memcpy(&fi, &ni->finder_info, sizeof(fi)); if (ni == vol->root_ni) fi.attrs &= ~FINDER_ATTR_IS_HIDDEN; if (S_ISLNK(ni->mode)) { fi.type = 0; fi.creator = 0; } if (bcmp(&ni->finder_info, &ntfs_empty_finder_info, sizeof(ni->finder_info))) { /* * Finder info is non-zero, i.e. it exists, and * XATTR_CREATE was specified. */ if (options & XATTR_CREATE) { ntfs_debug("Mft_no 0x%llx has " "non-zero Finder info " "and XATTR_CREATE was " "specified, returning " "EEXIST.", (unsigned long long) ni->mft_no); err = EEXIST; goto err; } } else { /* * Finder info is zero, i.e. it does not exist, * and XATTR_REPLACE was specified. */ if (options & XATTR_REPLACE) { ntfs_debug("Mft_no 0x%llx has zero " "Finder info and " "XATTR_REPLACE was " "specified, returning " "ENOATTR.", (unsigned long long) ni->mft_no); err = ENOATTR; goto err; } } } /* Copy the new Finder info value to our buffer. */ err = uiomove((caddr_t)&fi, sizeof(fi), uio); if (!err) { /* * Set the Finder info to the new value after masking * out the hidden bit if this is the root directory and * enforcing the type and creator if this is a symbolic * link to be our private values for symbolic links. */ if (ni == vol->root_ni) fi.attrs &= ~FINDER_ATTR_IS_HIDDEN; if (S_ISLNK(ni->mode)) { fi.type = FINDER_TYPE_SYMBOLIC_LINK; fi.creator = FINDER_CREATOR_SYMBOLIC_LINK; } memcpy((u8*)&ni->finder_info, (u8*)&fi, sizeof(fi)); NInoSetValidFinderInfo(ni); NInoSetDirtyFinderInfo(ni); /* * If the file is not hidden but the Finder info hidden * bit is being set, we need to cause the file to be * hidden, i.e. we need to set the FILE_ATTR_HIDDEN bit * in the file_attributes of the $STANDARD_INFORMATION * attribute. */ if (fi.attrs & FINDER_ATTR_IS_HIDDEN && !(ni->file_attributes & FILE_ATTR_HIDDEN)) { ni->file_attributes |= FILE_ATTR_HIDDEN; NInoSetDirtyFileAttributes(ni); } /* * Updating the Finder info causes both the * last_data_change_time (mtime) and * last_mft_change_time (ctime) to be updated. */ ni->last_mft_change_time = ni->last_data_change_time = ntfs_utc_current_time(); NInoSetDirtyTimes(ni); /* * Now write (if needed creating) the AFP_AfpInfo * attribute with the specified Finder Info. */ err = ntfs_inode_afpinfo_write(ni); if (err) ntfs_error(vol->mp, "Failed to write/create " "AFP_AfpInfo attribute in " "inode 0x%llx (error %d).", (unsigned long long)ni->mft_no, err); } else ntfs_error(vol->mp, "uiomove() failed (error %d).", err); goto err; } /* * Now deal with requests to write to the resource fork as that is * special because on one hand we need to translate its name from * XATTR_RESOURCEFORK_NAME to AFP_Resource so we do not need to convert * the utf8 name @name to Unicode and on the other hand the offset * @start_ofs may be non-zero, the write may be only to a partial * region of the resource fork, and the write may not shrink the * resource fork though it may extend it. * * FIXME: This comparison is case sensitive. */ if (!bcmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME))) { ntfs_name = NTFS_SFM_RESOURCEFORK_NAME; ntfs_name_len = 12; } else { /* * The request is not for the resource fork (nor for the Finder * info). This means that the offset @start_ofs must be zero. */ if (start_ofs) { err = EINVAL; goto err; } /* Convert the requested name from utf8 to Unicode. */ ntfs_name = ntfs_name_buf; ntfs_name_size = sizeof(ntfs_name_buf); ntfs_name_len = utf8_to_ntfs(vol, (const u8*)name, strlen(name), &ntfs_name, &ntfs_name_size); if (ntfs_name_len < 0) { err = -ntfs_name_len; if (err == ENAMETOOLONG) ntfs_debug("Failed (name is too long)."); else ntfs_error(vol->mp, "Failed to convert name to " "Unicode (error %d).", err); goto err; } /* * If this is one of the SFM named streams, skip it, as they * contain effectively metadata information so should not be * exposed directly. */ if (ntfs_is_sfm_name(vol, ntfs_name, ntfs_name_len)) { ntfs_debug("Not allowing access to protected SFM name " "(returning EINVAL)."); err = EINVAL; goto err; } } /* * We now have the name of the requested attribute in @ntfs_name and it * is @ntfs_name_len characters long and we have verified that the * start offset is zero (unless this is the resource fork in which case * a non-zero start offset is fine). * * Get the ntfs attribute inode of the $DATA:@ntfs_name attribute * (unless XATTR_CREATE is specified in @options) and if it does not * exist create it first (unless XATTR_REPLACE is specified in * @options). */ err = ntfs_attr_inode_get_or_create(ni, AT_DATA, ntfs_name, ntfs_name_len, FALSE, FALSE, options, LCK_RW_TYPE_EXCLUSIVE, &ani); if (err) { if (err == ENOENT) err = ENOATTR; else if (err != ENOATTR && err != EEXIST) ntfs_error(vol->mp, "Failed to get or create $DATA/%s " "attribute inode mft_no 0x%llx (error " "%d).", name, (unsigned long long)ni->mft_no, err); goto err; } /* * TODO: This check may no longer be necessary now that we lock against * changes in initialized size and thus truncation... Revisit this * issue when the write code has been written and remove the check if * appropriate simply using ubc_getsize(ni->vn); without the size_lock. */ lck_spin_lock(&ani->size_lock); size = ubc_getsize(ani->vn); if (size > ani->data_size) size = ani->data_size; lck_spin_unlock(&ani->size_lock); /* * Perform the actual write to the attribute inode. We pass in IO_UNIT * as we want an atomic i/o operation. * * FIXME: ntfs_write() does not always honour the IO_UNIT flag so we * still have to test for partial writes. */ err = ntfs_write(ani, uio, IO_UNIT, TRUE); /* * If the write was successful, need to shrink the attribute if the new * size is smaller than the old size. * * If the write was partial or failed, reset @uio pretending that the * write never happened. This is because extended attribute i/o is * meant to be atomic, i.e. either we get it all or we do not get * anything. * * In the partial/failed case, if @options specifies XATTR_REPLACE we * know the extended attribute existed already thus we truncate it to * zero size to simulate that the old value has been replaced. And if * @options specifies XATTR_CREATE we know we created the extended * attribute thus we delete it again. And if @options does not specify * XATTR_REPLACE nor XATTR_CREATE then we do not know whether we * created it or not and in this case we assume the caller does not * care so we delete it to conserve disk space. */ if (!err && !uio_resid(uio)) { /* * Shrink the attribute if the new value is smaller than the * old value. We do not do this for the resource fork as that * is a special case. */ if (ntfs_name != NTFS_SFM_RESOURCEFORK_NAME) { if (size > start_count) { err = ntfs_attr_resize(ani, start_count, 0, NULL); if (err) { ntfs_error(vol->mp, "Failed to resize " "extended attribute " "to its new size " "(error %d).", err); goto undo_err; } } } } else { /* * FIXME: Should we be trying to continue a partial write in * case we can complete it with multiple calls to ntfs_write()? */ if (!err) { ntfs_debug("ntfs_write() returned a partial write, " "pretending the write never happened " "and removing or truncating to zero " "size the old attribute value."); err = EIO; } undo_err: uio_setoffset(uio, start_ofs); uio_setresid(uio, start_count); if (options & XATTR_REPLACE) { errno_t err2; err2 = ntfs_attr_resize(ani, 0, 0, NULL); if (err2) { ntfs_error(vol->mp, "Failed to truncate " "extended attribute to zero " "size in error code path " "(error %d), attempting to " "delete it instead.", err2); goto rm_err; } } else { rm_err: /* * Unlink the named stream. The last close will cause * the VFS to call ntfs_vnop_inactive() which will do * the actual removal. */ ani->link_count = 0; /* * Update the last_mft_change_time (ctime) in the inode * as named stream/extended attribute semantics expect * on OS X. */ ni->last_mft_change_time = ntfs_utc_current_time(); NInoSetDirtyTimes(ni); /* * If this is not a directory or it is an encrypted * directory, set the needs archiving bit except for * the core system files. */ if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) { BOOL need_set_archive_bit = TRUE; if (ni->vol->major_ver >= 2) { if (ni->mft_no <= FILE_Extend) need_set_archive_bit = FALSE; } else { if (ni->mft_no <= FILE_UpCase) need_set_archive_bit = FALSE; } if (need_set_archive_bit) { ni->file_attributes |= FILE_ATTR_ARCHIVE; NInoSetDirtyFileAttributes(ni); } } } } lck_rw_unlock_exclusive(&ani->lock); (void)vnode_put(ani->vn); err: lck_rw_unlock_exclusive(&ni->lock); ntfs_debug("Done (error %d).", err); return err; } /** * ntfs_vnop_removexattr - remove an extended attribute from an ntfs inode * @a: arguments to removexattr function * * @a contains: * vnode_t a_vp; vnode whose extended attribute to remove * char *a_name; name of extented attribute to remove in utf8 * int a_options; flags controlling how the attribute is removed * vfs_context_t a_context; * * Remove the named stream with the name @a->a_name (we map named streams 1:1 * with extended attributes for NTFS as the NTFS native EAs are useless) from * the vnode @a->a_vp. * * The flags in @a->a_options control how the attribute is set. The following * flags are currently defined in OS X kernel: * XATTR_NOFOLLOW - Do not follow symbolic links. * XATTR_CREATE - Set the value, fail if already exists (setxattr only). * XATTR_REPLACE - Set the value, fail if does not exist (setxattr only). * XATTR_NOSECURITY- Bypass authorization checking. * XATTR_NODEFAULT - Bypass default extended attribute file ('._' file). * * Return 0 on success and errno on error. */ static int ntfs_vnop_removexattr(struct vnop_removexattr_args *a) { ntfs_inode *ani, *ni = NTFS_I(a->a_vp); const char *name = a->a_name; ntfs_volume *vol; ntfschar *ntfs_name; size_t ntfs_name_size; signed ntfs_name_len; errno_t err; ntfschar ntfs_name_buf[NTFS_MAX_ATTR_NAME_LEN]; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vol = ni->vol; /* Check for invalid names. */ if (!name || name[0] == '\0') return EINVAL; ntfs_debug("Entering for mft_no 0x%llx, extended attribute name %s, " "options 0x%x.", (unsigned long long)ni->mft_no, name, a->a_options); /* * Access to extended attributes must be atomic which we ensure by * locking the base ntfs inode for writing. */ lck_rw_lock_exclusive(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); ntfs_debug("Mft_no 0x%llx is deleted.", (unsigned long long)ni->mft_no); err = ENOENT; goto err; } /* * Only regular files, directories, and symbolic links can have * extended attributes. (Specifically named streams cannot have them.) * * Thus the check is for attribute inodes as all base inodes are * allowed. Raw inodes are also attribute inodes so they are excluded * automatically, too. */ if (NInoAttr(ni)) { ntfs_debug("Mft_no 0x%llx is an attribute inode.", (unsigned long long)ni->mft_no); err = EPERM; goto err; } /* * First of all deal with requests to remove the Finder info as that is * special because we cache it in the base ntfs inode @ni thus we need * to zero the cached Finder info and then write the changes out to the * AFP_AfpInfo attribute (deleting it if it is no longer needed). This * is sufficient as a zero Finder info is treated the same as * non-existent Finder info and vice versa. * * Note if the Finder info is already zero it does not exist thus we * need to return ENOATTR instead thus we may need to load the Finder * info first to find out whether it is zero or not. * * FIXME: This comparison is case sensitive. */ if (!bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME))) { FINDER_INFO fi; if (!NInoValidFinderInfo(ni)) { /* * Load the AFP_AfpInfo stream and initialize the * backup time and Finder info (at least the Finder * info is not yet valid). */ err = ntfs_inode_afpinfo_read(ni); if (err) { ntfs_error(vol->mp, "Failed to obtain AfpInfo " "for mft_no 0x%llx (error %d).", (unsigned long long)ni->mft_no, err); goto err; } if (!NInoValidFinderInfo(ni)) panic("%s(): !NInoValidFinderInfo(ni)\n", __FUNCTION__); } /* * Make a copy of the Finder info and mask out the hidden bit * if this is the root directory and the type and creator if * this is a symbolic link. */ memcpy(&fi, &ni->finder_info, sizeof(fi)); if (ni == vol->root_ni) fi.attrs &= ~FINDER_ATTR_IS_HIDDEN; if (S_ISLNK(ni->mode)) { fi.type = 0; fi.creator = 0; } if (!bcmp(&fi, &ntfs_empty_finder_info, sizeof(fi))) { /* Finder info is zero, i.e. it does not exist. */ ntfs_debug("Mft_no 0x%llx has zero Finder info, " "returning ENOATTR.", (unsigned long long)ni->mft_no); err = ENOATTR; goto err; } /* Zero the Finder info. */ bzero(&ni->finder_info, sizeof(ni->finder_info)); /* * If the file is hidden, we need to reflect this fact in the * Finder info, too. */ if (ni->file_attributes & FILE_ATTR_HIDDEN) ni->finder_info.attrs |= FINDER_ATTR_IS_HIDDEN; /* * Also, enforce the type and creator if this is a symbolic * link to be our private values for symbolic links. This in * fact causes the Finder info not to be deleted on disk and we * cannot allow that to happen as we would then no longer know * that this is a symbolic link. */ if (S_ISLNK(ni->mode)) { ni->finder_info.type = FINDER_TYPE_SYMBOLIC_LINK; ni->finder_info.creator = FINDER_CREATOR_SYMBOLIC_LINK; } NInoSetValidFinderInfo(ni); NInoSetDirtyFinderInfo(ni); /* * Updating the Finder info causes both the * last_data_change_time (mtime) and last_mft_change_time * (ctime) to be updated. */ ni->last_mft_change_time = ni->last_data_change_time = ntfs_utc_current_time(); NInoSetDirtyTimes(ni); /* Now write (if needed deleting) the AFP_AfpInfo attribute. */ err = ntfs_inode_afpinfo_write(ni); if (!err) ntfs_debug("Deleted Finder info from mft_no 0x%llx.", (unsigned long long)ni->mft_no); else ntfs_error(vol->mp, "Failed to write/delete " "AFP_AfpInfo attribute in inode " "0x%llx (error %d).", (unsigned long long)ni->mft_no, err); goto err; } /* * Now deal with requests to remove the resource fork as that is * special because we need to translate its name from * XATTR_RESOURCEFORK_NAME to AFP_Resource so we do not need to convert * the utf8 name @name to Unicode. * * FIXME: This comparison is case sensitive. */ if (!bcmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME))) { ntfs_name = NTFS_SFM_RESOURCEFORK_NAME; ntfs_name_len = 12; } else { /* * The request is not for the resource fork (nor for the Finder * info). * * Convert the requested name from utf8 to Unicode. */ ntfs_name = ntfs_name_buf; ntfs_name_size = sizeof(ntfs_name_buf); ntfs_name_len = utf8_to_ntfs(vol, (const u8*)name, strlen(name), &ntfs_name, &ntfs_name_size); if (ntfs_name_len < 0) { err = -ntfs_name_len; if (err == ENAMETOOLONG) ntfs_debug("Failed (name is too long)."); else ntfs_error(vol->mp, "Failed to convert name to " "Unicode (error %d).", err); goto err; } /* * If this is one of the SFM named streams, skip it, as they * contain effectively metadata information so should not be * exposed directly. */ if (ntfs_is_sfm_name(vol, ntfs_name, ntfs_name_len)) { ntfs_debug("Not allowing access to protected SFM name " "%s in mft_no 0x%llx (returning " "EINVAL).", name, (unsigned long long)ni->mft_no); err = EINVAL; goto err; } } /* * We now have the name of the requested attribute in @ntfs_name and it * is @ntfs_name_len characters long. * * Get the ntfs attribute inode of the $DATA:@ntfs_name attribute. */ err = ntfs_attr_inode_get(ni, AT_DATA, ntfs_name, ntfs_name_len, FALSE, LCK_RW_TYPE_EXCLUSIVE, &ani); if (err) { if (err == ENOENT) err = ENOATTR; else if (err != ENOATTR) ntfs_error(vol->mp, "Failed to get $DATA/%s attribute " "inode mft_no 0x%llx (error %d).", name, (unsigned long long)ni->mft_no, err); goto err; } /* * Unlink the named stream. The last close will cause the VFS to call * ntfs_vnop_inactive() which will do the actual removal. */ ani->link_count = 0; /* * Update the last_mft_change_time (ctime) in the inode as named * stream/extended attribute semantics expect on OS X. */ ni->last_mft_change_time = ntfs_utc_current_time(); NInoSetDirtyTimes(ni); /* * If this is not a directory or it is an encrypted directory, set the * needs archiving bit except for the core system files. */ if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) { BOOL need_set_archive_bit = TRUE; if (ni->vol->major_ver >= 2) { if (ni->mft_no <= FILE_Extend) need_set_archive_bit = FALSE; } else { if (ni->mft_no <= FILE_UpCase) need_set_archive_bit = FALSE; } if (need_set_archive_bit) { ni->file_attributes |= FILE_ATTR_ARCHIVE; NInoSetDirtyFileAttributes(ni); } } ntfs_debug("Done."); lck_rw_unlock_exclusive(&ani->lock); (void)vnode_put(ani->vn); err: lck_rw_unlock_exclusive(&ni->lock); return err; } /** * ntfs_vnop_listxattr - list the names of the extended attributes of an inode * @args: arguments to listxattr function * * @args contains: * vnode_t a_vp; vnode whose extended attributes to list * uio_t a_uio; destination in which to return the list * size_t *a_size; size of the list of extended attributes in bytes * int a_options; flags controlling how the attribute list is generated * vfs_context_t a_context; * * Iterate over the list of named streams (which we map 1:1 with extended * attributes for NTFS as the NTFS native EAs are useless) in the vnode * @args->a_vp and for each encountered stream copy its name (converted to an * NULL-terminated utf8 string) to the destination as specified by * @args->a_uio. * * If @args->a_uio is NULL, do not copy anything and simply iterate over all * named streams and add up the number of bytes needed to create a full list of * their names and return that in *@args->a_size. Note that when @args->a_uio * is not NULL @args->a_size is ignored as the number of bytes is implicitly * returned in the @args->a_uio and it can be obtained by taking the original * buffer size and subtracting uio_resid(@args->a_uio) from it. * * The flags in @args->a_options control how the attribute list is generated. * The following flags are currently defined in OS X kernel: * XATTR_NOFOLLOW - Do not follow symbolic links. * XATTR_CREATE - Set the value, fail if already exists (setxattr only). * XATTR_REPLACE - Set the value, fail if does not exist (setxattr only). * XATTR_NOSECURITY- Bypass authorization checking. * XATTR_NODEFAULT - Bypass default extended attribute file ('._' file). * * Return 0 on success and errno on error. */ static int ntfs_vnop_listxattr(struct vnop_listxattr_args *args) { ntfs_inode *ni = NTFS_I(args->a_vp); uio_t uio = args->a_uio; ntfs_volume *vol; MFT_RECORD *m; ntfs_attr_search_ctx *ctx; u8 *utf8_name; ntfschar *upcase; unsigned upcase_len; size_t size, utf8_size; errno_t err; BOOL case_sensitive; FINDER_INFO fi; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vol = ni->vol; upcase = vol->upcase; upcase_len = vol->upcase_len; case_sensitive = NVolCaseSensitive(vol); ntfs_debug("Entering."); lck_rw_lock_shared(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); ntfs_debug("Mft_no 0x%llx is deleted.", (unsigned long long)ni->mft_no); err = ENOENT; goto err; } /* * Only regular files, directories, and symbolic links can have * extended attributes. (Specifically named streams cannot have them.) * * Thus the check is for attribute inodes as all base inodes are * allowed. Raw inodes are also attribute inodes so they are excluded * automatically, too. */ if (NInoAttr(ni)) { ntfs_debug("Mft_no 0x%llx is an attribute inode.", (unsigned long long)ni->mft_no); err = EPERM; goto err; } size = 0; /* * First of all deal with the Finder info as that is special because we * cache it in the base ntfs inode @ni and we only want to export the * name for the Finder info, XATTR_FINDERINFO_NAME, if the Finder info * is non-zero. This is what HFS does, too. * * Thus we need to check the status of the cache in the ntfs inode * first and if that it valid we can use it to check the content of the * Finder info for being zero. And if it is not valid then it must be * non-resident in which case we need to read it into the cache in the * ntfs inode and then we can check the Finder info in the cache for * being zero. In fact we do this the other way round, i.e. if the * Finder info cache is not valid we read the Finder info into the * cache first and then the cache is definitely valid thus we can check * the Finder info for being non-zero and export XATTR_FINDERINFO_NAME * if so. */ if (!NInoValidFinderInfo(ni)) { if (!lck_rw_lock_shared_to_exclusive(&ni->lock)) { lck_rw_lock_exclusive(&ni->lock); if (NInoDeleted(ni)) { cache_purge(ni->vn); lck_rw_unlock_exclusive(&ni->lock); ntfs_debug("Mft_no 0x%llx is deleted.", (unsigned long long)ni->mft_no); return ENOENT; } } /* * Load the AFP_AfpInfo stream and initialize the backup time * and Finder info (if they are not already valid). */ err = ntfs_inode_afpinfo_read(ni); if (err) { ntfs_error(vol->mp, "Failed to obtain AfpInfo for " "mft_no 0x%llx (error %d).", (unsigned long long)ni->mft_no, err); lck_rw_unlock_exclusive(&ni->lock); return err; } if (!NInoValidFinderInfo(ni)) panic("%s(): !NInoValidFinderInfo(ni)\n", __FUNCTION__); lck_rw_lock_exclusive_to_shared(&ni->lock); } /* * Make a copy of the Finder info and mask out the hidden bit if this * is the root directory and the type and creator if this is a symbolic * link. */ memcpy(&fi, &ni->finder_info, sizeof(fi)); if (ni == vol->root_ni) fi.attrs &= ~FINDER_ATTR_IS_HIDDEN; if (S_ISLNK(ni->mode)) { fi.type = 0; fi.creator = 0; } if (bcmp(&fi, &ntfs_empty_finder_info, sizeof(fi))) { if (!uio) size += sizeof(XATTR_FINDERINFO_NAME); else if (uio_resid(uio) < (user_ssize_t)sizeof(XATTR_FINDERINFO_NAME)) { err = ERANGE; goto err; } else { err = uiomove((caddr_t)XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME), uio); if (err) { ntfs_error(vol->mp, "uiomove() failed (error " "%d).", err); goto err; } } ntfs_debug("Exporting Finder info name %s.", XATTR_FINDERINFO_NAME); } /* Iterate over all the named $DATA attributes. */ err = ntfs_mft_record_map(ni, &m); if (err) { ntfs_error(vol->mp, "Failed to map mft record (error %d).", err); goto err; } ctx = ntfs_attr_search_ctx_get(ni, m); if (!ctx) { ntfs_error(vol->mp, "Failed to allocate search context."); err = ENOMEM; goto unm_err; } /* * Allocate a buffer we can use when converting the names of the named * $DATA attributes to utf8. We want enough space to definitely be * able to convert the name as well as a byte for the NULL terminator. */ utf8_size = NTFS_MAX_ATTR_NAME_LEN * 4 + 1; utf8_name = OSMalloc(utf8_size, ntfs_malloc_tag); if (!utf8_name) { ntfs_error(vol->mp, "Failed to allocate name buffer."); err = ENOMEM; goto put_err; } do { ntfs_inode *ani; ATTR_RECORD *a; ntfschar *name; unsigned name_len; signed utf8_len; /* Get the next $DATA attribute. */ err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, NULL, 0, ctx); if (err) { if (err == ENOENT) { err = 0; break; } ntfs_error(vol->mp, "Failed to iterate over named " "$DATA attributes (error %d).", err); goto free_err; } /* Got the next attribute, deal with it. */ a = ctx->a; /* If this is the unnamed $DATA attribute, skip it. */ if (!a->name_length) { ntfs_debug("Skipping unnamed $DATA attribute."); continue; } name = (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)); name_len = a->name_length; if ((u8*)name < (u8*)a || (u8*)name + name_len > (u8*)a + le32_to_cpu(a->length)) { ntfs_error(vol->mp, "Found corrupt named $DATA " "attribute. Run chkdsk."); NVolSetErrors(vol); err = EIO; goto free_err; } /* * Check if this attribute currently has a cached inode/vnode * and if so check if it has been unlinked/deleted and if so * skip it. */ err = ntfs_attr_inode_lookup(ni, a->type, name, name_len, FALSE, &ani); if (err != ENOENT) { BOOL skip_it; if (err) panic("%s() inode lookup failed (error %d).\n", __FUNCTION__, err); /* Got the cached attribute inode. */ skip_it = FALSE; if (NInoDeleted(ani) || !ani->link_count || (ntfs_are_names_equal(name, name_len, NTFS_SFM_RESOURCEFORK_NAME, 12, case_sensitive, upcase, upcase_len) && !ubc_getsize(ani->vn))) skip_it = TRUE; if (skip_it) { if (NInoDeleted(ani) || !ani->link_count) ntfs_debug("Skipping deleted/unlinked " "attribute."); else ntfs_debug("Mft_no 0x%llx has zero " "size resource fork, " "pretending it does " "not exist.", (unsigned long long) ani->mft_no); (void)vnode_put(ani->vn); continue; } (void)vnode_put(ani->vn); } /* * If AFP_Resource named stream exists, i.e. the resource fork * is present, and it is non-empty export the name * XATTR_RESOURCEFORK_NAME. This is what HFS does, too. */ if (ntfs_are_names_equal(name, name_len, NTFS_SFM_RESOURCEFORK_NAME, 12, case_sensitive, upcase, upcase_len)) { if (!ntfs_attr_size(a)) { ntfs_debug("Skipping empty resource fork " "name %s.", XATTR_RESOURCEFORK_NAME); continue; } if (!uio) size += sizeof(XATTR_RESOURCEFORK_NAME); else if (uio_resid(uio) < (user_ssize_t)sizeof( XATTR_RESOURCEFORK_NAME)) { err = ERANGE; goto free_err; } else { err = uiomove((caddr_t)XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME), uio); if (err) { ntfs_error(vol->mp, "uiomove() failed " "(error %d).", err); goto free_err; } } ntfs_debug("Exporting resource fork name %s.", XATTR_RESOURCEFORK_NAME); continue; } /* * If this is one of the SFM named streams, skip it, as they * contain effectively metadata information so should not be * exposed directly. */ if (ntfs_is_sfm_name(vol, name, name_len)) { ntfs_debug("Skipping protected SFM name."); continue; } /* Convert the name to utf8. */ utf8_len = ntfs_to_utf8(vol, name, name_len << NTFSCHAR_SIZE_SHIFT, &utf8_name, &utf8_size); if (utf8_len < 0) { ntfs_warning(vol->mp, "Skipping unrepresentable name " "in mft_no 0x%llx (error %d).", (unsigned long long)ni->mft_no, -utf8_len); continue; } /* * If this is a protected attribute, skip it. * * FIXME: xattr_protected() is case sensitive so it does not * exclude protected attributes when they are not correctly * cased on disk. * * However we do call it to be consistent with HFS and SMB but * it is pointless as anyone can call getxattr() for a case * variant and the getxattr() system call would use * xattr_protected() which would not filter it out so the * VNOP_GETXATTR() call would happen and we would return the * attribute just fine. Simillarly anyone could set and remove * such "protected" attributes by just calling the system call * with a case variant even when they are correctly filtered * out here. */ if (xattr_protected((char*)utf8_name)) { ntfs_debug("Skipping protected name %.*s.", utf8_len, utf8_name); continue; } /* * Increment the length of the name by one for the NULL * terminator. */ utf8_len++; /* Export the utf8_name. */ if (!uio) size += utf8_len; else if (uio_resid(uio) < utf8_len) { err = ERANGE; goto free_err; } else { err = uiomove((caddr_t)utf8_name, utf8_len, uio); if (err) { ntfs_error(vol->mp, "uiomove() failed (error " "%d).", err); goto free_err; } } ntfs_debug("Exporting name %.*s.", utf8_len, utf8_name); /* Continue to the next name. */ } while (1); if (!uio) *args->a_size = size; ntfs_debug("Done."); free_err: OSFree(utf8_name, utf8_size, ntfs_malloc_tag); put_err: ntfs_attr_search_ctx_put(ctx); unm_err: ntfs_mft_record_unmap(ni); err: lck_rw_unlock_shared(&ni->lock); return err; } /** * ntfs_vnop_blktooff - map a logical block number to its byte offset * @a: arguments to blktooff function * * @a contains: * vnode_t a_vp; vnode to which the logical block number belongs * daddr64_t a_lblkno; logical block number to map * off_t *a_offset; destination for returning the result * * Map the logical block number @a->a_lblkno belonging to the vnode @a->a_vp to * the corresponding byte offset, i.e. the offset in the vnode in bytes and * return the result in @a->a_offset. * * Return 0 on success and EINVAL if no vnode was specified in @a->a_vp. */ static int ntfs_vnop_blktooff(struct vnop_blktooff_args *a) { ntfs_inode *ni; ntfs_volume *vol; unsigned block_size_shift; if (!a->a_vp) { ntfs_warning(NULL, "Called with NULL vnode!"); return EINVAL; } ni = NTFS_I(a->a_vp); if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } if (S_ISDIR(ni->mode)) { ntfs_error(ni->vol->mp, "Called for directory vnode."); return EINVAL; } ntfs_debug("Entering for logical block 0x%llx, mft_no 0x%llx, type " "0x%x, name_len 0x%x.", (unsigned long long)a->a_lblkno, (unsigned long long)ni->mft_no, le32_to_cpu(ni->type), (unsigned)ni->name_len); vol = ni->vol; block_size_shift = PAGE_SHIFT; /* * For $MFT/$DATA and $MFTMirr/$DATA the logical block number is the * mft record number and the block size is the mft record size which is * also in @ni->block_size{,_shift}. */ if (ni == vol->mft_ni || ni == vol->mftmirr_ni) block_size_shift = ni->block_size_shift; *a->a_offset = a->a_lblkno << block_size_shift; ntfs_debug("Done (byte offset 0x%llx).", (unsigned long long)*a->a_offset); return 0; } /** * ntfs_vnop_offtoblk - map a byte offset to its logical block number * @a: arguments to offtoblk function * * @a contains: * vnode_t a_vp; vnode to which the byte offset belongs * off_t a_offset; byte offset to map * daddr64_t *a_lblkno; destination for returning the result * * Map the byte offset @a->a_offset belonging to the vnode @a->a_vp to the * corresponding logical block number, i.e. the offset in the vnode in units of * the vnode block size and return the result in @a->a_lblkno. * * Return 0 on success and EINVAL if no vnode was specified in @a->a_vp. */ static int ntfs_vnop_offtoblk(struct vnop_offtoblk_args *a) { ntfs_inode *ni; ntfs_volume *vol; unsigned block_size_shift; if (!a->a_vp) { ntfs_warning(NULL, "Called with NULL vnode."); return EINVAL; } ni = NTFS_I(a->a_vp); if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } if (S_ISDIR(ni->mode)) { ntfs_error(ni->vol->mp, "Called for directory vnode."); return EINVAL; } ntfs_debug("Entering for byte offset 0x%llx, mft_no 0x%llx, type " "0x%x, name_len 0x%x.", (unsigned long long)a->a_offset, (unsigned long long)ni->mft_no, le32_to_cpu(ni->type), (unsigned)ni->name_len); vol = ni->vol; block_size_shift = PAGE_SHIFT; /* * For $MFT/$DATA and $MFTMirr/$DATA the logical block number is the * mft record number and the block size is the mft record size which is * also in @ni->block_size{,_shift}. */ if (ni == vol->mft_ni || ni == vol->mftmirr_ni) block_size_shift = ni->block_size_shift; *a->a_lblkno = a->a_offset >> block_size_shift; ntfs_debug("Done (logical block 0x%llx).", (unsigned long long)*a->a_lblkno); return 0; } /** * ntfs_vnop_blockmap - map a file offset to its physical block number * @a: arguments to blockmap function * * @a contains: * vnode_t a_vp; vnode to which the byte offset belongs * off_t a_foffset; starting byte offset to map * size_t a_size; number of bytes to map starting at @a_foffset * daddr64_t *a_bpn; destination for starting physical block number * size_t *a_run; destination for contiguous bytes from @a_bpn * void *a_poff; physical offset into @a_bpn * int a_flags; reason for map (VNODE_READ, VNODE_WRITE, or 0) * vfs_context_t a_context; * * Map @a->a_size bytes starting at the file offset @a->a_foffset to the * corresponding physical block number and return the result in @a->a_bpn * (starting block number), @a->a_run (number of contiguous bytes starting at * @a->a_bpn), and @a->a_poff (byte offset into @a->a_bpn corresponding to the * file offset @a->a_foffset, this will be zero if @a_foffset is block aligned * and non-zero otherwise). * * FIXME: At present the OS X kernel completely ignores @a->a_poff and in fact * it is always either NULL on entry or the returned value is ignored. Thus, * for now, if @a->a_foffset is not aligned to the physical block size, we * always return error (EINVAL) unless @a->a_foffset equals the initialized * size in the ntfs inode in which case we return a block number of -1 in * @a->a_bpn thus alignment to the block and hence @a->a_poff are not relevant. * Thus we always return 0 in @a->a_poff. * * @a->a_flags is either VNODE_READ or VNODE_WRITE but can be 0 in certain call * paths such as the system call fcntl(F_LOG2PHYS) for example. * * Note, all the return pointers (@a->a_bpn, @a->a_run, @a->a_poff) are NULL in * some code paths in xnu (one or more of them at a time), thus all of them * need to be checked for being NULL before writing to them. If @a->a_bpn is * NULL then there is nothing to do and success is returned immediately. * * For ntfs mapping to physical blocks is special because some attributes do * not have block aligned data. This is the case for all resident attributes * as well as for all non-resident attributes which are compressed or * encrypted. For all of those it would be logical to return an error however * this leads to a kernel panic in current xnu because a buf_bread() can cause * ntfs_vnop_blockmap() to be called when an uptodate page is in memory but no * buffer is in memory. This can happen under memory pressure when the buffer * has been recycled for something else but the page has not been reused yet. * In that case ntfs_vnop_blockmap() is only called to recreate the physical * mapping of the buffer and is not actually used for anything as the data is * already present in the uptodate page. Thus, instead of returning error, we * set the physical block @a->a_bpn to equal the logical block corresponding to * the byte offset @a->a_foffset and return success. Doing this signals to the * VFS that the physical mapping cannot be cached in the buffer and all is * well. Note this call path always has a non-zero @a->a_flags whilst other * "weird" code paths like fcntl(F_LOG2PHYS) set @a->a_flags to zero, thus we * can do the above workaround when @a->a_flags is not zero and return error * EINVAL when @a->a_flags is zero. * * In the read case and when @a->a_flags is zero, if @a->a_foffset is beyond * the end of the attribute, return error ERANGE. HFS returns ERANGE in this * case so we follow suit. Although some other OS X file systems return EFBIG * and some E2BIG instead so it does not seem to be very standardized, so maybe * we should return the IMHO more correct "invalid seek" (ESPIPE), instead. (-; * * In the write case we need to allow the mapping of blocks beyond the end of * the attribute as we will already have extended the allocated size but not * yet the data size nor the initialized size. Thus in this case we only * return ERANGE if the requested @a->a_foffset is beyond the end of the * allocated size. * * Return 0 on success and errno on error. */ static int ntfs_vnop_blockmap(struct vnop_blockmap_args *a) { const s64 byte_offset = a->a_foffset; const s64 byte_size = a->a_size; s64 max_size, data_size, init_size, clusters, bytes = 0; VCN vcn; LCN lcn; ntfs_inode *ni = NTFS_I(a->a_vp); ntfs_volume *vol; unsigned vcn_ofs; BOOL is_write = (a->a_flags & VNODE_WRITE); if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vol = ni->vol; ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x, " "offset 0x%llx, size 0x%llx, for %s operation.", (unsigned long long)ni->mft_no, (unsigned)le32_to_cpu(ni->type), (unsigned)ni->name_len, (unsigned long long)byte_offset, (unsigned long long)byte_size, a->a_flags ? (is_write ? "write" : "read") : "unspecified"); if (S_ISDIR(ni->mode)) { ntfs_error(vol->mp, "Called for directory vnode."); return EINVAL; } if (is_write && NVolReadOnly(vol)) { ntfs_warning(vol->mp, "Called for VNODE_WRITE but mount is " "read-only."); return EROFS; } if (!a->a_bpn) { ntfs_debug("Called with a_bpn == NULL, nothing to do. " "Returning success (0)."); return 0; } /* * We cannot take the inode lock as it may be held already so we just * check the deleted bit and abort if it is set which is better than * nothing. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(ni->vn); ntfs_debug("Inode has been deleted."); return ENOENT; } /* * Note it does not matter if we are racing with truncate because that * will be detected during the runlist lookup below. */ lck_spin_lock(&ni->size_lock); if (is_write) max_size = ni->allocated_size; else max_size = ni->data_size; data_size = ni->data_size; init_size = ni->initialized_size; lck_spin_unlock(&ni->size_lock); if (byte_offset >= max_size) { eof: ntfs_error(vol->mp, "Called for inode 0x%llx, size 0x%llx, " "byte offset 0x%llx, for %s operation, which " "is beyond the end of the inode %s size " "0x%llx. Returning error: ERANGE.", (unsigned long long)ni->mft_no, (unsigned long long)byte_size, (unsigned long long)byte_offset, a->a_flags ? (is_write ? "write" : "read") : "unspecified", is_write ? "allocated" : "data", (unsigned long long)max_size); return ERANGE; } if (byte_offset & vol->sector_size_mask && byte_offset != init_size) { ntfs_error(vol->mp, "Called for inode 0x%llx, byte offset " "0x%llx. This is not a multiple of the " "physical block size %u thus the mapping " "cannot be performed. Returning error: " "EINVAL.", (unsigned long long)ni->mft_no, (unsigned long long)byte_offset, (unsigned)vol->sector_size); return EINVAL; } /* * In the read case, if the requested byte offset is at or beyond the * initialized size simply return a hole. We already checked for being * at or beyond the data size so we know we are in an uninitialized * region in this case rather than at or beyond the end of the * attribute. */ if (!is_write && byte_offset >= init_size) { *a->a_bpn = -1; /* -1 means hole. */ /* * Set the size of the block to the number of uninitialized * bytes in the attribute starting at the requested byte offset * @a->a_foffset. */ bytes = data_size - byte_offset; goto done; } /* * Blockmap does not make sense for resident attributes and neither * does it make sense for non-resident, compressed or encrypted * attributes. The only special case is for directory inodes because * their flags are only defaults to be used when creating new files * rather than having any meaning for their actual data contents. */ if (!NInoNonResident(ni) || (ni->type != AT_INDEX_ALLOCATION && (NInoCompressed(ni) || NInoEncrypted(ni)) && !NInoRaw(ni))) { if (!a->a_flags) { ntfs_error(vol->mp, "Called for inode 0x%llx, which " "is resident, compressed, or " "encrypted and VNOP_BLOCKMAP() does " "not make sense for such inodes. " "Returning error: EINVAL.", (unsigned long long)ni->mft_no); return EINVAL; } *a->a_bpn = byte_offset >> PAGE_SHIFT; bytes = ni->block_size; ntfs_debug("Called for inode 0x%llx which is resident, " "compressed, or encrypted and VNOP_BLOCKMAP() " "does not make sense for such inodes. " "Returning success and setting physical == " "logical block number to signal to VFS that " "the mapping cannot be cached in the buffer.", (unsigned long long)ni->mft_no); goto done; } /* * All is ok, do the mapping. First, work out the vcn and vcn offset * corresponding to the @a->a_foffset. */ vcn = byte_offset >> vol->cluster_size_shift; vcn_ofs = (u32)byte_offset & vol->cluster_size_mask; /* * Convert the vcn to the corresponding lcn and obtain the number of * contiguous clusters starting at the vcn. */ lck_rw_lock_shared(&ni->rl.lock); lcn = ntfs_attr_vcn_to_lcn_nolock(ni, vcn, FALSE, a->a_run ? &clusters : 0); if (lcn < LCN_HOLE) { errno_t err; /* Error: deal with it. */ lck_rw_unlock_shared(&ni->rl.lock); switch (lcn) { case LCN_ENOENT: /* * Raced with a concurrent truncate which caused the * byte offset @a->a_foffset to become outside the * attribute size. */ goto eof; case LCN_ENOMEM: ntfs_error(vol->mp, "Not enough memory to complete " "mapping for inode 0x%llx. " "Returning error: ENOMEM.", (unsigned long long)ni->mft_no); err = ENOMEM; break; default: ntfs_error(vol->mp, "Failed to complete mapping for " "inode 0x%llx. Run chkdsk. " "Returning error: EIO.", (unsigned long long)ni->mft_no); err = EIO; break; } return err; } if (lcn < 0) { /* * It is a hole, return it. If this is a VNODE_WRITE request, * output a warning as this should never happen. Both * VNOP_WRITE() and VNOP_PAGEOUT() should have instantiated the * hole before performing the write. * * Note we could potentially fill the hole here in the write * case. However this is quite hard to do as the caller will * likely have pages around the hole locked in UBC UPLs thus we * would have difficulties zeroing the surrounding regions when * the cluster size is larger than the page size. Also a * problem is what happens if the write fails for some reason * but we have instantiated the hole here and not zeroed it * completely (because we are expecting the write to go into * the allocated clusters). We would have no way of fixing up * in this case and we would end up exposing stale data. This * all is why we choose not to fill the hole here but to do it * in advance in ntfs_vnop_write() and ntfs_vnop_pageout(). * * The only thing that will happen when we return a hole in the * write case is that when the caller is cluster_io(), it will * page out page by page and this will fill the hole in pieces * which will degrade performance. */ if (is_write) ntfs_warning(vol->mp, "Returning hole but flags " "specify VNODE_WRITE. This causes " "very inefficient allocation and I/O " "patterns."); /* Return the hole. */ lck_rw_unlock_shared(&ni->rl.lock); *a->a_bpn = -1; /* -1 means hole. */ if (a->a_run) { bytes = (clusters << vol->cluster_size_shift) - vcn_ofs; /* * If the run overlaps the initialized size, extend the * run length so it goes up to the data size thus * merging the hole with the uninitialized region. * * Note, do not do this in the write case as we want to * return the real clusters even beyond the initialized * size as the initialized size will only be updated * after the write has completed. */ if (!is_write && byte_offset + bytes > init_size) bytes = data_size - byte_offset; } goto done; } else lck_rw_unlock_shared(&ni->rl.lock); /* The vcn was mapped successfully to a physical lcn, return it. */ *a->a_bpn = ((lcn << vol->cluster_size_shift) + vcn_ofs) >> vol->sector_size_shift; if (a->a_run) { bytes = (clusters << vol->cluster_size_shift) - vcn_ofs; /* * In the read case, if the run overlaps the initialized size, * truncate the run length so it only goes up to the * initialized size. The caller will then be able to access * this region on disk directly and will then call us again * with a byte offset equal to the initialized size and we will * then return the entire initialized region as a hole. Thus * the caller does not need to know about the fact that NTFS * has such a thing as the initialized_size. * * We already handled the case where the byte offset is beyond * the initialized size so no need to check for that here. * * However do not do this if the initialized size is equal to * the data size. The caller is responsible for not returning * data beyond the attribute size to user space. If this is * not done the last page of an attribute read is broken into * two separate i/os, one with a read and one with a hole. * cluster_io() will zero beyond the end of attribute in any * case so it is faster to do it with a single call. */ if (!is_write && byte_offset + bytes > init_size && init_size < data_size) bytes = init_size - byte_offset; } done: if (a->a_run) { if (bytes > byte_size) bytes = byte_size; *a->a_run = bytes; } if (a->a_poff) *(int*)a->a_poff = 0; ntfs_debug("Done (a_bpn 0x%llx, a_run 0x%lx, a_poff 0x%x).", (unsigned long long)*a->a_bpn, a->a_run ? (unsigned long)*a->a_run : 0, a->a_poff ? *(int*)a->a_poff : 0); return 0; } /** * ntfs_vnop_getnamedstream - find a named stream in an inode given its name * @a: arguments to getnamedstream function * * @a contains: * vnode_t a_vp; vnode containing the named stream * vnode_t *a_svpp; destination for the named stream vnode * const char *a_name; name of the named stream to get * enum nsoperation a_operation; reason for getnamedstream * int a_flags; flags describing the request * vfs_context_t a_context; * * Find the named stream with name @a->a_name in the vnode @a->a_vp and return * the vnode of the named stream in *@a->a_svpp if it was found. * * @a->a_operation specifies the reason for the lookup of the named stream. * The following operations are currently defined in OS X kernel: * NS_OPEN - Want to open the named stream for access. * NS_CREATE - Want to create the named stream so checking it does not * exist already. * NS_DELETE - Want to delete the named stream so making sure it exists. * * The flags in @a->a_flags further describe the getnamedstream request. At * present no flags are defined in OS X kernel. * * Note that at present Mac OS X only supports the "com.apple.ResourceFork" * stream so we follow suit. * * Return 0 on success and the error code on error. A return value of ENOATTR * does not signify an error as such but merely the fact that the named stream * @name is not present in the vnode @a->a_vp. */ static int ntfs_vnop_getnamedstream(struct vnop_getnamedstream_args *a) { vnode_t vn = a->a_vp; ntfs_inode *sni, *ni = NTFS_I(vn); const char *name = a->a_name; int options; const enum nsoperation op = a->a_operation; errno_t err; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } ntfs_debug("Entering for mft_no 0x%llx, stream name %s, operation %s " "(0x%x), flags 0x%x.", (unsigned long long)ni->mft_no, name, op == NS_OPEN ? "NS_OPEN" : (op == NS_CREATE ? "NS_CREATE" : (op == NS_DELETE ? "NS_DELETE" : "unknown")), op, a->a_flags); /* * Mac OS X only supports the resource fork stream. * Note that this comparison is case sensitive. */ if (bcmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME))) { ntfs_warning(ni->vol->mp, "Unsupported named stream %s " "specified, only the resource fork named " "stream (%s) is supported at present. " "Returning ENOATTR.", name, XATTR_RESOURCEFORK_NAME); return ENOATTR; } /* Only regular files may have a resource fork stream. */ if (!S_ISREG(ni->mode)) { ntfs_warning(ni->vol->mp, "The resource fork may only be " "attached to regular files and mft_no 0x%llx " "is not a regular file. Returning EPERM.", (unsigned long long)ni->mft_no); return EPERM; } /* * Attempt to get the inode for the named stream. For the resource * fork we need to return it even if it is zero size if the caller has * specified @op == NS_OPEN so we set @options to zero in this case. * Otherwise we want to treat a zero size resource fork as a * non-existent resource fork se we set @options to XATTR_REPLACE which * is the behaviour of ntfs_attr_inode_get(). */ if (op == NS_OPEN) { options = 0; lck_rw_lock_exclusive(&ni->lock); } else { options = XATTR_REPLACE; lck_rw_lock_shared(&ni->lock); } /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(vn); if (op == NS_OPEN) lck_rw_unlock_exclusive(&ni->lock); else lck_rw_unlock_shared(&ni->lock); ntfs_debug("Mft_no 0x%llx is deleted.", (unsigned long long)ni->mft_no); return ENOENT; } err = ntfs_attr_inode_get_or_create(ni, AT_DATA, NTFS_SFM_RESOURCEFORK_NAME, 12, FALSE, FALSE, options, LCK_RW_TYPE_SHARED, &sni); if (!err) { /* We have successfully opened the named stream. */ *a->a_svpp = sni->vn; lck_rw_unlock_shared(&sni->lock); ntfs_debug("Done."); } else { if (err == ENOENT) { err = ENOATTR; ntfs_debug("Done (named stream %s does not exist in " "mft_no 0x%llx.", name, (unsigned long long)ni->mft_no); } else ntfs_error(ni->vol->mp, "Failed to get named stream " "%s, mft_no 0x%llx (error %d).", name, (unsigned long long)ni->mft_no, err); } if (op == NS_OPEN) lck_rw_unlock_exclusive(&ni->lock); else lck_rw_unlock_shared(&ni->lock); return err; } /** * ntfs_vnop_makenamedstream - create a named stream in an ntfs inode * @a: arguments to makenamedstream function * * @a contains: * vnode_t a_vp; vnode in which to create the named stream * vnode_t *a_svpp; destination for the named stream vnode * const char *a_name; name of the named stream to create * int a_flags; flags describing the request * vfs_context_t a_context; * * Create the named stream with name @a->a_name in the vnode @a->a_vp and * return the created vnode of the named stream in *@a->a_svpp. If the named * stream already exists than it is obtained instead, i.e. if the named stream * already exists then ntfs_vnop_makenamedstream() does exactly the same thing * as ntfs_vnop_getnamedstream(). * * The flags in @a->a_flags further describe the makenamedstream request. At * present no flags are defined in OS X kernel. * * Note that at present Mac OS X only supports the "com.apple.ResourceFork" * stream so we follow suit. * * Return 0 on success and the error code on error. */ static int ntfs_vnop_makenamedstream(struct vnop_makenamedstream_args *a) { vnode_t vn = a->a_vp; ntfs_inode *sni, *ni = NTFS_I(vn); const char *name = a->a_name; errno_t err; if (!ni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } ntfs_debug("Entering for mft_no 0x%llx, stream name %s, flags 0x%x.", (unsigned long long)ni->mft_no, name, a->a_flags); /* * Mac OS X only supports the resource fork stream. * Note that this comparison is case sensitive. */ if (bcmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME))) { ntfs_warning(ni->vol->mp, "Unsupported named stream %s " "specified, only the resource fork named " "stream (%s) is supported at present. " "Returning ENOATTR.", name, XATTR_RESOURCEFORK_NAME); return ENOATTR; } /* Only regular files may have a resource fork stream. */ if (!S_ISREG(ni->mode)) { ntfs_warning(ni->vol->mp, "The resource fork may only be " "attached to regular files and mft_no 0x%llx " "is not a regular file. Returning EPERM.", (unsigned long long)ni->mft_no); return EPERM; } lck_rw_lock_exclusive(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(vn); lck_rw_unlock_exclusive(&ni->lock); ntfs_debug("Mft_no 0x%llx is deleted.", (unsigned long long)ni->mft_no); return ENOENT; } /* * Attempt to create the named stream. * * HFS allows an existing resource fork to be opened. We want to * follow suit so we specify 0 for @options when calling * ntfs_attr_inode_get_or_create(). * * FIXME: I think this is actually wrong behaviour. If I am right and * this is one day fixed in HFS, then we can trivially fix the * behaviour here by setting @options to XATTR_CREATE. */ err = ntfs_attr_inode_get_or_create(ni, AT_DATA, NTFS_SFM_RESOURCEFORK_NAME, 12, FALSE, FALSE, 0, LCK_RW_TYPE_SHARED, &sni); if (!err) { /* We have successfully opened the (created) named stream. */ *a->a_svpp = sni->vn; lck_rw_unlock_shared(&sni->lock); ntfs_debug("Done."); } else { if (err == EEXIST) ntfs_debug("Named stream %s already exists in mft_no " "0x%llx.", name, (unsigned long long)ni->mft_no); else ntfs_error(ni->vol->mp, "Failed to create named " "stream %s in mft_no 0x%llx (error " "%d).", name, (unsigned long long)ni->mft_no, err); } lck_rw_unlock_exclusive(&ni->lock); return err; } /** * ntfs_vnop_removenamedstream - remove a named stream from an ntfs inode * @a: arguments to removenamedstream function * * @a contains: * vnode_t a_vp; vnode from which to remove the named stream * vnode_t a_svp; vnode of named stream to remove * const char *a_name; name of the named stream to remove * int a_flags; flags describing the request * vfs_context_t a_context; * * Delete the named stream described by the vnode @a->a_svp with name * @a->a_name from the vnode @a->a_vp. * * The flags in @a->a_flags further describe the removenamedstream request. At * present no flags are defined in OS X kernel. * * Note we obey POSIX open unlink semantics thus an open named stream will * remain accessible for read/write/lseek purproses until the last open * instance is closed when the VFS will call ntfs_vnop_inactive() which will in * turn actually remove the named stream. * * Note that at present Mac OS X only supports the "com.apple.ResourceFork" * stream so we follow suit. * * Return 0 on success and the error code on error. A return value of ENOATTR * does not signify an error as such but merely the fact that the named stream * @name is not present in the vnode @a->a_vp. */ static int ntfs_vnop_removenamedstream(struct vnop_removenamedstream_args *a) { vnode_t svn, vn = a->a_vp; ntfs_inode *sni, *ni = NTFS_I(vn); const char *vname, *name = a->a_name; svn = a->a_svp; sni = NTFS_I(svn); if (!ni || !sni) { ntfs_debug("Entered with NULL ntfs_inode, aborting."); return EINVAL; } vname = vnode_getname(svn); ntfs_debug("Entering for mft_no 0x%llx, stream mft_no 0x%llx, stream " "name %s, flags 0x%x, stream vnode name %s.", (unsigned long long)ni->mft_no, (unsigned long long)sni->mft_no, name, a->a_flags, vname ? vname : "not present"); if (vname) (void)vnode_putname(vname); /* * Mac OS X only supports the resource fork stream. * Note that this comparison is case sensitive. */ if (bcmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME))) { ntfs_warning(ni->vol->mp, "Unsupported named stream %s " "specified, only the resource fork named " "stream (%s) is supported at present. " "Returning ENOATTR.", name, XATTR_RESOURCEFORK_NAME); return ENOATTR; } /* Only regular files may have a resource fork stream. */ if (!S_ISREG(ni->mode)) { ntfs_warning(ni->vol->mp, "The resource fork may only be " "attached to regular files and mft_no 0x%llx " "is not a regular file. Returning EPERM.", (unsigned long long)ni->mft_no); return EPERM; } lck_rw_lock_exclusive(&ni->lock); /* Do not allow messing with the inode once it has been deleted. */ if (NInoDeleted(ni)) { /* Remove the inode from the name cache. */ cache_purge(vn); lck_rw_unlock_exclusive(&ni->lock); ntfs_debug("Mft_no 0x%llx is deleted.", (unsigned long long)ni->mft_no); return ENOATTR; } lck_rw_lock_exclusive(&sni->lock); /* Do not allow messing with the stream once it has been deleted. */ if (NInoDeleted(sni)) { /* Remove the inode from the name cache. */ cache_purge(svn); lck_rw_unlock_exclusive(&sni->lock); lck_rw_unlock_exclusive(&ni->lock); ntfs_debug("Stream mft_no 0x%llx, name %s is deleted.", (unsigned long long)sni->mft_no, name); return ENOATTR; } /* * The base inode of the stream inode must be the same as the parent * inode specified by the caller. */ if (!NInoAttr(sni) || sni->base_ni != ni) panic("%s(): !NInoAttr(sni) || sni->base_ni != ni\n", __FUNCTION__); /* * Unlink the named stream. The last close will cause the VFS to call * ntfs_vnop_inactive() which will do the actual removal. * * And if the named stream is already unlinked there is nothing to do. * This is what HFS does so we follow suit. */ if (sni->link_count) { sni->link_count = 0; /* * Update the last_mft_change_time (ctime) in the inode as * named stream/extended attribute semantics expect on OS X. */ ni->last_mft_change_time = ntfs_utc_current_time(); NInoSetDirtyTimes(ni); /* * If this is not a directory or it is an encrypted directory, * set the needs archiving bit except for the core system * files. */ if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) { BOOL need_set_archive_bit = TRUE; if (ni->vol->major_ver >= 2) { if (ni->mft_no <= FILE_Extend) need_set_archive_bit = FALSE; } else { if (ni->mft_no <= FILE_UpCase) need_set_archive_bit = FALSE; } if (need_set_archive_bit) { ni->file_attributes |= FILE_ATTR_ARCHIVE; NInoSetDirtyFileAttributes(ni); } } ntfs_debug("Done."); } else ntfs_debug("$DATA/%s attribute has already been unlinked from " "mft_no 0x%llx.", name, (unsigned long long)sni->mft_no); lck_rw_unlock_exclusive(&sni->lock); lck_rw_unlock_exclusive(&ni->lock); return 0; } static struct vnodeopv_entry_desc ntfs_vnodeop_entries[] = { /* * Set vn_default_error() to be our default vnop, thus any vnops we do * not specify (or specify as NULL) will be set to it and this function * just returns ENOTSUP. */ { &vnop_default_desc, (vnop_t*)vn_default_error }, { &vnop_strategy_desc, (vnop_t*)ntfs_vnop_strategy }, /* * vn_bwrite() is a simple wrapper for buf_bwrite() which in turn uses * VNOP_STRATEGY() and hence ntfs_vnop_strategy() to do the i/o and the * latter handles all NTFS specifics thus we can simply use the generic * vn_bwrite() for our VNOP_BWRITE() method. */ { &vnop_bwrite_desc, (vnop_t*)vn_bwrite }, { &vnop_lookup_desc, (vnop_t*)ntfs_vnop_lookup }, { &vnop_create_desc, (vnop_t*)ntfs_vnop_create }, { &vnop_mknod_desc, (vnop_t*)ntfs_vnop_mknod }, { &vnop_open_desc, (vnop_t*)ntfs_vnop_open }, { &vnop_close_desc, (vnop_t*)ntfs_vnop_close }, { &vnop_access_desc, (vnop_t*)ntfs_vnop_access }, { &vnop_getattr_desc, (vnop_t*)ntfs_vnop_getattr }, { &vnop_setattr_desc, (vnop_t*)ntfs_vnop_setattr }, { &vnop_read_desc, (vnop_t*)ntfs_vnop_read }, { &vnop_write_desc, (vnop_t*)ntfs_vnop_write }, { &vnop_ioctl_desc, (vnop_t*)ntfs_vnop_ioctl }, { &vnop_select_desc, (vnop_t*)ntfs_vnop_select }, { &vnop_exchange_desc, (vnop_t*)ntfs_vnop_exchange }, /* Let the VFS deal with revoking a vnode. */ { &vnop_revoke_desc, (vnop_t*)nop_revoke }, { &vnop_mmap_desc, (vnop_t*)ntfs_vnop_mmap }, { &vnop_mnomap_desc, (vnop_t*)ntfs_vnop_mnomap }, { &vnop_fsync_desc, (vnop_t*)ntfs_vnop_fsync }, { &vnop_remove_desc, (vnop_t*)ntfs_vnop_remove }, { &vnop_link_desc, (vnop_t*)ntfs_vnop_link }, { &vnop_rename_desc, (vnop_t*)ntfs_vnop_rename }, { &vnop_mkdir_desc, (vnop_t*)ntfs_vnop_mkdir }, { &vnop_rmdir_desc, (vnop_t*)ntfs_vnop_rmdir }, { &vnop_symlink_desc, (vnop_t*)ntfs_vnop_symlink }, { &vnop_readdir_desc, (vnop_t*)ntfs_vnop_readdir }, { &vnop_readdirattr_desc, (vnop_t*)ntfs_vnop_readdirattr }, { &vnop_readlink_desc, (vnop_t*)ntfs_vnop_readlink }, { &vnop_inactive_desc, (vnop_t*)ntfs_vnop_inactive }, { &vnop_reclaim_desc, (vnop_t*)ntfs_vnop_reclaim }, { &vnop_pathconf_desc, (vnop_t*)ntfs_vnop_pathconf }, /* * Let the VFS deal with advisory locking for us, so our advlock method * should never get called and if it were to get called for some * reason, we make sure to return error (ENOTSUP). */ { &vnop_advlock_desc, (vnop_t*)err_advlock }, { &vnop_allocate_desc, (vnop_t*)ntfs_vnop_allocate }, { &vnop_pagein_desc, (vnop_t*)ntfs_vnop_pagein }, { &vnop_pageout_desc, (vnop_t*)ntfs_vnop_pageout }, { &vnop_searchfs_desc, (vnop_t*)ntfs_vnop_searchfs }, /* * Nothing supports copyfile in current xnu and it is not documented so * we do not support it either. */ { &vnop_copyfile_desc, (vnop_t*)err_copyfile }, { &vnop_getxattr_desc, (vnop_t*)ntfs_vnop_getxattr }, { &vnop_setxattr_desc, (vnop_t*)ntfs_vnop_setxattr }, { &vnop_removexattr_desc, (vnop_t*)ntfs_vnop_removexattr }, { &vnop_listxattr_desc, (vnop_t*)ntfs_vnop_listxattr }, { &vnop_blktooff_desc, (vnop_t*)ntfs_vnop_blktooff }, { &vnop_offtoblk_desc, (vnop_t*)ntfs_vnop_offtoblk }, { &vnop_blockmap_desc, (vnop_t*)ntfs_vnop_blockmap }, { &vnop_getnamedstream_desc, (vnop_t*)ntfs_vnop_getnamedstream }, { &vnop_makenamedstream_desc, (vnop_t*)ntfs_vnop_makenamedstream }, { &vnop_removenamedstream_desc, (vnop_t*)ntfs_vnop_removenamedstream }, { NULL, (vnop_t*)NULL } }; struct vnodeopv_desc ntfs_vnodeopv_desc = { &ntfs_vnodeop_p, ntfs_vnodeop_entries };