/* * ntfs_index.c - NTFS kernel index handling. * * Copyright (c) 2006-2008 Anton Altaparmakov. All Rights Reserved. * Portions Copyright (c) 2006-2008 Apple Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the name of Apple Inc. ("Apple") nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ALTERNATIVELY, provided that this notice and licensing terms are retained in * full, this file may be redistributed and/or modified under the terms of the * GNU General Public License (GPL) Version 2, in which case the provisions of * that version of the GPL will apply to you instead of the license terms * above. You can obtain a copy of the GPL Version 2 at * http://developer.apple.com/opensource/licenses/gpl-2.txt. */ #include <sys/errno.h> #include <string.h> #include <libkern/libkern.h> #include <libkern/OSMalloc.h> #include <kern/debug.h> #include <kern/locks.h> #include "ntfs.h" #include "ntfs_attr.h" #include "ntfs_attr_list.h" #include "ntfs_bitmap.h" #include "ntfs_collate.h" #include "ntfs_debug.h" #include "ntfs_dir.h" #include "ntfs_endian.h" #include "ntfs_index.h" #include "ntfs_inode.h" #include "ntfs_layout.h" #include "ntfs_lcnalloc.h" #include "ntfs_mft.h" #include "ntfs_page.h" #include "ntfs_runlist.h" #include "ntfs_types.h" #include "ntfs_unistr.h" #include "ntfs_volume.h" /** * ntfs_index_ctx_unlock - unlock an index context * @ictx: index context to unlock * * Unlock the index context @ictx. We also unmap the mft record (in index root * case) or the page (in index allocation block case) thus all pointers into * the index node need to be revalidated when the mft record or page is mapped * again in ntfs_index_ctx_relock(). * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. * - The index context @ictx must be locked. */ void ntfs_index_ctx_unlock(ntfs_index_context *ictx) { if (!ictx) panic("%s(): !ictx\n", __FUNCTION__); if (!ictx->is_locked) panic("%s(): !ictx->is_locked\n", __FUNCTION__); if (ictx->is_root) { if (ictx->actx) { ntfs_attr_search_ctx_put(ictx->actx); if (!ictx->base_ni) panic("%s(): !ictx->base_ni\n", __FUNCTION__); ntfs_mft_record_unmap(ictx->base_ni); ictx->actx = NULL; } } else { if (ictx->upl) { ntfs_page_unmap(ictx->idx_ni, ictx->upl, ictx->pl, ictx->is_dirty); ictx->upl = NULL; ictx->is_dirty = 0; } } ictx->is_locked = 0; } /** * ntfs_index_ctx_path_unlock - unlock an entire B+tree index context path * @index_ctx: index context whose whole path to unlock * * Unlock all index contexts attached to the B+tree path to which @index_ctx * belongs. * * This function is only called in error handling to ensure nothing is held * busy so the error handling code cannot deadlock. * * Locking: Caller must hold @index_ctx->idx_ni->lock on the index inode. */ static void ntfs_index_ctx_path_unlock(ntfs_index_context *index_ctx) { ntfs_index_context *ictx; ictx = index_ctx; if (!ictx) panic("%s(): !ictx\n", __FUNCTION__); /* * Note we traverse the tree path backwards (upwards) because @up is * the first element in the index_context structure thus doing things * this way generates faster/better machine code. */ do { if (ictx->is_locked) ntfs_index_ctx_unlock(ictx); } while ((ictx = ictx->up) != index_ctx); } /** * ntfs_index_ctx_relock - relock an index context that was unlocked earlier * @ictx: index context to relock * * Relock the index context @ictx after it was unlocked with * ntfs_index_ctx_unlock(). We also remap the mft record (in index root case) * or the page (in index allocation block case) after which we revalidate all * pointers into the index node because the page may have been mapped into a * different virtual address and the mft record may have been changed with the * result that the index root attribute is moved within the mft record or even * to a completely different mft record. * * Note the check whether to revalidate or not is very simple because the index * node content cannot have changed thus all points change by a fixed offset * delta which once determined can be applied to all pointers. * * In the index root case, there is also a non-pointer index context field that * can have changed (and it does so irrespective of the index root position). * This is @ictx->bytes_free as that is dependent on the other attributes in * the mft record which can have changed legitimately under our feet which of * course is the reason why the index root can have moved about, too. * * Return 0 on success and errno on error. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. * - The index context @ictx must not be locked. */ errno_t ntfs_index_ctx_relock(ntfs_index_context *ictx) { long delta; errno_t err; if (!ictx) panic("%s(): !ictx\n", __FUNCTION__); if (ictx->is_locked) panic("%s(): ictx->is_locked\n", __FUNCTION__); if (ictx->is_root) { MFT_RECORD *m; ntfs_attr_search_ctx *actx; if (ictx->actx) panic("%s(): ictx->actx\n", __FUNCTION__); if (!ictx->base_ni) panic("%s(): !ictx->base_ni\n", __FUNCTION__); err = ntfs_mft_record_map(ictx->base_ni, &m); if (err) { ntfs_error(ictx->idx_ni->vol->mp, "Failed to lock " "index root because " "ntfs_mft_record_map() failed (error " "%d).", err); return err; } ictx->actx = actx = ntfs_attr_search_ctx_get(ictx->base_ni, m); if (!actx) { ntfs_error(ictx->idx_ni->vol->mp, "Failed to allocate " "attribute search context."); err = ENOMEM; goto actx_err; } err = ntfs_attr_lookup(AT_INDEX_ROOT, ictx->idx_ni->name, ictx->idx_ni->name_len, 0, NULL, 0, actx); if (err) { if (err == ENOENT) panic("%s(): err == ENOENT\n", __FUNCTION__); ntfs_error(ictx->idx_ni->vol->mp, "Failed to look up " "index root attribute (error %d).", err); goto actx_err; } ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) - le32_to_cpu(actx->m->bytes_in_use); /* Get to the index root value. */ ictx->ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); delta = (u8*)&ictx->ir->index - (u8*)ictx->index; ictx->index = &ictx->ir->index; } else { u8 *addr; if (ictx->upl) panic("%s(): ictx->upl\n", __FUNCTION__); if (ictx->is_dirty) panic("%s(): ictx->is_dirty\n", __FUNCTION__); err = ntfs_page_map(ictx->idx_ni, ictx->upl_ofs, &ictx->upl, &ictx->pl, &addr, TRUE); if (err) { ntfs_error(ictx->idx_ni->vol->mp, "Failed to map " "index page (error %d).", err); ictx->upl = NULL; return err; } /* Get to the index allocation block. */ delta = addr - ictx->addr; if (delta) { ictx->addr = addr; ictx->ia = (INDEX_ALLOCATION*)((u8*)ictx->ia + delta); ictx->index = &ictx->ia->index; } } if (delta) { INDEX_ENTRY **entries; unsigned u; /* * The index node has moved thus we have to update all stored * pointers so they point into the new memory location. */ ictx->entry = (INDEX_ENTRY*)((u8*)ictx->entry + delta); entries = ictx->entries; for (u = 0; u < ictx->nr_entries; u++) entries[u] = (INDEX_ENTRY*)((u8*)entries[u] + delta); } ictx->is_locked = 1; return 0; actx_err: if (ictx->actx) { ntfs_attr_search_ctx_put(ictx->actx); ictx->actx = NULL; } ntfs_mft_record_unmap(ictx->base_ni); return err; } /** * ntfs_index_ctx_get - allocate and initialize a new index context * @idx_ni: ntfs index inode with which to initialize the context * * Allocate a new index context, initialize it with @idx_ni and return it. * * Return NULL if the allocation failed. * * Locking: Caller must hold @idx_ni->lock on the index inode. */ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) { ntfs_index_context *ictx; ictx = ntfs_index_ctx_alloc(); if (ictx) ntfs_index_ctx_init(ictx, idx_ni); return ictx; } /** * ntfs_index_ctx_put_reuse_single - release an index context but do not free it * @ictx: index context to release * * Release the index context @ictx, releasing all associated resources but keep * the index context itself allocated so it can be reused with a call to * ntfs_index_ctx_init(). * * This function ignores the tree path which this entry may be a part of and is * only a helper function for ntfs_index_ctx_put_reuse(). * * Locking: Caller must hold @ictx->idx_ni->lock on the index inode. */ void ntfs_index_ctx_put_reuse_single(ntfs_index_context *ictx) { if (ictx->entry && ictx->is_locked) ntfs_index_ctx_unlock(ictx); if (ictx->entries) OSFree(ictx->entries, ictx->max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag); } /** * ntfs_index_ctx_put_reuse - release an index context but do not free it * @index_ctx: index context to release * * Release the index context @index_ctx, releasing all associated resources but * keep the index context itself allocated so it can be reused with a call to * ntfs_index_ctx_init(). * * Locking: Caller must hold @index_ctx->idx_ni->lock on the index inode. */ void ntfs_index_ctx_put_reuse(ntfs_index_context *index_ctx) { ntfs_index_context *ictx, *next; /* * Destroy all tree path components except @index_ctx itself. We need * the temporary index context pointer @next because we deallocate the * current index context in each iteration of the loop thus we would no * longer have any means of finding the next index context in the tree * path if we had not already stored a pointer to it in @next. Note we * actually traverse the tree path backwards (upwards) because @up is * the first element in the index_context structure thus doing things * this way generates faster/better machine code. */ for (ictx = index_ctx->up, next = ictx->up; ictx != index_ctx; ictx = next, next = next->up) { /* * Disconnect the current index context from the tree and * release it and all its resources. */ ntfs_index_ctx_put_single(ictx); } /* Reuse the only remaining, bottom entry. */ ntfs_index_ctx_put_reuse_single(index_ctx); } /** * ntfs_index_get_entries - get the entries for the index node into the context * @ictx: index context for which to get the entries * * Loop through the entries in the index node described by @ictx and gather all * the index entries into the @ictx->entries array which is also allocated by * this function. * * Return 0 on success and errno on error. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. * - The index context @ictx must be locked. */ static errno_t ntfs_index_get_entries(ntfs_index_context *ictx) { u8 *index_end; INDEX_HEADER *index; INDEX_ENTRY *ie, **entries; unsigned nr_entries, max_entries; errno_t err; BOOL is_view; ntfs_debug("Entering."); if (!ictx->is_locked) panic("%s(): !ictx->is_locked\n", __FUNCTION__); is_view = FALSE; if (ictx->idx_ni->name != I30) is_view = TRUE; nr_entries = 0; max_entries = ictx->max_entries; /* Allocate memory for the index entry pointers in the index node. */ entries = OSMalloc(max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag); if (!entries) { ntfs_error(ictx->idx_ni->vol->mp, "Failed to allocate index " "entry pointer array."); return ENOMEM; } index = ictx->index; index_end = (u8*)index + le32_to_cpu(index->index_length); /* The first index entry. */ ie = (INDEX_ENTRY*)((u8*)index + le32_to_cpu(index->entries_offset)); /* * Loop until we exceed valid memory (corruption case) or until we * reach the last entry and for each entry place a pointer to it into * our array of entry pointers. */ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { /* Bounds checks. */ if ((u8*)ie < (u8*)index || (u8*)ie + sizeof(INDEX_ENTRY_HEADER) > index_end || (u8*)ie + le16_to_cpu(ie->length) > index_end || (u32)sizeof(INDEX_ENTRY_HEADER) + le16_to_cpu(ie->key_length) > le16_to_cpu(ie->length)) goto err; /* Add this entry to the array of entry pointers. */ if (nr_entries >= max_entries) panic("%s(): nr_entries >= max_entries\n", __FUNCTION__); entries[nr_entries++] = ie; if (ie->flags & INDEX_ENTRY_END) break; /* Further bounds checks for view indexes. */ if (is_view && ((u32)sizeof(INDEX_ENTRY_HEADER) + le16_to_cpu(ie->key_length) > le16_to_cpu(ie->data_offset) || (u32)le16_to_cpu(ie->data_offset) + le16_to_cpu(ie->data_length) > le16_to_cpu(ie->length))) goto err; } /* Except for the index root, leaf nodes are not allowed to be empty. */ if (nr_entries < 2 && !ictx->is_root && !(index->flags & INDEX_NODE)) { ntfs_error(ictx->idx_ni->vol->mp, "Illegal empty leaf node " "found in index."); err = EIO; goto err; } ictx->entries = entries; ictx->nr_entries = nr_entries; ntfs_debug("Done."); return 0; err: OSFree(entries, max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag); ntfs_error(ictx->idx_ni->vol->mp, "Corrupt index in inode 0x%llx. " "Run chkdsk.", (unsigned long long)ictx->idx_ni->mft_no); NVolSetErrors(ictx->idx_ni->vol); return EIO; } /** * ntfs_index_lookup_init - prepare an index context for a lookup * @ictx: [IN] index context to prepare * @key_len: [IN] size of the key ntfs_index_lookup() is called with in bytes * * Prepare the index context @ictx for a call to ntfs_index_lookup() or * ntfs_index_lookup_by_position(). * * @key_len is the length of the key ntfs_index_lookup() will be called with. * If the index @ictx is a directory index this is ignored. * * Return 0 on success and errno on error. */ static errno_t ntfs_index_lookup_init(ntfs_index_context *ictx, const int key_len) { ntfs_inode *idx_ni; MFT_RECORD *m; ntfs_attr_search_ctx *actx; INDEX_ROOT *ir; unsigned min_entry_size, max_entries; errno_t err; ntfs_debug("Entering."); if (!ictx->base_ni) panic("%s(): !ictx->base_ni\n", __FUNCTION__); idx_ni = ictx->idx_ni; if (idx_ni->type != AT_INDEX_ALLOCATION) panic("%s(): idx_ni->type != AT_INDEX_ALLOCATION\n", __FUNCTION__); /* * Ensure the index context is still uninitialized, i.e. it is not * legal to call ntfs_index_lookup*() with a search context that has * been used already. */ if (ictx->up != ictx || ictx->max_entries) panic("%s(): Called for already used index context.\n", __FUNCTION__); if (!ntfs_is_collation_rule_supported(idx_ni->collation_rule)) { ntfs_error(idx_ni->vol->mp, "Index uses unsupported collation " "rule 0x%x. Aborting.", le32_to_cpu(idx_ni->collation_rule)); return ENOTSUP; } /* Get hold of the mft record for the index inode. */ err = ntfs_mft_record_map(ictx->base_ni, &m); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to map mft record (error " "%d.", err); return err; } actx = ntfs_attr_search_ctx_get(ictx->base_ni, m); if (!actx) { err = ENOMEM; goto err; } /* Find the index root attribute in the mft record. */ err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, 0, NULL, 0, actx); if (err) { if (err == ENOENT) { ntfs_error(idx_ni->vol->mp, "Index root attribute " "missing in inode 0x%llx. Inode is " "corrupt. Run chkdsk.", (unsigned long long)idx_ni->mft_no); /* * This will cause the returned error to be EIO and the * volume to be marked as containing errors. */ err = 0; } goto err; } /* * The entry size is made up of the index entry header plus the index * key plus the index data plus optionally the sub-node vcn pointer. * * For most index types the index key is constant size and is simply * given by the caller supplied @key_len. * * The only collation types with variable sized keys are * COLLATION_FILENAME and COLLATION_NTOFS_SID. * * Determining the index data size is more complicated than that so we * simply ignore it. This means we waste some memory but it is not too * bad as only directory indexes appear more than once on a volume thus * only they can have more than one lookup in progress at any one time. */ min_entry_size = sizeof(INDEX_ENTRY_HEADER) + sizeof(FILENAME_ATTR); if (idx_ni->collation_rule != COLLATION_FILENAME) { if (idx_ni->collation_rule == COLLATION_NTOFS_SID) min_entry_size = sizeof(INDEX_ENTRY_HEADER) + sizeof(SID); else min_entry_size = sizeof(INDEX_ENTRY_HEADER) + key_len; } /* * Work out the absolute maximum number of entries there can be in an * index allocation block. We add one for the end entry which does not * contain a key. */ max_entries = 1 + ((idx_ni->block_size - sizeof(INDEX_BLOCK) - sizeof(INDEX_ENTRY_HEADER)) / min_entry_size); /* * Should the mft record size exceed the size of an index block (this * should never really happen) then calculate the maximum number of * entries there can be in the index root and if they are more than the * ones in the index block use that as the maximum value. */ if (idx_ni->vol->mft_record_size > idx_ni->block_size) { unsigned max_root_entries; max_root_entries = 1 + ((idx_ni->vol->mft_record_size - le16_to_cpu(m->attrs_offset) - offsetof(ATTR_RECORD, reservedR) - sizeof(((ATTR_RECORD*)NULL)->reservedR) - sizeof(INDEX_ROOT) - sizeof(INDEX_ENTRY_HEADER)) / min_entry_size); if (max_root_entries > max_entries) max_entries = max_root_entries; } ictx->max_entries = max_entries; /* * Get to the index root value (it has been verified when the inode was * read in ntfs_index_inode_read()). */ ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); ictx->index = &ir->index; /* * Gather the index entry pointers and finish setting up the index * context. */ ictx->is_root = 1; ictx->is_locked = 1; err = ntfs_index_get_entries(ictx); if (err) { ictx->is_locked = 0; goto err; } ictx->ir = ir; ictx->actx = actx; ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) - le32_to_cpu(actx->m->bytes_in_use); ntfs_debug("Done."); return 0; err: if (actx) ntfs_attr_search_ctx_put(actx); ntfs_mft_record_unmap(ictx->base_ni); ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err); return err; } /** * ntfs_index_descend_into_child_node - index node whose child node to return * @index_ctx: pointer to index context whose child node to return * * Descend into the child node pointed to by (*@index_ctx)->entry and return * its fully set up index context in *@index_ctx. * * Return 0 on success and errno on error. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. * - The index context @ictx must be locked. */ static errno_t ntfs_index_descend_into_child_node( ntfs_index_context **index_ctx) { VCN vcn; s64 ofs; ntfs_index_context *ictx, *new_ictx; ntfs_inode *idx_ni; upl_t upl; upl_page_info_array_t pl; u8 *addr; INDEX_ALLOCATION *ia; errno_t err = 0; int is_dirty = 0; static const char es[] = "%s. Inode 0x%llx is corrupt. Run chkdsk."; ntfs_debug("Entering."); if (!index_ctx) panic("%s(): !index_ctx\n", __FUNCTION__); ictx = *index_ctx; if (!ictx) panic("%s(): !ictx\n", __FUNCTION__); idx_ni = ictx->idx_ni; if (!ictx->is_locked) panic("%s(): !ictx->is_locked\n", __FUNCTION__); if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n", __FUNCTION__); /* * Since INDEX_NODE == LARGE_INDEX this check is ok for the index root * as well. */ if (!(ictx->index->flags & INDEX_NODE)) { ntfs_error(idx_ni->vol->mp, es, "Index entry with child node " "found in a leaf node", (unsigned long long)idx_ni->mft_no); goto err; } /* Get the starting vcn of the child index block to descend into. */ vcn = sle64_to_cpup((sle64*)((u8*)ictx->entry + le16_to_cpu(ictx->entry->length) - sizeof(VCN))); if (vcn < 0) { ntfs_error(idx_ni->vol->mp, es, "Negative child node VCN", (unsigned long long)idx_ni->mft_no); goto err; } /* Determine the offset of the page containing the child index block. */ ofs = (vcn << idx_ni->vcn_size_shift) & ~PAGE_MASK_64; /* * If the entry whose sub-node we are descending into is in the index * root, release the index root unlocking its node or we can deadlock * with ntfs_page_map(). */ if (ictx->is_root) { /* * As a sanity check verify that the index allocation attribute * exists. */ if (!NInoIndexAllocPresent(idx_ni)) { ntfs_error(idx_ni->vol->mp, "No index allocation " "attribute but index root entry " "requires one. Inode 0x%llx is " "corrupt. Run chkdsk.", (unsigned long long)idx_ni->mft_no); goto err; } ntfs_index_ctx_unlock(ictx); } else /* if (!ictx->is_root) */ { /* * If @vcn is in the same VM page as the existing page we reuse * the mapped page, otherwise we release the page so we can get * the new one. */ upl = ictx->upl; pl = ictx->pl; addr = ictx->addr; is_dirty = ictx->is_dirty; ictx->upl = NULL; ictx->is_dirty = 0; ictx->is_locked = 0; if (ofs == ictx->upl_ofs) goto have_page; ntfs_page_unmap(idx_ni, upl, pl, is_dirty); is_dirty = 0; } /* We did not reuse the old page, get the new page. */ err = ntfs_page_map(idx_ni, ofs, &upl, &pl, &addr, TRUE); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to map index page (error " "%d).", err); goto err; } have_page: /* Get to the index allocation block. */ ia = (INDEX_ALLOCATION*)(addr + ((unsigned)(vcn << idx_ni->vcn_size_shift) & PAGE_MASK)); /* Bounds checks. */ if ((u8*)ia < addr || (u8*)ia > addr + PAGE_SIZE) { ntfs_error(idx_ni->vol->mp, es, "Out of bounds check failed", (unsigned long long)idx_ni->mft_no); goto unm_err; } /* Catch multi sector transfer fixup errors. */ if (!ntfs_is_indx_record(ia->magic)) { ntfs_error(idx_ni->vol->mp, "Index record with VCN 0x%llx is " "corrupt. Inode 0x%llx is corrupt. Run " "chkdsk.", (unsigned long long)vcn, (unsigned long long)idx_ni->mft_no); goto unm_err; } if (sle64_to_cpu(ia->index_block_vcn) != vcn) { ntfs_error(idx_ni->vol->mp, "Actual VCN (0x%llx) of index " "buffer is different from expected VCN " "(0x%llx). Inode 0x%llx is corrupt. Run " "chkdsk.", (unsigned long long) sle64_to_cpu(ia->index_block_vcn), (unsigned long long)vcn, (unsigned long long)idx_ni->mft_no); goto unm_err; } if (offsetof(INDEX_BLOCK, index) + le32_to_cpu( ia->index.allocated_size) != idx_ni->block_size) { ntfs_error(idx_ni->vol->mp, "Index buffer (VCN 0x%llx) of " "inode 0x%llx has a size (%u) differing from " "the index specified size (%u). Inode is " "corrupt. Run chkdsk.", (unsigned long long)vcn, (unsigned long long)idx_ni->mft_no, (unsigned)offsetof(INDEX_BLOCK, index) + le32_to_cpu(ia->index.allocated_size), (unsigned)idx_ni->block_size); goto unm_err; } if ((u8*)ia + idx_ni->block_size > addr + PAGE_SIZE) panic("%s(): (u8*)ia + idx_ni->block_size > kaddr + " "PAGE_SIZE\n", __FUNCTION__); if ((u8*)&ia->index + le32_to_cpu(ia->index.index_length) > (u8*)ia + idx_ni->block_size) { ntfs_error(idx_ni->vol->mp, "Size of index buffer (VCN " "0x%llx) of inode 0x%llx exceeds maximum " "size.", (unsigned long long)vcn, (unsigned long long)idx_ni->mft_no); goto unm_err; } /* Allocate a new index context. */ new_ictx = ntfs_index_ctx_alloc(); if (!new_ictx) { err = ENOMEM; ntfs_error(idx_ni->vol->mp, "Failed to allocate index " "context."); goto unm_err; } /* * Attach the new index context between the current index context * (which is the bottom of the tree) and the index context below it * (which is the top of the tree), i.e. place the new index context at * the bottom of the tree. * * Gcc is broken so it fails to see both the members of the anonymous * union(s) and the bitfield inside the C99 initializer. Work around * this by setting @is_locked, @is_dirty, @ia, and @vcn afterwards. */ *new_ictx = (ntfs_index_context) { .up = ictx, .down = ictx->down, .idx_ni = idx_ni, .base_ni = ictx->base_ni, .index = &ia->index, .bytes_free = le32_to_cpu(ia->index.allocated_size) - le32_to_cpu(ia->index.index_length), .max_entries = ictx->max_entries, }; new_ictx->is_locked = 1; new_ictx->is_dirty = is_dirty; new_ictx->ia = ia; new_ictx->vcn = vcn; new_ictx->upl_ofs = ofs; new_ictx->upl = upl; new_ictx->pl = pl; new_ictx->addr = addr; ictx->down->up = new_ictx; ictx->down = new_ictx; /* * Gather the index entry pointers and finish setting up the new index * context. */ err = ntfs_index_get_entries(new_ictx); if (!err) { *index_ctx = new_ictx; ntfs_debug("Done."); return 0; } new_ictx->is_locked = 0; new_ictx->is_dirty = 0; new_ictx->upl = NULL; unm_err: ntfs_page_unmap(idx_ni, upl, pl, is_dirty); err: if (!err) { NVolSetErrors(idx_ni->vol); err = EIO; } return err; } /** * ntfs_index_lookup_in_node - search for an entry in an index node * @ictx: index context in which to search for the entry * @match_key: index entry key data to search for * @match_key_len: length of @match_key in bytes * @key: index entry key to search for * @key_len: length of @key in bytes * * Perform a binary search through the index entries in the index node * described by @ictx looking for the correct entry or if not found for the * index entry whose sub-node pointer to descend into. * * @key and @key_len is the complete key to search for. This is used when * doing the full blown collation. * * When doing exact matching for filenames we need to compare the actual * filenames rather than the filename attributes whereas for view indexes we * need to compare the whole key. To make this function simpler we let the * caller specify the appropriate data to use for exact matching in @match_key * and @match_key_len. For view indexes @match_key and @match_key_len are the * same as @key and @key_len respectively. * * Return 0 on success and errno on error. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. * - The index context @ictx must be locked. */ static errno_t ntfs_index_lookup_in_node(ntfs_index_context *ictx, const void *match_key, const int match_key_len, const void *key, const int key_len) { ntfs_inode *idx_ni; INDEX_ENTRY *ie, **entries; unsigned min_left, max_right, cur_entry; int rc; BOOL is_view; ntfs_debug("Entering."); idx_ni = ictx->idx_ni; is_view = FALSE; if (idx_ni->name != I30) is_view = TRUE; entries = ictx->entries; /* * If there is only one entry, i.e. the index node is empty, we need to * descend into the sub-node pointer of the end entry if present thus * return the end entry. */ if (ictx->nr_entries == 1) { cur_entry = 0; goto not_found; } /* * Now do a binary search through the index entries looking for the * correct entry or if not found for the index entry whose sub-node * pointer to descend into. * * Note we exclude the end entry from the search as it does not include * a key we can compare. That makes the search algorithm simpler and * more efficient. */ min_left = 0; max_right = ictx->nr_entries - 2; cur_entry = max_right / 2; do { void *ie_match_key; int ie_match_key_len; ie = entries[cur_entry]; if (!is_view) { ie_match_key_len = ie->key.filename.filename_length << NTFSCHAR_SIZE_SHIFT; ie_match_key = &ie->key.filename.filename; } else { ie_match_key_len = le16_to_cpu(ie->key_length); ie_match_key = &ie->key; } /* * If the keys match perfectly, we setup @ictx to point to the * matching entry and return. */ if ((match_key_len == ie_match_key_len) && !bcmp(match_key, ie_match_key, ie_match_key_len)) { found: ictx->entry = ie; ictx->entry_nr = cur_entry; ictx->is_match = 1; ntfs_debug("Done (found)."); return 0; } /* * Not a perfect match, need to do full blown collation so we * know which way in the B+tree we have to go. */ rc = ntfs_collate(idx_ni->vol, idx_ni->collation_rule, key, key_len, &ie->key, le16_to_cpu(ie->key_length)); /* * If @key collates before the key of the current entry, need * to search on the left. */ if (rc == -1) { /* * If this is the left-most entry we need to descend * into it if it has a sub-node. */ if (cur_entry == min_left) break; /* Exclude the wrong half or remaining entries. */ max_right = cur_entry - 1; /* Find the middle of the remaining entries. */ cur_entry = (min_left + max_right) / 2; continue; } /* * A match should never happen as the bcmp() call should have * caught it, but we still treat it correctly. */ if (!rc) goto found; /* * @key collates after the key of the current entry, need to * search on the right. * * If this is the right-most entry we need to descend into the * end entry on the right if it has a sub-node. */ if (cur_entry == max_right) { cur_entry++; break; } /* Exclude the wrong half or remaining entries. */ min_left = cur_entry + 1; /* Find the middle of the remaining entries. */ cur_entry = (min_left + max_right) / 2; } while (1); not_found: ictx->follow_entry = entries[cur_entry]; ictx->entry_nr = cur_entry; ntfs_debug("Done (not found)."); return ENOENT; } /** * ntfs_index_lookup - find a key in an index and return its index entry * @key: [IN] key for which to search in the index * @key_len: [IN] length of @key in bytes * @index_ctx: [IN/OUT] context describing the index and the returned entry * * Before calling ntfs_index_lookup(), *@index_ctx must have been obtained * from a call to ntfs_index_ctx_get(). * * Look for the @key in the index specified by the index lookup context * @index_ctx. ntfs_index_lookup() walks the contents of the index looking for * the @key. As it does so, it records the path taken through the B+tree * together with other useful information it obtains along the way. This is * needed if the lookup is going to be followed by a complex index tree * operation such as an index entry addition requiring one or more index block * splits for example. * * If the @key is found in the index, 0 is returned and *@index_ctx is set up * to describe the index entry containing the matching @key. The matching * entry is pointed to by *@index_ctx->entry. * * If the @key is not found in the index, ENOENT is returned and *@index_ctx is * setup to describe the index entry whose key collates immediately after the * search @key, i.e. this is the position in the index at which an index entry * with a key of @key would need to be inserted. * * If an error occurs return the error code. In this case *@index_ctx is * undefined and must be freed via a call to ntfs_index_ctx_put() or * reinitialized via a call to ntfs_index_ctx_put_reuse(). * * When finished with the entry and its data, call ntfs_index_ctx_put() to free * the context and other associated resources. * * If the index entry was modified, call ntfs_index_entry_mark_dirty() before * the call to ntfs_index_ctx_put() to ensure that the changes are written to * disk. * * Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode. * - Caller must hold an iocount reference on the index inode. * * TODO: An optimization would be to take two new parameters, say @add and * @del, which allow our caller to tell us if the search is for purposes of * adding an entry (@add is true) or for removing an entry (@del is true) or if * it is a simple lookup (read-only or overwrite without change in length, both * @add and @del are false). For the lookup case we do not need to record the * path so we can just use one single index context and only one array of index * entry pointers and we keep reusing both instead of getting new ones each * time we descend into a sub-node. Alternatively take a single parameter * @reason or @intent or something and define some constants like NTFS_LOOKUP, * NTFS_ADD, and NTFS_DEL or something to go with it to serve the same purpose * as above. */ errno_t ntfs_index_lookup(const void *key, const int key_len, ntfs_index_context **index_ctx) { ntfs_index_context *ictx; const void *match_key; int match_key_len; errno_t err; ntfs_debug("Entering."); if (!key) panic("%s(): !key\n", __FUNCTION__); if (key_len <= 0) panic("%s(): key_len <= 0\n", __FUNCTION__); ictx = *index_ctx; /* * When doing exact matching for filenames we need to compare the * actual filenames rather than the filename attributes whereas for * view indexes we need to compare the whole key. */ if (ictx->idx_ni->name == I30) { match_key_len = ((FILENAME_ATTR*)key)->filename_length << NTFSCHAR_SIZE_SHIFT; match_key = &((FILENAME_ATTR*)key)->filename; } else { match_key_len = key_len; match_key = key; } /* Prepare the search context for its first lookup. */ err = ntfs_index_lookup_init(ictx, key_len); if (err) goto err; do { /* * Look for the @key in the current index node. If found, the * index context points to the found entry so we are done. If * not found, the index context points to the entry whose * sub-node pointer we need to descend into if it is present * and if not we have failed to find the entry and are also * done. */ err = ntfs_index_lookup_in_node(ictx, match_key, match_key_len, key, key_len); if (err && err != ENOENT) panic("%s(): err && err != ENOENT\n", __FUNCTION__); if (!err || !(ictx->entry->flags & INDEX_ENTRY_NODE)) break; /* Not found but child node present, descend into it. */ err = ntfs_index_descend_into_child_node(&ictx); if (err) goto err; /* * Replace the caller's index context with the new one so the * caller always gets the bottom-most index context. */ *index_ctx = ictx; } while (1); ntfs_debug("Done (%s in index %s).", !err ? "found matching entry" : "entry not found", ictx->is_root ? "root" : "allocation block"); return err; err: ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err); return err; } /** * ntfs_index_lookup_by_position - find an entry by its position in the B+tree * @pos: [IN] position of index entry to find in the B+tree * @key_len: [IN] size of the key ntfs_index_lookup() is called with in bytes * @index_ctx: [IN/OUT] context describing the index and the returned entry * * Before calling ntfs_index_lookup_by_position(), *@index_ctx must have been * obtained from a call to ntfs_index_ctx_get(). * * Start searching at the beginning of the B+tree, counting each of the entries * and when the entry with position number @pos has been reached return that in * *@index_ctx. * * @key_len is the length of the key ntfs_index_lookup() would be called with. * If the index *@index_ictx is a directory index this is ignored. * * As the search progresses, the path taken through the B+tree is recorded * together with other useful information obtained along the way. This is * needed so the lookup can proceed to the next entry if/when desired, for * example during a ntfs_readdir() call. * * If the position @pos is found in the index, 0 is returned and *@index_ctx is * set up to describe the index entry containing at position @pos. The * matching entry is pointed to by *@index_ctx->entry. * * If the position @pos is not found in the index, ENOENT is returned. * * If an error occurs the error code is returned (cannot be ENOENT). * * If any values other than 0 are returned, *@index_ctx is undefined and must * be freed via a call to ntfs_index_ctx_put() or reinitialized via a call to * ntfs_index_ctx_put_reuse(). * * When finished with the entry and its data, call ntfs_index_ctx_put() to free * the context and other associated resources. * * If the index entry was modified, call ntfs_index_entry_mark_dirty() before * the call to ntfs_index_ctx_put() to ensure that the changes are written to * disk. * * Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode. * - Caller must hold an iocount reference on the index inode. */ errno_t ntfs_index_lookup_by_position(const s64 pos, const int key_len, ntfs_index_context **index_ctx) { s64 current_pos; ntfs_index_context *ictx; errno_t err; ntfs_debug("Entering for position 0x%llx.", (unsigned long long)pos); if (pos < 0) panic("%s(): pos < 0\n", __FUNCTION__); ictx = *index_ctx; /* Prepare the search context for its first lookup. */ err = ntfs_index_lookup_init(ictx, key_len); if (err) goto err; /* Start at the first index entry in the index root. */ ictx->entry = ictx->entries[0]; ictx->entry_nr = 0; current_pos = 0; /* * Iterate over the entries in the B+tree counting them up until we * reach entry position @pos in which case we are done. */ do { /* * Keep descending into the sub-node pointers until a leaf node * is found. */ while (ictx->entry->flags & INDEX_ENTRY_NODE) { /* Child node present, descend into it. */ err = ntfs_index_descend_into_child_node(&ictx); if (err) goto err; /* Start at the first index entry in the index node. */ ictx->entry = ictx->entries[0]; ictx->entry_nr = 0; } /* * We have reached the next leaf node. If @pos is in this node * skip to the correct entry and we are done. * * Note we need the -1 because @ictx->nr_entries counts the end * entry which does not contain any data thus we do not count * it as a real entry. */ if (current_pos + ictx->nr_entries - 1 > pos) { /* * No need to skip any entries if the current entry is * the one matching @pos. */ if (current_pos < pos) { s64 nr; nr = ictx->entry_nr + (pos - current_pos); if (nr >= ictx->nr_entries - 1) panic("%s(): nr >= ictx->nr_entries - " "1\n", __FUNCTION__); ictx->entry_nr = nr; ictx->entry = ictx->entries[nr]; current_pos = pos; } else if (current_pos > pos) panic("%s(): current_pos > pos\n", __FUNCTION__); break; } /* * @pos is not in this node. Skip the whole node, i.e. skip * @ictx->nr_entries - 1 real index entries. */ current_pos += ictx->nr_entries - 1; /* * Keep moving up the B+tree until we find an index node that * we have not finished yet. */ do { ntfs_index_context *itmp; /* If we are in the index root, we are done. */ if (ictx->is_root) goto not_found; /* Save the current index context so we can free it. */ itmp = ictx; /* Move up to the parent node. */ ictx = ictx->up; /* * Disconnect the old index context from its path and * free it. */ ntfs_index_ctx_disconnect(itmp); ntfs_index_ctx_put_reuse_single(itmp); ntfs_index_ctx_free(itmp); } while (ictx->entry_nr == ictx->nr_entries - 1); /* * We have reached a node which we have not finished with yet. * Lock it so we can work on it. */ err = ntfs_index_ctx_relock(ictx); if (err) goto err; /* * Check if the current entry is the entry matching @pos and if * so we are done. */ if (current_pos == pos) break; /* * Move to the next entry in this node and continue descending * into the sub-node pointers until a leaf node is found. The * first entry in the leaf node will be the next entry thus * increment @current_pos now. */ ictx->entry = ictx->entries[++ictx->entry_nr]; current_pos++; } while (1); /* * Indicate that the current entry is the one the caller was looking * for and return the index context containing it to the caller. */ ictx->is_match = 1; *index_ctx = ictx; ntfs_debug("Done (entry with position 0x%llx found in index %s).", (unsigned long long)pos, ictx->is_root ? "root" : "allocation block"); return 0; not_found: ntfs_debug("Done (entry with position 0x%llx not found in index, " "returning ENOENT).", (unsigned long long)pos); /* Ensure we return a valid index context. */ *index_ctx = ictx; return ENOENT; err: ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err); /* Ensure we return a valid index context. */ *index_ctx = ictx; return err; } /** * ntfs_index_lookup_next - find the next entry in the B+tree * @index_ctx: [IN/OUT] context describing the index and the returned entry * * Before calling ntfs_index_lookup_next(), *@index_ctx must have been obtained * from a call to ntfs_index_lookup() or ntfs_index_lookup_by_position(). * * If the next entry is found in the index, 0 is returned and *@index_ctx is * set up to describe the next index entry. The matching entry is pointed to * by *@index_ctx->entry. * * If the position @pos is not found in the index, ENOENT is returned. * * If an error occurs the error code is returned (cannot be ENOENT). * * If any values other than 0 are returned, *@index_ctx is undefined and must * be freed via a call to ntfs_index_ctx_put() or reinitialized via a call to * ntfs_index_ctx_put_reuse(). * * When finished with the entry and its data, call ntfs_index_ctx_put() to free * the context and other associated resources. * * If the index entry was modified, call ntfs_index_entry_mark_dirty() before * the call to ntfs_index_ctx_put() to ensure that the changes are written to * disk. * * Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode. * - Caller must hold an iocount reference on the index inode. */ errno_t ntfs_index_lookup_next(ntfs_index_context **index_ctx) { ntfs_index_context *ictx; errno_t err; ntfs_debug("Entering."); ictx = *index_ctx; if (!ictx->base_ni) panic("%s(): !ictx->base_ni\n", __FUNCTION__); /* * Ensure the index context is initialized, i.e. it is not legal to * call ntfs_index_lookup_next() with a search context that has not * been used for a call to ntfs_index_lookup_by_position() or * ntfs_index_lookup() yet. */ if (!ictx->max_entries) panic("%s(): Called for uninitialized index context.\n", __FUNCTION__); /* * @ictx must currently point to real entry thus @ictx->nr_entries must * be greater or equal to 2 as it includes the end entry which cannot * contain any data. */ if (ictx->nr_entries < 2) panic("%s(): ictx->nr_entries < 2\n", __FUNCTION__); if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) { /* * The current entry is in a leaf node. * * If it is not the last entry in the node return the next * entry in the node. */ if (ictx->entry_nr < ictx->nr_entries - 2) { ictx->entry = ictx->entries[++ictx->entry_nr]; ntfs_debug("Done (same leaf node)."); return 0; } /* * The current entry is in a leaf node and it is the last entry * in the node. The next entry is the first real entry above * the current node thus keep moving up the B+tree until we * find a real entry. */ do { ntfs_index_context *itmp; /* If we are in the index root, we are done. */ if (ictx->is_root) { ntfs_debug("Done (was at last entry already, " "returning ENOENT)."); /* Ensure we return a valid index context. */ *index_ctx = ictx; return ENOENT; } /* Save the current index context so we can free it. */ itmp = ictx; /* Move up to the parent node. */ ictx = ictx->up; /* * Disconnect the old index context from its path and * free it. */ ntfs_index_ctx_disconnect(itmp); ntfs_index_ctx_put_reuse_single(itmp); ntfs_index_ctx_free(itmp); } while (ictx->entry_nr == ictx->nr_entries - 1); /* * We have reached a node with a real index entry. Lock it so * we can work on it. */ err = ntfs_index_ctx_relock(ictx); if (err) goto err; /* * Indicate that the current entry is the next entry and return * the index context containing it to the caller. */ ictx->is_match = 1; *index_ctx = ictx; ntfs_debug("Done (upper index node)."); return 0; } /* * The current entry is in an index node. * * To find the next entry we need to switch to the next entry in this * node and then descend into the sub-node pointers until a leaf node * is found. The first entry of that leaf node is the next entry. * * First, unmark this node as being the matching one and switch to the * next entry in this node. */ ictx->is_match = 0; ictx->entry = ictx->entries[++ictx->entry_nr]; /* * Keep descending into the sub-node pointers until a leaf node is * found. */ while (ictx->entry->flags & INDEX_ENTRY_NODE) { /* Child node present, descend into it. */ err = ntfs_index_descend_into_child_node(&ictx); if (err) goto err; /* Start at the first index entry in the index node. */ ictx->entry = ictx->entries[0]; ictx->entry_nr = 0; } /* * We have reached the next leaf node. The next entry is the first * entry in this node which we have already set up to be the current * entry. * * Indicate that the current entry is the one the caller was looking * for and return the index context containing it to the caller. */ ictx->is_match = 1; *index_ctx = ictx; ntfs_debug("Done (next leaf node)."); return 0; err: ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err); /* Ensure we return a valid index context. */ *index_ctx = ictx; return err; } /** * ntfs_index_entry_mark_dirty - mark an index entry dirty * @ictx: ntfs index context describing the index entry * * Mark the index entry described by the index entry context @ictx dirty. * * If the index entry is in the index root attribute, simply mark the mft * record containing the index root attribute dirty. This ensures the mft * record, and hence the index root attribute, will be written out to disk * later. * * If the index entry is in an index block belonging to the index allocation * attribute, mark the page the index block is in dirty. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) { if (!ictx->is_locked) panic("%s(): Index context is not locked.\n", __FUNCTION__); if (ictx->is_root) { if (!ictx->actx) panic("%s(): Attribute context is missing from index " "context.\n", __FUNCTION__); NInoSetMrecNeedsDirtying(ictx->actx->ni); } else { if (!ictx->upl) panic("%s(): Page list is missing from index " "context.\n", __FUNCTION__); ictx->is_dirty = 1; } } /** * ntfs_insert_index_allocation_attribute - add the index allocation attribute * @ni: base ntfs inode to which the attribute is being added * @ctx: search context describing where to insert the attribute * @idx_ni: index inode for which to add the index allocation attribute * * Insert a new index allocation attribute in the base ntfs inode @ni at the * position indicated by the attribute search context @ctx and add an attribute * list attribute entry for it if the inode uses the attribute list attribute. * * The new attribute type and name are given by the index inode @idx_ni. * * If @idx_ni contains a runlist, it is used to create the mapping pairs array * for the non-resident index allocation attribute. * * Return 0 on success and errno on error. * * WARNING: Regardless of whether success or failure is returned, you need to * check @ctx->is_error and if true the @ctx is no longer valid, i.e. * you need to either call ntfs_attr_search_ctx_reinit() or * ntfs_attr_search_ctx_put() on it. In that case @ctx->error will * give the error code for why the mapping of the inode failed. * * Locking: Caller must hold @idx_ni->lock on the index inode for writing. */ static errno_t ntfs_insert_index_allocation_attribute(ntfs_inode *ni, ntfs_attr_search_ctx *ctx, ntfs_inode *idx_ni) { ntfs_volume *vol; MFT_RECORD *base_m, *m; ATTR_RECORD *a; ATTR_LIST_ENTRY *al_entry; unsigned mp_ofs, mp_size, al_entry_used, al_entry_len; unsigned new_al_size, new_al_alloc; errno_t err; BOOL al_entry_added; ntfs_debug("Entering for mft_no 0x%llx, name_len 0x%x.", (unsigned long long)ni->mft_no, idx_ni->name_len); vol = idx_ni->vol; /* * Calculate the offset into the new attribute at which the mapping * pairs array begins. The mapping pairs array is placed after the * name aligned to an 8-byte boundary which in turn is placed * immediately after the non-resident attribute record itself. */ mp_ofs = offsetof(ATTR_RECORD, compressed_size) + (((idx_ni->name_len << NTFSCHAR_SIZE_SHIFT) + 7) & ~7); /* Work out the size for the mapping pairs array. */ err = ntfs_get_size_for_mapping_pairs(vol, idx_ni->rl.elements ? idx_ni->rl.rl : NULL, 0, -1, &mp_size); if (err) panic("%s(): err\n", __FUNCTION__); /* * Work out the size for the attribute record. We simply take the * offset to the mapping pairs array we worked out above and add the * size of the mapping pairs array in bytes aligned to an 8-byte * boundary. Note we do not need to do the alignment as * ntfs_attr_record_make_space() does it anyway. * * The current implementation of ntfs_attr_lookup() will always return * pointing into the base mft record when an attribute is not found. */ base_m = ctx->m; retry: if (ni != ctx->ni) panic("%s(): ni != ctx->ni\n", __FUNCTION__); m = ctx->m; a = ctx->a; err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size); if (err) { ntfs_inode *eni; if (err != ENOSPC) panic("%s(): err != ENOSPC\n", __FUNCTION__); /* * There was not enough space in the mft record to insert the * new attribute record which means we will need to insert it * into an extent mft record. * * To avoid bugs and impossible situations, check that the * attribute is not already the only attribute in the mft * record otherwise moving it would not give us anything. */ if (ntfs_attr_record_is_only_one(m, a)) panic("%s(): ntfs_attr_record_is_only_one()\n", __FUNCTION__); /* * Before we can allocate an extent mft record, we need to * ensure that the inode has an attribute list attribute. */ if (!NInoAttrList(ni)) { err = ntfs_attr_list_add(ni, m, NULL); if (err) { ntfs_error(vol->mp, "Failed to add attribute " "list attribute to mft_no " "0x%llx (error %d).", (unsigned long long)ni->mft_no, err); return err; } /* * Adding the attribute list attribute may have * generated enough space in the base mft record to * fit the attribute so try again. */ ntfs_attr_search_ctx_reinit(ctx); err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, idx_ni->name, idx_ni->name_len, 0, NULL, 0, ctx); if (err == ENOENT) { /* * The current implementation of * ntfs_attr_lookup() will always return * pointing into the base mft record when an * attribute is not found. */ if (m != ctx->m) panic("%s(): m != ctx->m\n", __FUNCTION__); goto retry; } /* * We cannot have found the attribute as we have * exclusive access and know that it does not exist * already. */ if (!err) panic("%s() !err\n", __FUNCTION__); /* * Something has gone wrong. Note we have to bail out * as a failing attribute lookup indicates corruption * and/or disk failure and/or not enough memory all of * which would prevent us from rolling back the * attribute list attribute addition. */ ntfs_error(vol->mp, "Failed to add index allocation " "attribute to mft_no 0x%llx because " "looking up the attribute failed " "(error %d).", (unsigned long long)ni->mft_no, err); return err; } /* * We now need to allocate a new extent mft record, attach it * to the base ntfs inode and set up the search context to * point to it, then insert the new attribute into it. */ err = ntfs_mft_record_alloc(vol, NULL, NULL, ni, &eni, &m, &a); if (err) { ntfs_error(vol->mp, "Failed to add index allocation " "attribute to mft_no 0x%llx because " "allocating a new extent mft record " "failed (error %d).", (unsigned long long)ni->mft_no, err); /* * TODO: If we added the attribute list attribute above * we could now remove it again but this may require * moving attributes back into the base mft record so * is not a trivial amount of work and in the end it * does not really matter if we leave an inode with an * attribute list attribute that does not really need * it. Especially so since this is error handling and * it is not an error to have an attribute list * attribute when not strictly required. */ return err; } ctx->ni = eni; ctx->m = m; ctx->a = a; /* * Make space for the new attribute. This cannot fail as we * now have an empty mft record which by definition can hold * a maximum size resident attribute record. */ err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size); if (err) panic("%s(): err (ntfs_attr_record_make_space())\n", __FUNCTION__); } /* * Now setup the new attribute record. The entire attribute has been * zeroed and the length of the attribute record has been set up by * ntfs_attr_record_make_space(). */ a->type = AT_INDEX_ALLOCATION; a->non_resident = 1; a->name_length = idx_ni->name_len; a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD, compressed_size)); a->instance = m->next_attr_instance; /* * Increment the next attribute instance number in the mft record as we * consumed the old one. */ m->next_attr_instance = cpu_to_le16( (le16_to_cpu(m->next_attr_instance) + 1) & 0xffff); a->highest_vcn = cpu_to_sle64((idx_ni->allocated_size >> vol->cluster_size_shift) - 1); a->mapping_pairs_offset = cpu_to_le16(mp_ofs); lck_spin_lock(&idx_ni->size_lock); a->allocated_size = cpu_to_sle64(idx_ni->allocated_size); a->data_size = cpu_to_sle64(idx_ni->data_size); a->initialized_size = cpu_to_sle64(idx_ni->initialized_size); lck_spin_unlock(&idx_ni->size_lock); /* Copy the attribute name into place. */ if (idx_ni->name_len) memcpy((u8*)a + offsetof(ATTR_RECORD, compressed_size), idx_ni->name, idx_ni->name_len << NTFSCHAR_SIZE_SHIFT); /* Generate the mapping pairs array into place. */ err = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs, mp_size, idx_ni->rl.elements ? idx_ni->rl.rl : NULL, 0, -1, NULL); if (err) panic("%s(): err (ntfs_mapping_pairs_build())\n", __FUNCTION__); /* * If the inode does not use the attribute list attribute we are done. * * If the inode uses the attribute list attribute (including the case * where we just created it), we need to add an attribute list * attribute entry for the attribute. */ if (!NInoAttrList(ni)) goto done; /* Add an attribute list attribute entry for the inserted attribute. */ al_entry = ctx->al_entry; al_entry_used = offsetof(ATTR_LIST_ENTRY, name) + (idx_ni->name_len << NTFSCHAR_SIZE_SHIFT); al_entry_len = (al_entry_used + 7) & ~7; new_al_size = ni->attr_list_size + al_entry_len; /* Out of bounds checks. */ if ((u8*)al_entry < ni->attr_list || (u8*)al_entry > ni->attr_list + new_al_size || (u8*)al_entry + al_entry_len > ni->attr_list + new_al_size) { /* Inode is corrupt. */ ntfs_error(vol->mp, "Mft_no 0x%llx is corrupt. Run chkdsk.", (unsigned long long)ni->mft_no); err = EIO; goto undo; } err = ntfs_attr_size_bounds_check(vol, AT_ATTRIBUTE_LIST, new_al_size); if (err) { if (err == ERANGE) { ntfs_error(vol->mp, "Cannot insert attribute into " "mft_no 0x%llx because the attribute " "list attribute would become too " "large. You need to defragment your " "volume and then try again.", (unsigned long long)ni->mft_no); err = ENOSPC; } else { ntfs_error(vol->mp, "Attribute list attribute is " "unknown on the volume. The volume " "is corrupt. Run chkdsk."); NVolSetErrors(vol); err = EIO; } goto undo; } new_al_alloc = (new_al_size + NTFS_ALLOC_BLOCK - 1) & ~(NTFS_ALLOC_BLOCK - 1); /* * Reallocate the memory buffer if needed and create space for the new * entry. */ if (new_al_alloc > ni->attr_list_alloc) { u8 *tmp, *al, *al_end; unsigned al_entry_ofs; tmp = OSMalloc(new_al_alloc, ntfs_malloc_tag); if (!tmp) { ntfs_error(vol->mp, "Not enough memory to extend " "attribute list attribute of mft_no " "0x%llx.", (unsigned long long)ni->mft_no); err = ENOMEM; goto undo; } al = ni->attr_list; al_entry_ofs = (u8*)al_entry - al; al_end = al + ni->attr_list_size; memcpy(tmp, al, al_entry_ofs); if ((u8*)al_entry < al_end) memcpy(tmp + al_entry_ofs + al_entry_len, al + al_entry_ofs, ni->attr_list_size - al_entry_ofs); al_entry = ctx->al_entry = (ATTR_LIST_ENTRY*)(tmp + al_entry_ofs); OSFree(ni->attr_list, ni->attr_list_alloc, ntfs_malloc_tag); ni->attr_list_alloc = new_al_alloc; ni->attr_list = tmp; } else if ((u8*)al_entry < ni->attr_list + ni->attr_list_size) memmove((u8*)al_entry + al_entry_len, al_entry, ni->attr_list_size - ((u8*)al_entry - ni->attr_list)); ni->attr_list_size = new_al_size; /* Set up the attribute list entry. */ al_entry->type = AT_INDEX_ALLOCATION; al_entry->length = cpu_to_le16(al_entry_len); al_entry->name_length = idx_ni->name_len; al_entry->name_offset = offsetof(ATTR_LIST_ENTRY, name); al_entry->lowest_vcn = 0; al_entry->mft_reference = MK_LE_MREF(ctx->ni->mft_no, ctx->ni->seq_no); al_entry->instance = a->instance; /* Copy the attribute name into place. */ if (idx_ni->name_len) memcpy((u8*)&al_entry->name, idx_ni->name, idx_ni->name_len << NTFSCHAR_SIZE_SHIFT); /* For tidyness, zero any unused space. */ if (al_entry_len != al_entry_used) { if (al_entry_len < al_entry_used) panic("%s(): al_entry_len < al_entry_used\n", __FUNCTION__); memset((u8*)al_entry + al_entry_used, 0, al_entry_len - al_entry_used); } /* * Extend the attribute list attribute and copy in the modified * value from the cache. */ err = ntfs_attr_list_sync_extend(ni, base_m, (u8*)al_entry - ni->attr_list, ctx); if (err) { ntfs_error(vol->mp, "Failed to extend attribute list " "attribute of mft_no 0x%llx (error %d).", (unsigned long long)ni->mft_no, err); al_entry_added = TRUE; goto undo_al; } done: ntfs_debug("Done."); return 0; undo: al_entry_added = FALSE; undo_al: /* * Need to remove the attribute again or free the extent mft record if * there are no attributes remaining in it. */ if (m == base_m || !ntfs_attr_record_is_only_one(m, a)) { ntfs_attr_record_delete_internal(m, a); /* * If the attribute was not in the base mft record mark the * extent mft record dirty so it gets written out later. If * the attribute was in the base mft record it will be marked * dirty later. * * We also unmap the extent mft record and we set @ctx->ni to * equal the base inode @ni so that the search context is * initialized from scratch or simply freed if the caller * reinitializes or releases the search context respectively. */ if (m != base_m) { NInoSetMrecNeedsDirtying(ctx->ni); ntfs_extent_mft_record_unmap(ctx->ni); ctx->ni = ni; } } else { errno_t err2; BOOL al_needed; err2 = ntfs_extent_mft_record_free(ni, ctx->ni, m); if (err2) { /* * Ignore the error as we just end up with an unused * mft record that is marked in use. */ ntfs_error(vol->mp, "Failed to free extent mft_no " "0x%llx (error %d). Unmount and run " "chkdsk to recover the lost inode.", (unsigned long long)ctx->ni->mft_no, err2); NVolSetErrors(vol); /* * Relese the extent mft record after dirtying it thus * simulating the effect of freeing it. */ NInoSetMrecNeedsDirtying(ctx->ni); ntfs_extent_mft_record_unmap(ctx->ni); } /* * The attribute search context still points to the no longer * mapped extent inode thus we need to change it to point to * the base inode instead so the context can be reinitialized * or released safely. */ ctx->ni = ni; /* * Check the attribute list attribute. If there are no other * attribute list attribute entries referencing extent mft * records delete the attribute list attribute altogether. * * If this fails it does not matter as we simply retain the * attribute list attribute so we ignore the error and go on to * delete the attribute list attribute entry instead. * * If there are other attribute list attribute entries * referencing extent mft records we still need the attribute * list attribute thus we go on to delete the attribute list * entry corresponding to the attribute record we just deleted * by freeing its extent mft record. */ err2 = ntfs_attr_list_is_needed(ni, al_entry_added ? al_entry : NULL, &al_needed); if (err2) ntfs_warning(vol->mp, "Failed to determine if " "attribute list attribute of mft_no " "0x%llx if still needed (error %d). " "Assuming it is still needed and " "continuing.", (unsigned long long)ni->mft_no, err2); else if (!al_needed) { /* * No more extent mft records are in use. Delete the * attribute list attribute. */ ntfs_attr_search_ctx_reinit(ctx); err2 = ntfs_attr_list_delete(ni, ctx); if (!err2) { /* * We deleted the attribute list attribute and * this will have updated the base inode * appropriately thus we have restored * everything as it was before. */ return err; } ntfs_warning(vol->mp, "Failed to delete attribute " "list attribute of mft_no 0x%llx " "(error %d). Continuing using " "alternative error recovery method.", (unsigned long long)ni->mft_no, err2); } } /* * Both @ctx and @ni are now invalid and cannot be used any more which * is fine as we have finished dealing with the attribute record. * * We now need to delete the corresponding attribute list attribute * entry if we created it. * * Then we need to rewrite the attribute list attribute again because * ntfs_attr_list_sync_extend() may have left it in an indeterminate * state. */ if (al_entry_added) { errno_t err2; ntfs_attr_list_entry_delete(ni, al_entry); ntfs_attr_search_ctx_reinit(ctx); err2 = ntfs_attr_list_sync_shrink(ni, 0, ctx); if (err2) { ntfs_error(vol->mp, "Failed to restore attribute list " "attribute in base mft_no 0x%llx " "(error %d). Leaving inconsistent " "metadata. Unmount and run chkdsk.", (unsigned long long)ni->mft_no, err2); NVolSetErrors(vol); } } /* Make sure any changes are written out. */ NInoSetMrecNeedsDirtying(ni); return err; } /** * ntfs_index_block_lay_out - lay out an index block into a memory buffer * @idx_ni: index inode to which the index block will belong * @vcn: vcn of index block * @ia: destination buffer of size >= @idx_ni->block_size bytes * * Lay out an empty index allocation block with the @vcn into the buffer @ia. * The index inode @idx_ni is needed because we need to know the size of an * index block and the @vcn is needed because we need to record it in the index * block. * * Locking: Caller must hold @idx_ni->lock on the index inode for writing. */ static void ntfs_index_block_lay_out(ntfs_inode *idx_ni, VCN vcn, INDEX_BLOCK *ia) { INDEX_HEADER *ih; INDEX_ENTRY *ie; u32 ie_ofs; bzero(ia, idx_ni->block_size); ia->magic = magic_INDX; ia->usa_ofs = const_cpu_to_le16(sizeof(INDEX_BLOCK)); ia->usa_count = cpu_to_le16(1 + (idx_ni->block_size / NTFS_BLOCK_SIZE)); /* Set the update sequence number to 1. */ *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = cpu_to_le16(1); ia->index_block_vcn = cpu_to_sle64(vcn); ih = &ia->index; ie_ofs = (sizeof(INDEX_HEADER) + (le16_to_cpu(ia->usa_count) << 1) + 7) & ~7; ih->entries_offset = cpu_to_le32(ie_ofs); ih->index_length = cpu_to_le32(ie_ofs + sizeof(INDEX_ENTRY_HEADER)); ih->allocated_size = cpu_to_le32(idx_ni->block_size - offsetof(INDEX_BLOCK, index)); ih->flags = LEAF_NODE; ie = (INDEX_ENTRY*)((u8*)ih + ie_ofs); ie->length = const_cpu_to_le16(sizeof(INDEX_ENTRY_HEADER)); ie->flags = INDEX_ENTRY_END; } /** * ntfs_index_block_alloc - allocate and return an index allocation block * @ictx: index context of the index for which to allocate a block * @dst_vcn: pointer in which to return the VCN of the allocated index block * @dst_ia: pointer in which to return the allocated index block * @dst_upl_ofs: pointer in which to return the mapped address of the page data * @dst_upl: pointer in which to return the page list containing @ia * @dst_pl: pointer in which to return the array of pages containing @ia * @dst_addr: pointer in which to return the mapped address * * Allocate an index allocation block for the index described by the index * context @ictx and return it in *@dst_ia. *@dst_vcn, *@dst_upl_ofs, * *@dst_upl, *@dst_pl, and *@dst_addr are set to the VCN of the allocated * index block and the mapped page list and array of pages containing the * returned index block respectively. * * Return 0 on success and errno on error. On error *@dst_vcn, *@dst_ia, * *@dst_upl_ofs, *@dst_upl, *@dst_pl, and *@dst_addr are left untouched. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - All of the index contexts in the index must be unlocked (this * includes @ictx, i.e. @ictx must not be locked). */ static errno_t ntfs_index_block_alloc(ntfs_index_context *ictx, VCN *dst_vcn, INDEX_BLOCK **dst_ia, s64 *dst_upl_ofs, upl_t *dst_upl, upl_page_info_array_t *dst_pl, u8 **dst_addr) { s64 bmp_pos, end_pos, init_size, upl_ofs; ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni; upl_t upl; upl_page_info_array_t pl; u8 *bmp, *bmp_end; INDEX_ALLOCATION *ia; errno_t err, err2; lck_rw_type_t lock; le16 usn; BOOL have_resized; u8 bit; ntfs_debug("Entering for inode 0x%llx.", (unsigned long long)idx_ni->mft_no); /* * Get the index bitmap inode. * * Note we do not lock the bitmap inode as the caller holds an * exclusive lock on the index inode thus the bitmap cannot change * under us as the only places the index bitmap is modified is as a * side effect of a locked index inode. * * The inode will be locked later if/when we are going to modify its * size to ensure any relevant VM activity that may be happening at the * same time is excluded. * * We need to take the ntfs inode lock on the bitmap inode for writing. * This causes a complication because there is a valid case in which we * can recurse (once) back into ntfs_index_block_alloc() by calling: * ntfs_attr_resize(bmp_ni) -> * ntfs_index_move_root_to_allocation_block() -> * ntfs_index_block_alloc() * In this case we have to not take the lock again or we will deadlock * (or with lock debugging enabled the kernel will panic()). Thus we * set @ictx->bmp_is_locked so that when we get back here we know not * to take the lock again. */ lock = 0; if (!ictx->bmp_is_locked) { lock = LCK_RW_TYPE_EXCLUSIVE; ictx->bmp_is_locked = 1; } err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name, idx_ni->name_len, FALSE, lock, &bmp_ni); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode " "(error %d).", err); if (lock) ictx->bmp_is_locked = 0; goto err; } /* Find the first zero bit in the index bitmap. */ bmp_pos = 0; lck_spin_lock(&bmp_ni->size_lock); end_pos = bmp_ni->initialized_size; lck_spin_unlock(&bmp_ni->size_lock); for (bmp_pos = 0; bmp_pos < end_pos;) { err = ntfs_page_map(bmp_ni, bmp_pos, &upl, &pl, &bmp, TRUE); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to read index " "bitmap (error %d).", err); goto put_err; } bmp_end = bmp + PAGE_SIZE; if (bmp_pos + PAGE_SIZE > end_pos) bmp_end = bmp + (end_pos - bmp_pos); /* Check the next bit(s). */ for (; bmp < bmp_end; bmp++, bmp_pos++) { if (*bmp == 0xff) continue; /* * TODO: There does not appear to be a ffz() function * in the kernel. )-: If/when the kernel has an ffz() * function, switch the below code to use it. * * So emulate "ffz(x)" using "ffs(~x) - 1" which gives * the same result but incurs extra CPU overhead. */ bit = ffs(~(unsigned long)*bmp) - 1; if (bit < 8) goto allocated_bit; } ntfs_page_unmap(bmp_ni, upl, pl, FALSE); } /* * There are no zero bits in the initialized part of the bitmap. Thus * we extend it by 8 bytes and allocate the first bit of the extension. */ bmp_pos = end_pos; lck_spin_lock(&bmp_ni->size_lock); end_pos = bmp_ni->data_size; lck_spin_unlock(&bmp_ni->size_lock); /* If we are exceeding the bitmap size need to extend it. */ have_resized = FALSE; if (bmp_pos + 8 >= end_pos) { ntfs_debug("Extending index bitmap, old size 0x%llx, " "requested size 0x%llx.", (unsigned long long)end_pos, (unsigned long long)end_pos + 8); err = ntfs_attr_resize(bmp_ni, end_pos + 8, 0, ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to extend index " "bitmap (error %d).", err); goto put_err; } have_resized = TRUE; } /* * Get the page containing the bit we are allocating. Note this has to * happen before we call ntfs_attr_set_initialized_size() as the latter * only sets the initialized size without zeroing the area between the * old initialized size and the new one thus we need to map the page * now so that its tail is zeroed due to the old value of the * initialized size. */ err = ntfs_page_map(bmp_ni, bmp_pos & ~PAGE_MASK_64, &upl, &pl, &bmp, TRUE); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to read index bitmap " "(error %d).", err); /* * There is no need to resize the bitmap back to its old size. * It does not change the metadata consistency to have a bigger * bitmap. */ goto put_err; } bmp += (unsigned)bmp_pos & PAGE_MASK; /* * Extend the initialized size if the bitmap is non-resident. If it is * resident this was already done by the ntfs_attr_resize() call. */ if (NInoNonResident(bmp_ni)) { #ifdef DEBUG lck_spin_lock(&bmp_ni->size_lock); init_size = bmp_ni->initialized_size; lck_spin_unlock(&bmp_ni->size_lock); ntfs_debug("Setting initialized size of index bitmap, old " "size 0x%llx, requested size 0x%llx.", (unsigned long long)init_size, (unsigned long long)end_pos + 8); #endif err = ntfs_attr_set_initialized_size(bmp_ni, end_pos + 8); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to update " "initialized size of index bitmap " "(error %d).", err); ntfs_page_unmap(bmp_ni, upl, pl, FALSE); goto put_err; } } /* Finally, allocate the bit in the page. */ bit = 0; /* * If we used ntfs_attr_resize() to extend the bitmap attribute it is * possible that this caused the index root node to be moved out to an * index allocation block which very likely will have allocated the * very bit we want to allocate so we test for this case and if it has * happened we allocate the next bit along which must be free or * something has gone wrong. */ if (have_resized && *bmp & 1) { if (*bmp & 2) panic("%s(): *bmp & 2\n", __FUNCTION__); bit++; } allocated_bit: *bmp |= 1 << bit; ntfs_page_unmap(bmp_ni, upl, pl, TRUE); /* Set @bmp_pos to the allocated index bitmap bit. */ bmp_pos = (bmp_pos << 3) + bit; /* * If we are caching the last set bit in the bitmap in the index inode * and we allocated beyond the last set bit, update the cached value. */ if (idx_ni->last_set_bit >= 0 && bmp_pos > idx_ni->last_set_bit) idx_ni->last_set_bit = bmp_pos; ntfs_debug("Allocated index bitmap bit 0x%llx.", bmp_pos); /* * We are done with the bitmap and have the index allocation to go. * * If the allocated bit is outside the data size need to extend it. */ lck_spin_lock(&idx_ni->size_lock); end_pos = idx_ni->data_size; lck_spin_unlock(&idx_ni->size_lock); if (bmp_pos >= end_pos >> idx_ni->block_size_shift) { ntfs_debug("Extending index allocation, old size 0x%llx, " "requested size 0x%llx.", (unsigned long long) end_pos, (unsigned long long)(bmp_pos + 1) << idx_ni->block_size_shift); err = ntfs_attr_resize(idx_ni, (bmp_pos + 1) << idx_ni->block_size_shift, 0, ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to extend index " "allocation (error %d).", err); goto alloc_err; } } /* * Map the page containing the allocated index block. As above for the * bitmap we need to get the page before we set the initialized size so * that the tail of the page is zeroed for us. */ upl_ofs = (bmp_pos << idx_ni->block_size_shift) & ~PAGE_MASK_64; err = ntfs_page_map(idx_ni, upl_ofs, &upl, &pl, &bmp, TRUE); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to read index allocation " "block (error %d).", err); /* * There is no need to truncate the index allocation back to * its old size. It does not change the metadata consistency * to have a bigger index allocation. */ goto alloc_err; } /* Extend the initialized size if needed. */ lck_spin_lock(&idx_ni->size_lock); init_size = idx_ni->initialized_size; lck_spin_unlock(&idx_ni->size_lock); if (bmp_pos >= init_size >> idx_ni->block_size_shift) { ntfs_debug("Setting initialized size of index allocation, old " "size 0x%llx, requested size 0x%llx.", (unsigned long long)init_size, (unsigned long long)(bmp_pos + 1) << idx_ni->block_size_shift); err = ntfs_attr_set_initialized_size(idx_ni, (bmp_pos + 1) << idx_ni->block_size_shift); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to update " "initialized size of index allocation " "(error %d).", err); goto unm_err; } } if (lock) { ictx->bmp_is_locked = 0; lck_rw_unlock_exclusive(&bmp_ni->lock); } (void)vnode_put(bmp_ni->vn); /* Lay out an empty index block into the allocated space. */ ia = (INDEX_ALLOCATION*)(bmp + (((unsigned)bmp_pos << idx_ni->block_size_shift) & PAGE_MASK)); /* Preserve the update sequence number across the layout. */ usn = 0; if (le16_to_cpu(ia->usa_ofs) < NTFS_BLOCK_SIZE - sizeof(u16)) usn = *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)); /* Calculate the index block vcn from the index block number. */ *dst_vcn = bmp_pos = bmp_pos << idx_ni->block_size_shift >> idx_ni->vcn_size_shift; ntfs_index_block_lay_out(idx_ni, bmp_pos, ia); if (usn && usn != const_cpu_to_le16(0xffff)) *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = usn; *dst_ia = ia; *dst_upl_ofs = upl_ofs; *dst_upl = upl; *dst_pl = pl; *dst_addr = bmp; ntfs_debug("Done (allocated index block with vcn 0x%llx.", (unsigned long long)bmp_pos); return 0; unm_err: ntfs_page_unmap(idx_ni, upl, pl, FALSE); alloc_err: /* Free the index bitmap bit that we allocated. */ err2 = ntfs_bitmap_clear_bit(bmp_ni, bmp_pos); if (err2) { ntfs_error(idx_ni->vol->mp, "Failed to undo index block " "allocation in index bitmap (error %d). " "Leaving inconsistent metadata. Run chkdsk.", err2); NVolSetErrors(idx_ni->vol); } put_err: if (lock) { ictx->bmp_is_locked = 0; lck_rw_unlock_exclusive(&bmp_ni->lock); } (void)vnode_put(bmp_ni->vn); err: ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err); return err; } /** * ntfs_index_ctx_disconnect_reinit - disconnect and reinit an index context * @ictx: index context to disconnect * * Disconnect the index context @ictx from its tree path and reinitialize its * @up and @down pointers to point to itself. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. */ static inline void ntfs_index_ctx_disconnect_reinit(ntfs_index_context *ictx) { ntfs_index_ctx_disconnect(ictx); ictx->down = ictx->up = ictx; } /** * ntfs_index_move_root_to_allocation_block - move index root to allocation * @ictx: index lookup context describing index root to move * * Move the index root to an index allocation block in the process switching * the index from a small to a large index if necessary. * * If no index allocation and/or bitmap attributes exist they are created. * * An index block is then allocated and if necessary the index bitmap and/or * the allocation attributes are extended in the process. * * Finally all index entries contained in the index root attribute are moved * into the allocated index block in the index allocation attribute. * * We need to potentially create the index allocation and/or bitmap attributes * so we can move the entries from the index root attribute to the index * allocation attribute and then shrink the index root attribute. However, * there is not enough space in the mft record to do this. Also we already * have the index root attribute looked up so it makes sense to deal with it * first. * * Thus, if there is no index allocation attribute, we allocate the space to be * used by the index allocation attribute and setup the directory inode for a * large index including the allocated space but leaving the initialized size * to zero. We then map and lock the first page containing the now allocated * first index block and move the index entries from the index root into it. * We then shrink the index root and set it up to point to the allocated index * block. * * Having shrunk the index root attribute there is now hopefully enough space * in the mft record to create the index allocation attribute and the index * bitmap attribute in the mft record and the conversion is complete. * * If there is not enough space we create the index allocation and/or index * bitmap attributes in an extent mft record. TODO: This is not implemented * yet. * * If there is an index allocation attribute already, we allocate a temporary * buffer to hold the index block and then copy from there into the allocated * index block later. * * Return 0 on success and errno on error. On error the index context is no * longer usable and must be released or reinitialized. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ errno_t ntfs_index_move_root_to_allocation_block(ntfs_index_context *ictx) { VCN vcn; ntfs_inode *base_ni, *idx_ni; ntfs_volume *vol; ntfs_index_context *ir_ictx; upl_t upl; upl_page_info_array_t pl; INDEX_ALLOCATION *ia; INDEX_HEADER *ih; INDEX_ROOT *ir; INDEX_ENTRY *ie; ntfs_attr_search_ctx *actx; MFT_RECORD *m; u32 u, ia_ie_ofs, ir_ie_ofs, clusters = 0; errno_t err, err2; struct { unsigned is_large_index:1; } old; BOOL need_ubc_setsize = FALSE; ntfs_debug("Entering."); if (!ictx->is_locked) panic("%s(): !ictx->is_locked\n", __FUNCTION__); base_ni = ictx->base_ni; idx_ni = ictx->idx_ni; vol = base_ni->vol; /* * The current index context is going to turn from describing the index * root to describing the newly allocated index block. Thus, allocate * a new index context for the emptied index root. */ ir_ictx = ntfs_index_ctx_get(idx_ni); if (!ir_ictx) return ENOMEM; ir_ictx->entries = OSMalloc(ictx->max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag); if (!ir_ictx->entries) { err = ENOMEM; goto err; } /* * If there is no index allocation attribute, we need to allocate an * index block worth of clusters. Thus if the cluster size is less * than the index block size the number of clusters we need to allocate * is the index block size divided by the cluster size and if the * cluster size is greater than or equal to the index block size we * simply allocate one cluster. * * Otherwise we allocate a temporary buffer for the index block. */ if (!NInoIndexAllocPresent(idx_ni)) { clusters = idx_ni->block_size >> vol->cluster_size_shift; if (!clusters) clusters = 1; /* * We cannot lock the runlist as we are holding the mft record * lock. That is fortunately ok as we also have @idx_ni->lock * on the index inode and there is no index allocation * attribute at present so no-one can be using the runlist yet. */ err = ntfs_cluster_alloc(vol, 0, clusters, -1, DATA_ZONE, TRUE, &idx_ni->rl); if (err) { ntfs_error(vol->mp, "Failed to allocate %u clusters " "for the index allocation block " "(error %d).", (unsigned)clusters, err); goto err; } /* Allocate/get the first page and zero it out. */ ntfs_page_grab(idx_ni, 0, &upl, &pl, (u8**)&ia, TRUE); if (err) { ntfs_error(vol->mp, "Failed to grab first page."); goto page_err; } bzero(ia, PAGE_SIZE); } else { /* Allocate a temporary buffer and zero it out. */ ia = OSMalloc(idx_ni->block_size, ntfs_malloc_tag); if (!ia) { err = ENOMEM; goto err; } bzero(ia, idx_ni->block_size); upl = NULL; pl = NULL; } /* * Set up the index block and copy the index entries from the index * root. */ ia->magic = magic_INDX; ia->usa_ofs = const_cpu_to_le16(sizeof(INDEX_BLOCK)); ia->usa_count = cpu_to_le16(1 + (idx_ni->block_size / NTFS_BLOCK_SIZE)); /* Set the update sequence number to 1. */ *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = cpu_to_le16(1); ih = &ia->index; ia_ie_ofs = (sizeof(INDEX_HEADER) + (le16_to_cpu(ia->usa_count) << 1) + 7) & ~7; ih->entries_offset = cpu_to_le32(ia_ie_ofs); /* Work out the size of the index entries in the index root. */ ir = ictx->ir; ir_ie_ofs = le32_to_cpu(ir->index.entries_offset); /* * FIXME: Should we scan through the index root looking for the last * entry and use that to determine the size instead? Then we could fix * any space being wasted due to a too long index for its entries. */ u = le32_to_cpu(ir->index.index_length) - ir_ie_ofs; ih->index_length = cpu_to_le32(ia_ie_ofs + u); ih->allocated_size = cpu_to_le32(idx_ni->block_size - offsetof(INDEX_BLOCK, index)); ih->flags = LEAF_NODE; ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs); memcpy((u8*)ih + ia_ie_ofs, ie, u); /* * If the index root is a large index, then the index block is an index * node, i.e. not a leaf node. */ if (ir->index.flags & LARGE_INDEX) { old.is_large_index = 1; ih->flags |= INDEX_NODE; } /* * Set up the index context to point into the index allocation block * instead of into the index root. */ ictx->entry = (INDEX_ENTRY*)((u8*)ih + ia_ie_ofs + ((u8*)ictx->entry - (u8*)ie)); ictx->is_root = 0; actx = ictx->actx; ictx->ia = ia; ictx->vcn = 0; ictx->index = ih; ictx->upl_ofs = 0; ictx->upl = upl; ictx->pl = pl; ictx->addr = (u8*)ia; ictx->bytes_free = le32_to_cpu(ia->index.allocated_size) - le32_to_cpu(ia->index.index_length); /* * We have copied the index entries and switched the index context so * we can go ahead and shrink the index root by moving the end entry on * top of the first entry. We only need to do the move if the first * index root entry is not the end entry, i.e. the index is not empty. */ ih = &ir->index; if (!(ie->flags & INDEX_ENTRY_END)) { u8 *index_end; INDEX_ENTRY *end_ie; index_end = (u8*)&ir->index + le32_to_cpu(ih->index_length); for (end_ie = ie;; end_ie = (INDEX_ENTRY*)((u8*)end_ie + le16_to_cpu(end_ie->length))) { /* Bounds checks. */ if ((u8*)end_ie < (u8*)ie || (u8*)end_ie + sizeof(INDEX_ENTRY_HEADER) > index_end || (u8*)end_ie + le16_to_cpu(end_ie->length) > index_end || (u32)sizeof(INDEX_ENTRY_HEADER) + le16_to_cpu(end_ie->key_length) > le16_to_cpu(end_ie->length)) { ntfs_error(vol->mp, "Corrupt index. Run " "chkdsk."); NVolSetErrors(vol); err = EIO; goto ictx_err; } /* Stop when we have found the last entry. */ if (end_ie->flags & INDEX_ENTRY_END) break; } memmove(ie, end_ie, le16_to_cpu(end_ie->length)); } /* * If the end entry is a leaf we need to extend it by 8 bytes in order * to turn it into a node entry. To do this we need to extend the * index root attribute itself in the case that the index root was * empty to start with or we need to do nothing if the index root was * not empty as we have not shrunk the index root attribute yet and we * moved the end index entry forward by at least one index entry which * is much bigger than 8 bytes. * * We do the index root attribute resize unconditionally because it * takes care of the size increase needed in the empty index root case * as well as of the size decrease needed in the non-empty index root * case. */ retry_resize: u = le16_to_cpu(ie->length); if (!(ie->flags & INDEX_ENTRY_NODE)) u += sizeof(VCN); err = ntfs_resident_attr_value_resize(actx->m, actx->a, offsetof(INDEX_ROOT, index) + le32_to_cpu(ih->entries_offset) + u); if (err) { leMFT_REF mref; ATTR_RECORD *a; ntfschar *a_name; ATTR_LIST_ENTRY *al_entry; u8 *al_end; ntfs_attr_search_ctx ctx; /* * This can only happen when the first entry is the last entry, * i.e. this is an empty directory and thus the above memmove() * has not been done, in which case we only need eight bytes * (sizeof(VCN)) more space which means that if we manage to * move out a single attribute out of the mft record we would * gain that much space. In the worst case scenario we can * always move out the index root attribute itself in which * case we are guaranteed to have enough space as an empty * index root is much smaller than an mft record. The only * complication is when there is no attribute list attribute as * we have to add it first. On the bright side that in itself * can cause some space to be freed up an we may have enough to * extend the index root by eight bytes. * * Add an attribute list attribute if it is not present. */ if (!NInoAttrList(base_ni)) { /* * Take a copy of the current attribute record pointer * so we can update all our pointers below. */ a = actx->a; err = ntfs_attr_list_add(base_ni, actx->m, actx); if (err || actx->is_error) { if (!err) err = actx->error; ntfs_error(vol->mp, "Failed to add attribute " "list attribute to mft_no " "0x%llx (error %d).", (unsigned long long) base_ni->mft_no, err); goto ictx_err; } update_and_retry_resize: /* * Need to update our cached pointers as the index root * attribute is likely to have moved. */ ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); ih = &ir->index; ir_ie_ofs = le32_to_cpu(ir->index.entries_offset); ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs); /* * Retry the resize in case we freed eight bytes in the * mft record which is all we need. */ goto retry_resize; } /* * If this is the only attribute record in the mft record we * cannot gain anything by moving it or anything else. This * really cannot happen as we have emptied the index root thus * have made the attribute as small as possible thus it must * fit just fine in an otherwise empty mft record thus we must * have enough space to do the resize thus we cannot have * gotten here. */ if (ntfs_attr_record_is_only_one(actx->m, actx->a)) panic("%s(): ntfs_attr_record_is_only_one()\n", __FUNCTION__); /* * We now know we have an attribute list attribute and that we * still do not have enough space to extend the index root * attribute by eight bytes. * * First, if the index root attribute is already in an extent * mft record move it into a new, empty extent mft record. * This is guaranteed to give us the needed eight bytes. * * Second, look through the mft record to see if there are any * attribute records we can move out into an extent mft record * and if yes move one. That will also definitely give us the * needed eight bytes of space. * * Finally, if no attributes are available for moving then move * the index root attribute itself. */ if (actx->ni != base_ni) { move_idx_root: lck_rw_lock_shared(&base_ni->attr_list_rl.lock); err = ntfs_attr_record_move(actx); lck_rw_unlock_shared(&base_ni->attr_list_rl.lock); if (err) { ntfs_error(vol->mp, "Failed to move index " "root attribute from mft " "record 0x%llx to an extent " "mft record (error %d).", (unsigned long long) actx->ni->mft_no, err); goto ictx_err; } /* * Need to update our cached pointers as the index root * attribute has now moved after which we need to retry * the resize which will now succeed. */ goto update_and_retry_resize; } /* * We know the index root is in the base mft record. * * Look through the base mft record to see if there are any * attribute records we can move out into an extent mft record * and if yes move one. That will also definitely give us the * needed eight bytes of space. */ ntfs_attr_search_ctx_init(&ctx, base_ni, actx->m); ctx.is_iteration = 1; do { err = ntfs_attr_find_in_mft_record(AT_UNUSED, NULL, 0, NULL, 0, &ctx); if (err) { if (err == ENOENT) { /* * No attributes are available for * moving thus move the index root * attribute out of the base mft record * into a new extent. * * TODO: Need to get this case * triggered and then need to run * chkdsk to check for validity of * moving the index root attribute out * of the base mft record. */ goto move_idx_root; } ntfs_error(vol->mp, "Failed to iterate over " "attribute records in base " "mft record 0x%llx (error %d).", (unsigned long long) base_ni->mft_no, err); goto ictx_err; } /* * Skip the standard information attribute, the * attribute list attribute, and the index root * attribute as we are looking for lower priority * attributes to move out and the attribute list * attribute is of course not movable. */ a = ctx.a; } while (a->type == AT_STANDARD_INFORMATION || a->type == AT_ATTRIBUTE_LIST || a->type == AT_INDEX_ROOT); /* * Move the found attribute out to an extent mft record and * update its attribute list entry. * * But first find the attribute list entry matching the * attribute record so it can be updated. */ a_name = (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)); al_entry = (ATTR_LIST_ENTRY*)base_ni->attr_list; al_end = (u8*)al_entry + base_ni->attr_list_size; mref = MK_LE_MREF(base_ni->mft_no, base_ni->seq_no); do { if ((u8*)al_entry >= al_end || !al_entry->length) { ntfs_error(vol->mp, "Attribute list attribute " "in mft_no 0x%llx is " "corrupt. Run chkdsk.", (unsigned long long) base_ni->mft_no); err = EIO; goto ictx_err; } if (al_entry->mft_reference == mref && al_entry->instance == a->instance) { /* * We found the entry, stop looking but first * perform a quick sanity check that we really * do have the correct attribute record. */ if (al_entry->type == a->type && ntfs_are_names_equal( (ntfschar*)((u8*)al_entry + al_entry->name_offset), al_entry->name_length, a_name, a->name_length, TRUE, vol->upcase, vol->upcase_len)) break; ntfs_error(vol->mp, "Found corrupt attribute " "list attribute when looking " "for attribute type 0x%x in " "attribute list attribute of " "base mft record 0x%llx. Run " "chkdsk.", (unsigned)le32_to_cpu(a->type), (unsigned long long) base_ni->mft_no); NVolSetErrors(vol); err = EIO; goto ictx_err; } /* Go to the next attribute list entry. */ al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + le16_to_cpu(al_entry->length)); } while (1); /* Finally, move the attribute to an extent record. */ err = ntfs_attr_record_move_for_attr_list_attribute(&ctx, al_entry, actx, NULL); if (err) { ntfs_error(vol->mp, "Failed to move attribute type " "0x%x out of base mft record 0x%llx " "and into an extent mft record (error " "%d). Run chkdsk.", (unsigned)le32_to_cpu(a->type), (unsigned long long)base_ni->mft_no, err); NVolSetErrors(vol); goto ictx_err; } /* * Sync the modified attribute list attribute value to * metadata/disk. */ ntfs_attr_search_ctx_reinit(&ctx); err = ntfs_attr_list_sync(base_ni, (u8*)al_entry - base_ni->attr_list, &ctx); if (!err) { /* * Need to update our cached pointers as the index root * attribute has likely moved after which we need to * retry the resize which will now succeed. */ goto update_and_retry_resize; } /* * FIXME: We could try and revert the move of the attribute and * the attribute list attribute value change but that is a lot * of work and the only real errors that could happen at this * stage are either that we have run out of memory or that the * disk has gone bad or has been disconnected or there must be * bad memory corruption. In all those cases we are extremely * unlikely to be able to undo what we have done so far due to * further errors so at least for now we do not bother trying. */ ntfs_error(vol->mp, "Failed to sync attribute list attribute " "in mft_no 0x%llx (error %d). Leaving " "corrupt metadata. Run chkdsk.", (unsigned long long)base_ni->mft_no, err); /* * Need to update our cached pointers as the index root * attribute has likely moved. */ ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); ih = &ir->index; ir_ie_ofs = le32_to_cpu(ir->index.entries_offset); ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs); goto ictx_err; } /* * Update the end index entry and the index header to reflect the * changes in size. */ ie->length = cpu_to_le16(u); ih->allocated_size = ih->index_length = cpu_to_le32( le32_to_cpu(ih->entries_offset) + u); /* * Update the index root and end index entry to reflect the fact that * the directory now is a large index and that the end index entry * points to a sub-node and set the sub-node pointer to vcn 0. */ ih->flags |= LARGE_INDEX; ie->flags |= INDEX_ENTRY_NODE; *(leVCN*)((u8*)ie + le16_to_cpu(ie->length) - sizeof(VCN)) = 0; /* * Now setup the new index context for the emptied index root and * attach it at the start of the tree path. */ ir_ictx->entries[0] = ir_ictx->entry = ie; ir_ictx->is_root = 1; ir_ictx->is_locked = 1; ir_ictx->ir = ir; ir_ictx->index = ih; ir_ictx->actx = actx; ir_ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) - le32_to_cpu(actx->m->bytes_in_use); ir_ictx->max_entries = ictx->max_entries; ir_ictx->nr_entries = 1; /* Ensure the modified index root attribute is written to disk. */ ntfs_index_entry_mark_dirty(ir_ictx); /* * Attach the new index context between the current index context * (which is the top of the tree) and the index context above it (which * is the bottom of the tree), i.e. make the new index context the top * of the tree. */ ictx->up->down = ir_ictx; ir_ictx->down = ictx; ir_ictx->up = ictx->up; ictx->up = ir_ictx; /* * If the index allocation attribute is not present yet, we need to * create it and the index bitmap attribute. */ if (upl) { /* * The page is now done, mark the index context as dirty so it * gets written out later. */ ntfs_index_entry_mark_dirty(ictx); /* * Set up the index inode to reflect that it has an index * allocation attribute. */ NInoSetIndexAllocPresent(idx_ni); lck_spin_lock(&idx_ni->size_lock); idx_ni->allocated_size = (s64)clusters << vol->cluster_size_shift; idx_ni->initialized_size = idx_ni->data_size = idx_ni->block_size; lck_spin_unlock(&idx_ni->size_lock); if (idx_ni->name == I30) { lck_spin_lock(&base_ni->size_lock); base_ni->allocated_size = idx_ni->allocated_size; base_ni->initialized_size = base_ni->data_size = idx_ni->block_size; lck_spin_unlock(&base_ni->size_lock); } if (!ubc_setsize(idx_ni->vn, idx_ni->data_size)) panic("%s(): ubc_setsize() failed.\n", __FUNCTION__); /* * Find the position at which we need to insert the index * allocation attribute. */ err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, idx_ni->name, idx_ni->name_len, 0, NULL, 0, actx); if (err != ENOENT) { if (!err) { ntfs_error(vol->mp, "Index allocation " "attribute already present " "when it should not be. " "Corrupt index or driver " "bug. Run chkdsk."); NVolSetErrors(vol); err = EIO; } else ntfs_error(vol->mp, "Failed to look up " "position at which to insert " "the index allocation " "attribute (error %d).", err); goto ia_err; } /* Insert the index allocation attribute. */ err = ntfs_insert_index_allocation_attribute(base_ni, actx, idx_ni); if (err || actx->is_error) { ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx " "(error %d).", actx->is_error ? "remap extent mft record of" : "insert index allocation attribute in ", (unsigned long long)base_ni->mft_no, err); goto ia_err; } /* * Ensure the created index allocation attribute is written to * disk. */ NInoSetMrecNeedsDirtying(actx->ni); /* * Find the position at which we need to insert the index * bitmap attribute. */ ntfs_attr_search_ctx_reinit(actx); err = ntfs_attr_lookup(AT_BITMAP, idx_ni->name, idx_ni->name_len, 0, NULL, 0, actx); if (err != ENOENT) { if (!err) { ntfs_error(vol->mp, "Index bitmap attribute " "attribute already present " "when it should not be. " "Corrupt index or driver " "bug. Run chkdsk."); NVolSetErrors(vol); err = EIO; } else ntfs_error(vol->mp, "Failed to look up " "position at which to insert " "the index bitmap attribute " "(error %d).", err); goto bmp_err; } /* * Insert the index bitmap attribute as a resident attribute * with a value length sufficient to cover the number of * allocated index blocks rounded up to a multiple of 8 bytes * which are initialized to zero. We then set the first bit in * the bitmap to reflect the fact that the first index * allocation block is in use. */ err = ntfs_resident_attr_record_insert(base_ni, actx, AT_BITMAP, idx_ni->name, idx_ni->name_len, NULL, (((idx_ni->allocated_size >> idx_ni->block_size_shift) + 63) >> 3) & ~(s64)7); if (err || actx->is_error) { if (!err) err = actx->error; ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx " "(error %d).", actx->is_error ? "remap extent mft record of" : "insert index bitmap attribute in", (unsigned long long)base_ni->mft_no, err); goto bmp_err; } *((u8*)actx->a + le16_to_cpu(actx->a->value_offset)) = 1; /* * Ensure the created index bitmap attribute is written to * disk. */ NInoSetMrecNeedsDirtying(actx->ni); /* * We successfully created the index allocation and bitmap * attributes thus we can set up our cache of the last set bit * in the bitmap in the index inode to 0 to reflect the fact * that we just allocated the first index block. */ idx_ni->last_set_bit = 0; } /* * We are either completely done and can release the attribute search * context and the mft record or we now need to call functions which * will deadlock with us if we do not release the attribute search * context and the mft record so we have to release them now thus we * now unlock the index root context. */ ntfs_index_ctx_unlock(ir_ictx); if (upl) { INDEX_ENTRY **entries; long delta; unsigned i; update_ie_pointers: /* * Note we still hold the locked and referenced index * allocation page which is now attached to the index context * and will be released later when the index context is * released. * * We need to update the index entry pointers in the array to * point into the index block instead of the old index root. */ entries = ictx->entries; delta = (u8*)ictx->entry - (u8*)entries[ictx->entry_nr]; for (i = 0; i < ictx->nr_entries; i++) entries[i] = (INDEX_ENTRY*)((u8*)entries[i] + delta); if (ictx->entry != entries[ictx->entry_nr]) panic("%s(): ictx->entry != entries[ictx->entry_nr]\n", __FUNCTION__); ntfs_debug("Done."); return 0; } /* * We have an index allocation attribute already thus we need to * allocate an index block from it for us to transfer the temporary * index block to. * * We need to set @is_locked to zero because we are using the temporary * buffer for @ia and hence @upl and @pl are zero thus as far as the * index context code is concerned the context is not actually locked * at all. Not doing this can lead to a panic() in * ntfs_index_block_alloc() under some circumstances due to an * assertion check that verifies that @is_locked is zero in * ntfs_attr_resize(). */ ictx->is_locked = 0; err = ntfs_index_block_alloc(ictx, &vcn, &ia, &ictx->upl_ofs, &upl, &ictx->pl, &ictx->addr); if (err) { ntfs_error(vol->mp, "Failed to allocate index allocation " "block (error %d).", err); goto alloc_err; } /* Copy the update sequence number into our temporary buffer. */ *(le16*)((u8*)ictx->ia + le16_to_cpu(ictx->ia->usa_ofs)) = *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)); /* * Copy our temporary buffer into the allocated index block, free the * temporary buffer and setup the index context to point to the * allocated index block instead of the temporary one. */ memcpy(ia, ictx->ia, idx_ni->block_size); OSFree(ictx->ia, idx_ni->block_size, ntfs_malloc_tag); ictx->entry = ie = (INDEX_ENTRY*)((u8*)ia + ((u8*)ictx->entry - (u8*)ictx->ia)); ictx->ia = ia; ictx->index = &ia->index; ictx->upl = upl; ictx->is_locked = 1; /* * If the vcn of the allocated index block is not zero, we need to * update the vcn in the index block itself as well as the sub-node * vcn pointer in the index root attribute. */ if (vcn) { ictx->vcn = vcn; ia->index_block_vcn = cpu_to_sle64(vcn); ictx->upl_ofs = (vcn << idx_ni->vcn_size_shift) & ~PAGE_MASK_64; /* Get hold of the mft record for the index inode. */ err = ntfs_mft_record_map(base_ni, &m); if (err) { /* * The only thing that is now incorrect is the vcn * sub-node pointer in the empty index root attribute * but we cannot correct it as we are failing to map * the mft record which we need to be able to rollback. * We leave it to chkdsk to sort this out later. */ ntfs_error(vol->mp, "Cannot rollback partial index " "upgrade because " "ntfs_mft_record_map() failed (error " "%d). Leaving inconsistent " "metadata. Run chkdsk.", err); goto map_err; } actx = ntfs_attr_search_ctx_get(base_ni, m); if (!actx) { err = ENOMEM; ntfs_error(vol->mp, "Cannot rollback partial index " "upgrade because there was not enough " "memory to obtain an attribute search " "context. Leaving inconsistent " "metadata. Run chkdsk."); goto actx_err; } /* Find the index root attribute in the mft record. */ err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, 0, NULL, 0, actx); if (err) goto lookup_err; /* Get to the index root value. */ ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); /* The first index entry which is also the last one. */ ie = (INDEX_ENTRY*)((u8*)&ir->index + le32_to_cpu(ir->index.entries_offset)); if (!(ie->flags & INDEX_ENTRY_END)) panic("%s(): !(ie->flags & INDEX_ENTRY_END)\n", __FUNCTION__); if (!(ie->flags & INDEX_ENTRY_NODE)) panic("%s(): !(ie->flags & INDEX_ENTRY_NODE)\n", __FUNCTION__); /* Finally, update the vcn pointer of the index entry. */ *(leVCN*)((u8*)ie + le16_to_cpu(ie->length) - sizeof(VCN)) = cpu_to_sle64(vcn); /* * Ensure the updated index entry is written to disk and * release the attribute search context and the mft record. */ NInoSetMrecNeedsDirtying(actx->ni); ntfs_attr_search_ctx_put(actx); ntfs_mft_record_unmap(base_ni); } /* * The page is now done, mark the index context as dirty so it gets * written out later. */ ntfs_index_entry_mark_dirty(ictx); goto update_ie_pointers; lookup_err: ntfs_error(vol->mp, "Cannot rollback partial index upgrade because we " "failed to look up the index root attribute (error " "%d). Leaving inconsistent metadata. Run chkdsk.", err); if (err == ENOENT) err = EIO; ntfs_attr_search_ctx_put(actx); actx_err: ntfs_mft_record_unmap(base_ni); map_err: /* * The page is now done, mark the index context as dirty so it gets * written out later. */ ntfs_index_entry_mark_dirty(ictx); goto err; alloc_err: /* * We need to get hold of the mft record and get a new search context * so we can restore the index root attribute. */ err2 = ntfs_mft_record_map(base_ni, &m); if (err2) { ntfs_error(vol->mp, "Cannot rollback partial index upgrade " "because ntfs_mft_record_map() failed (error " "%d). Leaving inconsistent metadata. Run " "chkdsk.", err2); undo_alloc_err2: /* * Mark the index context invalid given it neither has an index * block and page nor an index root and mft record attached. */ ictx->entry = NULL; goto undo_alloc_err; } actx = ntfs_attr_search_ctx_get(base_ni, m); if (!actx) { ntfs_error(vol->mp, "Cannot rollback partial index upgrade " "because there was not enough memory to " "obtain an attribute search context. Leaving " "inconsistent metadata. Run chkdsk."); ntfs_mft_record_unmap(base_ni); goto undo_alloc_err2; } goto restore_ir; bmp_err: ntfs_attr_search_ctx_reinit(actx); err2 = ntfs_attr_lookup(AT_INDEX_ALLOCATION, idx_ni->name, idx_ni->name_len, 0, NULL, 0, actx); if (err2) { ntfs_error(vol->mp, "Cannot rollback partial index upgrade " "because looking up the index allocation " "attribute failed (error %d). Leaving " "inconsistent metadata. Run chkdsk.", err2); bmp_err_err: NVolSetErrors(vol); /* * Everything is actually consistent except for the fact that * the index bitmap attribute is missing so it is better if we * do not do any kind of rollback. Hopefully, chkdsk will fix * this by adding the missing index bitmap attribute. */ goto err; } /* Remove the added index allocation attribute. */ err2 = ntfs_attr_record_delete(base_ni, actx); if (err2) { ntfs_error(vol->mp, "Cannot rollback partial index upgrade " "because deleting the index allocation " "attribute failed (error %d). Leaving " "inconsistent metadata. Run chkdsk.", err2); goto bmp_err_err; } ia_err: /* Reset the inode. */ lck_spin_lock(&idx_ni->size_lock); idx_ni->initialized_size = idx_ni->data_size = idx_ni->allocated_size = 0; lck_spin_unlock(&idx_ni->size_lock); if (idx_ni->name == I30) { lck_spin_lock(&base_ni->size_lock); base_ni->initialized_size = base_ni->data_size = base_ni->allocated_size = 0; lck_spin_unlock(&base_ni->size_lock); } NInoClearIndexAllocPresent(idx_ni); /* * Cannot call ubc_setsize() because we have the page pinned so delay * until we dump the page later on. */ need_ubc_setsize = TRUE; /* * Pretend the @ir_ictx is not locked as we are going to transfer the * index root back to @ictx. */ ir_ictx->is_locked = 0; ir_ictx->actx = NULL; /* Restore the index root attribute. */ ntfs_attr_search_ctx_reinit(actx); restore_ir: err2 = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, 0, NULL, 0, actx); if (err2) { ntfs_error(vol->mp, "Cannot rollback partial index upgrade " "because looking up the index root attribute " "failed (error %d). Leaving inconsistent " "metadata. Run chkdsk.", err2); NVolSetErrors(vol); goto ictx_err; } ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); ih = &ir->index; ie = (INDEX_ENTRY*)((u8*)ih + ir_ie_ofs); u = ir_ie_ofs + (le32_to_cpu(ia->index.index_length) - ia_ie_ofs); err2 = ntfs_resident_attr_value_resize(actx->m, actx->a, offsetof(INDEX_ROOT, index) + u); if (err2) { ntfs_error(vol->mp, "Cannot rollback partial index upgrade " "because resizing the index root attribute " "failed (error %d). Leaving inconsistent " "metadata. Run chkdsk.", err2); NVolSetErrors(vol); goto ictx_err; } if (!old.is_large_index) ih->flags &= ~LARGE_INDEX; ih->allocated_size = ih->index_length = cpu_to_le32(u); memcpy(ie, (u8*)&ia->index + ia_ie_ofs, le32_to_cpu(ia->index.index_length) - ia_ie_ofs); /* Ensure the restored index root attribute is written to disk. */ NInoSetMrecNeedsDirtying(actx->ni); ictx_err: /* * Reset the index context. We may be setting @ictx->entry to a bogus * value but it does not matter because we are returning an error code * thus the caller must not use the index context and while the value * may be bogus it is correctly non-NULL thus the index context will be * cleaned up correctly when the caller releases it. */ ictx->entry = ictx->entries[ictx->entry_nr]; ictx->is_root = 1; ictx->is_locked = 1; ictx->ir = ir; ictx->index = ih; ictx->actx = actx; if (!actx->is_error) ir_ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) - le32_to_cpu(actx->m->bytes_in_use); if (!upl) { undo_alloc_err: OSFree(ia, idx_ni->block_size, ntfs_malloc_tag); } else { /* Destroy the page. */ ntfs_page_dump(idx_ni, upl, pl); if (need_ubc_setsize && !ubc_setsize(idx_ni->vn, 0)) panic("%s(): ubc_setsize() failed.\n", __FUNCTION__); page_err: err2 = ntfs_cluster_free_from_rl(vol, idx_ni->rl.rl, 0, -1, NULL); if (err2) { ntfs_error(vol->mp, "Failed to rollback cluster " "allocation (error %d). Run chkdsk " "to recover the lost space.", err2); NVolSetErrors(vol); } err2 = ntfs_rl_truncate_nolock(vol, &idx_ni->rl, 0); if (err2) panic("%s(): err2\n", __FUNCTION__); } err: /* * Dissociate the allocated index root context from the tree path and * throw it away. We do this here unconditionally as it works both for * the case where we have not attached it to the tree path yet and for * the case where we have already attached it to the tree path. We * have to deal with the former case here so cannot defer the cleanup * to the ntfs_index_ctx_put() of the index context @ictx that will be * done by the caller. */ ntfs_index_ctx_disconnect_reinit(ir_ictx); ntfs_index_ctx_put(ir_ictx); ntfs_error(vol->mp, "Failed (error %d).", err); return err; } /** * ntfs_index_root_make_space - make space for an index entry in the index root * @ictx: index entry in front of which to make space * @ie_size: size of the index entry to make space for * * Return 0 on success and ENOSPC if there is not enough space in the mft * record to insert an index entry of size @ie_size in the index root * attribute. * * Note that we do not update the array of index entry pointers nor the number * of entries in the array because on success it means that the index entry * being added will be created and the function is then done, i.e. the array of * index entry pointers will not be used any more and on error we have not done * anything at all so there is no need to update any of the pointers. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ static errno_t ntfs_index_root_make_space(ntfs_index_context *ictx, const u32 ie_size) { MFT_RECORD *m = ictx->actx->m; const u32 muse = le32_to_cpu(m->bytes_in_use); const u32 new_muse = muse + ie_size; ntfs_debug("Entering."); if (new_muse <= le32_to_cpu(m->bytes_allocated)) { INDEX_ENTRY *ie = ictx->entry; ATTR_RECORD *a; INDEX_HEADER *ih; /* * Extend the index root attribute so it has enough space for * the new index entry. As an optimization we combine the * resizing of the index root attribute and the moving of index * entries within the attribute into a single operation. */ memmove((u8*)ie + ie_size, ie, muse - ((u8*)ie - (u8*)m)); /* Adjust the mft record to reflect the change in used space. */ m->bytes_in_use = cpu_to_le32(new_muse); /* * Adjust the attribute record to reflect the changes in the * size of the attribute record and in the size of the * attribute value. */ a = ictx->actx->a; a->length = cpu_to_le32(le32_to_cpu(a->length) + ie_size); a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) + ie_size); /* Adjust the index header to reflect the change in length. */ ih = ictx->index; ih->allocated_size = ih->index_length = cpu_to_le32( le32_to_cpu(ih->index_length) + ie_size); ntfs_debug("Done."); return 0; } ntfs_debug("Failed (not enough space in mft record to insert index " "entry into index root attribute)."); return ENOSPC; } /** * ntfs_index_block_make_space - make space for an index entry in an index block * @ictx: index entry in front of which to make space * @ie_size: size of the index entry to make space for * * Return 0 on success and ENOSPC if there is not enough space in the index * block to insert an index entry of size @ie_size. * * Note that we do not update the array of index entry pointers nor the number * of entries in the array because on success it means that the index entry * being added will be created and the function is then done, i.e. the array of * index entry pointers will not be used any more and on error we have not done * anything at all so there is no need to update any of the pointers. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ static errno_t ntfs_index_block_make_space(ntfs_index_context *ictx, const u32 ie_size) { INDEX_HEADER *ih = ictx->index; const u32 ilen = le32_to_cpu(ih->index_length); const u32 new_ilen = ilen + ie_size; ntfs_debug("Entering."); if (new_ilen <= le32_to_cpu(ih->allocated_size)) { INDEX_ENTRY *ie = ictx->entry; /* Move the index entries to make space for the new entry. */ memmove((u8*)ie + ie_size, ie, ilen - ((u8*)ie - (u8*)ih)); /* Adjust the index header to reflect the change in length. */ ih->index_length = cpu_to_le32(new_ilen); ntfs_debug("Done."); return 0; } ntfs_debug("Failed (not enough space in index block to insert index " "entry)."); return ENOSPC; } /** * ntfs_index_node_make_space - make space for an index entry in an index node * @ictx: index entry in front of which to make space * @ie_size: size of the index entry to make space for * * Return 0 on success and ENOSPC if there is not enough space in the index * node to insert an index entry of size @ie_size. * * Note that we do not update the array of index entry pointers nor the number * of entries in the array because on success it means that the index entry * being added will be created and the function is then done, i.e. the array of * index entry pointers will not be used any more and on error we have not done * anything at all so there is no need to update any of the pointers. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ static errno_t ntfs_index_node_make_space(ntfs_index_context *ictx, const u32 ie_size) { errno_t err; if (ictx->is_root) err = ntfs_index_root_make_space(ictx, ie_size); else err = ntfs_index_block_make_space(ictx, ie_size); return err; } /** * ntfs_index_ctx_revalidate - revalidate the pointers in an index context * @new: index context with the new mapped virtual address * @old: index context to revalidate * * Revalidate all pointers in the index context @old and adjust them for the * changed mapped virtual address in the index context @new. * * Note both @new and @old must be in the same VM page. * * We need to revalidate all the pointers in @old just as if we were locking it * because @new may have been mapped into a different virtual address than @old * was last time thus the pointers in @old may be stale. * * To check if revalidation is needed is done by comparing @old->addr and * @new->addr. If they match all is ok and if not then need to revalidate @old * with the new address of @new->addr. * * Locking: - Caller must hold @new->idx_ni->lock on the index inode. * - The index context @old must be locked. * - The index context @new must not be locked. */ static void ntfs_index_ctx_revalidate(ntfs_index_context *new, ntfs_index_context *old) { long delta; INDEX_ENTRY **entries; unsigned u; if (!new->is_locked) panic("%s(): !new->is_locked\n", __FUNCTION__); if (new->is_root) panic("%s(): new->is_root\n", __FUNCTION__); if (old->is_locked) panic("%s(): old->is_locked\n", __FUNCTION__); if (old->is_root) panic("%s(): old->is_root\n", __FUNCTION__); delta = new->addr - old->addr; if (!delta) return; /* Revalidation is needed. */ old->entry = (INDEX_ENTRY*)((u8*)old->entry + delta); old->ia = (INDEX_ALLOCATION*)((u8*)old->ia + delta); old->index = &old->ia->index; old->addr = new->addr; entries = old->entries; for (u = 0; u < old->nr_entries; u++) entries[u] = (INDEX_ENTRY*)((u8*)entries[u] + delta); } /** * ntfs_index_ctx_lock_two - lock two index contexts in a deadlock-free fashion * @a: first index context to lock (this may be locked) * @b: second index context to lock (this must not be locked) * * Lock the two index contexts @a and @b. @a may already be locked and it may * need to be unlocked if it has to be locked after @b is locked. * * The following lock ordering rules are applied: * * - An index block node must be locked before an index root node. * * - Two index block nodes must be locked in ascending page offset order. * * - Two index block nodes that are physically in the same VM page must share * the lock. The way this is implemented is that @a is really locked and @b * is not actually locked but all its pointers are revalidated as if it were * locked. This is ok because @a is locked and therefore the VM page is * mapped, locked, and pinned and will not go anywhere until we are done. * The reason we need to do the revalidation is because @b when it was mapped * previously may have been mapped at a different virtual address than the * one @a is mapped at now. This means that when you are unlocking @b you * must check if @b is locked and only unlock it if so. * * Return 0 on success or errno on error. On error @a and @b may be both left * unlocked or one can be left locked and one unlocked. */ static errno_t ntfs_index_ctx_lock_two(ntfs_index_context *a, ntfs_index_context *b) { errno_t err; if (b->is_locked) panic("%s(): b->is_locked\n", __FUNCTION__); if (a->is_root) { /* * @a is the index root so it has to be locked second. * * Unlock @a if it is already locked. */ if (a->is_locked) ntfs_index_ctx_unlock(a); } else if (b->is_root) { /* @b is the index root. If @a is not locked, lock it. */ if (!a->is_locked) { err = ntfs_index_ctx_relock(a); if (err) return err; } } else { /* * Both @a and @b are index blocks. * * Do we need to share the lock because both index nodes are in * the same VM page? */ if (a->upl_ofs == b->upl_ofs) { if (!a->is_locked) { err = ntfs_index_ctx_relock(a); if (err) return err; } ntfs_index_ctx_revalidate(a, b); return 0; } if (a->is_locked) { /* Do we have to unlock @a before locking @b? */ if (a->upl_ofs > b->upl_ofs) ntfs_index_ctx_unlock(a); } else { /* Do we need to lock @a first? */ if (a->upl_ofs < b->upl_ofs) { err = ntfs_index_ctx_relock(a); if (err) return err; } } } /* Lock @b. */ err = ntfs_index_ctx_relock(b); /* If @a is currently locked or there was an error we are done. */ if (a->is_locked || err) return err; /* * We unlocked @a so we could lock @b or @a was not locked. * * Lock @a and we are done. */ return ntfs_index_ctx_relock(a); } /** * ntfs_index_entry_add_or_node_split - add a key to an index * @ictx: index context specifying the node to split/position to add at * @split_only: if true do not insert, only split the index node * @entry_size: size of the entry that will be added after the split * @key: key to add to the directory or view index * @key_len: size of key @key in bytes * @data: data to associate with the key @key if a view index * @data_len: size of data @data in bytes if a view index * * If @split_only is true, split the index node pointed to by @ictx. @ictx * also points to the entry in the index node at which an entry will be * inserted after the split is completed. @entry_size is the size of the entry * that will be added at that position. These are used so that the split is * performed with consideration with the insertion that is likely to come. And * if the insertion does not happen it does not matter much, it just means the * split was not quite on the median entry but off by one. * * In this case @key, @key_len, @data, and @data_len are ignored. * * If @split_only is false @entry_size is ignored and @key, @key_len, @data, * and @data_len are used. * * In this case, if @ictx belongs to a directory index, insert the filename * attribute @key of length @key_len bytes in the directory index at the * position specified by the index context @ictx and point the inserted index * entry at the mft reference *@data which is the mft reference of the inode to * which the filename @fn belongs. @data_len must be zero in this case. * * If @ictx belongs to a view index, insert the key @key of length @key_len * bytes in the view index at the position specified by the index context @ictx * and associate the data @data of size @data_len bytes with the key @key. * * Return 0 on success and errno on error. On error the index context is no * longer usable and must be released or reinitialized. * * Note that we do not update the array of index entry pointers nor the number * of entries in the array because on success it means that the index node has * been split and the function is done, i.e. the array of index entry pointers * will not be used any more and on error the index context becomes invalid so * there is no need to update any of the pointers. The caller is expected to * restart its operations by doing a new index lookup if it wants to continue * working on the index for that reason. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ errno_t ntfs_index_entry_add_or_node_split(ntfs_index_context *ictx, const BOOL split_only, u32 entry_size, const void *key, const u32 key_len, const void *data, const u32 data_len) { VCN vcn = 0; ntfs_index_context *cur_ictx, *child_ictx; ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni; u32 data_ofs = 0; errno_t err, err2; const BOOL is_view = (idx_ni->name != I30); ntfs_debug("Entering."); if (!ictx->is_locked) panic("%s(): !ictx->is_locked\n", __FUNCTION__); /* * If this is only a split @entry_size contains the size of the entry * and the @key and @data are not set (and not needed/used). * * If this is an addition, @entry_size is not set and we need to work * it out from the supplied @key and @data. */ if (!split_only) { /* * Calculate the size for the index entry to be added. For a * directory index, the mft reference @data is embedded inside * the directory index entry thus we only need space for the * index entry header and the filename attribute @key. * * For a view index, the data follows the key aligned to a * 4-byte boundary. * * As an additional complication, the $SDH index in the $Secure * system file contains an extra magic which is not counted in * the data length @data_len so we need to add it by hand here. */ entry_size = sizeof(INDEX_ENTRY_HEADER) + key_len; if (is_view) { data_ofs = (entry_size + 4) & ~4; entry_size = data_ofs + data_len; if (idx_ni == idx_ni->vol->secure_sdh_ni) entry_size += sizeof(((SDH_INDEX_DATA*)NULL)-> magic); } /* * Align the index entry size to an 8-byte boundary and add * another 8 bytes to the entry size if the insertion is to * happen in an index node. */ entry_size = ((entry_size + 7) & ~7) + ((ictx->index->flags & INDEX_NODE) ? sizeof(leVCN): 0); } /* Set the current entry to be the entry to be added to the index. */ cur_ictx = ictx; cur_ictx->promote_inserted = 0; cur_ictx->insert_to_add = 1; cur_ictx->insert_entry_size = entry_size; /* * We need to loop over each index node on the tree path starting at * the bottom. For each traversed node, we check if the current entry * to be inserted fits and if it does not we need to split that node. * We make the arrangements to be able to do so and then move onto the * next node up the tree and so on until we find a node which does have * enough space to fit the index entry to be inserted. We can then * break out of the loop and begin work on the actual addition of the * entry and the splitting of the so marked nodes. */ while (cur_ictx->insert_entry_size > cur_ictx->bytes_free) { ntfs_index_context *insert_ictx; unsigned median_entry_nr, insert_entry_nr, entry_nr; u32 insert_entry_size; BOOL insert_to_add; /* * The entry to be inserted into this node @cur_ictx is * specified by its @insert_entry and @insert_entry_size. */ if (!cur_ictx->is_locked) panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__); /* * We do not have enough space in the current node to insert * the entry to be inserted. * * If the current node is the index root we need to move the * index root to an index block and if successful restart the * loop to determine if we now have enough space to insert the * entry. * * This causes the B+tree to grow in height by one and is in * fact the only operation that can cause the tree to grow in * height. All other operations can only cause the tree to * grow in width. */ if (cur_ictx->is_root) { ntfs_debug("Moving index root into index allocation " "block."); /* * If we speculatively locked the index node containing * the index entry to insert into the index root, * unlock it now so we can move the index root to an * index block. */ insert_ictx = cur_ictx->insert_ictx; if (insert_ictx && insert_ictx->is_locked) { if (insert_ictx->is_root) panic("%s(): insert_ictx->is_root\n", __FUNCTION__); ntfs_index_ctx_unlock(insert_ictx); } err = ntfs_index_move_root_to_allocation_block( cur_ictx); if (!err) continue; ntfs_error(idx_ni->vol->mp, "Failed to move index " "root to index allocation block " "(error %d).", err); goto err; } ntfs_debug("Need to split index block with VCN 0x%llx.", (unsigned long long)cur_ictx->vcn); /* * We do not have enough space in the current node which we now * know is an index allocation block. We have to split the * node and promote the median entry into the parent node which * may in turn involve a split. We do not perform the split * but instead work out what needs to be done and allocate any * needed index blocks in advance so we cannot run out of disk * space and/or memory half-way through and only then do we do * the actual work on the index nodes. This preparation * involves the following steps: * * 1. Work out the median entry to be promoted into the parent * node of the current node and unlock the current node. * 2. If the entry to be promoted is not the entry to be * inserted, make a note of the fact that the entry to be * inserted is to be inserted into the current node in the * index context. * 3. Allocate a new index block and make a note of it in the * current index context. * 4. Set the parent node as the current node. * 5. Set the median entry up as the current entry to be * inserted. * * Then go round the loop again checking whether there is * enough space to insert the now current entry into the now * current node... * * First of all, mark the node as needing to be split. */ cur_ictx->split_node = 1; /* * 1. Determine the median index entry. * * Note we exclude the end entry from the median calculation * because it is not eligible for promotion thus we should use * @ictx->nr_entries - 1 but we need to take into account the * not yet inserted entry for which we are doing the split in * the first place so that is a + 1 and we also start counting * entries at 0 and not 1 thus we apply a - 1 for that. Thus * we have - 1 + 1 - 1 = -1. */ median_entry_nr = (cur_ictx->nr_entries - 1) / 2; /* * @entry_nr is the index into the array of index entry * pointers of the index entry in front of which the entry to * be inserted @cur_ictx->insert_entry needs to be inserted. */ entry_nr = cur_ictx->entry_nr; /* * If the position at which to insert the entry to be inserted * is the median promote the entry to be inserted. If the * number of entries is even there are two possible medians. * We choose the first one for simplicity. * * If the entry to be inserted is before the median, subtract 1 * from the median. * * Otherwise promote the median entry. * * The only exception is when @split_only is true and the * current node is the node we are meant to split in which * case there is no entry to be inserted and thus it cannot be * promoted. In this case we recalculate the median ignoring * the entry to be inserted and promote that. */ if (entry_nr == median_entry_nr && (!split_only || cur_ictx != ictx)) { insert_to_add = cur_ictx->insert_to_add; insert_entry_nr = cur_ictx->insert_entry_nr; insert_entry_size = cur_ictx->insert_entry_size; insert_ictx = cur_ictx->insert_ictx; /* * 2. The entry to be promoted is the entry to be * inserted, make a note of that fact. */ cur_ictx->promote_inserted = 1; } else { if (entry_nr < median_entry_nr) median_entry_nr--; else if (entry_nr == median_entry_nr) { if (!split_only) panic("%s(): !split_only\n", __FUNCTION__); if (cur_ictx != ictx) panic("%s(): cur_ictx != ictx\n", __FUNCTION__); /* * We must have at least one real entry or * there is nothing to promote. This cannot * happen unless something has gone wrong. */ if (cur_ictx->nr_entries < 2) panic("%s(): cur_ictx->nr_entries < " "2\n", __FUNCTION__); median_entry_nr = (cur_ictx->nr_entries - 2) / 2; } insert_to_add = FALSE; insert_entry_nr = median_entry_nr; insert_entry_size = le16_to_cpu(cur_ictx-> entries[median_entry_nr]->length); insert_ictx = cur_ictx; } /* * If this is the very first promotion and we are promoting a * leaf entry we need to add 8 bytes to the size of the entry * being promoted to allow for the VCN sub-node pointer that * will be added at the end of the entry when it is inserted * into the parent index node. */ if (cur_ictx == ictx && (!(cur_ictx->index->flags & INDEX_NODE) ? sizeof(leVCN): 0)) insert_entry_size += sizeof(VCN); /* * Record which entry is being promoted, i.e. where we need to * perform the split of the node. */ cur_ictx->promote_entry_nr = median_entry_nr; // TODO: Possible optimization: Allow ntfs_index_block_alloc() // to do the unlocking and thus to consume the lock if the new // index block is in the same page as the current index block. if (!cur_ictx->is_locked) panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__); ntfs_index_ctx_unlock(cur_ictx); /* * 3. Allocate a new index block and make a note of it in the * current index context. * * The call may cause the index root attribute to have its * entries moved out to an index block. That is fine as we * have not looked at any of our parent entries yet and the * index root must be above us given we are a child node. */ err = ntfs_index_block_alloc(cur_ictx, &cur_ictx->right_vcn, &cur_ictx->right_ia, &cur_ictx->right_upl_ofs, &cur_ictx->right_upl, &cur_ictx->right_pl, &cur_ictx->right_addr); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to allocate index " "allocation block (error %d).", err); goto err; } ntfs_debug("Allocated index block for right-hand node with " "VCN 0x%llx.", (unsigned long long) cur_ictx->right_vcn); /* We need to unmap the newly allocated index block. */ ntfs_page_unmap(idx_ni, cur_ictx->right_upl, cur_ictx->right_pl, TRUE); cur_ictx->right_upl = NULL; /* 4. Set the parent node as the current node and lock it. */ cur_ictx = cur_ictx->up; if (cur_ictx->is_locked) panic("%s(): cur_ictx->is_locked\n", __FUNCTION__); if (cur_ictx->is_root) { /* * We have reached the index root. We speculatively * lock the index node containing the index entry to * insert into the index root, as we cannot lock it * once we have locked the mft record (which happens * when we lock the current index context below) as * that would cause lock reversal and thus deadlocks. */ if (insert_ictx) { if (insert_ictx->is_root) panic("%s(): insert_ictx->is_root\n", __FUNCTION__); if (insert_ictx->is_locked) panic("%s(): insert_ictx->is_locked\n", __FUNCTION__); err = ntfs_index_ctx_relock(insert_ictx); if (err) goto err; } } /* Lock the current index context. */ err = ntfs_index_ctx_relock(cur_ictx); if (err) goto err; /* * 5. Set the median entry up as the current entry to be * inserted. */ cur_ictx->insert_to_add = insert_to_add; cur_ictx->insert_entry_nr = insert_entry_nr; cur_ictx->insert_entry_size = insert_entry_size; cur_ictx->insert_ictx = insert_ictx; } /* * Get the child node index context if we are not at the bottom of the * tree already or rather at the node for which we were called (which * may not actually be at the bottom of the tree). */ child_ictx = NULL; if (cur_ictx != ictx) { child_ictx = cur_ictx->down; if (child_ictx->is_root) panic("%s(): child_ictx->is_root\n", __FUNCTION__); } /* * If the node we were called for was the index root then it will have * been moved to an index allocation block which will likely have * created enough space to fit the entry to be inserted thus if this is * a split only call nothing further needs to be done. */ if (cur_ictx == ictx && split_only) { ntfs_debug("Done (index root was upgraded to index " "allocation, no further split needed)."); return 0; } /* * We now have prepared everything so we are guaranteed not to run out * of disk space and can begin doing the work. * * TODO: We can still fail if relocking of an index context fails but * that can really only happen under extreme memory pressure and there * is not much we can do about that without <rdar://problem/4992358> * being implemented. Thus for now we simply bomb out with an error * message and leave the index potentially badly corrupted and also we * potentially leave some allocated index blocks actually unused. This * is highly unsatisfactory but rolling back once the modifications * have begun is pretty much impossible without keeping a lot more * state so we do not implement rollback. Once the aforementioned * radar is implemented we will no longer suffer from this problem and * it will no longer be possible to fail once we get here. * * We start with the current node which is the top-most node we need to * deal with. We know from above it has enough space and does not need * to be split. */ do { ntfs_index_context *insert_ictx; INDEX_ENTRY *entry, *right_entry, *end_entry; INDEX_HEADER *right_index; unsigned u; if (!cur_ictx->is_locked) panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__); /* * The current node is either the top-most node we need to deal * with thus we know it has enough space or we have just split * it and thus also know that it must have enough space. * * Thus the insertion is now a matter of doing a simple insert * into the node unless the current node has had its entry to * be inserted promoted into the parent node in which case * there is nothing to insert into this node. * * Note: Use goto to reduce indentation. */ insert_ictx = cur_ictx->insert_ictx; if (cur_ictx->promote_inserted) goto skip_insert; /* * We now need to begin to do the insertion but there is one * complication we need to deal with first and that is that if * we are inserting a promoted entry, we have to lock the index * node it is in also. * * If the current node is an index block we may need to unlock * it so that we can ensure to always take the node lock in * ascending page offset order. Also we have to consider the * case where the two index nodes are in the same page in which * case we need to share the index lock. * * If the current node is the index root and we are inserting a * promoted entry, we speculatively locked the index node * containing the entry in the above loop thus we do not need * to do anything here. */ if (!cur_ictx->insert_to_add) { if (!insert_ictx) panic("%s(): !insert_ictx\n", __FUNCTION__); if (cur_ictx->is_root) { if (!insert_ictx->is_locked) panic("%s(): !insert_ictx->" "is_locked\n", __FUNCTION__); } else { err = ntfs_index_ctx_lock_two(cur_ictx, insert_ictx); if (err) goto relock_err; } } entry_size = cur_ictx->insert_entry_size; /* * Everything we need is locked and we know there is enough * space in the index node thus make space for the entry to be * inserted at the appropriate place and then insert the index * entry by either copying it from the appropriate, locked * child node or by creating it in place. The latter happens * when the entry being inserted is the entry being added with * this call to ntfs_index_entry_add(), i.e. it is the reason * we have been called in the first place and thus there is no * index entry to copy from. */ err = ntfs_index_node_make_space(cur_ictx, entry_size); if (err) panic("%s(): err\n", __FUNCTION__); /* * We have modified the index node. Make sure it is written * out later. */ ntfs_index_entry_mark_dirty(cur_ictx); if (!cur_ictx->insert_to_add) { entry = insert_ictx->entries[ cur_ictx->insert_entry_nr]; /* Copy the index entry into the created space. */ memcpy(cur_ictx->entry, entry, le16_to_cpu(entry->length)); entry = cur_ictx->entry; } else { if (split_only) panic("%s(): split_only\n", __FUNCTION__); entry = cur_ictx->entry; /* * Clear the created space so we start with a clean * slate and do not need to worry about initializing * all the zero fields. */ bzero(entry, entry_size); /* Create the index entry in the created space. */ if (!is_view) entry->indexed_file = *(leMFT_REF*)data; else { u8 *new_data; new_data = (u8*)entry + data_ofs; entry->data_offset = cpu_to_le16(data_ofs); entry->data_length = cpu_to_le16(data_len); if (data_len) memcpy(new_data, data, data_len); /* * In the case of $Secure/$SDH we leave the * extra magic to zero rather than setting it * to "II" in Unicode. This could easily be * changed if deemed better and/or necessary by * uncommenting the below code. */ #if 0 if (idx_ni == idx_ni->vol->secure_sdh_ni) { static const ntfschar SDH_magic[2] = { const_cpu_to_le16('I'), const_cpu_to_le16('I') }; memcpy(((SDH_INDEX_DATA*)data)->magic, SDH_magic, sizeof(SDH_magic)); } #endif } entry->key_length = cpu_to_le16(key_len); memcpy(&entry->key, key, key_len); } /* * If the copied entry is a leaf entry and it is being inserted * into a non-leaf node, we need to rewrite its size with the * new size which includes the VCN sub-node pointer. * * We just overwrite the length unconditionally and use it to * set the length of the just created index entry, too. * * There is no harm in doing this as we are either updating the * size which we must do or we are overwriting the size with * the same value it already has which has no effect. */ entry->length = cpu_to_le16(entry_size); /* * If the current node is not a leaf node, we have to fix up * the VCN sub-node pointers both in the entry we just inserted * and in the entry in front of which we inserted. * * The inserted index entry needs to point to what will be the * left-hand index node after our child node is split. * * The index entry in front of which we inserted needs to point * to what will be the right-hand index node after our child * node is split. * * The INDEX_NODE check is fine for the index root, too, * because as it happens LARGE_INDEX == INDEX_NODE. */ if (cur_ictx->index->flags & INDEX_NODE) { if (!child_ictx) panic("%s(): !child_ictx\n", __FUNCTION__); if (entry->flags & INDEX_ENTRY_END) panic("%s(): entry->flags & INDEX_ENTRY_END\n", __FUNCTION__); entry->flags |= INDEX_ENTRY_NODE; *(leVCN*)((u8*)entry + entry_size - sizeof(VCN)) = cpu_to_sle64(child_ictx->vcn); ntfs_debug("Setting VCN sub-node pointer of inserted " "index entry to 0x%llx.", (unsigned long long)child_ictx->vcn); entry = (INDEX_ENTRY*)((u8*)entry + entry_size); if (!(entry->flags & INDEX_ENTRY_NODE)) panic("%s(): !(entry->flags & " "INDEX_ENTRY_NODE)\n", __FUNCTION__); *(leVCN*)((u8*)entry + le16_to_cpu(entry->length) - sizeof(VCN)) = cpu_to_sle64( child_ictx->right_vcn); ntfs_debug("Setting VCN sub-node pointer of index " "entry after inserted entry to " "0x%llx.", (unsigned long long) child_ictx->right_vcn); } skip_insert: /* * If there are no child nodes left we are done. We will dirty * the index node once we have broken out of the loop. When * the index context is released later all locked nodes will be * unlocked so no need to do it now. */ if (!child_ictx) break; if (!child_ictx->split_node) panic("%s(): !child_ictx->split_node\n", __FUNCTION__); /* * TODO: @child_ictx->right_upl and @child_ictx->right_pl are * currently not valid as @child_ictx is not locked. Once * <rdar://problem/4992358> is implemented we can re-enable * this check and change the code to leave the right_upl mapped * at all times. */ #if 0 if (!child_ictx->right_upl || !child_ictx->right_pl) panic("%s(): !child_ictx->right_upl || !child_ictx->right_pl\n", __FUNCTION__); #endif /* * We are done with the current node and have a child node. * Switch to the child node, and sort out the needed locks. * * First, unlock the @insert_ictx node if it exists and is * locked. * * Note we do not bother with trying to transfer the lock from * @insert_ictx onto @child_ictx or @child_ictx->right_* * because index blocks are 4096 bytes in size in majority of * cases and the PAGE_SIZE is 4096 bytes both on x86 and PPC * thus in majority of cases each page will contain a separate * index block thus no sharing will be possible and there is no * point in adding extra complexity for an extremely unlikely * event. */ if (insert_ictx && insert_ictx->is_locked) ntfs_index_ctx_unlock(insert_ictx); /* * Unlock the current node. Again we do not bother with trying * to share the lock with @child_ictx or @child_ictx->right_*. */ ntfs_index_ctx_unlock(cur_ictx); /* Set the child node to be the current node. */ cur_ictx = child_ictx; /* * We need to ensure both the current node and its right-hand * node are locked. Both are currently unlocked. * * If both nodes share the same page, lock the current node and * share the lock with the right one. */ if (cur_ictx->is_locked) panic("%s(): cur_ictx->is_locked\n", __FUNCTION__); if (cur_ictx->right_is_locked) panic("%s(): cur_ictx->right_is_locked\n", __FUNCTION__); if (cur_ictx->upl_ofs <= cur_ictx->right_upl_ofs) { err = ntfs_index_ctx_relock(cur_ictx); if (err) goto relock_err; } if (cur_ictx->upl_ofs == cur_ictx->right_upl_ofs) { cur_ictx->right_ia = (INDEX_ALLOCATION*)( (u8*)cur_ictx->right_ia + (cur_ictx->addr - cur_ictx->right_addr)); cur_ictx->right_addr = cur_ictx->addr; } else { u8 *addr; err = ntfs_page_map(idx_ni, cur_ictx->right_upl_ofs, &cur_ictx->right_upl, &cur_ictx->right_pl, &addr, TRUE); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to map " "index page (error %d).", err); cur_ictx->right_upl = NULL; goto relock_err; } cur_ictx->right_is_locked = 1; cur_ictx->right_ia = (INDEX_ALLOCATION*)( (u8*)cur_ictx->right_ia + (addr - cur_ictx->right_addr)); cur_ictx->right_addr = addr; } if (!cur_ictx->is_locked) { err = ntfs_index_ctx_relock(cur_ictx); if (err) { if (cur_ictx->right_is_locked) { ntfs_page_unmap(idx_ni, cur_ictx->right_upl, cur_ictx->right_pl, FALSE); cur_ictx->right_upl = NULL; cur_ictx->right_is_locked = 0; } goto relock_err; } } /* * Having obtained the needed locks, we can now split the * current node. * * We have recorded the split point in @promote_entry_nr and * @promote_inserted tells us whether to remove the entry * specified by @promote_entry_nr and move all entries after it * to the right-hand node (@promote_inserted is 0) or whether * to move the entry specified by @promote_entry_nr and all * entries after it to the right-hand node (@promote_inserted * is 1). * * The split results in the creation of a new end entry in the * left-hand node as its old end entry is moved to the right * hand node. This means we need to determine what we need to * set the VCN sub-node pointer to if this is an index node. * * If @promote_iserted is 0, i.e. we are promoting an existing * entry from this node, we need to use the VCN sub-node * pointer of the entry we are about to promote. Because we * are promoting the entry it is going to disappear altogether * from this node thus we need to make a note of its sub-node * VCN pointer if it has one now before doing the actual split. * * In this case we do not need to modify the VCN sub-node * pointer of the first entry in the (new) right-hand node as * it does not change. * * If @promote_inserted is 1, i.e. we are promoting the entry * that was going to be inserted into this node, we need to use * the VCN of our child node which is the VCN of the entry in * front of which we are inserting and splitting, i.e. the VCN * of the first entry we are about to move to the right-hand * node. * * In this case we also need to modify the VCN sub-node pointer * of the first entry in the (new) right-hand node to point to * the (new) right-hand child node. We do not know what that * is yet so we determine it later once we have obtained our * child node index context containing the needed information. * * First, copy the appropriate entries to the right-hand node * and switch the node to be an index, i.e. non-leaf, node if * the node being split is an index node. */ right_index = &cur_ictx->right_ia->index; right_entry = (INDEX_ENTRY*)((u8*)right_index + le32_to_cpu(right_index->entries_offset)); u = cur_ictx->promote_entry_nr; entry = cur_ictx->entries[u]; if (!cur_ictx->promote_inserted) { if (entry->flags & INDEX_ENTRY_NODE) { if (!(cur_ictx->index->flags & INDEX_NODE)) panic("%s(): !(cur_ictx->index->flags " "& INDEX_NODE)\n", __FUNCTION__); vcn = sle64_to_cpu(*(leVCN*)((u8*)entry + le16_to_cpu(entry->length) - sizeof(VCN))); } u++; entry = cur_ictx->entries[u]; } end_entry = cur_ictx->entries[cur_ictx->nr_entries - 1]; if (!(end_entry->flags & INDEX_ENTRY_END)) panic("%s(): !(end_entry->flags & INDEX_ENTRY_END)\n", __FUNCTION__); u = (u8*)end_entry - (u8*)entry + le16_to_cpu(end_entry->length); memcpy(right_entry, entry, u); right_index->index_length = cpu_to_le32( le32_to_cpu(right_index->entries_offset) + u); right_index->flags = cur_ictx->index->flags; /* * Move the end entry of the left-hand node forward thus * truncating the left-hand node. */ if (!cur_ictx->promote_inserted) entry = cur_ictx->entries[cur_ictx->promote_entry_nr]; if (entry != end_entry) { u = le16_to_cpu(end_entry->length); memmove(entry, end_entry, u); u += (u8*)entry - (u8*)cur_ictx->index; cur_ictx->index->index_length = cpu_to_le32(u); } /* * If the current, left-hand node is not a leaf node, we have * to replace the VCN sub-node pointer in its end entry. * * If @promote_iserted is 0, we use the VCN we made a note of * above. * * If @promote_inserted is 1, we take the VCN of the left-hand * node of our child node. * * A side effect of getting the child node in loop scope here * is that we do not need to reget it when we go round the loop * again which is when we need to know the child node in order * to be able to insert the appropriate entry into the current * node. */ child_ictx = NULL; if (entry->flags & INDEX_ENTRY_NODE) { if (cur_ictx != ictx) { child_ictx = cur_ictx->down; if (child_ictx->is_root) panic("%s(): child_ictx->is_root\n", __FUNCTION__); } if (cur_ictx->promote_inserted) { if (!child_ictx) panic("%s(): !child_ictx\n", __FUNCTION__); /* * As described take the VCN of the left-hand * node of our child node index context for the * new end entry. */ vcn = child_ictx->vcn; /* * Again, as described, update the VCN of the * first entry we just moved to the (new) right * hand node to the right-hand node of our * child node index context. */ *(leVCN*)((u8*)right_entry + le16_to_cpu( right_entry->length) - sizeof(VCN)) = cpu_to_sle64( child_ictx->right_vcn); ntfs_debug("Setting VCN sub-node pointer of " "first index entry of " "right-hand index block node " "after splitting it off from " "the left-hand node to " "0x%llx.", (unsigned long long) child_ictx->right_vcn); } if (!(cur_ictx->index->flags & INDEX_NODE)) panic("%s(): !(cur_ictx->index->flags & " "INDEX_NODE)\n", __FUNCTION__); *(leVCN*)((u8*)entry + le16_to_cpu(entry->length) - sizeof(VCN)) = cpu_to_sle64(vcn); ntfs_debug("Setting VCN sub-node pointer of end index " "entry of left-hand index block node " "after splitting off the right-hand " "node to 0x%llx.", (unsigned long long) vcn); } /* * The index context still describes a single node thus if we * are going to insert an entry into either of the two split * nodes we have to update the index context appropriately. * * If @cur_ictx->entry_nr <= @cur_ictx->promote_entry_nr, we * have to insert the entry into the left-hand node and if * @cur_ictx->entry_nr > @cur_ictx->promote_entry_nr we have to * insert the entry into the right-hand node. * * If inserting into the left-hand node we do not need to do * anything as the insertion process does not make use of * anything in the index context that has changed. * * If inserting into the right-hand node we have to switch the * index context to describe the right-hand node and place the * left-hand node into the place of the right-hand node, i.e. * swap the left- and right-hand nodes in the index context. * * If we are switching the left- and right-hand nodes, we also * have to update the index entry pointer to point to the * correct insertion location in the now current page. * * Note we do not bother to update the array of index entry * pointers as that is no longer used. */ if (!cur_ictx->promote_inserted && cur_ictx->entry_nr > cur_ictx->promote_entry_nr) { union { VCN vcn; INDEX_BLOCK *ia; s64 upl_ofs; upl_t upl; upl_page_info_array_t pl; u8 *addr; } tmp; tmp.vcn = cur_ictx->right_vcn; cur_ictx->right_vcn = cur_ictx->vcn; cur_ictx->vcn = tmp.vcn; tmp.ia = cur_ictx->right_ia; cur_ictx->ia = tmp.ia; cur_ictx->right_ia = cur_ictx->ia; cur_ictx->index = right_index = &tmp.ia->index; if (cur_ictx->right_is_locked) { tmp.upl_ofs = cur_ictx->right_upl_ofs; cur_ictx->right_upl_ofs = cur_ictx->upl_ofs; cur_ictx->upl_ofs = tmp.upl_ofs; tmp.upl = cur_ictx->right_upl; cur_ictx->right_upl = cur_ictx->upl; cur_ictx->upl = tmp.upl; tmp.pl = cur_ictx->right_pl; cur_ictx->right_pl = cur_ictx->pl; cur_ictx->pl = tmp.pl; tmp.addr = cur_ictx->right_addr; cur_ictx->right_addr = cur_ictx->addr; cur_ictx->addr = tmp.addr; } /* * Get the location in the left-hand page of the first * entry that was moved to the right-hand page. */ entry = cur_ictx->entries[cur_ictx->promote_entry_nr + 1]; cur_ictx->entry = (INDEX_ENTRY*)((u8*)right_index + le32_to_cpu(right_index-> entries_offset) + ((u8*)cur_ictx->entry - (u8*)entry)); } /* * We are done with the node that is now set up as the * right-hand node. Unless it is sharing the lock with the * left-hand node, unmap/release it marking it dirty so it gets * written out later. */ if (cur_ictx->right_is_locked) { ntfs_page_unmap(idx_ni, cur_ictx->right_upl, cur_ictx->right_pl, TRUE); cur_ictx->right_upl = NULL; cur_ictx->right_is_locked = 0; } /* * We may be done with the current node. Mark it dirty so it * gets written out later. */ ntfs_index_entry_mark_dirty(cur_ictx); /* * If we have reached the original node (@child_ictx will be * NULL) and we are only splitting it there is nothing to * insert and hence nothing at all left to do so we break out * of the loop. */ } while (child_ictx || !split_only); ntfs_debug("Done (%s).", cur_ictx->split_node ? "insert with split" : "simple insert"); return 0; err: if (!NInoIndexAllocPresent(idx_ni)) goto err_out; /* * Unlock all index contexts in the B+tree path otherwise the call to * ntfs_attr_inode_get() can deadlock. */ ntfs_index_ctx_path_unlock(ictx); /* Get the index bitmap inode. */ err2 = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name, idx_ni->name_len, FALSE, LCK_RW_TYPE_SHARED, &bmp_ni); if (err2) { ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode " "(error %d). Cannot undo index block " "allocation. Leaving inconsistent metadata. " "Run chkdsk.", err2); NVolSetErrors(idx_ni->vol); goto err_out; } /* Free all the index block allocations we have done. */ do { if (cur_ictx->right_addr) { if (!cur_ictx->right_ia) panic("%s(): !cur_ictx->right_ia\n", __FUNCTION__); if (cur_ictx->right_vcn < 0) panic("%s(): cur_ictx->right_vcn < 0\n", __FUNCTION__); err2 = ntfs_bitmap_clear_bit(bmp_ni, cur_ictx->right_vcn << idx_ni->vcn_size_shift >> idx_ni->block_size_shift); if (err2) { ntfs_error(idx_ni->vol->mp, "Failed to undo " "index block allocation in " "(error %d). Leaving " "inconsistent metadata. Run " "chkdsk.", err2); NVolSetErrors(idx_ni->vol); } } /* When we have dealt with the bottom entry we are done. */ if (cur_ictx == ictx) break; cur_ictx = cur_ictx->down; } while (1); lck_rw_unlock_shared(&bmp_ni->lock); (void)vnode_put(bmp_ni->vn); err_out: ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err); return err; relock_err: ntfs_error(idx_ni->vol->mp, "Failed to relock index context (error " "%d). Leaving corrupt index. Run chkdsk.", err); NVolSetErrors(idx_ni->vol); return err; } /** * ntfs_index_node_split - split an index node * @ictx: index context specifying the node to split * @entry_size: size of the entry that will be added after the split * * Split the index node pointed to by @ictx. @ictx also points to the entry * in the index node at which an entry will be inserted after the split is * completed. @entry_size is the size of the entry that will be added at that * position. These are used so that the split is performed with consideration * with the insertion that is likely to come. And if the insertion does not * happen it does not matter much, it just means the split was not quite on the * median entry but off by one. * * Return 0 on success and errno on error. On error the index context is no * longer usable and must be released or reinitialized. * * Note that we do not update the array of index entry pointers nor the number * of entries in the array because on success it means that the index node has * been split and the function is done, i.e. the array of index entry pointers * will not be used any more and on error the index context becomes invalid so * there is no need to update any of the pointers. The caller is expected to * restart its operations by doing a new index lookup for that reason. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ static inline errno_t ntfs_index_node_split(ntfs_index_context *ictx, const u32 entry_size) { return ntfs_index_entry_add_or_node_split(ictx, TRUE, entry_size, NULL, 0, NULL, 0); } /** * ntfs_index_lookup_predecessor - index node whose predecessor node to return * @ictx: index context whose predecessor node to return * @pred_ictx: pointer in which to return the found predecessor index context * * Descend into the child node pointed to by @ictx->entry and then keep * descending into the child node of the child node pointed to by the end entry * of the child node until we reach the bottom of the B+tree. * * On success return the predecessor entry, i.e. the last real (non-end) entry * in the found leaf index node in *@pred_ictx. * * Return 0 on success and errno on error. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. * - The index context @ictx must be locked. */ static errno_t ntfs_index_lookup_predecessor(ntfs_index_context *ictx, ntfs_index_context **pred_ictx) { unsigned entry_nr; errno_t err; ntfs_debug("Entering."); if (!pred_ictx) panic("%s(): !pred_ictx\n", __FUNCTION__); if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n", __FUNCTION__); /* * We must be at the bottom of the current tree path or things will get * very confused. */ if (!ictx->down->is_root) panic("%s(): !ictx->down->is_root\n", __FUNCTION__); do { err = ntfs_index_descend_into_child_node(&ictx); if (err) goto err; /* If this child node is a leaf node we are done. */ if (!(ictx->index->flags & INDEX_NODE)) break; /* * This child node is an index node, descend into its end * entry. */ ictx->entry_nr = entry_nr = ictx->nr_entries - 1; ictx->follow_entry = ictx->entries[entry_nr]; } while (1); /* * We found the leaf node thus the predecessor entry we are looking for * is the last entry before the end entry. */ if (ictx->nr_entries < 2) panic("%s(): ictx->nr_entries < 2\n", __FUNCTION__); ictx->entry_nr = entry_nr = ictx->nr_entries - 2; ictx->entry = ictx->entries[entry_nr]; ictx->is_match = 1; *pred_ictx = ictx; ntfs_debug("Done (found)."); return 0; err: ntfs_error(ictx->idx_ni->vol->mp, "Failed to descend into child node " "(error %d).", err); return err; } /** * ntfs_index_ctx_move - move an index context from its tree path to another one * @ictx: index context to move * @dst: destination index context below which to insert @ictx * * Disconnect the index context @ictx from its tree path and insert it into the * tree path to which @dst belongs positioning it immediately below the index * context @dst. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. */ static inline void ntfs_index_ctx_move(ntfs_index_context *ictx, ntfs_index_context *dst) { ntfs_index_ctx_disconnect(ictx); dst->down->up = ictx; ictx->down = dst->down; ictx->up = dst; dst->down = ictx; } /** * ntfs_index_root_prepare_replace - prepare an index entry to be replaced * @ictx: existing index entry that is going to be replaced * @new_ie_size: size in bytes of the new index entry * * Resize the existing index entry to the size of the new index entry so that * the index root is all set up and ready to receive the new entry. * * Return 0 on success and ENOSPC if there is not enough space in the index mft * record for the new entry. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ static errno_t ntfs_index_root_prepare_replace(ntfs_index_context *ictx, const unsigned new_ie_size) { MFT_RECORD *m = ictx->actx->m; INDEX_ENTRY *ie = ictx->entry; const u32 muse = le32_to_cpu(m->bytes_in_use); const unsigned ie_size = le16_to_cpu(ie->length); const int size_change = (int)new_ie_size - (int)ie_size; const u32 new_muse = muse + size_change; ntfs_debug("Entering."); if (new_muse <= le32_to_cpu(m->bytes_allocated)) { u8 *vcn_addr; ATTR_RECORD *a; INDEX_HEADER *ih; /* * Resize the index root attribute so it has the appropriate * space for the new index entry to replace the existing entry. * As an optimization we combine the resizing of the index root * attribute and the moving of index entries within the * attribute into a single operation. */ vcn_addr = (u8*)ie + ie_size - sizeof(VCN); memmove((u8*)ie + new_ie_size - sizeof(VCN), vcn_addr, muse - (vcn_addr - (u8*)m)); /* Adjust the mft record to reflect the change in used space. */ m->bytes_in_use = cpu_to_le32(new_muse); /* * Adjust the attribute record to reflect the changes in the * size of the attribute record and in the size of the * attribute value. */ a = ictx->actx->a; a->length = cpu_to_le32(le32_to_cpu(a->length) + size_change); a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) + size_change); /* Adjust the index header to reflect the change in length. */ ih = ictx->index; ih->allocated_size = ih->index_length = cpu_to_le32( le32_to_cpu(ih->index_length) + size_change); ntfs_debug("Done."); return 0; } ntfs_debug("Failed (not enough space in mft record to replace index " "entry in index root attribute)."); return ENOSPC; } /** * ntfs_index_block_prepare_replace - prepare an index entry to be replaced * @ictx: existing index entry that is going to be replaced * @new_ie_size: size in bytes of the new index entry * * Resize the existing index entry to the size of the new index entry so that * the index node is all set up and ready to receive the new entry. * * Return 0 on success and ENOSPC if there is not enough space in the index * node for the new entry. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ static errno_t ntfs_index_block_prepare_replace(ntfs_index_context *ictx, const unsigned new_ie_size) { INDEX_HEADER *ih = ictx->index; INDEX_ENTRY *ie = ictx->entry; const u32 ilen = le32_to_cpu(ih->index_length); const unsigned ie_size = le16_to_cpu(ie->length); const u32 new_ilen = ilen + new_ie_size - ie_size; ntfs_debug("Entering."); if (new_ilen <= le32_to_cpu(ih->allocated_size)) { u8 *vcn_addr; /* * Move the VCN of the index entry to be replaced and * everything that follows it to adapt the space for the new * entry. */ vcn_addr = (u8*)ie + ie_size - sizeof(VCN); memmove((u8*)ie + new_ie_size - sizeof(VCN), vcn_addr, ilen - (vcn_addr - (u8*)ih)); /* Adjust the index header to reflect the change in length. */ ih->index_length = cpu_to_le32(new_ilen); ntfs_debug("Done."); return 0; } ntfs_debug("Failed (not enough space in index block to replace index " "entry)."); return ENOSPC; } /** * ntfs_index_entry_replace - replace an existing index entry with a new one * @ictx: existing index entry to replace * @new_ictx: new index entry to replace the existing entry with * * Replace the existing node index entry @ictx->entry with the leaf index entry * @new_ictx->entry. * * Return 0 on success and ENOSPC if there is not enough space for the new * entry. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. * - The index context @new_ictx must be locked. */ static errno_t ntfs_index_entry_replace(ntfs_index_context *ictx, ntfs_index_context *new_ictx) { INDEX_ENTRY *new_ie = new_ictx->entry; INDEX_ENTRY *ie = ictx->entry; const unsigned new_ie_size = le16_to_cpu(new_ie->length) + sizeof(VCN); errno_t err; if (!ictx->is_match || !new_ictx->is_match) panic("%s(): !ictx->is_match || !new_ictx->is_match\n", __FUNCTION__); /* The destination entry is meant to be a node entry. */ if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n", __FUNCTION__); /* The new entry is meant to be a leaf entry. */ if (new_ictx->entry->flags & INDEX_ENTRY_NODE) panic("%s(): new_ictx->entry->flags & INDEX_ENTRY_NODE\n", __FUNCTION__); if (ictx->is_root) err = ntfs_index_root_prepare_replace(ictx, new_ie_size); else err = ntfs_index_block_prepare_replace(ictx, new_ie_size); if (!err) { /* Copy the new index entry into the adapted space. */ memcpy(ie, new_ie, new_ie_size - sizeof(VCN)); /* * Update the copied index entry to reflect the fact that it is * now an index node entry and has the VCN sub-node pointer at * its tail. */ ie->length = cpu_to_le16(new_ie_size); ie->flags |= INDEX_ENTRY_NODE; /* Ensure the updates are written to disk. */ ntfs_index_entry_mark_dirty(ictx); } return err; } /** * ntfs_index_block_free - free an index allocation block * @ictx: index context of the index block to deallocate * * Deallocate the index allocation block for the index described by the index * context @ictx and invalidate the context so the caller can safely release * it. * * We also check if the index allocation attribute can be shrunk as a * consequence of the deallocation of the index allocation block and if so and * that would actually change the on-disk size of the attribute we shrink it * now. * * If we shrunk the index allocation attribute and the index bitmap attribute * is non-resident we shrink the index bitmap attribute also but again only if * it would actually change the on-disk size of the attribute. * * Return 0 on success and errno on error. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - All of the index contexts in the index must be unlocked (this * includes @ictx, i.e. @ictx must not be locked). */ static errno_t ntfs_index_block_free(ntfs_index_context *ictx) { s64 target_pos, bmp_pos, alloc_size; ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni; ntfs_volume *vol = idx_ni->vol; unsigned page_ofs; errno_t err; ntfs_debug("Entering."); if (ictx->is_locked) panic("%s(): ictx->is_locked\n", __FUNCTION__); /* Get the index bitmap inode. */ err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name, idx_ni->name_len, FALSE, LCK_RW_TYPE_EXCLUSIVE, &bmp_ni); if (err) { ntfs_error(vol->mp, "Failed to get index bitmap inode (error " "%d).", err); return err; } /* * Zero the bit in the index bitmap corresponding to the index block * being deallocated. */ target_pos = ictx->vcn << idx_ni->vcn_size_shift >> idx_ni->block_size_shift; err = ntfs_bitmap_clear_bit(bmp_ni, target_pos); if (err) { ntfs_error(vol->mp, "Failed to deallocate index block in " "index bitmap (error %d).", err); lck_rw_unlock_exclusive(&bmp_ni->lock); (void)vnode_put(bmp_ni->vn); return err; } /* If this is not the last set bit, we are done. */ if (target_pos < idx_ni->last_set_bit) { done: ntfs_debug("Done (index records are allocated beyond the " "deallocated one)."); out: lck_rw_unlock_exclusive(&bmp_ni->lock); (void)vnode_put(bmp_ni->vn); return 0; } /* * Scan backwards through the entire bitmap looking for the last set * bit. If we do not know the old last set bit (@idx_ni->last_set_bit * is -1), start at the end of the bitmap and if we do know it but we * cleared it just now, start at the old last set bit. * * Note we ignore any errors as the truncation is just a disk saving * optimization and is not actually required. However if an error * occurs we invalidate the last set bit stored in the inode. */ if (idx_ni->last_set_bit >= 0) bmp_pos = idx_ni->last_set_bit >> 3; else { lck_spin_lock(&bmp_ni->size_lock); bmp_pos = bmp_ni->initialized_size - 1; lck_spin_unlock(&bmp_ni->size_lock); } do { upl_t upl; upl_page_info_array_t pl; u8 *bmp_start, *bmp; err = ntfs_page_map(bmp_ni, bmp_pos & ~PAGE_MASK_64, &upl, &pl, &bmp_start, FALSE); if (err) { ntfs_debug("Failed to read index bitmap (error %d).", err); idx_ni->last_set_bit = -1; goto out; } page_ofs = (unsigned)bmp_pos & PAGE_MASK; bmp = bmp_start + page_ofs; /* Scan backwards through the page. */ do { unsigned bit, byte = *bmp; /* If this byte is zero skip it. */ if (!byte) continue; /* * Determine the last set bit in the byte. * * TODO: There does not appear to be a fls() function * in the kernel. )-: If/when the kernel has an fls() * function, switch the below code to use it. * * So we do the "bit = fls(byte) - 1" by hand which is * not very efficient but works. */ bit = 0; if (byte & 0xf0) { byte >>= 4; bit += 4; } if (byte & 0x0c) { byte >>= 2; bit += 2; } if (byte & 0x02) bit++; ntfs_page_unmap(bmp_ni, upl, pl, FALSE); /* * @bit now contains the last set bit in the byte thus * we can determine the last set bit in the bitmap. */ idx_ni->last_set_bit = (((bmp_pos & ~PAGE_MASK_64) + (bmp - bmp_start)) << 3) + bit; if (target_pos < idx_ni->last_set_bit) goto done; goto was_last_set_bit; } while (--bmp >= bmp_start); ntfs_page_unmap(bmp_ni, upl, pl, FALSE); } while ((bmp_pos -= page_ofs + 1) >= 0); /* * We scanned the entire bitmap and it was all zero. We do not do * anything because truncation of indexes that become empty is done * elsewhere. */ idx_ni->last_set_bit = -1; ntfs_debug("Done (index bitmap has no set bits left)."); goto out; was_last_set_bit: /* * This was the last set bit. Check if we would save disk space by * truncating the index allocation attribute and if so do it. To do * this determine which the first unused cluster is and compare it * against the currently allocated last cluster. * * Note we ignore any errors because it is not essential to resize the * index allocation attribute. In fact Windows and chkdsk are * perfectly happy with it remaining allocated. It just means the * index is wasting space on disk and that will be reclaimed when the * index is deleted or when the index is filled again with entries. */ target_pos = (((idx_ni->last_set_bit + 1) << idx_ni->block_size_shift) + vol->cluster_size_mask) & ~(s64)vol->cluster_size_mask; lck_spin_lock(&idx_ni->size_lock); alloc_size = idx_ni->allocated_size; lck_spin_unlock(&idx_ni->size_lock); if (target_pos >= alloc_size) { ntfs_debug("Done (no space would be freed on disk by " "truncating the index allocation attribute)."); goto out; } err = ntfs_attr_resize(idx_ni, target_pos, 0, NULL); if (err) { ntfs_debug("Failed to truncate index allocation attribute " "(error %d) thus this index will be wasting " "space on disk until it is deleted or " "repopulated with entries.", err); goto out; } ntfs_debug("Truncated index allocation attribute to reclaim 0x%llx " "bytes of disk space.", (unsigned long long)(alloc_size - target_pos)); /* * If the bitmap attribute is non-resident check if we would save disk * space by truncating it, too, and if so do it. Again we ignore any * errors as it is ok for the bitmap attribute to be left as it is. */ if (NInoNonResident(bmp_ni)) { target_pos = ((((target_pos >> idx_ni->block_size_shift) + 7) >> 3) + vol->cluster_size_mask) & ~(s64)vol->cluster_size_mask; lck_spin_lock(&bmp_ni->size_lock); alloc_size = bmp_ni->allocated_size; lck_spin_unlock(&bmp_ni->size_lock); if (target_pos >= alloc_size) { ntfs_debug("Done (truncated index allocation to free " "space on disk but not truncating " "index bitmap as no space would be " "freed on disk by doing so)."); goto out; } err = ntfs_attr_resize(bmp_ni, target_pos, 0, NULL); if (err) { ntfs_debug("Failed to truncate index bitmap attribute " "(error %d) thus this index will be " "wasting space on disk until it is " "deleted or repopulated with entries.", err); goto out; } ntfs_debug("Truncated index bitmap attribute to reclaim " "0x%llx bytes of disk space.", (unsigned long long)(alloc_size - target_pos)); } ntfs_debug("Done."); goto out; } /** * ntfs_index_make_empty - make an index empty discarding all entries in it * @ictx: index context describing the index to empty * * Empty the index described by the index context @ictx. On failure, use the * tree described by @ictx to re-allocate the freed index blocks if any have * been freed at the point of failure. * * This is called when the last index entry in an index is being deleted. * * We need to remove the sub-node from the end entry of the index root and * switch the entry to be a leaf entry. * * We also need to deallocate all index blocks and if possible shrink the index * allocation attribute to zero size as well as the index bitmap attribute if * it is non-resident. * * Return 0 on success and errno on error. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - All of the index contexts in the index must be unlocked (this * includes @ictx, i.e. @ictx must not be locked). */ static errno_t ntfs_index_make_empty(ntfs_index_context *ictx) { s64 data_size; ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni; MFT_RECORD *m; ntfs_attr_search_ctx *actx; INDEX_ROOT *ir; INDEX_HEADER *ih; INDEX_ENTRY *ie; ntfs_index_context *start_ictx; u32 new_ilen; errno_t err; ntfs_debug("Entering."); if (ictx->is_locked) panic("%s(): ictx->is_locked\n", __FUNCTION__); /* * Start by zeroing the index bitmap bits corresponding to the index * blocks being deallocated. For simplicity, we just zero the entire * index bitmap attribute. */ err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name, idx_ni->name_len, FALSE, LCK_RW_TYPE_EXCLUSIVE, &bmp_ni); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode " "(error %d).", err); return err; } lck_spin_lock(&bmp_ni->size_lock); data_size = bmp_ni->data_size; lck_spin_unlock(&bmp_ni->size_lock); err = ntfs_attr_set(bmp_ni, 0, data_size, 0); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to deallocate index " "block(s) in index bitmap."); goto err; } /* * We need to get hold of the index root attribute in order to convert * its end entry to a leaf node. */ err = ntfs_mft_record_map(ictx->base_ni, &m); if (err) { ntfs_error(idx_ni->vol->mp, "ntfs_mft_record_map() failed " "(error %d).", err); goto err; } actx = ntfs_attr_search_ctx_get(ictx->base_ni, m); if (!actx) { err = ENOMEM; goto unm_err; } /* Find the index root attribute in the mft record. */ err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, 0, NULL, 0, actx); if (err) { if (err == ENOENT) { ntfs_error(idx_ni->vol->mp, "Index root attribute " "missing in inode 0x%llx. Run " "chkdsk.", (unsigned long long)idx_ni->mft_no); err = EIO; NVolSetErrors(idx_ni->vol); } goto put_err; } /* Get to the index root value. */ ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); ih = (INDEX_HEADER*)&ir->index; /* The first and last index entry. */ ie = (INDEX_ENTRY*)((u8*)ih + le32_to_cpu(ih->entries_offset)); if (!(ie->flags & INDEX_ENTRY_END)) panic("%s(): !(ie->flags & INDEX_ENTRY_END)\n", __FUNCTION__); if (!(ie->flags & INDEX_ENTRY_NODE)) panic("%s(): !(ie->flags & INDEX_ENTRY_NODE)\n", __FUNCTION__); /* * Remove the sub-node pointer from the index entry and shrink the * index root attribute appropriately. */ ie->length = cpu_to_le16(le16_to_cpu(ie->length) - sizeof(VCN)); ie->flags &= ~INDEX_ENTRY_NODE; new_ilen = le32_to_cpu(ih->index_length) - sizeof(VCN); ih->allocated_size = ih->index_length = cpu_to_le32(new_ilen); ih->flags &= ~LARGE_INDEX; err = ntfs_resident_attr_value_resize(actx->m, actx->a, offsetof(INDEX_ROOT, index) + new_ilen); /* We are shrinking the index root so the resize cannot fail. */ if (err) panic("%s(): err\n", __FUNCTION__); /* Ensure the changes are written to disk. */ NInoSetMrecNeedsDirtying(actx->ni); ntfs_attr_search_ctx_put(actx); ntfs_mft_record_unmap(ictx->base_ni); /* * Now deal with the index allocation attribute by truncating it to * zero length. * * Note we ignore any errors because it is not essential to resize the * index allocation attribute. In fact Windows and chkdsk are * perfectly happy with it remaining allocated. It just means the * index is wasting space on disk and that will be reclaimed when the * index is deleted or when the index is filled again with entries. */ err = ntfs_attr_resize(idx_ni, 0, 0, ictx); if (err) ntfs_debug("Failed to truncate index allocation attribute to " "zero size thus this index will be wasting " "space on disk until it is deleted or " "repopulated with entries."); /* * Finally, if the index bitmap attribute is non-resident truncate it * to zero length, too. Again we ignore any errors as it is ok for the * bitmap attribute to be left as it is. */ if (!err && NInoNonResident(bmp_ni)) { err = ntfs_attr_resize(bmp_ni, 0, 0, ictx); if (err) ntfs_debug("Failed to truncate index bitmap attribute " "to zero size thus this index will be " "wasting space on disk until it is " "deleted or repopulated with " "entries."); } lck_rw_unlock_exclusive(&bmp_ni->lock); (void)vnode_put(bmp_ni->vn); /* * We no longer have any index blocks allocated so invalidate our cache * of the last set bit. */ idx_ni->last_set_bit = -1; ntfs_debug("Done."); return 0; put_err: ntfs_attr_search_ctx_put(actx); unm_err: ntfs_mft_record_unmap(ictx->base_ni); err: /* * Re-allocate the deallocated index block(s). This is safe because * the index inode mutex is held throughout. */ start_ictx = ictx; do { int err2; /* * Skip the index root as it does not have a bit in the * bitmap. */ if (ictx->is_root) continue; err2 = ntfs_bitmap_set_bit(bmp_ni, ictx->vcn << idx_ni->vcn_size_shift >> idx_ni->block_size_shift); if (err2) { ntfs_error(idx_ni->vol->mp, "Failed to undo " "deallocation of index block in index " "bitmap (error %d) of inode 0x%llx. " "Leaving inconsistent metadata. Run " "chkdsk.", err2, (unsigned long long)idx_ni->mft_no); NVolSetErrors(idx_ni->vol); } } while ((ictx = ictx->up) != start_ictx); lck_rw_unlock_exclusive(&bmp_ni->lock); (void)vnode_put(bmp_ni->vn); ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err); return err; } /** * ntfs_index_entry_delete - delete an index entry * @ictx: index context describing the index entry to delete * * Delete the index entry described by the index context @ictx from the index. * * Return 0 on success and errno on error. A special case is a return code of * -EAGAIN (negative, not EAGAIN) which means that the B+tree was rearranged * into a different consistent state to make the deletion possible but now the * lookup and delete has to be repeated as the index entry to be deleted may * have changed its position in the tree thus a new lookup is required to be * able to delete it. Doing this is not terribly efficient but we are only * talking about a handful of cases in a single delete of tens of thousands of * files so it does not matter if that is inefficient. On the plus side doing * things this way means we do not need to keep track of the entry to be * deleted when rearranging the tree which saves time and makes the code much * simpler. * * Note that we do not update the array of index entry pointers nor the number * of entries in the array because on success it means that the index entry has * been removed and the function is done, i.e. the array of index entry * pointers will not be used any more and on error the index contect becomes * invalid so there is no need to update any of the pointers. * * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for * writing. * - The index context @ictx must be locked. */ int ntfs_index_entry_delete(ntfs_index_context *ictx) { leVCN vcn; ntfs_inode *idx_ni = ictx->idx_ni; ntfs_index_context *pred_ictx, *parent_ictx, *suc_ictx, *put_ictx; ntfs_index_context *deallocate_ictx; INDEX_HEADER *ih; INDEX_ENTRY *ie, *next_ie, *pull_down_entry; MFT_RECORD *m; unsigned ie_size, pred_ie_size, pull_down_entry_nr, pull_down_ie_size; unsigned old_parent_entry_nr, old_parent_ie_size, move_ie_size; errno_t err; u32 new_ilen; BOOL is_root = ictx->is_root; ntfs_debug("Entering."); if (!ictx->is_locked) panic("%s(): !ictx->is_locked\n", __FUNCTION__); deallocate_ictx = parent_ictx = put_ictx = NULL; ih = ictx->index; ie = ictx->entry; /* We cannot delete end entries as they do not contain any key/data. */ if (ie->flags & INDEX_ENTRY_END) panic("%s(): ie->flags & INDEX_ENTRY_END\n", __FUNCTION__); ie_size = le16_to_cpu(ie->length); next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size); /* * There are two types of index entry: Either it is a leaf entry, i.e. * it is in a leaf node and thus has no children or it is in an index * node entry, i.e. it is in an index block node and thus has children * which we need to deal with. If it is a leaf entry, skip onto leaf * entry deletion. */ if (!(ie->flags & INDEX_ENTRY_NODE)) goto delete_leaf_entry; /* * The node the entry to be deleted is in has sub-node(s), i.e. it is * not a leaf node, and the entry to be deleted is not a leaf entry. * We have to replace it with its predecessor (which by definition is a * leaf entry). There are three cases of increasing complexity: * * 1) The simplest case is when the predecessor is not the only entry * in its (leaf) node thus it can be simply removed without any tree * rebalancing needing to be done on its node and in addition the * predecessor entry can replace the entry to be deleted without * overflowing the node, i.e. there is enough space in the node we are * deleting the entry from to fit its predecessor entry thus it is a * matter of a simple replace without the need to change anything else * in the tree. * * 2) The slightly more complicated case is the same as above case 1) * except that there is not enough space in the node the entry to be * deleted is in to fit its predecessor entry, thus we need to split * the node the entry is being deleted from and promote the median * entry to the parent node which may then overflow the parent node and * so on up to the root node. A particular annoyance here is that * depending on the implementation details it is possible for the entry * to be deleted to be the median entry and thus be promoted to its * parent or alternatively it is possible for its predecessor entry * that is to replace the entry to be deleted to have to be promoted. * A further pitfall is that if the entry to be deleted is behind the * median entry, i.e. it is on the right of the median entry, then it * will be moved to a different node (the new right-hand node to the * old node being split) thus the replace needs to happen in a * different place. If we use ntfs_index_entry_add() as it is to take * care of the split&promote on insertion this last case would cause us * the problem that the pointers would be all out of date. We would * need ntfs_index_entry_add() to update the pointers and to switch the * right and left nodes in that case. Perhaps we need an * ntfs_index_entry_replace() that can share a lot of code with * ntfs_index_entry_add() perhaps even just a flag to * ntfs_index_entry_add() to indicate replace? Then we would not need * to worry about the pointers becoming out of date as the replace * would be done already for us. * * 3) The more complicated case is when the predecessor entry is the * last entry in its (leaf) node thus we cannot use it to replace the * entry to be deleted without rebalancing the tree. In this case we * have to rebalance the tree so that the predecessor entry is no * longer the last entry in its (leaf) node. Once that is successfully * done we have reduced the problem to either case 1) or case 2) above * or in fact to a completely different case (see below). The benefit * of doing the rebalancing first without regard to the delete and * replace that are to take place is that the tree ends up in a * consistent state, just a different one to what it was before, thus * we do not need to rollback to the original state any more thus error * handling is greatly simplified. The pitfall here is that doing the * balancing may profoundly change the tree to the extent that the * entry to be deleted may be moved to somewhere completely different * which we somehow need to be able to cope with. This can have * positive side effects, too, as it for example can lead to the entry * to be deleted being turned into a leaf entry thus its deletion turns * into a delete of a leaf entry thus the planned replace does not need * to happen at all. This is the "completely different case" mentioned * above. Usually this case will have been caused by a merge of two * neighbouring nodes with pulldown of the parent entry (which happens * to be the entry to be deleted) thus the entry to be deleted will not * only be a leaf entry but it will also not be the last entry in the * leaf node thus it can be deleted with a simple delete. Otherwise it * is not a disaster and we just need to rebalance the tree again as we * did just now but this time making sure that the entry to be deleted * is not the only entry in the node (compare to above where we were * making sure that the leaf predecessor entry is not the only entry in * its leaf node) and then we can proceed with a simple delete of the * leaf node. Because the tree is balanced at all steps we do not * actually care about handling the delete of the entry in the case * that it turned into a leaf entry and we just leave it to the leaf * entry deletion code to deal with. * * That is quite enough description now, let the games begin. First, * investigate the index entry to be deleted and its predecessor to * determine which of the above categories the deletion falls into. * * Locate the predecessor entry. Because the predecessor may be the * only entry in its (leaf) node in which case it would cause the tree * to be rebalanced, we record the path taken to the predecessor entry * by simply extending the existing tree path that points to the entry * to be deleted. */ err = ntfs_index_lookup_predecessor(ictx, &pred_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to look up predecessor of " "index node entry to be deleted (error %d).", err); return err; } /* * If the predecessor is not the only entry other than the end entry in * its node we can simply remove it from its node and use it to replace * the entry to be deleted. */ if (pred_ictx->nr_entries > 2) goto simple_replace; if (pred_ictx->nr_entries < 2) panic("%s(): pred_ictx->nr_entries < 2\n", __FUNCTION__); /* * The predecessor is the only entry (other than the end entry) in its * node thus we cannot simply remove it from its node as that would * cause the B+tree to become imbalanced. * * There are two different cases we need to consider for which we need * to look up the predecessor of the current predecessor leaf entry. * To do this we ascend the tree looking for a node with entries in it. * The first node containing any entries is the node containing the * predecessor entry (which is not a leaf entry). * * 1) If we reach the original node containing the entry to be deleted * without finding any non-empty nodes on the way the whole sub-tree * below the entry to be deleted will become empty after using the * predecessor leaf entry to replace the entry to be deleted. Thus we * can deallocate all nodes in this sub-tree and instead of replacing * the entry to be deleted we remove it altogether and we then move the * predecessor leaf entry that is now homeless in front of its new * successor leaf entry, thus effectively merging the node containing * the predecessor with its right-hand sibling. * * Note that the insertion of the predecessor leaf entry into the node * containing its successor leaf entry may require a split of the node * we are insert into. * * 2) If we find a predecessor for the predecessor entry before we * reach the original node containing the entry to be deleted we break * what we need to do into two operations. First, we rebalance the * tree so that the predecessor entry of the entry to be deleted is no * longer the only entry in its leaf node. This simplifies the * deletion of the entry to a case we have already solved as we can now * simply remove the predecessor entry from its node and use it to * replace the entry to be deleted. Thus, second, we use the already * existing code to do a simple replace. Should that fail it does not * matter as we are leaving a fully consistent B+tree tree behind. * Breaking this case up into two means we incur a little bit of * overhead for the extra locking of the predecessor node and for the * extra memmove()s and memcpy() of the predecessor entry. But this * simplifies error handling and makes the code so much simpler that it * is definitely worth paying the price in overhead. * * Enough discussion, time to do it. Start by going up the tree * looking for a non-empty node or the original node containing the * entry to be deleted so we can determine if we have the above case 1 * or 2. * * We must be at the bottom of the current tree path or things will get * very confused. */ if (!pred_ictx->down->is_root) panic("%s(): !pred_ictx->down->is_root\n", __FUNCTION__); parent_ictx = pred_ictx->up; put_ictx = deallocate_ictx = pred_ictx; ntfs_index_ctx_disconnect_reinit(pred_ictx); /* * Make a note of the size of the predecessor entry that we are going * to move and unlock it for now. */ move_ie_size = le16_to_cpu(pred_ictx->entry->length); ntfs_index_ctx_unlock(pred_ictx); /* Now ascend the tree until we find a non-empty node. */ while (parent_ictx->nr_entries <= 1) { if (parent_ictx->nr_entries != 1) panic("%s(): parent_ictx->nr_entries != 1\n", __FUNCTION__); /* * Empty index node. Move it to the list of index nodes to be * deallocated and proceed to its parent. */ pred_ictx = parent_ictx; parent_ictx = parent_ictx->up; ntfs_index_ctx_move(pred_ictx, deallocate_ictx); } /* * We have a non-empty parent node. Check whether this is the node * containing the entry to be deleted (case 1) or whether it is an * intervening node (case 2). In the latter case we need to rearrange * the tree. */ if (parent_ictx != ictx) goto rearrange_tree; /* * Now that we know we have case 1, we need to find the leaf node * containing the successor entry of the entry to be deleted. To do * this we repoint the tree path to the entry on the right-hand side of * the entry to be deleted and descend down into the first entry of * each node until we reach the leaf-node. * * We need to lock the node containing the entry to be deleted and make * a note of it as we are going to delete it later. */ err = ntfs_index_ctx_relock(parent_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to relock the parent " "index node (error %d).", err); goto put_err; } /* * NOTE: pull_down_* == to be deleted (we are just reusing existing * code and variables hence the (mis)naming... */ pull_down_entry = parent_ictx->follow_entry; pull_down_entry_nr = parent_ictx->entry_nr; /* * Repoint the path to the entry on the right-hand side of the entry to * be deleted so we can descend to the leaf node containing the * successor entry. */ parent_ictx->follow_entry = (INDEX_ENTRY*)((u8*)pull_down_entry + le16_to_cpu(pull_down_entry->length)); parent_ictx->entry_nr++; /* Finally descend to the leaf node containing the successor entry. */ suc_ictx = parent_ictx; do { err = ntfs_index_descend_into_child_node(&suc_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to descend to " "node containing successor entry " "(error %d).", err); goto put_err; } if (!suc_ictx->is_locked) panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); suc_ictx->follow_entry = suc_ictx->entries[0]; } while (suc_ictx->index->flags & INDEX_NODE); if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE) panic("%s(): suc_ictx->follow_entry->flags & " "INDEX_ENTRY_NODE\n", __FUNCTION__); /* * Can the predecessor entry simply be inserted in front of the * successor leaf entry without overflowing the node? If not we need * to split the node and then restart the delete. */ if (move_ie_size > suc_ictx->bytes_free) { split_and_restart_case_1: ntfs_debug("Splitting index node before add on delete (case " "1)."); ie_size = move_ie_size; goto split_and_restart; } /* * The predecessor entry can simply be inserted in front of the * successor leaf entry. * * We need to have locked both nodes to be able to do the insert thus * we may need to unlock the locked successor node to ensure that the * node lock is always taken in ascending page offset order so we avoid * deadlocks. Also we have to consider the case where the two index * nodes are in the same page in which case we need to share the index * lock. */ if (deallocate_ictx->is_locked) panic("%s(): deallocate_ictx->is_locked\n", __FUNCTION__); err = ntfs_index_ctx_lock_two(suc_ictx, deallocate_ictx); if (err) goto put_err; /* * We need to re-check the amount of free space as it can have changed * whilst we dropped the lock on @suc_ictx if that was necessary. When * this happens simply drop the lock on @deallocate_ictx if we took it * and go back to the previous code dealing with the not enough space * case. */ if (move_ie_size > suc_ictx->bytes_free) { if (deallocate_ictx->is_locked) ntfs_index_ctx_unlock(deallocate_ictx); goto split_and_restart_case_1; } /* * Both nodes are locked and this is a simple insert, so go ahead and * do it. We have already checked that there is enough space so it * cannot fail. * * Move all the entries out of the way to make space for the * predecessor entry to be copied in. */ err = ntfs_index_node_make_space(suc_ictx, move_ie_size); if (err) panic("%s(): err\n", __FUNCTION__); /* * Copy the predecessor index entry into the created space in front of * the successor entry. */ memcpy(suc_ictx->follow_entry, deallocate_ictx->entry, move_ie_size); /* Ensure the updates are written to disk. */ ntfs_index_entry_mark_dirty(suc_ictx); /* * We are done both with the predecessor and successor nodes so release * their locks. */ if (deallocate_ictx->is_locked) ntfs_index_ctx_unlock(deallocate_ictx); if (!suc_ictx->is_locked) panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); ntfs_index_ctx_unlock(suc_ictx); /* * We now have to lock the original node containing the entry to be * deleted and we need to switch it to be the current index context and * to point to the entry to be deleted. Note this is a simple delete * just like for a leaf entry as we have taken care of its child leaf * node(s) already and will deallocate it(them) later. */ if (parent_ictx->is_locked) panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); err = ntfs_index_ctx_relock(parent_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to relock the parent " "index node (error %d).", err); /* * Undo the insertion of the predecessor entry in front of the * successor entry by moving the successor and all following * entries back into their old place and resetting the index * length to the old size. */ err = ntfs_index_ctx_relock(suc_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to relock index " "context (error %d). Leaving corrupt " "index. Run chkdsk.", err); NVolSetErrors(idx_ni->vol); } else { /* * @suc_ictx cannot be the index root or the below * code would be wrong. */ if (suc_ictx->is_root) panic("%s(): suc_ctx->is_root\n", __FUNCTION__); ie = suc_ictx->follow_entry; ie_size = le16_to_cpu(ie->length); next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size); ih = suc_ictx->index; new_ilen = le32_to_cpu(ih->index_length); memmove(ie, next_ie, new_ilen - ((u8*)next_ie - (u8*)ih)); ih->index_length = cpu_to_le32(new_ilen - ie_size); ntfs_index_entry_mark_dirty(suc_ictx); } goto put_err; } ictx = parent_ictx; parent_ictx->entry = parent_ictx->entries[pull_down_entry_nr]; parent_ictx->entry_nr = pull_down_entry_nr; goto prep_simple_delete; rearrange_tree: /* * We now know we have case 2, i.e. we have a non-empty, intervening * node containing the predecessor (non-leaf) entry of the predecessor * entry of the entry to be deleted and we need to use it to rearrange * the tree such that the (leaf) predecessor entry of the entry to be * deleted no longer is the only entry in its (leaf) node. * * To do this we merge the (leaf) node containing the predecessor entry * of the entry to be deleted with its left-hand sibling (leaf) node. * This involves pulling the (non-leaf) predecessor entry of the (leaf) * predecessor entry of the entry to be deleted down to behind its * (leaf) predecessor entry. * * Thus, we need to now locate the leaf node containing the predecessor * entry behind which we need to insert the two predecessor entries. * * To be able to do this we need to lock the node containing the (non- * leaf) predecessor of the predecessor of the entry to be deleted and * make a note of it as we are going to delete it later. We also need * to repoint the tree path to descend into the (non-leaf) predecessor * entry. */ if (parent_ictx->is_locked) panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); if (parent_ictx->is_root) panic("%s(): parent_ictx->is_root\n", __FUNCTION__); err = ntfs_index_ctx_relock(parent_ictx); if (err) goto put_err; parent_ictx->entry_nr--; parent_ictx->follow_entry = parent_ictx->entries[parent_ictx->entry_nr]; pull_down_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) - sizeof(VCN); /* Now descend to the leaf predecessor entry. */ suc_ictx = parent_ictx; do { err = ntfs_index_descend_into_child_node(&suc_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to descend to " "node containing leaf predecessor " "entry of non-leaf predecessor entry " "of leaf predecessor entry of entry " "to be deleted (error %d).", err); goto put_err; } if (!suc_ictx->is_locked) panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); suc_ictx->entry_nr = suc_ictx->nr_entries - 1; suc_ictx->follow_entry = suc_ictx->entries[suc_ictx->entry_nr]; } while (suc_ictx->index->flags & INDEX_NODE); if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE) panic("%s(): suc_ictx->follow_entry->flags & " "INDEX_ENTRY_NODE\n", __FUNCTION__); /* * Can the entry to be pulled down and the original predecessor entry * to be merged in simply be inserted after the located predecessor * entry without overflowing the node? If not we need to split the * node and then restart the delete. */ if (pull_down_ie_size + move_ie_size > suc_ictx->bytes_free) { split_and_restart_case_2: ntfs_debug("Splitting index node before add on delete (case " "2)."); ie_size = pull_down_ie_size + move_ie_size; goto split_and_restart; } /* * We know we have enough space so we cannot fail. Need to lock the * non-leaf predecessor that we want to pull down. */ if (parent_ictx->is_locked) panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx); if (err) goto put_err; /* * We need to re-check the amount of free space as it can have changed * whilst we dropped the lock on @suc_ictx if that was necessary. When * this happens simply drop the lock on @parent_ictx if we took it and * go back to the previous code dealing with the not enough space case. */ if (pull_down_ie_size + move_ie_size > suc_ictx->bytes_free) { if (parent_ictx->is_locked) ntfs_index_ctx_unlock(parent_ictx); goto split_and_restart_case_2; } /* * Make enough space in the destination leaf node to accomodate both * the entries. */ err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size + move_ie_size); if (err) panic("%s(): err\n", __FUNCTION__); /* * Copy the non-leaf predecessor entry into place and convert it to a * leaf entry. */ ie = suc_ictx->follow_entry; pull_down_entry = parent_ictx->follow_entry; memcpy(ie, pull_down_entry, pull_down_ie_size); ie->length = cpu_to_le16(pull_down_ie_size); ie->flags &= ~INDEX_ENTRY_NODE; /* * Now delete the entry from its original node and transfer its VCN * sub-node pointer to the next entry (which is the end entry). */ vcn = *(leVCN*)((u8*)pull_down_entry + pull_down_ie_size); ih = parent_ictx->index; new_ilen = le32_to_cpu(ih->index_length) - (pull_down_ie_size + sizeof(VCN)); memmove(pull_down_entry, (u8*)pull_down_entry + pull_down_ie_size + sizeof(VCN), new_ilen - ((u8*)pull_down_entry - (u8*)ih)); *(leVCN*)((u8*)pull_down_entry + le16_to_cpu(pull_down_entry->length) - sizeof(VCN)) = vcn; /* Update the index size, too. */ ih->index_length = cpu_to_le32(new_ilen); /* Ensure the updates are written to disk. */ ntfs_index_entry_mark_dirty(parent_ictx); /* Update the index context as well. */ parent_ictx->bytes_free += pull_down_ie_size + sizeof(VCN); parent_ictx->nr_entries--; /* We are done with the non-leaf predecessor node so unlock it. */ if (parent_ictx->is_locked) ntfs_index_ctx_unlock(parent_ictx); /* * Now need to move over the original leaf predecessor entry. Lock its * node again. */ if (deallocate_ictx->is_locked) panic("%s(): deallocate_ictx->is_locked\n", __FUNCTION__); err = ntfs_index_ctx_lock_two(suc_ictx, deallocate_ictx); if (err) goto put_err; /* Copy the original leaf predecessor entry into place. */ memcpy((u8*)suc_ictx->follow_entry + pull_down_ie_size, deallocate_ictx->follow_entry, move_ie_size); /* Ensure the updates are written to disk. */ ntfs_index_entry_mark_dirty(suc_ictx); /* We are finished rearranging the tree so unlock both nodes. */ ntfs_index_ctx_unlock(suc_ictx); if (deallocate_ictx->is_locked) ntfs_index_ctx_unlock(deallocate_ictx); /* * The last thing to do is to deallocate the disconnected index node(s) * and then we can restart the delete operation which now involves a * simple replace to complete the delete. */ parent_ictx = deallocate_ictx; do { if (parent_ictx->is_root) panic("%s(): parent_ictx->is_root\n", __FUNCTION__); err = ntfs_index_block_free(parent_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to deallocate no " "longer used index block vcn 0x%llx " "(error %d). Run chkdsk to recover " "the lost index block.", (unsigned long long)parent_ictx->vcn, err); NVolSetErrors(idx_ni->vol); } } while ((parent_ictx = parent_ictx->up) != deallocate_ictx); /* * Release the sub-tree paths that the caller no longer has a reference * to. */ ntfs_index_ctx_put(put_ictx); /* Reset the state machine to original values. */ put_ictx = deallocate_ictx = NULL; /* * The tree is now fully consistent, no nodes are locked, and @suc_ictx * is the leaf node containing the predecessor entry of the entry to * be deleted and the predecessor entry no longer is the only entry * other than the end entry thus it can simply be used to replace the * entry to be deleted. * * Setup everything so we can jump straight into the simple replace * code. Note we lock the original entry to be deleted as well so that * the simple replace code does not have to drop the lock on the * predecessor node in order to lock the original node which it may * have to do otherwise. * * The simple replace code requires the predecessor to be in * @pred_ictx. */ pred_ictx = suc_ictx; err = ntfs_index_ctx_lock_two(suc_ictx, ictx); if (err) return err; /* * Repoint the predecessor context to point to the correct entry and * update it to reflect the addition of the two index entries. */ pred_ictx->entry = (INDEX_ENTRY*)((u8*)pred_ictx->entry + pull_down_ie_size); pred_ictx->entry_nr++; pred_ictx->is_match = 1; pred_ictx->bytes_free -= pull_down_ie_size + move_ie_size; pred_ictx->nr_entries += 2; if (pred_ictx->nr_entries > pred_ictx->max_entries) panic("%s(): pred_ictx->nr_entries > pred_ictx->max_entries\n", __FUNCTION__); pred_ictx->entries[pred_ictx->entry_nr] = pred_ictx->entry; pred_ictx->entries[pred_ictx->entry_nr + 1] = (INDEX_ENTRY*)( (u8*)pred_ictx->entry + move_ie_size); /* Everything is set up. Do the simple replace. */ /* goto simple_replace; */ simple_replace: /* * The predecessor can simply be removed from its node. * * We need to have locked both nodes to be able to do the replace thus * we may need to unlock the locked predecessor node to ensure that * page locks are only taken in ascending page offset order so we avoid * deadlocks. */ if (!pred_ictx->is_locked) panic("%s(): !pred_ictx->is_locked\n", __FUNCTION__); if (!ictx->is_locked) { err = ntfs_index_ctx_lock_two(pred_ictx, ictx); if (err) return err; } /* * Can the predecessor simply replace the entry to be deleted without * overflowing the node containing the entry to be deleted? If not we * need to split the node and then restart the delete. */ pred_ie_size = le16_to_cpu(pred_ictx->entry->length); ie_size = le16_to_cpu(ictx->entry->length) - sizeof(VCN); if ((int)pred_ie_size - (int)ie_size > (int)ictx->bytes_free) { ntfs_debug("Splitting index node before add on delete (case " "3)."); /* * Drop the lock on the predecessor or transfer it to the node * containing the entry to be deleted if they share the same * page (in which case @ictx is not marked locked). */ if (!pred_ictx->is_locked) panic("%s(): !pred_ictx->is_locked\n", __FUNCTION__); if (ictx->is_locked) { /* * @ictx is locked thus it cannot be sharing the lock * with @pred_ictx. Unlock @pred_ictx. */ ntfs_index_ctx_unlock(pred_ictx); } else { /* * @ictx is not locked thus it must be sharing the lock * with @pred_ictx. Transfer the lock across. */ if (ictx->is_root) panic("%s(): ictx->is_root\n", __FUNCTION__); if (ictx->upl_ofs != pred_ictx->upl_ofs) panic("%s(): ictx->upl_ofs != " "pred_ictx->upl_ofs\n", __FUNCTION__); ictx->is_locked = 1; pred_ictx->is_locked = 0; } if (!ictx->is_locked) panic("%s(): !ictx->is_locked\n", __FUNCTION__); suc_ictx = ictx; ie_size = pred_ie_size - ie_size; goto split_and_restart; } /* * Both nodes are locked and this is a simple replace, so go ahead and * do it. We have already checked that there is enough space so it * cannot fail. */ err = ntfs_index_entry_replace(ictx, pred_ictx); if (err) panic("%s(): err\n", __FUNCTION__); /* We are done with the current entry so release it. */ if (ictx->is_locked) ntfs_index_ctx_unlock(ictx); /* * We have now replaced the entry to be deleted but at the moment we * have two copies of the predecessor entry, so delete the original * copy from its (leaf) node. We need to switch the predecessor index * context to be the current context so we can just pretend it is the * entry to be deleted. We only need to setup the fields relevant to * index block node deletion as the predecessor cannot be in the index * root node. */ ictx = pred_ictx; goto prep_simple_delete; delete_leaf_entry: /* * If the entry is in the index root or it is not the only entry (other * than the end entry) in the leaf node it can simply be removed. */ if (ictx->nr_entries > 2 || is_root) goto simple_delete; /* * The entry to be deleted is the only entry and it is not in the index * root. We have to rebalance the tree so that the entry to be deleted * is no longer the only entry in its node. There are several cases we * need to consider: * * 1) If the parent entry is not the end entry of its index node then * simply insert our parent entry at the front of the leaf node on * our right, i.e. the child node of the entry following our parent * entry. Then remove our parent entry from our parent node and * free the leaf node the entry to be deleted is in (thus deleting * the entry). * * What the above describes is effectively merging the leaf node * containing the entry to be deleted with its right-hand sibling * leaf node and in the process pulling down our parent entry into * the merged node and placing it in front of its successor entry. * * Our parent node is of course turned from an index node entry to a * leaf entry as it is pulled down. * * Note how if our parent entry is the only entry other than the end * entry in its node this results in our parent node being empty, * i.e. containing only the end entry. This is fine for NTFS * B+trees, i.e. index nodes may be empty. It is only leaf nodes * that may not be empty. * * 2) As point 1) above but the parent entry of the entry to be removed * does not fit in the leaf node containing its successor thus that * node needs to be split and the split may need to propagate up the * tree which is of course the job of ntfs_index_entry_add(). * * 3) If the parent entry is the end entry of its index node, i.e. * there is no right-hand sibling leaf node then simply insert the * entry on the left of our parent entry at the end of its own child * node, i.e. the leaf node on our left. Then remove the entry on * the left of our parent entry that we inserted into its child node * from our parent node and change the VCN sub-node pointer of our * parent node to point to our left-hand sibling. Finally free the * leaf node the entry to be deleted is in which is no longer * referenced from anywhere (thus deleting the entry). * * What the above describes is effectively merging the leaf node * containing the entry to be deleted with its left-hand sibling * leaf node and in the process pulling down the entry on the * left-hand side of our parent into the merged node and placing it * after its predecessor entry. * * The left-hand sibling of our parent node is of course turned from * and index node entry to a leaf entry as it is pulled down. * * Note how if the left-hand sibling of our parent entry is the only * entry other than the end entry, which is also our parent entry, * in its node this results in our parent node being empty, i.e. * containing only the end entry. This is fine for NTFS B+trees, * i.e. index nodes may be empty. It is only leaf nodes that may * not be empty. * * 4) As point 3) above but the left-hand sibling of the parent entry * of the entry to be removed does not fit in the leaf node * containing its predecessor thus that node needs to be split and * the split may need to propagate up the tree which is of course * the job of ntfs_index_entry_add(). * * 5) If the parent entry is the end entry of its index node, i.e. * there is no right-hand sibling leaf node, and there is no entry * on the left of our parent entry, i.e. there is no left-hand * sibling leaf node, i.e. the parent node is completely empty, we * cannot merge with a sibling as there is none thus we need to * record the current state and repeat a very similar procedure as * above points 1-4 except applied to the parent node which is not * a leaf node so we need to be careful with VCN sub-node pointers. * The aim of doing this is to merge the parent node with one of its * siblings so that it is no longer empty so that our leaf node * containing the entry to be deleted has at least one sibling so * that we can merge the leaf node with one of its siblings. If the * parent node of the parent node has no siblings either we need to * push the recorded state down into the stack of recorded states, * record the new current state, and then go to the next parent and * try to merge that with its sibling and so on until we find a * parent with a sibling that can be merged. There are two possible * outcomes (not counting errors): * a) The going up and merging will eventually succeed in which case * we go backwards through the stack of recorded states merging * nodes as we go along until we reach the last recorded state at * which point we have reduced the problem to one of the above * points 1-4 thus we repeat them to finish. * b) We reach the index root which by definition cannot have any * siblings. Also because we reached the index root it means * that the index root must be empty and the end entry is * pointing to the VCN sub-node we came from and because the * entry to be deleted is the last entry in its leaf-node and all * parent nodes including the index root are empty the entry to * be deleted is the very last entry of the directory thus it can * simply be removed by truncating the index allocation attribute * to zero size and if the index bitmap is non-resident * truncating it to zero size, too, and if the index bitmap is * resident zeroing its contents instead of truncating it and * finally the index root is switched to be a small index, which * is reflected in its index flags and in the removal of the VCN * sub-node pointer in the end entry of the index root. * NOTE: Instead of recursing, we do it in one go by simply freeing * all the index node blocks in the tree path until we find a * non-empty node, i.e. a node that can be merged, and we then * perform the merge by pulling down the appropriate index entry but * instead of pulling it down one level we pull it down to before its * successor (if merging to the right) or to after its predecessor (if * merging on the left) which by definition will be in a leaf node. * That give the same results as the recursion but it is much less * work to do and it affects less index nodes. * NOTE2: After implementing NOTE above, we ditch the code handling * cases 1-4 above as they are just special cases of case 5 when * implemented like NOTE. * * That is quite enough description now, let the games begin. First, * investigate the parent index entry of the leaf entry to be deleted * to see whether it is the end entry or not to determine which of the * above categories the deletion falls into. * * We must be at the bottom of the current tree path or things will get * very confused. */ if (!ictx->down->is_root) panic("%s(): !ictx->down->is_root\n", __FUNCTION__); /* * We are going to disconnect @ictx thus the tree starting at the index * root is no longer referenced by the caller so we need to free it * later. */ put_ictx = ictx->down; /* * Obtain the parent node, unlock the current node, disconnect it from * the tree, and mark it for deallocation later. */ parent_ictx = ictx->up; ntfs_index_ctx_unlock(ictx); ntfs_index_ctx_disconnect_reinit(ictx); deallocate_ictx = ictx; /* * Investigate the parent node. If it is not empty we can immediately * proceed with the merge to the right or left. If it is empty then we * need to ascend the tree until we find a non-empty node or until we * reach an empty index root in which case the index is becoming empty * so we delete its contents by resetting it to zero size. */ while (parent_ictx->nr_entries <= 1) { if (parent_ictx->nr_entries != 1) panic("%s(): parent_ictx->nr_entries != 1\n", __FUNCTION__); if (parent_ictx->is_root) { /* * The index is becoming empty. To simplify things * merge the two sub-trees back together and then empty * the index. */ deallocate_ictx->up->down = parent_ictx->down; parent_ictx->down->up = deallocate_ictx->up; deallocate_ictx->up = parent_ictx; parent_ictx->down = deallocate_ictx; return ntfs_index_make_empty(deallocate_ictx); } /* * Empty index node. Move it to the list of index nodes to be * deallocated and proceed to its parent. */ ictx = parent_ictx; parent_ictx = parent_ictx->up; ntfs_index_ctx_move(ictx, deallocate_ictx); } /* * We have a non-empty parent node which we need to lock as we need to * access its contents. */ if (parent_ictx->is_locked) panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); err = ntfs_index_ctx_relock(parent_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to relock the parent " "index node (error %d).", err); goto put_err; } /* INDEX_NODE == LARGE_INDEX */ if (!(parent_ictx->follow_entry->flags & INDEX_ENTRY_NODE) || !(parent_ictx->index->flags & INDEX_NODE)) panic("%s(): !(parent_ictx->follow_entry->flags & " "INDEX_ENTRY_NODE) || " "!(parent_ictx->index->flags & INDEX_NODE)\n", __FUNCTION__); /* * If the parent entry is the end entry in its node we cannot merge to * the right so merge to the left instead. */ if (parent_ictx->follow_entry->flags & INDEX_ENTRY_END) goto merge_left; /* * The parent entry is not the end entry in its node so merge on the * right. To do this we need to find the successor (leaf) entry for * which we need to repoint the path so we descend into the sub-node of * the entry on the right-hand side of the parent entry. * * We also make a note of the parent entry as we are going to pull it * down in front of its successor entry and we then have to delete it * from this node later. */ pull_down_entry = parent_ictx->follow_entry; pull_down_entry_nr = parent_ictx->entry_nr; pull_down_ie_size = le16_to_cpu(pull_down_entry->length); parent_ictx->follow_entry = (INDEX_ENTRY*)((u8*)pull_down_entry + pull_down_ie_size); pull_down_ie_size -= sizeof(VCN); parent_ictx->entry_nr++; suc_ictx = parent_ictx; do { err = ntfs_index_descend_into_child_node(&suc_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to descend to " "node containing successor entry " "(error %d).", err); goto put_err; } if (!suc_ictx->is_locked) panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); suc_ictx->follow_entry = suc_ictx->entries[0]; } while (suc_ictx->index->flags & INDEX_NODE); if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE) panic("%s(): suc_ictx->follow_entry->flags & " "INDEX_ENTRY_NODE\n", __FUNCTION__); /* * Can the parent entry simply be inserted in front of its successor * leaf entry without overflowing the node? If not we need to split * the node and then restart the delete. */ if (pull_down_ie_size > suc_ictx->bytes_free) { split_and_restart_case_4: ntfs_debug("Splitting index node before add on delete (case " "4)."); ie_size = pull_down_ie_size; goto split_and_restart; } /* * The parent entry can simply be inserted in front of its successor * leaf entry. * * We need to have locked both nodes to be able to do the insert thus * we may need to unlock the locked successor node. */ if (parent_ictx->is_locked) panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx); if (err) goto put_err; /* * We need to re-check the amount of free space as it can have changed * whilst we dropped the lock on @suc_ictx if that was necessary. When * this happens simply drop the lock on @parent_ictx if we took it and * go back to the previous code dealing with the not enough space case. */ if (pull_down_ie_size > suc_ictx->bytes_free) { if (parent_ictx->is_locked) ntfs_index_ctx_unlock(parent_ictx); goto split_and_restart_case_4; } /* * Both nodes are locked and this is a simple insert, so go ahead and * do it. We have already checked that there is enough space so it * cannot fail. * * Move all the entries out of the way to make space for the parent * entry to be copied in. */ err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size); if (err) panic("%s(): err\n", __FUNCTION__); /* * Copy the parent index entry into the created space in front of its * successor and switch the inserted entry to being a leaf entry. */ ie = suc_ictx->follow_entry; pull_down_entry = parent_ictx->entries[pull_down_entry_nr]; memcpy(ie, pull_down_entry, pull_down_ie_size); ie->length = cpu_to_le16(pull_down_ie_size); ie->flags &= ~INDEX_ENTRY_NODE; /* Ensure the updates are written to disk. */ ntfs_index_entry_mark_dirty(suc_ictx); /* * We are done with the successor node so release it or transfer it to * the parent node if they share the same page (in which case * @parent_ictx is not marked locked). */ if (!suc_ictx->is_locked) panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); if (parent_ictx->is_locked) { /* * @parent_ictx is locked thus it cannot be sharing the lock * with @suc_ictx. Unlock @suc_ictx. */ ntfs_index_ctx_unlock(suc_ictx); } else { /* * @parent_ictx is not locked thus it must be sharing the lock * with @suc_ictx. Transfer the lock across. */ if (parent_ictx->is_root) panic("%s(): parent_ictx->is_root\n", __FUNCTION__); if (parent_ictx->upl_ofs != suc_ictx->upl_ofs) panic("%s(): parent_ictx->upl_ofs != " "suc_ictx->upl_ofs\n", __FUNCTION__); parent_ictx->is_locked = 1; suc_ictx->is_locked = 0; } if (!parent_ictx->is_locked) panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); /* * We have now inserted the parent entry in front of its successor but * at the moment we have two copies of the parent entry, so delete the * original copy from the parent node. We need to switch the parent * index context to be the current context and to point to the parent * entry we pulled down so we can just pretend it is the entry to be * deleted. Note this is a simple delete just like for a leaf entry as * we have taken care of its child leaf node(s) already and will * deallocate it(them) later. */ ictx = parent_ictx; parent_ictx->entry = pull_down_entry; parent_ictx->entry_nr = pull_down_entry_nr; goto prep_simple_delete; merge_left: /* * The parent entry is the end entry in its node so merge on the left. * To do this we need to find the predecessor (leaf) entry for which we * need to repoint the path so we descend into the sub-node of the * entry on the left-hand side of the parent entry. Once found we need * to pull down the new parent entry, i.e. the entry on the left of the * current parent entry, behind its predecessor entry. * * We also make a note of the original parent entry as we have to * change its VCN sub-node pointer later. */ old_parent_entry_nr = parent_ictx->entry_nr; old_parent_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) - sizeof(VCN); parent_ictx->entry_nr--; parent_ictx->follow_entry = parent_ictx->entries[parent_ictx->entry_nr]; pull_down_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) - sizeof(VCN); suc_ictx = parent_ictx; do { err = ntfs_index_descend_into_child_node(&suc_ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to descend to " "node containing predecessor entry of " "left-hand sibling entry (error %d).", err); goto put_err; } if (!suc_ictx->is_locked) panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); suc_ictx->entry_nr = suc_ictx->nr_entries - 1; suc_ictx->follow_entry = suc_ictx->entries[suc_ictx->entry_nr]; } while (suc_ictx->index->flags & INDEX_NODE); if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE) panic("%s(): suc_ictx->follow_entry->flags & " "INDEX_ENTRY_NODE\n", __FUNCTION__); /* * Can the entry on the left-hand side of the parent entry simply be * inserted in after its predecessor leaf entry, i.e. before the end * entry in the leaf node containing the predecessor leaf entry, * without overflowing the node? If not we need to split the node and * then restart the delete. */ if (pull_down_ie_size > suc_ictx->bytes_free) { split_and_restart_case_5: ntfs_debug("Splitting index node before add on delete (case " "5)."); ie_size = pull_down_ie_size; goto split_and_restart; } /* * The entry on the left-hand side of the parent entry can simply be * inserted in front of the end entry of the leaf node containing its * predecessor. * * We need to have locked both nodes to be able to do the insert thus * we may need to unlock the locked predecessor node to ensure that * page locks are only taken in ascending page index order so we avoid * deadlocks. */ if (parent_ictx->is_locked) panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx); if (err) goto put_err; /* * We need to re-check the amount of free space as it can have changed * whilst we dropped the lock on @suc_ictx if that was necessary. When * this happens simply drop the lock on @parent_ictx if we took it and * go back to the previous code dealing with the not enough space case. */ if (pull_down_ie_size > suc_ictx->bytes_free) { if (parent_ictx->is_locked) ntfs_index_ctx_unlock(parent_ictx); goto split_and_restart_case_5; } /* * Both nodes are locked and this is a simple insert, so go ahead and * do it. We have already checked that there is enough space so it * cannot fail. * * Move the end entry out of the way to make space for the entry to be * copied in. */ err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size); if (err) panic("%s(): err\n", __FUNCTION__); /* * Copy the parent index entry into the created space in front of its * predecessor and switch the inserted entry to being a leaf entry. */ ie = suc_ictx->follow_entry; memcpy(ie, parent_ictx->follow_entry, pull_down_ie_size); ie->length = cpu_to_le16(pull_down_ie_size); ie->flags &= ~INDEX_ENTRY_NODE; /* Ensure the updates are written to disk. */ ntfs_index_entry_mark_dirty(suc_ictx); /* * We are done with the predecessor node so release it or transfer it * to the parent node if they share the same page (in which case * @parent_ictx is not marked locked). */ if (!suc_ictx->is_locked) panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); if (parent_ictx->is_locked) { /* * @parent_ictx is locked thus it cannot be sharing the lock * with @suc_ictx. Unlock @suc_ictx. */ ntfs_index_ctx_unlock(suc_ictx); } else { /* * @parent_ictx is not locked thus it must be sharing the lock * with @suc_ictx. Transfer the lock across. */ if (parent_ictx->is_root) panic("%s(): parent_ictx->is_root\n", __FUNCTION__); if (parent_ictx->upl_ofs != suc_ictx->upl_ofs) panic("%s(): parent_ictx->upl_ofs != " "suc_ictx->upl_ofs\n", __FUNCTION__); parent_ictx->is_locked = 1; suc_ictx->is_locked = 0; } if (!parent_ictx->is_locked) panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); /* * We have now inserted the left-hand sibling entry of the parent entry * in front of the end entry of the leaf node containing its * predecessor entry but at the moment we have two copies of the entry, * so delete the original copy from its index node. * * Before we do that, update the VCN sub-node pointer of the original * parent entry to point to the index node that is about to become * parentless. Note we do not mark the index node dirty as that will * happen when the original copy of the entry we pulled down is * deleted. */ *(leVCN*)((u8*)parent_ictx->entries[old_parent_entry_nr] + old_parent_ie_size) = *(leVCN*)( (u8*)parent_ictx->follow_entry + pull_down_ie_size); /* * All that is left now is to delete the original copy of the entry we * pulled down. We need to switch the parent index context to be the * current context as it already points to the entry we pulled down * thus we can just pretend it is the entry to be deleted. Note this * is a simple delete just like for a leaf entry as we have taken care * of transfering its child leaf node(s) to the original parent entry * already and will deallocate the origial parent entry's child leaf * node(s) later. */ ictx = parent_ictx; prep_simple_delete: ie = ictx->entry; is_root = ictx->is_root; ictx->is_match = 1; ih = ictx->index; ie_size = le16_to_cpu(ie->length); next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size); /* goto simple_delete; */ simple_delete: new_ilen = le32_to_cpu(ih->index_length) - ie_size; if (is_root) { ATTR_RECORD *a; u32 muse; m = ictx->actx->m; muse = le32_to_cpu(m->bytes_in_use); /* * Move the index entries following @ie into @ie's position. * As an optimization we combine the moving of the index * entries and the resizing of the index root attribute into a * single operation. */ memmove(ie, next_ie, muse - ((u8*)next_ie - (u8*)m)); /* Adjust the mft record to reflect the change in used space. */ m->bytes_in_use = cpu_to_le32(muse - ie_size); /* * Adjust the attribute record to reflect the change in * attribute value and attribute record size. */ a = ictx->actx->a; a->length = cpu_to_le32(le32_to_cpu(a->length) - ie_size); a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) - ie_size); /* Adjust the index header to reflect the change in length. */ ih->allocated_size = ih->index_length = cpu_to_le32(new_ilen); } else { /* Move index entries following @ie into @ie's position. */ memmove(ie, next_ie, new_ilen - ((u8*)ie - (u8*)ih)); /* Adjust the index header to reflect the change in length. */ ih->index_length = cpu_to_le32(new_ilen); } /* Ensure the updates are written to disk. */ ntfs_index_entry_mark_dirty(ictx); /* * If we have scheduled any index block nodes to be deallocated do it * now but first unlock the node we just deleted an entry from so we do * not deadlock. */ if (deallocate_ictx) { ntfs_index_ctx_unlock(ictx); ictx = deallocate_ictx; do { if (ictx->is_root) panic("%s(): ictx->is_root\n", __FUNCTION__); err = ntfs_index_block_free(ictx); if (err) { ntfs_error(idx_ni->vol->mp, "Failed to " "deallocate no longer used " "index block VCN 0x%llx " "(error %d). Run chkdsk to " "recover the lost index " "block.", (unsigned long long)ictx->vcn, err); NVolSetErrors(idx_ni->vol); } } while ((ictx = ictx->up) != deallocate_ictx); } /* * Release any sub-tree paths that the caller no longer has a reference * to. */ if (put_ictx) ntfs_index_ctx_put(put_ictx); ntfs_debug("Done."); return 0; split_and_restart: /* * Split the index node described by @suc_ictx taking into account the * fact that it is very likely that an entry of @ie_size will be added * to the index in front of the position pointed to by @suc_ictx. */ err = ntfs_index_node_split(suc_ictx, ie_size); if (!err) { /* * Signal to the caller that they need to restart the delete * from scratch. */ err = -EAGAIN; } put_err: if (put_ictx) ntfs_index_ctx_put(put_ictx); return err; }