#define ROOT_DISK_ONLY
#define DONT_CACHE_SSDS
static int BC_minimum_mem_size = 256;
#define BC_READAHEAD_WAIT_DEFAULT 10
#define BC_READAHEAD_WAIT_CDROM 45
static int BC_wait_for_readahead = BC_READAHEAD_WAIT_DEFAULT;
static int BC_chit_max_num_IOs_since_last_hit = 10000;
static int BC_chit_prelogin_timeout_seconds = 3600;
static int BC_history_timeout = 240 * 100;
#ifndef DBG_BOOTCACHE
#define DBG_BOOTCACHE 7
#endif
#define DBG_BC_TAG 1
#define DBG_BC_BATCH 2
static int dbg_tag_count = 0;
#ifdef BC_DEBUG
# define MACH_DEBUG
# define MACH_ASSERT 1
# define message(fmt, args...) \
do { \
microtime(&debug_currenttime); \
timersub(&debug_currenttime, &debug_starttime, &debug_currenttime); \
lck_mtx_lock(&debug_printlock); \
printf("BootCache: %5d.%03d %24s[%4d]: " fmt "\n", (u_int)(debug_currenttime.tv_sec), (u_int)(debug_currenttime.tv_usec / 1000), __FUNCTION__, __LINE__, ##args); \
lck_mtx_unlock(&debug_printlock); \
} while (0)
# define debug(fmt, args...) message("*** " fmt, ##args)
extern void Debugger(char *);
#else
# define debug(fmt, args...) do {} while (0)
# define message(fmt, args...) printf("BootCache: " fmt "\n" , ##args)
#endif
#ifdef BC_EXTRA_DEBUG
# define xdebug(fmt, args...) message("+++ " fmt, ##args)
#else
# define xdebug(fmt, args...) do {} while (0)
#endif
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/kdebug.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/ubc.h>
#include <sys/uio.h>
#include <uuid/uuid.h>
#include <mach/kmod.h>
#include <mach/mach_types.h>
#include <mach/vm_param.h>
#include <vm/vm_kern.h>
#include <libkern/libkern.h>
#include <libkern/OSAtomic.h>
#include <kern/assert.h>
#include <kern/host.h>
#include <kern/kalloc.h>
#include <kern/thread_call.h>
#include <kern/locks.h>
#include <IOKit/storage/IOMediaBSDClient.h>
#include <pexpert/pexpert.h>
#include "BootCache.h"
#ifdef BC_DEBUG
static struct timeval debug_starttime;
static struct timeval debug_currenttime;
static lck_mtx_t debug_printlock;
#endif
#define BC_MALLOC(_size, _type, _flags) ((ssize_t)(_size) < 0) ? NULL : _MALLOC((_size), (_type), (_flags))
struct BC_cache_extent {
lck_mtx_t ce_lock;
u_int64_t ce_diskoffset;
u_int64_t ce_length;
off_t ce_cacheoffset;
u_int16_t ce_batch;
u_int16_t ce_flags;
#define CE_ABORTED (1 << 0)
#define CE_IODONE (1 << 1)
#define CE_LOWPRI (1 << 2)
#define CE_SHARED (1 << 3)
int ce_mount_idx;
u_int32_t *ce_blockmap;
};
struct BC_cache_mount {
lck_rw_t cm_lock;
uuid_t cm_uuid;
int cm_nextents;
struct BC_cache_extent **cm_pextents;
int cm_state;
#define CM_SETUP (0)
#define CM_READY (1)
#define CM_ABORTED (2)
dev_t cm_dev;
struct buf *cm_bp;
u_int64_t cm_throttle_mask;
u_int64_t cm_blocksize;
u_int64_t cm_devsize;
u_int64_t cm_maxread;
struct BC_cache_disk* cm_disk;
};
struct BC_cache_disk {
lck_mtx_t cd_lock;
u_int64_t cd_disk_id;
int cd_disk_num;
int cd_flags;
#define CD_HAS_THREAD (1 << 0)
#define CD_ISSUE_LOWPRI (1 << 1)
#define CD_IS_SSD (1 << 2)
int cd_nmounts;
int cd_batch;
};
struct BC_history_mount_device {
u_int64_t hmd_disk_id;
struct BC_history_mount_device* hmd_next;
int hmd_is_ssd;
dev_t hmd_dev;
int hmd_blocksize;
struct BC_history_mount hmd_mount;
};
struct BC_history_entry_cluster {
struct BC_history_entry_cluster *hec_link;
int hec_nentries;
struct BC_history_entry hec_data[0];
};
#ifdef STATIC_HISTORY
# define BC_HISTORY_ALLOC 128000
# define BC_HISTORY_ENTRIES \
(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) / \
sizeof(struct BC_history_entry)) - 1)
# define BC_HISTORY_MAXCLUSTERS 1
#else
# define BC_HISTORY_ALLOC 16000
# define BC_HISTORY_ENTRIES \
(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) / \
sizeof(struct BC_history_entry)) - 1)
# define BC_HISTORY_MAXCLUSTERS ((BC_MAXENTRIES / BC_HISTORY_ENTRIES) + 1)
#endif
#ifdef READ_HISTORY_BUFFER
struct BC_read_history_entry {
struct BC_cache_extent *rh_extent;
u_int64_t rh_blkno;
u_int64_t rh_bcount;
int rh_result;
# define BC_RHISTORY_OK 0
# define BC_RHISTORY_FAIL 1
};
#endif
struct BC_cache_control {
lck_grp_t *c_lckgrp;
struct bdevsw *c_devsw;
strategy_fcn_t *c_strategy;
open_close_fcn_t *c_close;
lck_mtx_t c_handlers_lock;
u_int64_t c_cachesize;
u_int64_t c_root_disk_id;
int c_nmounts;
struct BC_cache_mount *c_mounts;
int c_nextentlists;
int *c_nextents;
struct BC_cache_extent **c_extentlists;
int c_batch_count;
vnode_t c_vp;
uint32_t c_vid;
lck_rw_t c_cache_lock;
int c_history_num_clusters;
struct BC_history_entry_cluster *c_history;
struct BC_history_mount_device *c_history_mounts;
uint64_t c_history_size;
lck_rw_t c_history_lock;
UInt32 c_flags;
#define BC_FLAG_SETUP (1 << 0)
#define BC_FLAG_CACHEACTIVE (1 << 1)
#define BC_FLAG_HISTORYACTIVE (1 << 2)
#define BC_FLAG_HTRUNCATED (1 << 3)
#define BC_FLAG_SHUTDOWN (1 << 4)
SInt32 c_strategycalls;
uint32_t c_readahead_throttles_cutthroughs;
uint32_t c_num_ios_since_last_hit;
uint32_t c_num_reader_threads;
lck_mtx_t c_reader_threads_lock;
int c_take_detailed_stats;
struct BC_statistics c_stats;
struct timeval c_loadtime;
struct timeval c_cache_starttime;
struct timeval c_history_starttime;
#ifdef READ_HISTORY_BUFFER
int c_rhistory_idx;
struct BC_read_history_entry *c_rhistory;
#endif
};
#define CB_PAGEBLOCKS(cm) (PAGE_SIZE / (cm)->cm_blocksize)
#define CB_BLOCK_TO_PAGE(cm, block) ((block) / CB_PAGEBLOCKS(cm))
#define CB_PAGE_TO_BLOCK(cm, page) ((page) * CB_PAGEBLOCKS(cm))
#define CB_BLOCK_TO_BYTE(cm, block) ((block) * (cm)->cm_blocksize)
#define HB_BLOCK_TO_BYTE(hmd, block) ((block) * (hmd)->hmd_blocksize)
#define CB_BYTE_TO_BLOCK(cm, byte) ((byte) / (cm)->cm_blocksize)
#define CB_MAPFIELDBITS 32
#define CB_MAPFIELDBYTES 4
#define CB_MAPIDX(x) ((x) / CB_MAPFIELDBITS)
#define CB_MAPBIT(x) (1 << ((x) % CB_MAPFIELDBITS))
#define CB_BLOCK_PRESENT(ce, block) \
((ce)->ce_blockmap != NULL && ((ce)->ce_blockmap[CB_MAPIDX(block)] & CB_MAPBIT(block)))
#define CB_MARK_BLOCK_PRESENT(ce, block) \
((ce)->ce_blockmap[CB_MAPIDX(block)] |= CB_MAPBIT(block))
#define CB_MARK_BLOCK_VACANT(ce, block) \
((ce)->ce_blockmap[CB_MAPIDX(block)] &= ~CB_MAPBIT(block))
#define CB_PAGE_VACANT(cm, ce, page) \
(!((ce)->ce_blockmap[CB_MAPIDX(CB_PAGE_TO_BLOCK(cm, page))] & \
(((1 << CB_PAGEBLOCKS(cm)) - 1) << (CB_PAGE_TO_BLOCK(cm, page) % \
CB_MAPFIELDBITS))))
#define BC_MAX_SIZE (max_mem / 2)
#define _FREE_ZERO(p, c) \
do { \
if (p != NULL) { \
_FREE(p, c); \
p = NULL; \
} \
} while (0);
#define LOCK_HISTORY_R() lck_rw_lock_shared(&BC_cache->c_history_lock)
#define UNLOCK_HISTORY_R() lck_rw_unlock_shared(&BC_cache->c_history_lock)
#define LOCK_HISTORY_W() lck_rw_lock_exclusive(&BC_cache->c_history_lock)
#define UNLOCK_HISTORY_W() lck_rw_unlock_exclusive(&BC_cache->c_history_lock)
#define LOCK_EXTENT(e) lck_mtx_lock(&(e)->ce_lock)
#define UNLOCK_EXTENT(e) lck_mtx_unlock(&(e)->ce_lock)
#define LOCK_MOUNT_R(m) lck_rw_lock_shared(&(m)->cm_lock)
#define UNLOCK_MOUNT_R(m) lck_rw_unlock_shared(&(m)->cm_lock)
#define LOCK_MOUNT_W(m) lck_rw_lock_exclusive(&(m)->cm_lock)
#define UNLOCK_MOUNT_W(m) lck_rw_unlock_exclusive(&(m)->cm_lock)
#define LOCK_MOUNT_W_TO_R(m) lck_rw_lock_exclusive_to_shared(&(m)->cm_lock)
#define LOCK_MOUNT_R_TO_W(m) lck_rw_lock_shared_to_exclusive(&(m)->cm_lock)
#define LOCK_DISK(d) lck_mtx_lock(&(d)->cd_lock)
#define UNLOCK_DISK(d) lck_mtx_unlock(&(d)->cd_lock)
#define LOCK_READERS() lck_mtx_lock(&BC_cache->c_reader_threads_lock)
#define UNLOCK_READERS() lck_mtx_unlock(&BC_cache->c_reader_threads_lock)
#define LOCK_HANDLERS() lck_mtx_lock(&BC_cache->c_handlers_lock)
#define UNLOCK_HANDLERS() lck_mtx_unlock(&BC_cache->c_handlers_lock)
#define LOCK_CACHE_R() lck_rw_lock_shared(&BC_cache->c_cache_lock)
#define UNLOCK_CACHE_R() lck_rw_unlock_shared(&BC_cache->c_cache_lock)
#define LOCK_CACHE_W() lck_rw_lock_exclusive(&BC_cache->c_cache_lock)
#define UNLOCK_CACHE_W() lck_rw_unlock_exclusive(&BC_cache->c_cache_lock)
#define LOCK_CACHE_TRY_W() lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_EXCLUSIVE)
#define LOCK_CACHE_TRY_R() lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_SHARED)
#define LOCK_CACHE_W_TO_R() lck_rw_lock_exclusive_to_shared(&BC_cache->c_cache_lock)
#define LOCK_CACHE_R_TO_W() lck_rw_lock_shared_to_exclusive(&BC_cache->c_cache_lock)
#define BC_ADD_STAT(stat, inc) OSAddAtomic((inc), ((SInt32 *)&BC_cache->c_stats.ss_##stat))
static struct BC_cache_control BC_cache_softc;
static struct BC_cache_control *BC_cache = &BC_cache_softc;
extern kmod_info_t kmod_info;
static struct BC_playlist_header *BC_preloaded_playlist;
static int BC_preloaded_playlist_size;
static int BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length);
static int BC_find_cache_mount(dev_t dev);
static struct BC_cache_extent** BC_find_extent(struct BC_cache_mount* cm, u_int64_t offset, u_int64_t length, int contained, int* pnum_extents);
static int BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length);
static int BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length);
static int BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk);
static void BC_reader_thread(void *param0, wait_result_t param1);
static int BC_strategy(struct buf *bp);
static int BC_close(dev_t dev, int flags, int devtype, struct proc *p);
static int BC_terminate_readahead(void);
static int BC_terminate_cache(void);
static int BC_terminate_history(void);
static void BC_check_handlers(void);
static void BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t from,
uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength);
static int BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd);
static void BC_teardown_disk(struct BC_cache_disk *cd);
static void BC_mount_available(struct BC_cache_mount *cm);
static int BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm);
static int BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context);
static void BC_teardown_mount(struct BC_cache_mount *ce);
static int BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx);
static void BC_teardown_extent(struct BC_cache_extent *ce);
static int BC_copyin_playlist(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries);
static int BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history);
static void BC_update_mounts(void);
static void BC_timeout_history(void *);
static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex);
static int BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev);
static int BC_size_history_mounts(void);
static int BC_size_history_entries(void);
static int BC_copyout_history_mounts(user_addr_t uptr);
static int BC_copyout_history_entries(user_addr_t uptr);
static void BC_discard_history(void);
static void BC_auto_start(void);
static int BC_setup(void);
static int fill_in_bc_cache_mounts(mount_t mount, void* arg);
static int check_for_new_mount_itr(mount_t mount, void* arg);
static int extents_status(struct BC_cache_extent **pce, int nextents) ;
static void wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) ;
#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
static int compare_cache_extents(const void *vfirst, const void *vsecond);
extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
#endif
static int BC_sysctl SYSCTL_HANDLER_ARGS;
SYSCTL_PROC(_kern, OID_AUTO, BootCache, CTLFLAG_RW | CTLFLAG_LOCKED,
0, 0, BC_sysctl,
"S,BC_command",
"BootCache command interface");
extern int netboot_root(void);
extern void (* mountroot_post_hook)(void);
extern void (* unmountroot_pre_hook)(void);
static int BC_alloc_pagebuffer();
static void BC_free_pagebuffer();
static void BC_free_page(struct BC_cache_extent *ce, int page);
int ubc_range_op(vnode_t, off_t, off_t, int, int *);
#define MACH_KERNEL 1
#include <mach-o/loader.h>
extern void *getsectdatafromheader(struct mach_header *, char *, char *, int *);
static inline UInt32
BC_set_flag(UInt32 flag)
{
return flag & OSBitOrAtomic(flag, &BC_cache->c_flags);
}
static inline UInt32
BC_clear_flag(UInt32 flag)
{
return flag & OSBitAndAtomic(~(flag), &BC_cache->c_flags);
}
static inline const char* uuid_string(uuid_t uuid)
{
static uuid_string_t uuidString;
uuid_unparse(uuid, uuidString);
return (char*)uuidString;
}
static inline int
BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length)
{
if ((offset < (ce->ce_diskoffset + ce->ce_length)) &&
((offset + length) > ce->ce_diskoffset))
return 1;
return 0;
}
static int
BC_find_cache_mount(dev_t dev)
{
int i;
if (dev == nulldev())
return(-1);
if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (BC_cache->c_mounts == NULL))
return(-1);
for (i = 0; i < BC_cache->c_nmounts; i++) {
if (BC_cache->c_mounts[i].cm_state == CM_READY) {
if (BC_cache->c_mounts[i].cm_dev == dev) {
return i;
}
}
}
return(-1);
}
static struct BC_cache_extent **
BC_find_extent(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length, int contained, int *pnum_extents)
{
struct BC_cache_extent **p, **base, **last, **next, **prev;
size_t lim;
assert(length > 0);
base = cm->cm_pextents;
last = base + cm->cm_nextents - 1;
if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (base == NULL)) {
return(NULL);
}
if (((offset + length) <= (*base)->ce_diskoffset) ||
(offset >= ((*last)->ce_diskoffset + (*last)->ce_length))) {
return(NULL);
}
for (lim = cm->cm_nextents; lim != 0; lim >>= 1) {
p = base + (lim >> 1);
if (BC_check_intersection((*p), offset, length)) {
if (contained == 0)
return(p);
if (pnum_extents) {
*pnum_extents = 1;
}
if ((offset + length) > ((*p)->ce_diskoffset + (*p)->ce_length)) {
prev = p;
for (next = prev + 1;
next <= last;
next++) {
assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset);
if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) {
if (BC_cache->c_take_detailed_stats) {
xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length));
}
return(NULL);
}
if (pnum_extents) {
(*pnum_extents)++;
}
if ((offset + length) <= ((*next)->ce_diskoffset + (*next)->ce_length)) {
break;
}
prev = next;
}
if (next > last) {
if (BC_cache->c_take_detailed_stats) {
xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length));
}
return (NULL);
}
}
if (offset < (*p)->ce_diskoffset) {
next = p;
for (prev = next - 1;
prev >= base;
prev--) {
assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset);
if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) {
if (BC_cache->c_take_detailed_stats) {
xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset);
}
return(NULL);
}
if (pnum_extents) {
(*pnum_extents)++;
}
if (offset >= (*prev)->ce_diskoffset) {
return(prev);
}
next = prev;
}
assert(prev < base);
if (prev < base) {
if (BC_cache->c_take_detailed_stats) {
xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset);
}
return (NULL);
}
}
return(p);
}
if (offset > (*p)->ce_diskoffset) {
base = p + 1;
lim--;
}
}
return(NULL);
}
static int
BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length)
{
struct BC_cache_mount *cm;
u_int64_t estart, eend, dstart, dend, nblk, base, page;
int i, discarded;
assert(length > 0);
#ifdef BC_DEBUG
lck_mtx_assert(&ce->ce_lock, LCK_MTX_ASSERT_OWNED);
#endif
if (ce->ce_flags & CE_ABORTED ||
ce->ce_length == 0)
return 0;
cm = BC_cache->c_mounts + ce->ce_mount_idx;
dstart = offset;
dend = offset + length;
assert(dend > dstart);
estart = ce->ce_diskoffset;
eend = ce->ce_diskoffset + ce->ce_length;
assert(eend > estart);
if (dend <= estart)
return(0);
if (eend <= dstart)
return(0);
if (dstart < estart)
dstart = estart;
if (dend > eend)
dend = eend;
nblk = CB_BYTE_TO_BLOCK(cm, dend - dstart);
base = CB_BYTE_TO_BLOCK(cm, dstart - ce->ce_diskoffset);
assert(base >= 0);
discarded = 0;
assert((base + nblk - 1) < howmany(ce->ce_length, cm->cm_blocksize));
for (i = 0; i < nblk; i++) {
if (CB_BLOCK_PRESENT(ce, base + i)) {
CB_MARK_BLOCK_VACANT(ce, base + i);
discarded++;
page = CB_BLOCK_TO_PAGE(cm, base + i);
if (CB_PAGE_VACANT(cm, ce, page))
BC_free_page(ce, (int) page);
}
}
return(discarded * cm->cm_blocksize);
}
static void
BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t fromoffset,
uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength)
{
int maxblks, i, nextblk = 0;
int found = 0;
maxblks = howmany(ce->ce_length, cm->cm_blocksize);
i = CB_BYTE_TO_BLOCK(cm, fromoffset);
for (; i < maxblks; i++) {
if (CB_BLOCK_PRESENT(ce, i)) {
if (found == 0) {
nextblk = i;
}
found++;
} else {
if (found != 0)
break;
}
}
if (found) {
*nextpage = CB_BLOCK_TO_PAGE(cm, nextblk);
*nextoffset = CB_BLOCK_TO_BYTE(cm, nextblk) % PAGE_SIZE;
*nextlength = MIN(CB_BLOCK_TO_BYTE(cm, found), cm->cm_maxread);
} else {
*nextpage = -1;
}
return;
}
static int
BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk)
{
int blk;
assert(base >= 0);
assert((base + nblk) <= howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize));
for (blk = 0; blk < nblk; blk++) {
if (!CB_BLOCK_PRESENT(ce, base + blk)) {
return(0);
}
}
return(1);
}
static void
BC_reader_thread(void *param0, wait_result_t param1)
{
struct BC_cache_disk *cd = NULL;
struct BC_cache_mount *cm = NULL;
struct BC_cache_extent *ce = NULL;
int count, num_mounts, ce_idx, cel_idx;
upl_t upl;
kern_return_t kret;
struct timeval batchstart, batchend;
int error = 0;
int issuing_lowpriority_extents = 0;
cd = (struct BC_cache_disk*) param0;
num_mounts = cd->cd_nmounts;
if (BC_cache->c_flags & BC_FLAG_SHUTDOWN)
goto out;
for (;;) {
if (issuing_lowpriority_extents) {
debug("disk %d starting low priority batch", cd->cd_disk_num);
} else {
debug("disk %d starting batch %d", cd->cd_disk_num, cd->cd_batch);
KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) |
DBG_FUNC_START,
cd->cd_batch, cd->cd_disk_num, 0, 0, 0);
}
microtime(&batchstart);
LOCK_CACHE_R();
for (cel_idx = 0;
cel_idx < BC_cache->c_nextentlists;
cel_idx++) {
for (ce_idx = 0;
ce_idx < BC_cache->c_nextents[cel_idx];
ce_idx++) {
ce = BC_cache->c_extentlists[cel_idx] + ce_idx;
cm = BC_cache->c_mounts + ce->ce_mount_idx;
if (cm->cm_state != CM_READY) {
continue;
}
if (cm->cm_disk != cd){
continue;
}
if (ce->ce_batch > cd->cd_batch && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
continue;
}
if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) {
continue;
}
if (!(cd->cd_flags & CD_ISSUE_LOWPRI) &&
(ce->ce_flags & CE_LOWPRI)) {
continue;
}
if((cd->cd_flags & CD_ISSUE_LOWPRI) &&
!(ce->ce_flags & CE_LOWPRI)) {
debug("Saw a regular extent while issuing throttled IO");
}
LOCK_EXTENT(ce);
if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) {
UNLOCK_EXTENT(ce);
continue;
}
uint32_t fromoffset = 0;
LOCK_MOUNT_R(cm);
if (cm->cm_state != CM_READY) {
UNLOCK_MOUNT_R(cm);
UNLOCK_EXTENT(ce);
continue;
}
for (;;) {
uint32_t nextpage;
uint32_t nextoffset;
uint32_t nextlength;
if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) {
UNLOCK_MOUNT_R(cm);
goto out;
}
BC_next_valid_range(cm, ce, fromoffset, &nextpage, &nextoffset, &nextlength);
if (nextpage == -1)
break;
fromoffset = (nextpage * PAGE_SIZE) + nextoffset + nextlength;
kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
if (kret != KERN_SUCCESS) {
UNLOCK_MOUNT_R(cm);
message("reader thread: vnode_getwithvid failed - %d\n", kret);
goto out;
}
kret = ubc_create_upl(BC_cache->c_vp,
ce->ce_cacheoffset + (nextpage * PAGE_SIZE),
(int) roundup(nextoffset + nextlength, PAGE_SIZE),
&upl,
NULL,
UPL_SET_LITE|UPL_FILE_IO);
if (kret != KERN_SUCCESS) {
UNLOCK_MOUNT_R(cm);
message("ubc_create_upl returned %d\n", kret);
(void) vnode_put(BC_cache->c_vp);
goto out;
}
buf_setblkno(cm->cm_bp, CB_BYTE_TO_BLOCK(cm, ce->ce_diskoffset + nextoffset) + CB_PAGE_TO_BLOCK(cm, nextpage));
buf_setcount(cm->cm_bp, (unsigned int) nextlength);
buf_setupl(cm->cm_bp, upl, (unsigned int) nextoffset);
buf_setresid(cm->cm_bp, buf_count(cm->cm_bp));
buf_reset(cm->cm_bp, B_READ);
throttle_info_handle_t throttle_info_handle;
if ((error = throttle_info_ref_by_mask(cm->cm_throttle_mask, &throttle_info_handle)) == 0) {
throttle_info_update_by_mask(throttle_info_handle, 0x0);
throttle_info_rel_by_mask(throttle_info_handle);
} else {
debug("Unable to update throttle info for mount %s: %d", cm->cm_uuid, error);
}
BC_ADD_STAT(initiated_reads, 1);
if (cd->cd_disk_num < STAT_DISKMAX) {
if (ce->ce_flags & CE_LOWPRI) {
BC_ADD_STAT(disk_initiated_reads_lowpri[cd->cd_disk_num], 1);
} else {
BC_ADD_STAT(disk_initiated_reads[cd->cd_disk_num], 1);
}
}
KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, DKIO_READ) | DBG_FUNC_NONE,
(uintptr_t) cm->cm_bp, buf_device(cm->cm_bp), (long)buf_blkno(cm->cm_bp), buf_count(cm->cm_bp), 0);
BC_cache->c_strategy(cm->cm_bp);
buf_biowait(cm->cm_bp);
kret = ubc_upl_commit(upl);
(void) vnode_put(BC_cache->c_vp);
if (kret != KERN_SUCCESS || buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) {
debug("read error: extent %lu %lu/%lu "
"(error buf %ld/%ld flags %08x resid %d)",
(unsigned long) ce_idx,
(unsigned long)ce->ce_diskoffset, (unsigned long)ce->ce_length,
(long)buf_blkno(cm->cm_bp), (long)buf_count(cm->cm_bp),
buf_flags(cm->cm_bp), buf_resid(cm->cm_bp));
count = BC_discard_bytes(ce, ce->ce_diskoffset + nextoffset, nextlength);
debug("read error: discarded %d bytes", count);
BC_ADD_STAT(read_errors, 1);
BC_ADD_STAT(read_errors_bytes, count);
}
#ifdef READ_HISTORY_BUFFER
if (BC_cache->c_rhistory && BC_cache->c_rhistory_idx < READ_HISTORY_BUFFER) {
BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_extent = ce;
BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_blkno = buf_blkno(cm->cm_bp);
BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_bcount = buf_count(cm->cm_bp);
if (buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) {
BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_FAIL;
BC_cache->c_rhistory_idx++;
} else {
# ifdef READ_HISTORY_ALL
BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_OK;
BC_cache->c_rhistory_idx++;
# endif
}
}
#endif
}
BC_ADD_STAT(read_blocks, (u_int) CB_BYTE_TO_BLOCK(cm, ce->ce_length));
BC_ADD_STAT(read_bytes, (u_int) ce->ce_length);
if (ce->ce_flags & CE_LOWPRI) {
BC_ADD_STAT(read_bytes_lowpri, (u_int) ce->ce_length);
if (cd->cd_disk_num < STAT_DISKMAX) {
BC_ADD_STAT(batch_bytes_lowpri[cd->cd_disk_num], (u_int) ce->ce_length);
}
} else {
if (cd->cd_disk_num < STAT_DISKMAX) {
BC_ADD_STAT(batch_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)], (u_int) ce->ce_length);
if (ce->ce_batch < cd->cd_batch) {
BC_ADD_STAT(batch_late_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)], ce->ce_length);
}
}
}
if (ce->ce_flags & CE_SHARED) {
BC_ADD_STAT(shared_bytes, (u_int) ce->ce_length);
}
UNLOCK_MOUNT_R(cm);
ce->ce_flags |= CE_IODONE;
UNLOCK_EXTENT(ce);
wakeup(ce);
UNLOCK_CACHE_R();
if (issuing_lowpriority_extents) {
throttle_lowpri_io((ce->ce_flags & CE_LOWPRI) ? 1 : 0);
}
LOCK_CACHE_R();
if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
debug("New extents, interrupting low priority readahead");
break;
}
if (cel_idx >= BC_cache->c_nextentlists ||
ce_idx >= BC_cache->c_nextents[cel_idx]) {
break;
}
}
ce = NULL;
if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
break;
}
}
UNLOCK_CACHE_R();
microtime(&batchend);
timersub(&batchend, &batchstart, &batchend);
if (cd->cd_disk_num < STAT_DISKMAX) {
if (issuing_lowpriority_extents) {
BC_cache->c_stats.ss_batch_time_lowpri[cd->cd_disk_num] +=
(u_int) batchend.tv_sec * 1000 +
(u_int) batchend.tv_usec / 1000;
} else {
BC_cache->c_stats.ss_batch_time[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)] +=
(u_int) batchend.tv_sec * 1000 +
(u_int) batchend.tv_usec / 1000;
}
}
if (issuing_lowpriority_extents) {
} else {
KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) |
DBG_FUNC_END,
cd->cd_batch, cd->cd_disk_num, 0, 0, 0);
}
LOCK_DISK(cd);
if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
issuing_lowpriority_extents = 0;
throttle_set_thread_io_policy(IOPOL_NORMAL);
cd->cd_batch = 0;
} else if (cd->cd_batch + 1 < BC_cache->c_batch_count && !issuing_lowpriority_extents) {
cd->cd_batch++;
} else {
if (num_mounts == cd->cd_nmounts) {
if ( !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
cd->cd_flags |= CD_ISSUE_LOWPRI;
throttle_set_thread_io_policy(IOPOL_THROTTLE);
issuing_lowpriority_extents = 1;
} else {
cd->cd_flags &= (~CD_HAS_THREAD);
cd->cd_batch = 0;
UNLOCK_DISK(cd);
goto out;
}
} else {
cd->cd_batch = 0;
debug("disk %d more mounts detected (%d total)", cd->cd_disk_num, cd->cd_nmounts);
}
}
num_mounts = cd->cd_nmounts;
UNLOCK_DISK(cd);
}
out:
if (ce != NULL) {
UNLOCK_EXTENT(ce);
LOCK_DISK(cd);
BC_set_flag(BC_FLAG_SHUTDOWN);
cd->cd_flags &= (~CD_HAS_THREAD);
UNLOCK_DISK(cd);
int tmpcel_idx, tmpce_idx;
for (tmpcel_idx = 0;
tmpcel_idx < BC_cache->c_nextentlists;
tmpcel_idx++) {
for (tmpce_idx = 0;
tmpce_idx < BC_cache->c_nextents[tmpcel_idx];
tmpce_idx++) {
ce = BC_cache->c_extentlists[tmpcel_idx] + tmpce_idx;
LOCK_EXTENT(ce);
if (!(ce->ce_flags & CE_IODONE)) {
ce->ce_flags |= CE_ABORTED;
UNLOCK_EXTENT(ce);
wakeup(ce);
} else {
UNLOCK_EXTENT(ce);
}
}
}
UNLOCK_CACHE_R();
}
debug("disk %d done", cd->cd_disk_num);
wakeup(&cd->cd_flags);
LOCK_READERS();
BC_cache->c_num_reader_threads--;
if (BC_cache->c_num_reader_threads == 0) {
wakeup(&BC_cache->c_num_reader_threads);
}
UNLOCK_READERS();
}
static int
BC_close(dev_t dev, int flags, int devtype, struct proc *p)
{
struct BC_cache_mount *cm;
int i, cm_idx;
LOCK_CACHE_R();
if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) {
cm_idx = BC_find_cache_mount(dev);
if (-1 != cm_idx) {
cm = BC_cache->c_mounts + cm_idx;
debug("Tearing down closing mount %s with dev 0x%x", uuid_string(cm->cm_uuid), dev);
LOCK_MOUNT_R(cm);
if (cm->cm_state != CM_ABORTED) {
for (i = 0; i < cm->cm_nextents; i++) {
LOCK_EXTENT(cm->cm_pextents[i]);
BC_teardown_extent(cm->cm_pextents[i]);
UNLOCK_EXTENT(cm->cm_pextents[i]);
wakeup(cm->cm_pextents[i]);
}
if (! LOCK_MOUNT_R_TO_W(cm)) {
LOCK_MOUNT_W(cm);
if (cm->cm_state == CM_ABORTED) {
UNLOCK_MOUNT_W(cm);
goto out;
}
}
BC_teardown_mount(cm);
UNLOCK_MOUNT_W(cm);
} else {
UNLOCK_MOUNT_R(cm);
}
}
}
out:
UNLOCK_CACHE_R();
return BC_cache->c_close(dev, flags, devtype, p);
}
static int extents_status(struct BC_cache_extent **pce, int nextents) {
int ce_idx;
int ret = CE_IODONE;
for (ce_idx = 0;
ce_idx < nextents;
ce_idx++) {
if (pce[ce_idx]->ce_flags & CE_ABORTED) {
return CE_ABORTED;
}
if (! (pce[ce_idx]->ce_flags & CE_IODONE)) {
if (pce[ce_idx]->ce_flags & CE_LOWPRI) {
ret = CE_LOWPRI;
} else {
if (ret == CE_IODONE) {
ret = 0;
}
}
}
}
return ret;
}
static void wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) {
struct timeval time;
struct timespec timeout;
microtime(&time);
timersub(&time,
&starttime,
&time);
if (time.tv_sec < BC_wait_for_readahead) {
timeout.tv_sec = BC_wait_for_readahead - time.tv_sec;
if (time.tv_usec == 0) {
timeout.tv_nsec = 0;
} else {
timeout.tv_sec--;
timeout.tv_nsec = NSEC_PER_USEC * (USEC_PER_SEC - time.tv_usec);
}
msleep(ce, &ce->ce_lock, PRIBIO, "BC_strategy", &timeout);
}
}
static int
BC_strategy(struct buf *bp)
{
struct BC_cache_extent *ce = NULL, **pce, **pcontaining_extents = NULL;
int num_extents;
uio_t uio = NULL;
int nbytes, resid, discards = 0;
struct timeval blocked_start_time, blocked_end_time;
daddr64_t blkno;
caddr_t buf_map_p, buf_extent_p;
off_t disk_offset;
kern_return_t kret;
int cm_idx = -1, ce_idx;
dev_t dev;
int32_t bufflags;
int during_cache = 0, during_io = 0, take_detailed_stats = 0, cache_hit = 0;
int is_root_disk, did_block, status;
int is_shared = 0, is_swap = 0;
int should_throttle = 0;
int is_ssd = 0;
int unfilled = 0;
vnode_t vp = NULL;
int pid = 0;
int dont_cache = 0;
assert(bp != NULL);
blkno = buf_blkno(bp);
nbytes = buf_count(bp);
bufflags = buf_flags(bp);
dev = buf_device(bp);
vp = buf_vnode(bp);
if (dev == nulldev()) {
BC_cache->c_strategy(bp);
BC_ADD_STAT(strategy_unknown, 1);
BC_ADD_STAT(strategy_unknown_bytes, nbytes);
return 0;
}
if (vp && vnode_isswap(vp)) {
is_swap = 1;
goto bypass;
}
if (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE) {
pid = proc_selfpid();
if (vp) {
is_shared = vnode_isdyldsharedcache(vp);
}
}
if (vp && (vnode_israge(vp))) {
dont_cache = 1;
}
if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) {
goto bypass;
}
OSAddAtomic(1, &BC_cache->c_strategycalls);
LOCK_CACHE_R();
during_cache = 1;
cm_idx = BC_find_cache_mount(dev);
if (cm_idx != -1) {
disk_offset = CB_BLOCK_TO_BYTE(BC_cache->c_mounts + cm_idx, blkno);
}
if (BC_cache->c_take_detailed_stats) {
take_detailed_stats = 1;
BC_ADD_STAT(strategy_calls, 1);
if (bufflags & B_THROTTLED_IO) {
BC_ADD_STAT(strategy_throttled, 1);
}
#ifdef BC_DEBUG
#endif
}
if (cm_idx == -1) {
if (take_detailed_stats)
BC_ADD_STAT(strategy_noncached_mount, 1);
goto bypass;
}
LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
if (BC_cache->c_mounts[cm_idx].cm_disk &&
(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) &&
!(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI)) {
during_io = 1;
if (take_detailed_stats) {
BC_ADD_STAT(strategy_duringio, 1);
}
}
if (BC_cache->c_mounts[cm_idx].cm_state != CM_READY) {
UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
cm_idx = -1;
if (take_detailed_stats)
BC_ADD_STAT(strategy_noncached_mount, 1);
goto bypass;
}
if ( !(bufflags & B_READ)) {
UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
if (take_detailed_stats)
BC_ADD_STAT(strategy_nonread, 1);
goto bypass;
}
if ((nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize) != 0) {
debug("IO with non-blocksize multiple bytes (%d %% %lld = %lld)", nbytes, BC_cache->c_mounts[cm_idx].cm_blocksize, (nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize));
UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
if (take_detailed_stats)
BC_ADD_STAT(strategy_nonblocksize, 1);
goto bypass;
}
if (take_detailed_stats) {
BC_ADD_STAT(extent_lookups, 1);
BC_ADD_STAT(requested_bytes, nbytes);
if (cm_idx < STAT_MOUNTMAX) {
BC_ADD_STAT(requested_bytes_m[cm_idx], nbytes);
}
}
num_extents = 0;
pce = BC_find_extent(BC_cache->c_mounts + cm_idx, disk_offset, nbytes, 1, &num_extents);
if (pce == NULL) {
#ifdef BC_EXTRA_DEBUG
if (take_detailed_stats && !dont_cache) {
char procname[128];
proc_selfname(procname, sizeof(procname));
const char* filename = vp ? vnode_getname(vp) : NULL;
if (num_extents > 0) {
xdebug("Missed IO from app %s for file %s (disk offset 0x%llx) (intersected, though)", procname, filename?:"unknown", blkno);
} else {
xdebug("Missed IO from app %s for file %s (disk offset 0x%llx)", procname, filename?:"unknown", blkno);
}
if (filename) {
vnode_putname(filename);
}
}
#endif
UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
goto bypass;
}
assert(num_extents > 0);
cache_hit = 1;
if (take_detailed_stats) {
BC_ADD_STAT(extent_hits, 1);
if (during_io)
BC_ADD_STAT(hit_duringio, 1);
if (num_extents > 1)
BC_ADD_STAT(hit_multiple, 1);
}
pcontaining_extents = BC_MALLOC(num_extents * sizeof(*pcontaining_extents), M_TEMP, M_WAITOK | M_ZERO);
if (pcontaining_extents == NULL) {
UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
if (take_detailed_stats)
BC_ADD_STAT(hit_failure, 1);
goto bypass;
}
memcpy(pcontaining_extents, pce, num_extents * sizeof(*pcontaining_extents));
is_ssd = (BC_cache->c_mounts[cm_idx].cm_disk &&
(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD));
UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
status = extents_status(pcontaining_extents, num_extents);
if (status & CE_ABORTED) {
if (take_detailed_stats)
BC_ADD_STAT(hit_aborted, 1);
goto bypass;
}
#ifndef EMULATE_ONLY
if (! (status & CE_IODONE)) {
if (is_ssd) {
unfilled = 1;
goto bypass;
}
if (status & CE_LOWPRI) {
if (take_detailed_stats)
BC_ADD_STAT(strategy_unfilled_lowpri, 1);
cache_hit = 0;
goto bypass;
}
microtime(&blocked_start_time);
did_block = 0;
for (ce_idx = 0;
ce_idx < num_extents;
ce_idx++) {
ce = pcontaining_extents[ce_idx];
LOCK_EXTENT(ce);
if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) {
did_block = 1;
UNLOCK_CACHE_R();
wait_for_extent(ce, blocked_start_time);
UNLOCK_EXTENT(ce);
LOCK_CACHE_R();
LOCK_EXTENT(ce);
if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) {
if (take_detailed_stats)
BC_ADD_STAT(strategy_timedout, 1);
#ifdef BC_DEBUG
microtime(&blocked_end_time);
timersub(&blocked_end_time,
&blocked_start_time,
&blocked_end_time);
char procname[128];
proc_selfname(procname, sizeof(procname));
const char* filename = vp ? vnode_getname(vp) : NULL;
debug("Strategy routine timed out app %s waiting on file %s (disk offset 0x%llx) after %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000);
if (filename) {
vnode_putname(filename);
}
#endif
goto bypass;
}
UNLOCK_EXTENT(ce);
ce = NULL;
status = extents_status(pcontaining_extents, num_extents);
if (status & CE_ABORTED) {
if (take_detailed_stats)
BC_ADD_STAT(hit_aborted, 1);
goto bypass;
} else if (status & CE_IODONE) {
break;
}
} else {
UNLOCK_EXTENT(ce);
ce = NULL;
}
}
if (take_detailed_stats && did_block) {
BC_ADD_STAT(strategy_blocked, 1);
microtime(&blocked_end_time);
timersub(&blocked_end_time,
&blocked_start_time,
&blocked_end_time);
int ms_blocked = (blocked_end_time.tv_sec * 1000) + (blocked_end_time.tv_usec / (1000));
#ifdef BC_DEBUG
char procname[128];
proc_selfname(procname, sizeof(procname));
const char* filename = vp ? vnode_getname(vp) : NULL;
debug("Strategy routine blocked app %s waiting on file %s (disk offset 0x%llx) for %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000);
if (filename) {
vnode_putname(filename);
}
#endif
BC_ADD_STAT(strategy_time_blocked, ms_blocked);
if (BC_cache->c_stats.ss_strategy_time_longest_blocked < ms_blocked) {
BC_cache->c_stats.ss_strategy_time_longest_blocked = ms_blocked;
}
}
}
if (buf_map(bp, &buf_map_p)) {
if (take_detailed_stats)
BC_ADD_STAT(hit_failure, 1);
goto bypass;
}
buf_extent_p = buf_map_p;
kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
if (kret != KERN_SUCCESS) {
debug("vnode_getwithvid failed - %d", kret);
if (take_detailed_stats)
BC_ADD_STAT(hit_failure, 1);
buf_unmap(bp);
goto bypass;
}
for (ce_idx = 0;
ce_idx < num_extents;
ce_idx++) {
u_int64_t nbytes_thisextent;
u_int64_t diskoffset_thisextent;
off_t cacheoffset_thisextent;
ce = pcontaining_extents[ce_idx];
LOCK_EXTENT(ce);
if (ce->ce_flags & CE_ABORTED) {
if (take_detailed_stats)
BC_ADD_STAT(hit_aborted, 1);
vnode_put(BC_cache->c_vp);
buf_unmap(bp);
goto bypass;
}
assert(ce->ce_flags & CE_IODONE);
diskoffset_thisextent = MAX(ce->ce_diskoffset, disk_offset);
nbytes_thisextent = MIN(disk_offset + nbytes, ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent;
cacheoffset_thisextent = ce->ce_cacheoffset + (diskoffset_thisextent - ce->ce_diskoffset);
assert(diskoffset_thisextent < ce->ce_diskoffset + ce->ce_length);
assert(diskoffset_thisextent >= ce->ce_diskoffset);
assert(nbytes_thisextent <= (ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent);
assert(cacheoffset_thisextent < ce->ce_cacheoffset + ce->ce_length);
assert(cacheoffset_thisextent >= ce->ce_cacheoffset);
assert(nbytes_thisextent <= (ce->ce_cacheoffset + ce->ce_length) - cacheoffset_thisextent);
assert(diskoffset_thisextent < disk_offset + nbytes);
assert(diskoffset_thisextent >= disk_offset);
assert(nbytes_thisextent <= (disk_offset + nbytes) - diskoffset_thisextent);
assert(buf_extent_p - buf_map_p == diskoffset_thisextent - disk_offset);
assert(buf_map_p + nbytes >= buf_extent_p + nbytes_thisextent);
assert(ce_idx > 0 || disk_offset == diskoffset_thisextent);
assert(ce_idx < (num_extents - 1) || disk_offset + nbytes == diskoffset_thisextent + nbytes_thisextent);
if (!BC_blocks_present(ce,
CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, diskoffset_thisextent - ce->ce_diskoffset),
CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, nbytes_thisextent))) {
if (take_detailed_stats)
BC_ADD_STAT(hit_blkmissing, 1);
vnode_put(BC_cache->c_vp);
buf_unmap(bp);
goto bypass;
}
resid = nbytes_thisextent;
uio = uio_create(1, cacheoffset_thisextent, UIO_SYSSPACE, UIO_READ);
if (uio == NULL) {
debug("couldn't allocate uio");
if (take_detailed_stats)
BC_ADD_STAT(hit_failure, 1);
vnode_put(BC_cache->c_vp);
buf_unmap(bp);
goto bypass;
}
kret = uio_addiov(uio, CAST_USER_ADDR_T(buf_extent_p), nbytes_thisextent);
if (kret != KERN_SUCCESS) {
debug("couldn't add iov to uio - %d", kret);
if (take_detailed_stats)
BC_ADD_STAT(hit_failure, 1);
vnode_put(BC_cache->c_vp);
buf_unmap(bp);
goto bypass;
}
kret = cluster_copy_ubc_data(BC_cache->c_vp, uio, &resid, 1);
if (kret != KERN_SUCCESS) {
debug("couldn't copy ubc data - %d", kret);
if (take_detailed_stats)
BC_ADD_STAT(hit_failure, 1);
vnode_put(BC_cache->c_vp);
buf_unmap(bp);
goto bypass;
}
if (resid != 0) {
if (take_detailed_stats)
BC_ADD_STAT(hit_stolen, 1);
vnode_put(BC_cache->c_vp);
buf_unmap(bp);
goto bypass;
}
buf_extent_p += nbytes_thisextent;
discards += BC_discard_bytes(ce, disk_offset, nbytes_thisextent);
UNLOCK_EXTENT(ce);
ce = NULL;
uio_free(uio);
uio = NULL;
}
vnode_put(BC_cache->c_vp);
buf_setresid(bp, 0);
buf_unmap(bp);
buf_biodone(bp);
#else //ifndef EMULATE_ONLY
(void) kret;
(void) p;
(void) resid;
for (ce_idx = 0;
ce_idx < num_extents;
ce_idx++) {
ce = pcontaining_extents[ce_idx];
LOCK_EXTENT(ce);
discards += BC_discard_bytes(ce, disk_offset, nbytes);
UNLOCK_EXTENT(ce);
ce = NULL;
}
BC_cache->c_strategy(bp);
#endif //ifndef EMULATE_ONLY else
if (take_detailed_stats) {
BC_ADD_STAT(hit_bytes, discards);
if (cm_idx < STAT_MOUNTMAX) {
BC_ADD_STAT(hit_bytes_m[cm_idx], discards);
}
if (dont_cache) {
BC_ADD_STAT(strategy_hit_nocache, 1);
BC_ADD_STAT(hit_nocache_bytes, discards);
}
if (is_shared) {
BC_ADD_STAT(hit_shared_bytes, discards);
}
} else {
BC_ADD_STAT(hit_bytes_afterhistory, discards);
}
BC_cache->c_num_ios_since_last_hit = 0;
OSAddAtomic(-1, &BC_cache->c_strategycalls);
UNLOCK_CACHE_R();
_FREE_ZERO(pcontaining_extents, M_TEMP);
if (! dont_cache) {
BC_add_history(blkno, nbytes, pid, 1, 0, 0, is_shared, dev);
}
#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
return IO_SATISFIED_BY_CACHE;
bypass:
if (ce != NULL)
UNLOCK_EXTENT(ce);
if (uio != NULL)
uio_free(uio);
_FREE_ZERO(pcontaining_extents, M_TEMP);
BC_cache->c_strategy(bp);
bp = NULL;
if (during_cache) {
if (cm_idx != -1) {
LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
if (BC_cache->c_mounts[cm_idx].cm_state == CM_READY) {
discards += BC_handle_discards(BC_cache->c_mounts + cm_idx, disk_offset, nbytes);
}
if (BC_cache->c_readahead_throttles_cutthroughs &&
!is_swap &&
BC_cache->c_mounts[cm_idx].cm_disk &&
(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) &&
!(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI) &&
!(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD)) {
should_throttle = 1;
}
UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
}
if (take_detailed_stats) {
BC_ADD_STAT(strategy_bypassed, 1);
if (during_io) {
BC_ADD_STAT(strategy_bypass_duringio, 1);
}
if (dont_cache) {
BC_ADD_STAT(strategy_bypass_nocache, 1);
BC_ADD_STAT(bypass_nocache_bytes, nbytes);
if (during_io) {
BC_ADD_STAT(strategy_bypass_duringio_nocache, 1);
}
}
if (cache_hit) {
BC_ADD_STAT(error_discards, discards);
} else if (dont_cache) {
BC_ADD_STAT(bypass_nocache_discards, discards);
} else if (bufflags & B_READ) {
BC_ADD_STAT(read_discards, discards);
} else {
BC_ADD_STAT(write_discards, discards);
}
} else {
BC_ADD_STAT(lost_bytes_afterhistory, discards);
}
}
if (during_cache) {
OSAddAtomic(-1, &BC_cache->c_strategycalls);
UNLOCK_CACHE_R();
}
if (! is_swap) {
if (! dont_cache) {
is_root_disk = BC_add_history(blkno, nbytes, pid, 0, ((bufflags & B_READ) ? 0 : 1), 0, is_shared, dev);
if (take_detailed_stats && during_io && is_root_disk) {
if (cache_hit) {
if (unfilled) {
BC_ADD_STAT(strategy_bypass_duringio_unfilled, 1);
} else {
BC_ADD_STAT(strategy_bypass_duringio_rootdisk_failure, 1);
}
} else if (bufflags & B_READ) {
BC_ADD_STAT(strategy_bypass_duringio_rootdisk_read, 1);
} else {
BC_ADD_STAT(strategy_bypass_duringio_rootdisk_nonread, 1);
}
}
}
}
if ((bufflags & B_READ) &&
(during_cache) &&
!(bufflags & B_THROTTLED_IO)) {
struct timeval current_time;
if (BC_cache->c_stats.ss_history_num_recordings < 2) {
microtime(¤t_time);
timersub(¤t_time,
&BC_cache->c_loadtime,
¤t_time);
}
if (BC_cache->c_stats.ss_history_num_recordings >= 2 || current_time.tv_sec > BC_chit_prelogin_timeout_seconds) {
OSAddAtomic(1, &BC_cache->c_num_ios_since_last_hit);
if (BC_cache->c_num_ios_since_last_hit >= BC_chit_max_num_IOs_since_last_hit) {
debug("hit rate below threshold (0 hits in the last %u lookups)",
BC_cache->c_num_ios_since_last_hit);
if (BC_terminate_cache()) {
message("could not terminate cache on bad hitrate");
}
}
}
}
if (is_swap && (! (BC_cache->c_flags & BC_FLAG_SHUTDOWN))) {
debug("Detected %s swap file. Terminating readahead", (bufflags & B_READ) ? "read from" : "write to");
BC_set_flag(BC_FLAG_SHUTDOWN);
}
if (should_throttle && !(bufflags & B_THROTTLED_IO)) {
BC_ADD_STAT(strategy_forced_throttled, 1);
#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
return IO_SHOULD_BE_THROTTLED;
}
return 0;
}
static int
BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length)
{
struct BC_cache_extent **pce, **p;
int count, total_discards;
total_discards = 0;
if ((pce = BC_find_extent(cm, offset, length, 0, NULL)) == NULL)
return 0;
LOCK_EXTENT(*pce);
count = BC_discard_bytes((*pce), offset, length);
UNLOCK_EXTENT(*pce);
total_discards += count;
p = pce - 1;
while (p >= cm->cm_pextents &&
BC_check_intersection((*p), offset, length)) {
LOCK_EXTENT(*p);
count = BC_discard_bytes((*p), offset, length);
UNLOCK_EXTENT(*p);
if (count == 0)
break;
total_discards += count;
p--;
}
p = pce + 1;
while (p < (cm->cm_pextents + cm->cm_nextents) &&
BC_check_intersection((*p), offset, length)) {
LOCK_EXTENT(*p);
count = BC_discard_bytes((*p), offset, length);
UNLOCK_EXTENT(*p);
if (count == 0)
break;
total_discards += count;
p++;
}
return total_discards;
}
static int
BC_terminate_readahead(void)
{
int error;
struct timespec timeout;
timeout.tv_sec = 10;
timeout.tv_nsec = 0;
BC_set_flag(BC_FLAG_SHUTDOWN);
LOCK_READERS();
while (BC_cache->c_num_reader_threads > 0) {
debug("terminating active readahead");
error = msleep(&BC_cache->c_num_reader_threads, &BC_cache->c_reader_threads_lock, PRIBIO, "BC_terminate_readahead", &timeout);
if (error == EWOULDBLOCK) {
UNLOCK_READERS();
message("timed out waiting for I/O to stop");
if (BC_cache->c_num_reader_threads == 0) {
debug("but I/O has stopped!");
}
#ifdef BC_DEBUG
Debugger("I/O Kit wedged on us");
#endif
return(EBUSY);
}
}
UNLOCK_READERS();
return(0);
}
static int
BC_terminate_cache(void)
{
int retry, cm_idx, j, ce_idx, cel_idx;
BC_terminate_readahead();
if (BC_cache->c_num_reader_threads > 0) {
debug("cannot terminate cache while readahead is in progress");
return(EBUSY);
}
LOCK_CACHE_R();
LOCK_HANDLERS();
if (!BC_clear_flag(BC_FLAG_CACHEACTIVE)) {
debug("cannot terminate cache when not active");
UNLOCK_HANDLERS();
UNLOCK_CACHE_R();
return(ENXIO);
}
debug("terminating cache...");
BC_check_handlers();
UNLOCK_HANDLERS();
for (cel_idx = 0;
cel_idx < BC_cache->c_nextentlists;
cel_idx++) {
for (ce_idx = 0;
ce_idx < BC_cache->c_nextents[cel_idx];
ce_idx++) {
struct BC_cache_extent *ce = BC_cache->c_extentlists[cel_idx] + ce_idx;
LOCK_EXTENT(ce);
if (ce->ce_blockmap != NULL && BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize != 0) {
for (j = 0; j < howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize); j++) {
if (CB_BLOCK_PRESENT(ce, j))
BC_ADD_STAT(spurious_bytes, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize);
}
}
BC_teardown_extent(ce);
UNLOCK_EXTENT(ce);
wakeup(ce);
}
}
for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx;
LOCK_MOUNT_W(cm);
BC_teardown_mount(cm);
UNLOCK_MOUNT_W(cm);
}
retry = 0;
while (BC_cache->c_strategycalls > 0) {
tsleep(BC_cache, PRIBIO, "BC_terminate_cache", 10);
if (retry++ > 50) {
message("could not terminate cache, timed out with %d caller%s in BC_strategy",
(int) BC_cache->c_strategycalls,
BC_cache->c_strategycalls == 1 ? "" : "s");
UNLOCK_CACHE_R();
return(EBUSY);
}
}
if (! LOCK_CACHE_R_TO_W()) {
message("Unable to upgrade cache lock to free resources");
return ENXIO;
}
for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx;
if (cm->cm_disk != NULL) {
for (j = cm_idx + 1; j < BC_cache->c_nmounts; j++) {
if (BC_cache->c_mounts[j].cm_disk == cm->cm_disk) {
BC_cache->c_mounts[j].cm_disk = NULL;
}
}
BC_teardown_disk(cm->cm_disk);
lck_mtx_destroy(&cm->cm_disk->cd_lock, BC_cache->c_lckgrp);
_FREE_ZERO(cm->cm_disk, M_TEMP);
}
}
BC_free_pagebuffer();
BC_cache->c_cachesize = 0;
for (cel_idx = 0;
cel_idx < BC_cache->c_nextentlists;
cel_idx++) {
for (ce_idx = 0;
ce_idx < BC_cache->c_nextents[cel_idx];
ce_idx++) {
lck_mtx_destroy(&BC_cache->c_extentlists[cel_idx][ce_idx].ce_lock, BC_cache->c_lckgrp);
}
_FREE_ZERO(BC_cache->c_extentlists[cel_idx], M_TEMP);
}
_FREE_ZERO(BC_cache->c_extentlists, M_TEMP);
_FREE_ZERO(BC_cache->c_nextents, M_TEMP);
BC_cache->c_nextentlists = 0;
for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
lck_rw_destroy(&BC_cache->c_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp);
}
_FREE_ZERO(BC_cache->c_mounts, M_TEMP);
BC_cache->c_nmounts = 0;
UNLOCK_CACHE_W();
struct timeval endtime;
microtime(&endtime);
timersub(&endtime,
&BC_cache->c_cache_starttime,
&endtime);
BC_cache->c_stats.ss_cache_time += (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000;
return(0);
}
static int
BC_terminate_history(void)
{
struct BC_history_mount_device *hmd;
LOCK_HANDLERS();
if (!BC_clear_flag(BC_FLAG_HISTORYACTIVE)) {
debug("cannot terminate history when not active");
UNLOCK_HANDLERS();
return(ENXIO);
}
debug("terminating history collection...");
BC_check_handlers();
UNLOCK_HANDLERS();
untimeout(BC_timeout_history, NULL);
if (BC_cache->c_take_detailed_stats) {
BC_cache->c_stats.ss_history_mount_no_uuid = 0;
for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next)
if (uuid_is_null(hmd->hmd_mount.hm_uuid))
BC_ADD_STAT(history_mount_no_uuid, 1);
}
if (BC_cache->c_take_detailed_stats) {
struct timeval endtime;
microtime(&endtime);
timersub(&endtime,
&BC_cache->c_history_starttime,
&endtime);
BC_ADD_STAT(history_time, (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000);
}
BC_cache->c_take_detailed_stats = 0;
return(0);
}
static void
BC_check_handlers(void)
{
if (BC_cache->c_devsw == NULL ||
BC_cache->c_strategy == NULL ||
BC_cache->c_close == NULL) {
debug("Cannot check device handlers: cache not setup properly");
return;
}
if ((BC_cache->c_flags & BC_FLAG_CACHEACTIVE) ||
(BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) {
if (BC_cache->c_devsw->d_strategy != (strategy_fcn_t*) BC_strategy ||
BC_cache->c_devsw->d_close != BC_close) {
debug("connecting handlers...");
BC_cache->c_devsw->d_strategy = (strategy_fcn_t*) BC_strategy;
BC_cache->c_devsw->d_close = BC_close;
}
} else {
if (BC_cache->c_devsw->d_strategy != BC_cache->c_strategy ||
BC_cache->c_devsw->d_close != BC_cache->c_close) {
debug("disconnecting handlers...");
BC_cache->c_devsw->d_strategy = BC_cache->c_strategy;
BC_cache->c_devsw->d_close = BC_cache->c_close;
}
}
}
static int
BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx)
{
lck_mtx_init(&ce->ce_lock, BC_cache->c_lckgrp,
LCK_ATTR_NULL);
ce->ce_diskoffset = pe->pe_offset;
ce->ce_mount_idx = cache_mount_idx;
ce->ce_length = pe->pe_length;
#ifdef IGNORE_BATCH
ce->ce_batch = 0;
#else
ce->ce_batch = pe->pe_batch;
#endif
ce->ce_batch += batch_adjustment;
if (ce->ce_batch > BC_MAXBATCHES) {
ce->ce_batch = BC_MAXBATCHES;
}
ce->ce_cacheoffset = 0;
ce->ce_blockmap = NULL;
ce->ce_flags = 0;
if (pe->pe_flags & BC_PE_LOWPRIORITY) {
ce->ce_flags |= CE_LOWPRI;
}
if (pe->pe_flags & BC_PE_SHARED) {
ce->ce_flags |= CE_SHARED;
}
if (ce->ce_batch >= BC_cache->c_batch_count) {
BC_cache->c_batch_count = ce->ce_batch + 1;
}
return 0;
}
#define BLOCK_ROUNDDOWN(cm, x) (((x) / (cm)->cm_blocksize) * (cm)->cm_blocksize)
#define BLOCK_ROUNDUP(cm, x) ((((x) + ((cm)->cm_blocksize - 1)) / (cm)->cm_blocksize) * (cm)->cm_blocksize)
static int
BC_fill_in_extent(struct BC_cache_extent *ce)
{
int numblocks, roundsize, i;
u_int64_t end;
if (ce->ce_flags & CE_ABORTED ||
ce->ce_length == 0) {
return 1;
}
struct BC_cache_mount *cm = BC_cache->c_mounts + ce->ce_mount_idx;
int blocksize = cm->cm_blocksize;
if (0 == blocksize) {
return 1;
}
roundsize = roundup(ce->ce_length, PAGE_SIZE);
end = ce->ce_diskoffset + roundsize;
ce->ce_diskoffset = BLOCK_ROUNDDOWN(cm, ce->ce_diskoffset);
ce->ce_length = BLOCK_ROUNDUP(cm, end) - ce->ce_diskoffset;
if (roundup(ce->ce_length, PAGE_SIZE) > roundsize) {
debug("Clipped extent %llu by a page", ce->ce_diskoffset);
BC_ADD_STAT(extents_clipped, 1);
ce->ce_length = roundsize;
}
numblocks = howmany(ce->ce_length, blocksize);
ce->ce_blockmap = BC_MALLOC(howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES)),
M_TEMP, M_WAITOK | M_ZERO);
if (!ce->ce_blockmap) {
message("can't allocate extent blockmap for %d blocks of %d bytes", numblocks, howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES)));
return 1;
}
for (i = 0; i < howmany(ce->ce_length, blocksize); i++) {
CB_MARK_BLOCK_PRESENT((ce), i);
}
return 0;
}
static void
BC_teardown_extent(struct BC_cache_extent *ce)
{
ce->ce_flags |= CE_ABORTED;
_FREE_ZERO(ce->ce_blockmap, M_TEMP);
}
static int
BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd)
{
static int next_disk_num;
lck_mtx_init(&cd->cd_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
cd->cd_disk_id = disk_id;
cd->cd_disk_num = next_disk_num++;
cd->cd_flags = 0;
cd->cd_nmounts = 0;
cd->cd_batch = 0;
if (is_ssd) {
cd->cd_flags |= CD_IS_SSD;
}
debug("Setup disk 0x%llx as disk num %d%s", disk_id, cd->cd_disk_num, (cd->cd_flags & CD_IS_SSD) ? " (ssd)" : "");
return (0);
}
static void
BC_teardown_disk(struct BC_cache_disk *cd)
{
}
static void
BC_mount_available(struct BC_cache_mount *cm)
{
#ifdef EMULATE_ONLY
int i;
for (i = 0; i < cm->cm_nextents; i++)
cm->cm_pextents[i]->ce_flags |= CE_IODONE;
#else
int i, error;
thread_t rthread;
if (cm->cm_state != CM_READY) {
return;
}
struct BC_cache_disk *cd = cm->cm_disk;
LOCK_DISK(cd);
cd->cd_nmounts++;
LOCK_READERS();
if (!(BC_cache->c_flags & BC_FLAG_SHUTDOWN)) {
if (cd->cd_flags & CD_HAS_THREAD) {
if (cd->cd_flags & CD_ISSUE_LOWPRI) {
debug("Interrupting low-priority thread for new mount %s", uuid_string(cm->cm_uuid));
cd->cd_flags &= (~CD_ISSUE_LOWPRI);
}
UNLOCK_READERS();
UNLOCK_DISK(cd);
return;
}
cd->cd_flags &= (~CD_ISSUE_LOWPRI);
debug("Kicking off reader thread for disk %d for mount %s", cd->cd_disk_num, uuid_string(cm->cm_uuid));
if ((error = kernel_thread_start(BC_reader_thread, cd, &rthread)) == KERN_SUCCESS) {
thread_deallocate(rthread);
BC_cache->c_num_reader_threads++;
BC_ADD_STAT(readahead_threads, 1);
cd->cd_flags |= CD_HAS_THREAD;
UNLOCK_READERS();
UNLOCK_DISK(cd);
return;
}
message("Unable to start reader thread for disk %d: %d", cd->cd_disk_num, error);
}
UNLOCK_READERS();
UNLOCK_DISK(cd);
for (i = 0; i < cm->cm_nextents; i++) {
LOCK_EXTENT(cm->cm_pextents[i]);
BC_teardown_extent(cm->cm_pextents[i]);
UNLOCK_EXTENT(cm->cm_pextents[i]);
wakeup(cm->cm_pextents[i]);
}
#endif
}
static int
BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm)
{
int error = 0;
lck_rw_init(&cm->cm_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
uuid_copy(cm->cm_uuid, pm->pm_uuid);
cm->cm_dev = nulldev();
cm->cm_bp = NULL;
cm->cm_devsize = 0;
cm->cm_maxread = 0;
cm->cm_disk = NULL;
cm->cm_blocksize = 0;
cm->cm_nextents = 0;
if (pm->pm_nentries == 0) {
message("Playlist incuded mount with 0 entries");
error = EINVAL;
goto out;
}
cm->cm_pextents = BC_MALLOC(pm->pm_nentries * sizeof(*cm->cm_pextents), M_TEMP, M_WAITOK | M_ZERO);
if (cm->cm_pextents == NULL) {
message("can't allocate mount's extent array (%d entries)", pm->pm_nentries);
error = ENOMEM;
goto out;
}
cm->cm_state = CM_SETUP;
out:
if (error) {
cm->cm_state = CM_ABORTED;
_FREE_ZERO(cm->cm_pextents, M_TEMP);
lck_rw_destroy(&cm->cm_lock, BC_cache->c_lckgrp);
}
return error;
}
static int
BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context)
{
u_int64_t blkcount;
u_int32_t blksize;
int error, mount_idx, i, is_ssd, max_byte_count;
u_int64_t disk_id;
struct BC_cache_disk *cd;
vnode_t devvp = NULLVP;
error = 0;
if (CM_SETUP != cm->cm_state) {
return EINVAL;
}
cm->cm_throttle_mask = vfs_throttle_mask(mount);
disk_id = cm->cm_throttle_mask;
devvp = vfs_devvp(mount);
if (devvp == NULLVP) {
message("mount %s does not have a vnode", uuid_string(cm->cm_uuid));
error = EINVAL;
goto out;
}
#ifdef ROOT_DISK_ONLY
if (devvp == rootvp) {
BC_cache->c_root_disk_id = disk_id;
assert(BC_cache->c_root_disk_id != 0);
} else if (0 == BC_cache->c_root_disk_id) {
error = EAGAIN;
goto out;
} else if (BC_cache->c_root_disk_id != disk_id) {
debug("mount %s (disk 0x%llx) is not on the root disk (disk 0x%llx)", uuid_string(cm->cm_uuid), disk_id, BC_cache->c_root_disk_id);
error = ENODEV;
goto out;
}
#endif
for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
if (BC_cache->c_mounts + mount_idx == cm) continue;
cd = BC_cache->c_mounts[mount_idx].cm_disk;
if (cd && cd->cd_disk_id == disk_id) {
cm->cm_disk = cd;
break;
}
}
if (cm->cm_disk == NULL) {
cd = BC_MALLOC(sizeof(*cd), M_TEMP, M_WAITOK | M_ZERO);
if (cd == NULL) {
message("can't allocate memory for cache disk");
error = ENOMEM;
goto out;
}
if (VNOP_IOCTL(devvp,
DKIOCISSOLIDSTATE,
(caddr_t)&is_ssd,
0,
context))
{
message("can't determine if disk is a solid state disk for mount %s", uuid_string(cm->cm_uuid));
is_ssd = 0;
}
if ((error = BC_setup_disk(cd, disk_id, is_ssd)) != 0) {
_FREE_ZERO(cd, M_TEMP);
message("cache disk setup failed: %d", error);
goto out;
}
cm->cm_disk = cd;
}
#ifdef DONT_CACHE_SSDS
if (cm->cm_disk->cd_flags & CD_IS_SSD) {
debug("mount %s is on an SSD (disk 0x%llx)", uuid_string(cm->cm_uuid), cm->cm_disk->cd_disk_id);
error = ENODEV;
goto out;
}
#endif
if (VNOP_IOCTL(devvp,
DKIOCGETBLOCKCOUNT,
(caddr_t)&blkcount,
0,
context)
|| blkcount == 0)
{
message("can't determine device size, not checking");
blkcount = 0;
}
if (VNOP_IOCTL(devvp,
DKIOCGETBLOCKSIZE,
(caddr_t)&blksize,
0,
context)
|| blksize == 0)
{
message("can't determine device block size for mount %s, defaulting to 512 bytes", uuid_string(cm->cm_uuid));
blksize = 512;
}
if (PAGE_SIZE % blksize != 0) {
message("PAGE_SIZE (%d) is not a multiple of block size (%d) for mount %s", PAGE_SIZE, blksize, uuid_string(cm->cm_uuid));
error = EINVAL;
goto out;
}
cm->cm_blocksize = blksize;
cm->cm_devsize = blksize * blkcount;
VNOP_IOCTL(devvp,
DKIOCGETMAXSEGMENTBYTECOUNTREAD,
(caddr_t)&cm->cm_maxread,
0,
context);
if (0 == VNOP_IOCTL(devvp,
DKIOCGETMAXBYTECOUNTREAD,
(caddr_t)&max_byte_count,
0,
context))
{
if (cm->cm_maxread > max_byte_count && max_byte_count > 0) {
cm->cm_maxread = max_byte_count;
}
}
if (cm->cm_maxread > ubc_upl_maxbufsize()) {
cm->cm_maxread = ubc_upl_maxbufsize();
}
if (cm->cm_maxread < MAXPHYS)
{
debug("can't determine device read size for mount %s; using default", uuid_string(cm->cm_uuid));
cm->cm_maxread = MAXPHYS;
}
if (cm->cm_maxread % cm->cm_blocksize != 0) {
debug("Mount max IO size (%llu) not a multiple of block size (%llu)", cm->cm_maxread, cm->cm_blocksize);
cm->cm_maxread -= cm->cm_maxread % cm->cm_blocksize;
}
cm->cm_bp = buf_alloc(devvp);
if (cm->cm_bp == NULL) {
message("can't allocate buf");
error = ENOMEM;
goto out;
}
cm->cm_dev = vnode_specrdev(devvp);
if (cm->cm_dev == nulldev()) {
message("mount %s does not have a device", uuid_string(cm->cm_uuid));
error = EINVAL;
goto out;
}
for (i = 0; i < cm->cm_nextents; i++) {
LOCK_EXTENT(cm->cm_pextents[i]);
if (0 != BC_fill_in_extent(cm->cm_pextents[i])) {
BC_teardown_extent(cm->cm_pextents[i]);
UNLOCK_EXTENT(cm->cm_pextents[i]);
wakeup(cm->cm_pextents[i]);
} else {
UNLOCK_EXTENT(cm->cm_pextents[i]);
}
}
cm->cm_state = CM_READY;
debug("Found new cache mount %s disk %d, %d extents", uuid_string(cm->cm_uuid), cm->cm_disk->cd_disk_num, cm->cm_nextents);
out:
if (error && error != EAGAIN) {
for (i = 0; i < cm->cm_nextents; i++) {
LOCK_EXTENT(cm->cm_pextents[i]);
BC_teardown_extent(cm->cm_pextents[i]);
UNLOCK_EXTENT(cm->cm_pextents[i]);
wakeup(cm->cm_pextents[i]);
}
BC_teardown_mount(cm);
}
if (devvp != NULLVP) {
vnode_put(devvp);
}
return error;
}
static void
BC_teardown_mount(struct BC_cache_mount *cm)
{
if (cm->cm_bp) {
buf_free(cm->cm_bp);
cm->cm_bp = NULL;
}
cm->cm_nextents = 0;
_FREE_ZERO(cm->cm_pextents, M_TEMP);
cm->cm_state = CM_ABORTED;
}
static int fill_in_bc_cache_mounts(mount_t mount, void* arg)
{
int mount_idx, error, i;
struct vfs_attr attr;
vfs_context_t context;
int* go_again = (int*) arg;
VFSATTR_INIT(&attr);
VFSATTR_WANTED(&attr, f_uuid);
context = vfs_context_create(NULL);
error = vfs_getattr(mount, &attr, context);
if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) {
#ifdef BC_DEBUG
char name[MFSNAMELEN];
vfs_name(mount, name);
if (strncmp("devfs", name, sizeof("devfs")) != 0) {
debug("Found mount %s for IO without a UUID: error %d", name, error);
}
#endif
vfs_context_rele(context);
return VFS_RETURNED;
} else {
for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx;
int match = 0;
if (CM_SETUP == cm->cm_state) {
if (0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) {
match = 1;
}
if ((!match) && uuid_is_null(cm->cm_uuid)) {
vnode_t devvp = vfs_devvp(mount);
if (vnode_specrdev(devvp) == rootdev) {
uuid_copy(cm->cm_uuid, attr.f_uuid);
match = 1;
debug("Found root mount %s", uuid_string(cm->cm_uuid));
}
vnode_put(devvp);
}
if (match) {
if (BC_fill_in_mount(cm, mount, context) == EAGAIN) {
*go_again = 1;
}
vfs_context_rele(context);
for (i = 0; i < BC_cache->c_nmounts; i++) {
if (CM_SETUP == BC_cache->c_mounts[i].cm_state) {
return VFS_RETURNED;
}
}
return VFS_RETURNED_DONE;
}
}
}
}
vfs_context_rele(context);
for (i = 0; i < BC_cache->c_nmounts; i++) {
if (CM_SETUP == BC_cache->c_mounts[i].cm_state) {
return VFS_RETURNED;
}
}
debug("No new mounts filled in fill_in_bc_cache_mounts");
return VFS_RETURNED_DONE;
}
#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
static int
compare_cache_extents(const void *vfirst, const void *vsecond)
{
const struct BC_cache_extent *first, *second;
first = (const struct BC_cache_extent *)vfirst;
second = (const struct BC_cache_extent *)vsecond;
int mount_comparison = first->ce_mount_idx - second->ce_mount_idx;
if (mount_comparison != 0)
return((mount_comparison < 0) ? -1 : 1);
if (first->ce_diskoffset == second->ce_diskoffset)
return(0);
return((first->ce_diskoffset < second->ce_diskoffset) ? -1 : 1);
}
#endif
static int
BC_copyin_playlist(size_t mounts_size, user_addr_t mounts_buf, size_t entries_size, user_addr_t entries_buf)
{
int nplaylist_mounts, nplaylist_entries;
struct BC_playlist_mount *playlist_mounts = NULL, *pm;
struct BC_playlist_entry *playlist_entries = NULL, *pe;
struct BC_cache_mount *cm, *old_cm;
struct BC_cache_extent *ce;
int ncache_mounts = 0, ncache_extents = 0, nnew_mounts = 0;
struct BC_cache_mount *cache_mounts = NULL;
struct BC_cache_extent *cache_extents = NULL;
int *pmidx_to_cmidx = NULL;
int *next_old_extent_idx = NULL;
int *cache_nextents = NULL;
struct BC_cache_extent **cache_extents_list = NULL;
int old_batch_count;
u_int64_t old_cache_size;
int error, pm_idx, cm_idx, pe_idx, ce_idx, cel_idx, pm_idx2, max_entries;
off_t p;
u_int64_t size;
int actual, remaining_entries;
int clipped_extents = 0, enveloped_extents = 0;
playlist_mounts = NULL;
playlist_entries = NULL;
#ifdef BC_DEBUG
struct timeval start_time;
microtime(&start_time);
#endif
old_batch_count = BC_cache->c_batch_count;
old_cache_size = BC_cache->c_cachesize;
nplaylist_mounts = (int) mounts_size / sizeof(*playlist_mounts);
nplaylist_entries = (int) entries_size / sizeof(*playlist_entries);
if (BC_cache->c_stats.ss_total_extents > 0) {
debug("adding %u extents to existing cache of %u extents", nplaylist_entries, BC_cache->c_stats.ss_total_extents);
} else {
debug("setting up first cache of %d extents", nplaylist_entries);
}
for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
debug("Mount %s has %d extents", uuid_string(BC_cache->c_mounts[cm_idx].cm_uuid), BC_cache->c_mounts[cm_idx].cm_nextents);
}
if (BC_preloaded_playlist) {
playlist_mounts = CAST_DOWN(struct BC_playlist_mount *, mounts_buf);
} else {
playlist_mounts = BC_MALLOC(mounts_size, M_TEMP, M_WAITOK);
if (playlist_mounts == NULL) {
message("can't allocate unpack mount buffer of %ld bytes", mounts_size);
error = ENOMEM;
goto out;
}
if ((error = copyin(mounts_buf, playlist_mounts, mounts_size)) != 0) {
message("copyin %ld bytes from 0x%llx to %p failed", mounts_size, mounts_buf, playlist_mounts);
goto out;
}
}
pmidx_to_cmidx = BC_MALLOC(nplaylist_mounts * sizeof(*pmidx_to_cmidx), M_TEMP, M_WAITOK);
if (pmidx_to_cmidx == NULL) {
message("can't allocate playlist index to cache index reference array for %d mounts", nplaylist_mounts);
error = ENOMEM;
goto out;
}
if (BC_cache->c_nmounts > 0) {
next_old_extent_idx = BC_MALLOC(BC_cache->c_nmounts * sizeof(*next_old_extent_idx), M_TEMP, M_WAITOK | M_ZERO);
if (next_old_extent_idx == NULL) {
message("can't allocate index array for %d mounts", BC_cache->c_nmounts);
error = ENOMEM;
goto out;
}
}
nnew_mounts = nplaylist_mounts;
for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) {
pmidx_to_cmidx[pm_idx] = -1;
for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
if (0 == uuid_compare(BC_cache->c_mounts[cm_idx].cm_uuid, playlist_mounts[pm_idx].pm_uuid)) {
pmidx_to_cmidx[pm_idx] = cm_idx;
nnew_mounts--;
break;
}
}
for (pm_idx2 = pm_idx + 1; pm_idx2 < nplaylist_mounts; pm_idx2++) {
if (0 == uuid_compare(playlist_mounts[pm_idx2].pm_uuid, playlist_mounts[pm_idx].pm_uuid)) {
message("Playlist had duplicate mount %s", uuid_string(playlist_mounts[pm_idx2].pm_uuid));
error = EINVAL;
goto out;
}
}
}
cache_mounts = BC_MALLOC((BC_cache->c_nmounts + nnew_mounts) * sizeof(*cache_mounts), M_TEMP, M_WAITOK | M_ZERO);
if (cache_mounts == NULL) {
message("can't allocate memory for %d mounts", (BC_cache->c_nmounts + nnew_mounts));
error = ENOMEM;
goto out;
}
memcpy(cache_mounts, BC_cache->c_mounts, (BC_cache->c_nmounts * sizeof(*BC_cache->c_mounts)));
ncache_mounts = BC_cache->c_nmounts;
for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) {
if (pmidx_to_cmidx[pm_idx] != -1) {
if (playlist_mounts[pm_idx].pm_nentries == 0) {
message("Playlist incuded mount with 0 entries");
error = EINVAL;
goto out;
}
cm_idx = pmidx_to_cmidx[pm_idx];
assert(cm_idx < BC_cache->c_nmounts);
cache_mounts[cm_idx].cm_pextents = BC_MALLOC((BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries) * sizeof(*cache_mounts[cm_idx].cm_pextents), M_TEMP, M_WAITOK | M_ZERO);
if (cache_mounts[cm_idx].cm_pextents == NULL) {
message("can't allocate mount's extent array (%d entries)", (BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries));
error = ENOMEM;
goto out;
}
cache_mounts[cm_idx].cm_nextents = 0;
} else {
if ((error = BC_setup_mount(cache_mounts + ncache_mounts, playlist_mounts + pm_idx)) != 0) {
goto out;
}
pmidx_to_cmidx[pm_idx] = ncache_mounts;
ncache_mounts++;
}
}
cache_nextents = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_nextents), M_TEMP, M_WAITOK | M_ZERO);
if (cache_nextents == NULL) {
message("can't allocate memory for %d extent list sizes", nplaylist_entries);
error = ENOMEM;
goto out;
}
memcpy(cache_nextents, BC_cache->c_nextents, BC_cache->c_nextentlists * sizeof(*BC_cache->c_nextents));
cache_extents_list = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_extentlists), M_TEMP, M_WAITOK | M_ZERO);
if (cache_extents_list == NULL) {
message("can't allocate memory for %d extent lists", nplaylist_entries);
error = ENOMEM;
goto out;
}
memcpy(cache_extents_list, BC_cache->c_extentlists, BC_cache->c_nextentlists * sizeof(*BC_cache->c_extentlists));
cache_extents = BC_MALLOC(nplaylist_entries * sizeof(*cache_extents), M_TEMP, M_WAITOK | M_ZERO);
if (cache_extents == NULL) {
message("can't allocate memory for %d extents", nplaylist_entries);
error = ENOMEM;
goto out;
}
size = 0;
if (BC_preloaded_playlist) {
playlist_entries = CAST_DOWN(struct BC_playlist_entry *, entries_buf);
} else {
playlist_entries = BC_MALLOC(BC_PLC_CHUNK * sizeof(*playlist_entries),
M_TEMP, M_WAITOK);
if (playlist_entries == NULL) {
message("can't allocate unpack buffer for %d entries", BC_PLC_CHUNK);
error = ENOMEM;
goto out;
}
}
remaining_entries = nplaylist_entries;
while (remaining_entries > 0) {
if (BC_preloaded_playlist) {
actual = remaining_entries;
} else {
actual = min(remaining_entries, BC_PLC_CHUNK);
if ((error = copyin(entries_buf, playlist_entries,
actual * sizeof(struct BC_playlist_entry))) != 0) {
message("copyin from 0x%llx to %p failed", entries_buf, playlist_entries);
goto out;
}
}
for (pe_idx = 0; pe_idx < actual; pe_idx++) {
pe = playlist_entries + pe_idx;
if (pe->pe_length == 0) {
debug("Bad Playlist: entry %d has 0 length", (nplaylist_entries - remaining_entries) + pe_idx);
error = EINVAL;
goto out;
}
pm_idx = pe->pe_mount_idx;
if (pm_idx >= nplaylist_mounts || pm_idx < 0) {
message("Bad playlist: entry %d referenced non-existent mount index %d", (nplaylist_entries - remaining_entries) + pe_idx, pm_idx);
error = EINVAL;
goto out;
}
pm = playlist_mounts + pm_idx;
cm_idx = pmidx_to_cmidx[pm_idx];
cm = cache_mounts + cm_idx;
ce = cache_extents + ncache_extents;
max_entries = pm->pm_nentries + ((cm_idx < BC_cache->c_nmounts) ? BC_cache->c_mounts[cm_idx].cm_nextents : 0);
if (cm->cm_nextents >= max_entries) {
message("Bad playlist: more entries existed than the mount %s claimed (%d)", uuid_string(pm->pm_uuid), pm->pm_nentries);
error = EINVAL;
goto out;
}
if ((error = BC_setup_extent(ce, pe, old_batch_count, cm_idx)) != 0) {
goto out;
}
#ifdef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
if (cm_idx < BC_cache->c_nmounts) {
old_cm = BC_cache->c_mounts + cm_idx;
while (next_old_extent_idx[cm_idx] < old_cm->cm_nextents &&
old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset <= ce->ce_diskoffset) {
cm->cm_pextents[cm->cm_nextents] = old_cm->cm_pextents[next_old_extent_idx[cm_idx]];
cm->cm_nextents++;
next_old_extent_idx[cm_idx]++;
}
if (next_old_extent_idx[cm_idx] < old_cm->cm_nextents) {
if (ce->ce_diskoffset + ce->ce_length > (old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset + old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length)) {
debug("!!! Entry %d (0x%llx:%lld) is getting clipped too much because of previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset, old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length);
}
u_int64_t max_offset = old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset;
if (max_offset < ce->ce_diskoffset + ce->ce_length) {
if (max_offset <= ce->ce_diskoffset) {
ce->ce_length = 0;
} else {
ce->ce_length = (max_offset - ce->ce_diskoffset);
}
}
}
}
if (cm->cm_nextents > 0) {
u_int64_t min_offset = cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset + cm->cm_pextents[cm->cm_nextents - 1]->ce_length;
if (min_offset > ce->ce_diskoffset) {
if (cm->cm_pextents[cm->cm_nextents - 1] >= cache_extents &&
cm->cm_pextents[cm->cm_nextents - 1] < cache_extents + ncache_extents) {
message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset, cm->cm_pextents[cm->cm_nextents - 1]->ce_length);
error = EINVAL;
goto out;
}
if (min_offset >= ce->ce_diskoffset + ce->ce_length) {
ce->ce_length = 0;
} else {
ce->ce_length -= (min_offset - ce->ce_diskoffset);
ce->ce_diskoffset = min_offset;
}
}
}
if (ce->ce_length != pe->pe_length) {
clipped_extents++;
}
if (ce->ce_length == 0) {
enveloped_extents++;
BC_teardown_extent(ce);
lck_mtx_destroy(&ce->ce_lock, BC_cache->c_lckgrp);
continue;
}
size += roundup(ce->ce_length, PAGE_SIZE);
#endif
cm->cm_pextents[cm->cm_nextents] = ce;
cm->cm_nextents++;
ncache_extents++;
}
remaining_entries -= actual;
entries_buf += actual * sizeof(struct BC_playlist_entry);
}
for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) {
cm_idx = pmidx_to_cmidx[pm_idx];
cm = cache_mounts + cm_idx;
pm = playlist_mounts + pm_idx;
if (cm->cm_nextents == 0) {
message("Bad playlist: No entries existed for mount %s (claimed %d)", uuid_string(pm->pm_uuid), pm->pm_nentries);
error = EINVAL;
goto out;
}
if (cm_idx < BC_cache->c_nmounts) {
if (next_old_extent_idx[cm_idx] < BC_cache->c_mounts[cm_idx].cm_nextents) {
memcpy(cm->cm_pextents + cm->cm_nextents, BC_cache->c_mounts[cm_idx].cm_pextents + next_old_extent_idx[cm_idx], sizeof(*cm->cm_pextents) * (BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx]));
cm->cm_nextents += BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx];
}
}
}
#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) {
cm = cache_mounts + cm_idx;
if (cm_idx >= BC_cache->c_nmounts ||
cm->cm_pextents != BC_cache->c_mounts[cm_idx].cm_pextents) {
qsort(cm->cm_pextents,
cm->cm_nextents,
sizeof(*cm->cm_pextents),
compare_cache_extents);
for (ce_idx = 0; ce_idx < cm->cm_nextents; ce_idx++) {
if (cm->cm_pextents[ce_idx] >= cache_extents &&
cm->cm_pextents[ce_idx] < cache_extents + ncache_extents) {
u_int64_t old_length = ce->ce_length;
if (ce_idx > 0) {
u_int64_t min_offset = cm->cm_pextents[ce_idx - 1]->ce_diskoffset + cm->cm_pextents[ce_idx - 1]->ce_length;
if (min_offset > ce->ce_diskoffset) {
if (min_offset >= ce->ce_diskoffset + ce->ce_length) {
ce->ce_length = 0;
} else {
ce->ce_length -= (min_offset - ce->ce_diskoffset);
ce->ce_diskoffset = min_offset;
}
}
}
if (ce_idx < cm_nextents - 1) {
u_int64_t max_offset = cm->cm_pextents[ce_idx + 1]->ce_diskoffset;
if (max_offset < ce->ce_diskoffset + ce->ce_length) {
if (cm->cm_pextents[ce_idx + 1] >= cache_extents &&
cm->cm_pextents[ce_idx + 1] < cache_extents + ncache_extents) {
message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[ce_idx + 1]->ce_diskoffset, cm->cm_pextents[ce_idx + 1]->ce_length);
error = EINVAL;
goto out;
}
if (max_offset <= ce->ce_diskoffset) {
ce->ce_length = 0;
} else {
ce->ce_length = (max_offset - ce->ce_diskoffset);
}
}
}
if (ce->ce_length != old_length) {
clipped_extents++;
if (ce->ce_length == 0) {
if ((ce_idx + 1) < cm->cm_nextents) {
memmove(cm->cm_pextents + ce_idx, cm->cm_pextents + (ce_idx + 1), cm->cm_nextents - (ce_idx + 1));
}
cm->cm_nextents--;
ce_idx--;
enveloped_extents++;
}
}
}
}
}
}
for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
size += roundup(cache_extents->ce_length, PAGE_SIZE);
}
#endif
if (clipped_extents > 0) {
debug("%d extents were clipped, %d of which were completely enveloped", clipped_extents, enveloped_extents);
}
if (ncache_extents == 0) {
debug("No extents added, not updating cache");
error = EALREADY;
goto out;
}
if (! BC_preloaded_playlist) {
_FREE_ZERO(playlist_entries, M_TEMP);
_FREE_ZERO(playlist_mounts, M_TEMP);
}
if (ncache_extents < nplaylist_entries) {
debug("%d extents added out of %d", ncache_extents, nplaylist_entries);
}
for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) {
debug("Mount %s now has %d extents", uuid_string(cache_mounts[cm_idx].cm_uuid), cache_mounts[cm_idx].cm_nextents);
}
size = roundup(size, PAGE_SIZE * CB_MAPFIELDBITS);
BC_cache->c_cachesize += size;
if (BC_cache->c_cachesize > BC_MAX_SIZE) {
message("cache size (%lu bytes) too large for physical memory (%llu bytes), capping at %llu bytes",
(unsigned long)BC_cache->c_cachesize, max_mem, BC_MAX_SIZE);
BC_cache->c_cachesize = BC_MAX_SIZE;
}
if (BC_alloc_pagebuffer() != 0) {
message("can't allocate %lld bytes for cache buffer", BC_cache->c_cachesize);
error = ENOMEM;
goto out;
}
p = 0;
if (BC_cache->c_nextentlists > 0) {
for (cel_idx = BC_cache->c_nextentlists - 1; cel_idx >= 0; cel_idx--) {
if (BC_cache->c_nextents[cel_idx] > 0) {
struct BC_cache_extent *last_ce = BC_cache->c_extentlists[cel_idx] + BC_cache->c_nextents[cel_idx] - 1;
p = last_ce->ce_cacheoffset + roundup(last_ce->ce_length, PAGE_SIZE);
break;
}
}
}
for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
cache_extents[ce_idx].ce_cacheoffset = p;
p += roundup(cache_extents[ce_idx].ce_length, PAGE_SIZE);
if (p >= BC_cache->c_cachesize || cache_extents[ce_idx].ce_length == 0) {
BC_teardown_extent(cache_extents + ce_idx);
}
}
if (BC_cache->c_nmounts > 0) {
for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) {
_FREE_ZERO(BC_cache->c_mounts[cm_idx].cm_pextents, M_TEMP);
}
}
_FREE_ZERO(BC_cache->c_mounts, M_TEMP)
}
BC_cache->c_mounts = cache_mounts;
BC_cache->c_nmounts = ncache_mounts;
if (BC_cache->c_nextentlists > 0) {
_FREE_ZERO(BC_cache->c_extentlists, M_TEMP)
_FREE_ZERO(BC_cache->c_nextents, M_TEMP)
}
cache_nextents[BC_cache->c_nextentlists] = ncache_extents;
cache_extents_list[BC_cache->c_nextentlists] = cache_extents;
BC_cache->c_nextentlists++;
BC_cache->c_nextents = cache_nextents;
BC_cache->c_extentlists = cache_extents_list;
BC_cache->c_stats.ss_total_mounts = ncache_mounts;
BC_cache->c_stats.ss_total_extents += ncache_extents;
for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
ce = cache_extents + ce_idx;
if (CM_SETUP != BC_cache->c_mounts[ce->ce_mount_idx].cm_state) {
LOCK_EXTENT(ce);
if (CM_READY == BC_cache->c_mounts[ce->ce_mount_idx].cm_state) {
if (0 == BC_fill_in_extent(ce)) {
UNLOCK_EXTENT(ce);
continue;
}
}
BC_teardown_extent(ce);
UNLOCK_EXTENT(ce);
}
}
for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
if (CM_SETUP == BC_cache->c_mounts[cm_idx].cm_state) {
int go_again = 0;
vfs_iterate(0, fill_in_bc_cache_mounts, &go_again);
if (go_again) {
debug("Missing root mount during last iteration, going again");
vfs_iterate(0, fill_in_bc_cache_mounts, &go_again);
}
break;
}
}
error = 0;
out:
if (! BC_preloaded_playlist) {
_FREE_ZERO(playlist_entries, M_TEMP);
_FREE_ZERO(playlist_mounts, M_TEMP);
}
_FREE_ZERO(pmidx_to_cmidx, M_TEMP);
_FREE_ZERO(next_old_extent_idx, M_TEMP);
if (error != 0) {
if (error != EALREADY) {
debug("cache setup failed, aborting");
}
if (BC_cache->c_extentlists == NULL) {
BC_cache->c_stats.ss_total_extents = 0;
BC_cache->c_stats.ss_total_mounts = 0;
if (BC_cache->c_vp != NULL) {
BC_free_pagebuffer();
}
}
BC_cache->c_cachesize = old_cache_size;
BC_cache->c_batch_count = old_batch_count;
if (cache_extents) {
for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
BC_teardown_extent(cache_extents + ce_idx);
lck_mtx_destroy(&cache_extents[ce_idx].ce_lock, BC_cache->c_lckgrp);
}
_FREE_ZERO(cache_extents, M_TEMP);
}
_FREE_ZERO(cache_nextents, M_TEMP);
_FREE_ZERO(cache_extents_list, M_TEMP);
if (cache_mounts) {
for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) {
if (cm_idx >= BC_cache->c_nmounts) {
BC_teardown_mount(cache_mounts + cm_idx);
lck_rw_destroy(&cache_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp);
} else if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) {
_FREE_ZERO(cache_mounts[cm_idx].cm_pextents, M_TEMP);
}
}
_FREE_ZERO(cache_mounts, M_TEMP);
}
}
#ifdef BC_DEBUG
struct timeval end_time;
microtime(&end_time);
timersub(&end_time, &start_time, &end_time);
debug("BC_copyin_playlist took %ldus", end_time.tv_sec * 1000000 + end_time.tv_usec);
#endif
return(error);
}
static int
BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history)
{
int error = 0, copyin_error, mount_idx;
struct timeval current_time;
if ((mounts_size == 0 || mounts == 0 || entries_size == 0 || entries == 0) &&
(mounts_size != 0 || mounts != 0 || entries_size != 0 || entries != 0)) {
debug("Invalid cache parameters: %ld, %lld, %ld, %lld", mounts_size, mounts, entries_size, entries);
return(EINVAL);
}
if (! (BC_cache->c_flags & BC_FLAG_SETUP)) {
debug("Cache not yet setup");
return(ENXIO);
}
microtime(¤t_time);
if (record_history) {
LOCK_HANDLERS();
if (BC_set_flag(BC_FLAG_HISTORYACTIVE)) {
debug("history recording already started, only one playlist will be returned");
BC_ADD_STAT(history_num_recordings, 1);
UNLOCK_HANDLERS();
} else {
debug("starting history recording");
timeout(BC_timeout_history, NULL, BC_history_timeout);
if (BC_cache->c_stats.ss_history_num_recordings < 2)
BC_cache->c_take_detailed_stats = 1;
BC_cache->c_history_starttime = current_time;
BC_cache->c_history_size = 0;
BC_cache->c_history_num_clusters = 0;
BC_ADD_STAT(history_num_recordings, 1);
BC_check_handlers();
UNLOCK_HANDLERS();
BC_update_mounts();
#ifndef NO_HISTORY
# ifdef STATIC_HISTORY
if ((BC_cache->c_history = BC_MALLOC(BC_HISTORY_ALLOC, M_TEMP, M_WAITOK)) == NULL) {
message("can't allocate %d bytes for static history buffer", BC_HISTORY_ALLOC);
BC_clear_flag(BC_FLAG_HISTORYACTIVE);
error = ENOMEM;
goto out;
}
bzero(BC_cache->c_history, BC_HISTORY_ALLOC);
# endif
#endif
}
}
if (entries_size > 0) {
LOCK_CACHE_W();
if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) {
debug("cache already shutdown");
error = ENXIO;
UNLOCK_CACHE_W();
goto out;
}
BC_cache->c_take_detailed_stats = 1;
copyin_error = BC_copyin_playlist(mounts_size, mounts, entries_size, entries);
if (copyin_error != 0 && copyin_error != EALREADY) {
message("can't load playlist: %d", copyin_error);
UNLOCK_CACHE_W();
error = copyin_error;
goto out;
}
BC_cache->c_readahead_throttles_cutthroughs = 1;
LOCK_HANDLERS();
if (BC_set_flag(BC_FLAG_CACHEACTIVE)) {
UNLOCK_HANDLERS();
} else {
#ifdef READ_HISTORY_BUFFER
if (NULL == BC_cache->c_rhistory) {
if ((BC_cache->c_rhistory = BC_MALLOC(READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry),
M_TEMP, M_WAITOK)) == NULL) {
message("can't allocate %d bytes for read history buffer",
READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry));
}
}
if (BC_cache->c_rhistory) {
bzero(BC_cache->c_rhistory, READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry));
}
#endif
BC_cache->c_cache_starttime = current_time;
BC_check_handlers();
UNLOCK_HANDLERS();
}
LOCK_CACHE_W_TO_R();
if (copyin_error != EALREADY) {
for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
if (BC_cache->c_mounts[mount_idx].cm_state == CM_READY) {
LOCK_MOUNT_R(BC_cache->c_mounts + mount_idx);
BC_mount_available(BC_cache->c_mounts + mount_idx);
UNLOCK_MOUNT_R(BC_cache->c_mounts + mount_idx);
}
}
}
UNLOCK_CACHE_R();
}
if (BC_cache->c_stats.ss_preload_time == 0) {
timersub(¤t_time,
&BC_cache->c_loadtime,
¤t_time);
BC_cache->c_stats.ss_preload_time = (u_int) current_time.tv_sec * 1000 + (u_int) current_time.tv_usec / 1000;
}
out:
return(error);
}
static void
BC_timeout_history(void *junk)
{
debug("history recording timed out");
BC_terminate_history();
}
static int check_for_new_mount_itr(mount_t mount, void* arg) {
struct vfs_attr attr;
vfs_context_t context;
int error, mount_idx;
int* go_again = (int*) arg;
struct BC_history_mount_device *hmd = NULL;
vnode_t devvp = NULLVP;
LOCK_HISTORY_R();
if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) {
UNLOCK_HISTORY_R();
return VFS_RETURNED_DONE;
}
#ifdef BC_DEBUG
char name[MFSNAMELEN];
vfs_name(mount, name);
#endif
devvp = vfs_devvp(mount);
if (devvp == NULLVP) {
UNLOCK_HISTORY_R();
goto out;
}
dev_t mount_dev = vnode_specrdev(devvp);
hmd = BC_get_history_mount_device(mount_dev, NULL);
if (hmd == NULL || (! uuid_is_null(hmd->hmd_mount.hm_uuid))) {
UNLOCK_HISTORY_R();
goto out;
}
hmd->hmd_blocksize = vfs_devblocksize(mount);
VFSATTR_INIT(&attr);
VFSATTR_WANTED(&attr, f_uuid);
context = vfs_context_create(NULL);
error = vfs_getattr(mount, &attr, context);
if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) {
vfs_context_rele(context);
#ifdef BC_DEBUG
if (strncmp("devfs", name, sizeof("devfs")) != 0) {
debug("Found mount %s without a UUID: error %d", name, error);
}
#endif
UNLOCK_HISTORY_R();
goto out;
}
hmd->hmd_disk_id = vfs_throttle_mask(mount);
if (VNOP_IOCTL(devvp,
DKIOCISSOLIDSTATE,
(caddr_t)&hmd->hmd_is_ssd,
0,
context))
{
debug("can't determine if physical disk for history mount %s is an ssd", uuid_string(hmd->hmd_mount.hm_uuid));
hmd->hmd_is_ssd = 0;
}
debug("Found new historic mount after %d IOs: %s, dev 0x%x, disk 0x%llx, blk %d%s",
hmd->hmd_mount.hm_nentries,
uuid_string(attr.f_uuid),
hmd->hmd_dev,
hmd->hmd_disk_id,
hmd->hmd_blocksize,
hmd->hmd_is_ssd ? ", ssd" : "");
if (hmd->hmd_blocksize == 0) {
hmd->hmd_blocksize = 512;
}
hmd->hmd_mount.hm_nentries = 0;
uuid_copy(hmd->hmd_mount.hm_uuid, attr.f_uuid);
UNLOCK_HISTORY_R();
if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) {
LOCK_CACHE_R();
for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx;
if (CM_SETUP == cm->cm_state && 0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) {
LOCK_MOUNT_W(cm);
if ((error = BC_fill_in_mount(cm, mount, context)) == 0) {
LOCK_MOUNT_W_TO_R(cm);
BC_mount_available(cm);
UNLOCK_MOUNT_R(cm);
} else {
if (error == EAGAIN) {
*go_again = 1;
}
UNLOCK_MOUNT_W(cm);
}
break;
}
}
UNLOCK_CACHE_R();
}
vfs_context_rele(context);
out:
if (devvp != NULLVP) {
vnode_put(devvp);
}
return VFS_RETURNED;
}
static void BC_update_mounts(void) {
if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) {
return;
}
int go_again = 0;
vfs_iterate(0, check_for_new_mount_itr, &go_again);
if (go_again) {
debug("Missing root mount during last iteration, going again");
vfs_iterate(0, check_for_new_mount_itr, &go_again);
}
}
static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex) {
struct BC_history_mount_device *tmphmd = NULL, **phmd, *ret = NULL;
int mount_idx = 0;
for (phmd = &BC_cache->c_history_mounts;
(*phmd) == NULL || (*phmd)->hmd_dev != dev;
phmd = &(*phmd)->hmd_next) {
if (*phmd == NULL) {
if (tmphmd == NULL) {
tmphmd = kalloc(sizeof(struct BC_history_mount_device));
if (tmphmd == NULL) {
message("could not allocate %lu bytes for history mount device",
sizeof(struct BC_history_mount_device));
BC_set_flag(BC_FLAG_HTRUNCATED);
goto out;
}
tmphmd->hmd_next = NULL;
tmphmd->hmd_mount.hm_nentries = 0;
uuid_clear(tmphmd->hmd_mount.hm_uuid);
tmphmd->hmd_disk_id = 0;
tmphmd->hmd_blocksize = 0;
tmphmd->hmd_dev = dev;
}
if (OSCompareAndSwapPtr(NULL, tmphmd, phmd)) {
tmphmd = NULL;
if (BC_cache->c_take_detailed_stats) {
BC_ADD_STAT(history_mounts, 1);
}
break;
}
if ((*phmd)->hmd_dev == dev) break;
}
mount_idx++;
}
if (pindex)
*pindex = mount_idx;
ret = *phmd;
out:
if (tmphmd)
kfree(tmphmd, sizeof(struct BC_history_mount_device));
return ret;
}
static int
BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev)
{
u_int64_t offset;
struct BC_history_entry_cluster *hec, *tmphec = NULL;
struct BC_history_mount_device *hmd = NULL;
struct BC_history_entry *he;
int mount_idx, entry_idx;
int is_root_disk = 0;
if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE))
return 0;
LOCK_HISTORY_R();
if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE))
goto out;
if (BC_cache->c_take_detailed_stats) {
if (! tag) {
if (! write) {
BC_ADD_STAT(history_reads, 1);
} else {
BC_ADD_STAT(history_writes, 1);
}
}
}
if (!tag && BC_cache->c_flags & BC_FLAG_HTRUNCATED)
goto out;
if (!tag && (BC_cache->c_history_size + length) > BC_MAX_SIZE) {
debug("History recording too large, capping at %lluMB", BC_MAX_SIZE / (1024 * 1024));
BC_set_flag(BC_FLAG_HTRUNCATED);
goto out;
}
#ifdef NO_HISTORY
BC_set_flag(BC_FLAG_HTRUNCATED);
debug("history disabled, truncating");
goto out;
#endif
mount_idx = 0;
offset = 0;
if (! tag) {
hmd = BC_get_history_mount_device(dev, &mount_idx);
if (hmd == NULL) goto out;
if (uuid_is_null(hmd->hmd_mount.hm_uuid)) {
OSIncrementAtomic(&hmd->hmd_mount.hm_nentries);
if (BC_cache->c_take_detailed_stats) {
BC_ADD_STAT(history_unknown, 1);
BC_ADD_STAT(history_unknown_bytes, length);
}
goto out;
}
offset = HB_BLOCK_TO_BYTE(hmd, blkno);
if (hmd->hmd_disk_id != BC_cache->c_root_disk_id) {
#ifdef ROOT_DISK_ONLY
goto out;
#endif
} else {
is_root_disk = 1;
}
#ifdef DONT_CACHE_SSDS
if (hmd->hmd_is_ssd) {
goto out;
}
#endif
}
while ((hec = BC_cache->c_history) == NULL ||
(entry_idx = OSAddAtomic(1, &hec->hec_nentries)) >= BC_HISTORY_ENTRIES) {
if (hec)
OSAddAtomic(-1, &hec->hec_nentries);
if (BC_cache->c_history_num_clusters + 1 >= BC_HISTORY_MAXCLUSTERS) {
message("too many history clusters (%d, limit %ld)",
BC_cache->c_history_num_clusters,
(long)BC_HISTORY_MAXCLUSTERS);
BC_set_flag(BC_FLAG_HTRUNCATED);
goto out;
}
if (tmphec == NULL) {
tmphec = kalloc(BC_HISTORY_ALLOC);
if (tmphec == NULL) {
message("could not allocate %d bytes for history cluster",
BC_HISTORY_ALLOC);
BC_set_flag(BC_FLAG_HTRUNCATED);
goto out;
}
tmphec->hec_nentries = 1;
tmphec->hec_link = hec;
}
if (OSCompareAndSwapPtr(hec, tmphec, &BC_cache->c_history)) {
hec = tmphec;
entry_idx = 0;
tmphec = NULL;
OSAddAtomic(1, &BC_cache->c_history_num_clusters);
break;
}
}
if (tmphec != NULL)
kfree(tmphec, BC_HISTORY_ALLOC);
he = &(hec->hec_data[entry_idx]);
assert(he >= &hec->hec_data[0]);
assert(he < &hec->hec_data[BC_HISTORY_ENTRIES]);
he->he_offset = offset;
he->he_length = length;
he->he_pid = pid;
he->he_mount_idx = mount_idx;
he->he_flags = 0x0;
if (hit) he->he_flags |= BC_HE_HIT;
if (write) he->he_flags |= BC_HE_WRITE;
if (tag) he->he_flags |= BC_HE_TAG;
if (shared) he->he_flags |= BC_HE_SHARED;
if (hmd)
OSIncrementAtomic(&hmd->hmd_mount.hm_nentries);
if (!write) {
OSAddAtomic64(length, &BC_cache->c_history_size);
if (BC_cache->c_take_detailed_stats) {
BC_ADD_STAT(history_bytes, length);
BC_ADD_STAT(history_entries, 1);
}
}
out:
UNLOCK_HISTORY_R();
return is_root_disk;
}
static int
BC_size_history_mounts(void)
{
struct BC_history_mount_device *hmd;
int nmounts;
nmounts = 0;
for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next)
nmounts ++;
return(nmounts * sizeof(struct BC_history_mount));
}
static int
BC_size_history_entries(void)
{
struct BC_history_entry_cluster *hec;
int nentries, cluster_entries;
nentries = 0;
for (hec = BC_cache->c_history; hec != NULL; hec = hec->hec_link) {
cluster_entries = hec->hec_nentries;
if (cluster_entries > BC_HISTORY_ENTRIES) {
cluster_entries = BC_HISTORY_ENTRIES;
}
nentries += cluster_entries;
}
return(nentries * sizeof(struct BC_history_entry));
}
static int
BC_copyout_history_mounts(user_addr_t uptr)
{
struct BC_history_mount_device *hmd;
int error;
assert(uptr != 0);
for (hmd = BC_cache->c_history_mounts;
hmd != NULL;
hmd = hmd->hmd_next) {
if ((error = copyout(&(hmd->hmd_mount), uptr,
sizeof(struct BC_history_mount))) != 0)
return(error);
uptr += sizeof(struct BC_history_mount);
}
return(0);
}
static int
BC_copyout_history_entries(user_addr_t uptr)
{
struct BC_history_entry_cluster *hec, *ohec;
int error, cluster_entries;
assert(uptr != 0);
ohec = NULL;
for (;;) {
hec = BC_cache->c_history;
while (hec->hec_link != ohec)
hec = hec->hec_link;
ohec = hec;
cluster_entries = hec->hec_nentries;
if (cluster_entries > BC_HISTORY_ENTRIES) {
cluster_entries = BC_HISTORY_ENTRIES;
}
if ((error = copyout(hec->hec_data, uptr,
cluster_entries * sizeof(struct BC_history_entry))) != 0)
return(error);
uptr += cluster_entries * sizeof(struct BC_history_entry);
if (hec == BC_cache->c_history)
break;
}
return(0);
}
static void
BC_discard_history(void)
{
struct BC_history_mount_device *hmd;
struct BC_history_entry_cluster *hec;
while (BC_cache->c_history_mounts != NULL) {
hmd = BC_cache->c_history_mounts;
BC_cache->c_history_mounts = hmd->hmd_next;
kfree(hmd, sizeof(struct BC_history_mount_device));
}
while (BC_cache->c_history != NULL) {
hec = BC_cache->c_history;
BC_cache->c_history = hec->hec_link;
kfree(hec, BC_HISTORY_ALLOC);
}
}
static int
BC_setup(void)
{
unsigned int boot_arg;
if ((max_mem / (1024 * 1024)) < BC_minimum_mem_size) {
debug("insufficient physical memory (%dMB < %dMB required)",
(int) (max_mem / (1024 * 1024)), BC_minimum_mem_size);
return(ENOMEM);
}
if (netboot_root() &&
(!PE_parse_boot_argn("BootCacheOverride", &boot_arg, sizeof(boot_arg)) ||
(boot_arg != 1))) {
return(ENXIO);
}
debug("Major of rootdev is %d", major(rootdev));
BC_cache->c_devsw = &bdevsw[major(rootdev)];
assert(BC_cache->c_devsw != NULL);
if ((BC_cache->c_devsw->d_strategy == (strategy_fcn_t*)BC_strategy) ||
(BC_cache->c_devsw->d_close == BC_close)) {
debug("cache was not terminated properly previously");
return(ENXIO);
}
BC_cache->c_strategy = BC_cache->c_devsw->d_strategy;
BC_cache->c_close = BC_cache->c_devsw->d_close;
if (BC_cache->c_strategy == NULL ||
BC_cache->c_close == NULL) {
return(ENXIO);
}
#ifdef ROOT_DISK_ONLY
#ifdef DONT_CACHE_SSDS
vfs_context_t context;
context = vfs_context_create(NULL);
int is_ssd;
if (!rootvp ||
VNOP_IOCTL(rootvp,
DKIOCISSOLIDSTATE,
(caddr_t)&is_ssd,
0,
context))
{
message("can't determine if disk is a solid state disk for root mount");
is_ssd = 0;
}
vfs_context_rele(context);
if (is_ssd) {
return(ENODEV);
}
#endif
#endif
BC_set_flag(BC_FLAG_SETUP);
return(0);
}
static void
BC_auto_start(void)
{
int error;
user_addr_t pm, pe;
if ((error = BC_setup()) != 0) {
debug("BootCache setup failed: %d", error);
return;
}
pm = CAST_USER_ADDR_T((uintptr_t)BC_preloaded_playlist + sizeof(struct BC_playlist_header));
pe = pm + (BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount));
error = BC_init_cache(BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount), pm,
BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry), pe,
0 );
if (error != 0)
debug("BootCache autostart failed: %d", error);
}
static void
BC_unmount_hook(void)
{
debug("device unmounted");
BC_terminate_cache();
BC_terminate_history();
}
static int
BC_sysctl SYSCTL_HANDLER_ARGS
{
struct BC_command bc;
int error;
if ((error = SYSCTL_IN(req, &bc, sizeof(bc))) != 0) {
debug("couldn't get command");
return(error);
}
if (bc.bc_magic != BC_MAGIC) {
debug("bad command magic");
return(EINVAL);
}
switch (bc.bc_opcode) {
case BC_OP_START:
debug("BC_OP_START(%d mounts, %d extents)", (int)(bc.bc_data1_size / sizeof(struct BC_playlist_mount)), (int)(bc.bc_data2_size / sizeof(struct BC_playlist_entry)));
if (BC_preloaded_playlist) {
error = EINVAL;
break;
}
error = BC_init_cache(bc.bc_data1_size, (user_addr_t)bc.bc_data1, bc.bc_data2_size, (user_addr_t)bc.bc_data2, 1 );
if (error != 0) {
message("cache init failed");
}
break;
case BC_OP_STOP:
debug("BC_OP_STOP");
BC_terminate_history();
LOCK_HISTORY_W();
bc.bc_data1_size = BC_size_history_mounts();
bc.bc_data2_size = BC_size_history_entries();
if ((error = SYSCTL_OUT(req, &bc, sizeof(bc))) != 0)
debug("could not return history size");
UNLOCK_HISTORY_W();
break;
case BC_OP_MOUNT:
BC_update_mounts();
break;
case BC_OP_HISTORY:
debug("BC_OP_HISTORY");
LOCK_HISTORY_W();
if (bc.bc_data1 != 0 && bc.bc_data2 != 0) {
if (bc.bc_data1_size < BC_size_history_mounts() ||
bc.bc_data2_size < BC_size_history_entries()) {
debug("supplied history buffer too small");
error = EINVAL;
UNLOCK_HISTORY_W();
break;
}
if ((error = BC_copyout_history_mounts(bc.bc_data1)) != 0) {
debug("could not copy out history mounts: %d", error);
}
if ((error = BC_copyout_history_entries(bc.bc_data2)) != 0) {
debug("could not copy out history entries: %d", error);
}
}
BC_discard_history();
UNLOCK_HISTORY_W();
break;
case BC_OP_JETTISON:
debug("BC_OP_JETTISON");
BC_terminate_cache();
break;
case BC_OP_STATS:
debug("BC_OP_STATS");
if (bc.bc_data1_size != sizeof(BC_cache->c_stats)) {
debug("stats structure wrong size");
error = ENOMEM;
} else {
BC_cache->c_stats.ss_cache_flags = BC_cache->c_flags;
if ((error = copyout(&BC_cache->c_stats, bc.bc_data1, bc.bc_data1_size)) != 0)
debug("could not copy out statistics");
}
break;
case BC_OP_TAG:
debug("BC_OP_TAG");
KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_TAG) |
DBG_FUNC_NONE, proc_selfppid(), dbg_tag_count,
0, 0, 0);
BC_add_history(0, 0, 0, 0, 0, 1, 0, 0);
dbg_tag_count++;
break;
case BC_OP_TEST:
debug("BC_OP_TEST");
if (! (BC_cache->c_flags & BC_FLAG_SETUP)) {
error = ENODEV;
}
break;
default:
error = EINVAL;
}
return(error);
}
static int
BC_alloc_pagebuffer()
{
kern_return_t kret;
int ret;
if (BC_cache->c_vp == NULL) {
kret = vnode_open(BC_BOOT_BACKINGFILE, (O_CREAT | O_NOFOLLOW | FREAD),
0400, 0, &BC_cache->c_vp, vfs_context_current());
if (kret != KERN_SUCCESS) {
debug("vnode_open failed - %d", kret);
return -1;
}
BC_cache->c_vid = vnode_vid(BC_cache->c_vp);
} else {
ret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
if (ret != 0) {
debug("vnode_getwithvid failed - %d", ret);
return -1;
}
}
assert(BC_cache->c_cachesize != 0);
kret = ubc_setsize(BC_cache->c_vp, BC_cache->c_cachesize);
if (kret != 1) {
debug("ubc_setsize failed - %d", kret);
(void) vnode_put(BC_cache->c_vp);
return -1;
}
(void) vnode_put(BC_cache->c_vp);
return(0);
}
static void
BC_free_pagebuffer(void)
{
kern_return_t kret;
if (BC_cache->c_vp != NULL) {
kret = vnode_getwithref(BC_cache->c_vp);
if (kret != KERN_SUCCESS) {
debug("vnode_getwithref failed - %d", kret);
return;
}
kret = ubc_range_op(BC_cache->c_vp, 0, BC_cache->c_cachesize,
UPL_ROP_DUMP, NULL);
if (kret != KERN_SUCCESS) {
debug("ubc_range_op failed - %d", kret);
}
#if 0
kret = vnode_delete(BC_BOOT_BACKINGFILE, vfs_context_current());
if (kret) {
debug("vnode_delete failed - %d", error);
}
#endif
kret = vnode_close(BC_cache->c_vp, 0, vfs_context_current());
if (kret != KERN_SUCCESS) {
debug("vnode_close failed - %d", kret);
}
BC_cache->c_vp = NULL;
}
}
static void
BC_free_page(struct BC_cache_extent *ce, int page)
{
off_t vpoffset;
kern_return_t kret;
vpoffset = (page * PAGE_SIZE) + ce->ce_cacheoffset;
kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
if (kret != KERN_SUCCESS) {
debug("vnode_getwithvid failed - %d", kret);
return;
}
kret = ubc_range_op(BC_cache->c_vp, vpoffset, vpoffset + PAGE_SIZE,
UPL_ROP_DUMP, NULL);
if (kret != KERN_SUCCESS) {
debug("ubc_range_op failed - %d", kret);
}
(void) vnode_put(BC_cache->c_vp);
}
void
BC_hook(void)
{
int error;
if ((error = BC_setup()) != 0) {
debug("BootCache setup failed: %d", error);
}
}
void
BC_load(void)
{
struct mach_header *mh;
long xsize;
int has64bitness, error;
size_t sysctlsize = sizeof(int);
char *plsection = "playlist";
#ifdef BC_DEBUG
microtime(&debug_starttime);
#endif
sysctl_register_oid(&sysctl__kern_BootCache);
bzero(BC_cache, sizeof(*BC_cache));
BC_cache->c_lckgrp = lck_grp_alloc_init("BootCache", LCK_GRP_ATTR_NULL);
lck_rw_init(&BC_cache->c_history_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
lck_rw_init(&BC_cache->c_cache_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
lck_mtx_init(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
lck_mtx_init(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
#ifdef BC_DEBUG
lck_mtx_init(&debug_printlock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
#endif
if (kmod_info.info_version != KMOD_INFO_VERSION) {
message("incompatible kmod_info versions");
return;
}
mh = (struct mach_header *)kmod_info.address;
error = sysctlbyname("hw.cpu64bit_capable", &has64bitness, &sysctlsize,
NULL, 0);
if (error == 0 && has64bitness == 0) {
plsection = "playlist32";
}
BC_preloaded_playlist = getsectdatafromheader(mh, "BootCache",
plsection, &BC_preloaded_playlist_size);
debug("preload section %s: %d @ %p",
plsection, BC_preloaded_playlist_size, BC_preloaded_playlist);
if (BC_preloaded_playlist != NULL) {
if (BC_preloaded_playlist_size < sizeof(struct BC_playlist_header)) {
message("preloaded playlist too small");
return;
}
if (BC_preloaded_playlist->ph_magic != PH_MAGIC) {
message("preloaded playlist has invalid magic (%x, expected %x)",
BC_preloaded_playlist->ph_magic, PH_MAGIC);
return;
}
xsize = sizeof(struct BC_playlist_header) +
(BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount)) +
(BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry));
if (xsize > BC_preloaded_playlist_size) {
message("preloaded playlist too small (%d, expected at least %ld)",
BC_preloaded_playlist_size, xsize);
return;
}
BC_wait_for_readahead = BC_READAHEAD_WAIT_CDROM;
mountroot_post_hook = BC_auto_start;
unmountroot_pre_hook = BC_unmount_hook;
debug("waiting for root mount...");
} else {
BC_preloaded_playlist = NULL;
mountroot_post_hook = BC_hook;
debug("waiting for playlist...");
}
microtime(&BC_cache->c_loadtime);
}
int
BC_unload(void)
{
int error;
debug("preparing to unload...");
unmountroot_pre_hook = NULL;
BC_terminate_history();
if ((error = BC_terminate_cache()) != 0) {
if (error != ENXIO)
goto out;
}
LOCK_HISTORY_W();
BC_discard_history();
UNLOCK_HISTORY_W();
lck_rw_destroy(&BC_cache->c_history_lock, BC_cache->c_lckgrp);
lck_rw_destroy(&BC_cache->c_cache_lock, BC_cache->c_lckgrp);
lck_mtx_destroy(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp);
lck_mtx_destroy(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp);
lck_grp_free(BC_cache->c_lckgrp);
sysctl_unregister_oid(&sysctl__kern_BootCache);
error = 0;
out:
return(error);
}