This patch is generated from the reiserfs_raw branch of HEAD in squid Wed Sep 29 01:34:07 2004 GMT See http://devel.squid-cache.org/ Index: squid/acconfig.h diff -u squid/acconfig.h:1.4 squid/acconfig.h:1.4.6.1 --- squid/acconfig.h:1.4 Sat Dec 16 20:57:39 2000 +++ squid/acconfig.h Sun Dec 17 05:55:39 2000 @@ -48,6 +48,9 @@ /* Define to use async disk I/O operations */ #undef USE_ASYNC_IO +/* Define to include suuport for "tree" (and "asynctree") cache_dir types */ +#undef USE_STORETREE + /* Defines how many threads to use for async I/O */ #undef ASYNC_IO_THREADS Index: squid/configure.in diff -u squid/configure.in:1.9 squid/configure.in:1.7.6.2 --- squid/configure.in:1.9 Thu Jan 4 14:44:02 2001 +++ squid/configure.in Fri Jan 5 12:49:02 2001 @@ -324,6 +324,13 @@ AC_SUBST(STORE_OBJS) STORE_LIBS="`echo $STORE_OBJS|sed -e's%fs/%%g'`" AC_SUBST(STORE_LIBS) +for module in $STORE_MODULES; do + case $module in + butterfly) + AC_DEFINE(USE_STORETREE, 1) + ;; + esac +done dnl --enable-heap-replacement compability option AC_ARG_ENABLE(heap-replacement, @@ -865,6 +872,7 @@ time.h \ unistd.h \ varargs.h \ + utime.h \ ) AC_C_CONST Index: squid/doc/debug-sections.txt diff -u squid/doc/debug-sections.txt:1.2 squid/doc/debug-sections.txt:1.2.22.1 --- squid/doc/debug-sections.txt:1.2 Sat Oct 21 08:16:08 2000 +++ squid/doc/debug-sections.txt Sun Dec 17 05:55:39 2000 @@ -86,3 +86,6 @@ section 79 HTTP Meter Header section 80 WCCP section 81 Store Removal/Replacement policy +section 86 Store Directory, Balanced tree (Reiserfs) version +section 88 Storage Manager TREE Interface I/O Routines, async +section 89 Storage Manager TREE Interface I/O Routines Index: squid/include/util.h diff -u squid/include/util.h:1.4 squid/include/util.h:1.4.16.1 --- squid/include/util.h:1.4 Sat Nov 4 15:23:06 2000 +++ squid/include/util.h Sun Dec 17 05:55:39 2000 @@ -65,6 +65,7 @@ extern char *xstrndup(const char *, size_t); extern const char *xstrerror(void); extern const char *xbstrerror(int); +extern int tvMsec(struct timeval); extern int tvSubMsec(struct timeval, struct timeval); extern int tvSubUsec(struct timeval, struct timeval); extern double tvSubDsec(struct timeval, struct timeval); Index: squid/lib/util.c diff -u squid/lib/util.c:1.4 squid/lib/util.c:1.4.10.1 --- squid/lib/util.c:1.4 Tue Nov 28 03:35:34 2000 +++ squid/lib/util.c Sun Dec 17 05:55:39 2000 @@ -662,6 +662,12 @@ } int +tvMsec(struct timeval t) +{ + return t.tv_sec * 1000 + t.tv_usec / 1000; +} + +int tvSubMsec(struct timeval t1, struct timeval t2) { return (t2.tv_sec - t1.tv_sec) * 1000 + Index: squid/src/defines.h diff -u squid/src/defines.h:1.5 squid/src/defines.h:1.3.16.1.2.2 --- squid/src/defines.h:1.5 Fri Jan 5 12:46:59 2001 +++ squid/src/defines.h Fri Jan 5 12:49:03 2001 @@ -146,7 +146,7 @@ #define LOG_ENABLE 1 #define LOG_DISABLE 0 -#define SM_PAGE_SIZE 4096 +#define SM_PAGE_SIZE (16*1024) /* Was 4096 --sizif */ #define EBIT_SET(flag, bit) ((void)((flag) |= ((1L<<(bit))))) #define EBIT_CLR(flag, bit) ((void)((flag) &= ~((1L<<(bit))))) @@ -270,7 +270,7 @@ */ #define PEER_TCP_MAGIC_COUNT 10 -#define CLIENT_SOCK_SZ 4096 +#define CLIENT_SOCK_SZ (16*1024) /* Was: 4096 --sizif */ #define URI_WHITESPACE_STRIP 0 #define URI_WHITESPACE_ALLOW 1 Index: squid/src/enums.h diff -u squid/src/enums.h:1.7 squid/src/enums.h:1.6.4.1.2.1 --- squid/src/enums.h:1.7 Fri Jan 5 12:46:59 2001 +++ squid/src/enums.h Sun Dec 17 05:55:39 2000 @@ -483,7 +483,8 @@ ENTRY_VALIDATED, ENTRY_BAD_LENGTH, ENTRY_ABORTED, - ENTRY_DONT_LOG /* hack for gross 'Pump' entries */ + ENTRY_DONT_LOG, /* hack for gross 'Pump' entries */ + ENTRY_TENTATIVE, /* storetree: file might not exist */ }; typedef enum { Index: squid/src/globals.h diff -u squid/src/globals.h:1.5 squid/src/globals.h:1.5.8.1 --- squid/src/globals.h:1.5 Tue Dec 12 15:21:19 2000 +++ squid/src/globals.h Sun Dec 17 05:55:39 2000 @@ -145,6 +145,9 @@ extern request_flags null_request_flags; extern int store_open_disk_fd; /* 0 */ extern const char *SwapDirType[]; +#ifdef USE_STORETREE +extern int n_storetree_dirs; +#endif /* USE_STORETREE */ extern storefs_entry_t *storefs_list; /* NULL */ extern storerepl_entry_t *storerepl_list; /* NULL */ extern int store_swap_low; /* 0 */ Index: squid/src/protos.h diff -u squid/src/protos.h:1.12 squid/src/protos.h:1.6.4.1.2.3 --- squid/src/protos.h:1.12 Fri Jan 5 12:46:59 2001 +++ squid/src/protos.h Fri Jan 5 13:10:43 2001 @@ -817,6 +817,7 @@ * store.c */ extern StoreEntry *new_StoreEntry(int, const char *, const char *); +extern FREE destroy_StoreEntry; extern StoreEntry *storeGet(const cache_key *); extern StoreEntry *storeGetPublic(const char *uri, const method_t method); extern StoreEntry *storeCreateEntry(const char *, const char *, request_flags, method_t); @@ -927,7 +928,6 @@ extern OBJH storeDirStats; extern char *storeDirSwapLogFile(int, const char *); extern char *storeSwapDir(int); -extern char *storeSwapFullPath(int, char *); extern char *storeSwapSubSubDir(int, char *); extern const char *storeSwapPath(int); extern int storeDirWriteCleanLogs(int reopen); @@ -949,8 +949,15 @@ int storeDirGetBlkSize(const char *path, int *blksize); /* - * store_swapmeta.c + * store_dir_drr.c */ +#if USE_STORETREE +extern StoreEntry *storeTreeGet(const char *url, const cache_key *key); +extern void storeTreeRestoreMetadata(StoreEntry *e, tlv *t); +#endif + +/* + * store_swapmeta.c */ extern char *storeSwapMetaPack(tlv * tlv_list, int *length); extern tlv *storeSwapMetaBuild(StoreEntry * e); extern tlv *storeSwapMetaUnpack(const char *buf, int *hdrlen); Index: squid/src/squid.h diff -u squid/src/squid.h:1.5 squid/src/squid.h:1.3.16.1.2.2 --- squid/src/squid.h:1.5 Fri Jan 5 12:46:59 2001 +++ squid/src/squid.h Fri Jan 5 12:49:03 2001 @@ -137,6 +137,9 @@ #if HAVE_TIME_H #include #endif +#if HAVE_UTIME_H +#include +#endif #if HAVE_SYS_PARAM_H #include #endif Index: squid/src/stat.c diff -u squid/src/stat.c:1.5 squid/src/stat.c:1.4.14.1.2.1 --- squid/src/stat.c:1.5 Fri Jan 5 12:46:59 2001 +++ squid/src/stat.c Sun Dec 17 05:55:39 2000 @@ -236,6 +236,8 @@ strcat(buf, "BAD_LENGTH,"); if (EBIT_TEST(flags, ENTRY_ABORTED)) strcat(buf, "ABORTED,"); + if (EBIT_TEST(flags, ENTRY_TENTATIVE)) + strcat(buf, "TENTATIVE,"); if ((t = strrchr(buf, ','))) *t = '\0'; return buf; @@ -812,6 +814,9 @@ storeAppendPrintf(sentry, "syscalls.disk.writes = %f/sec\n", XAVG(syscalls.disk.writes)); storeAppendPrintf(sentry, "syscalls.disk.seeks = %f/sec\n", XAVG(syscalls.disk.seeks)); storeAppendPrintf(sentry, "syscalls.disk.unlinks = %f/sec\n", XAVG(syscalls.disk.unlinks)); + storeAppendPrintf(sentry, "syscalls.disk.truncates = %f/sec\n", XAVG(syscalls.disk.truncates)); + storeAppendPrintf(sentry, "syscalls.disk.utimes = %f/sec\n", XAVG(syscalls.disk.utimes)); + storeAppendPrintf(sentry, "syscalls.disk.stats = %f/sec\n", XAVG(syscalls.disk.stats)); storeAppendPrintf(sentry, "syscalls.sock.accepts = %f/sec\n", XAVG(syscalls.sock.accepts)); storeAppendPrintf(sentry, "syscalls.sock.sockets = %f/sec\n", XAVG(syscalls.sock.sockets)); storeAppendPrintf(sentry, "syscalls.sock.connects = %f/sec\n", XAVG(syscalls.sock.connects)); Index: squid/src/store.c diff -u squid/src/store.c:1.8 squid/src/store.c:1.6.4.1.2.2 --- squid/src/store.c:1.8 Wed Jan 3 23:23:59 2001 +++ squid/src/store.c Fri Jan 5 12:49:03 2001 @@ -65,6 +65,8 @@ "SWAPOUT_DONE" }; +int n_storetree_dirs = 0; + typedef struct lock_ctrl_t { SIH *callback; void *callback_data; @@ -81,7 +83,6 @@ static void storeHashDelete(StoreEntry *); static MemObject *new_MemObject(const char *, const char *); static void destroy_MemObject(StoreEntry *); -static FREE destroy_StoreEntry; static void storePurgeMem(StoreEntry *); static void storeEntryReferenced(StoreEntry *); static void storeEntryDereferenced(StoreEntry *); @@ -169,7 +170,7 @@ memFree(mem, MEM_MEMOBJECT); } -static void +void destroy_StoreEntry(void *data) { StoreEntry *e = data; @@ -215,6 +216,18 @@ storeKeyText(e->hash.key)); storeSetMemStatus(e, NOT_IN_MEMORY); destroy_MemObject(e); +#if USE_STORETREE + /* For STORETREE type dirs, we free StoreEntry as well. STORETREE + is able to find object on disk based on the URL, so we can + reconstruct StoreEntry from disk object when (and whether) + request for it comes */ + if (e->swap_status == SWAPOUT_DONE) { + SwapDir *sd = INDEXSD(e->swap_dirn); + if (sd->halfrelease) + sd->halfrelease (e); + return; + } +#endif /* USE_STORETREE */ if (e->swap_status != SWAPOUT_DONE) storeRelease(e); } @@ -293,6 +306,13 @@ return (int) e->lock_count; if (e->store_status == STORE_PENDING) EBIT_SET(e->flags, RELEASE_REQUEST); +#define DIRTYHACK +#ifdef DIRTYHACK + if (storePendingNClients(e) != 0) { + debug(20, 3) ("storeUnlockObject: storePendingNClients != 0!! Leak!!\n"); + return (int) ++e->lock_count; + } +#endif assert(storePendingNClients(e) == 0); if (EBIT_TEST(e->flags, RELEASE_REQUEST)) storeRelease(e); @@ -322,7 +342,18 @@ StoreEntry * storeGetPublic(const char *uri, const method_t method) { - return storeGet(storeKeyPublic(uri, method)); + const cache_key *key = storeKeyPublic(uri, method); + StoreEntry *e; + + if ((e = storeGet(key))) { + return e; + } +#if USE_STORETREE + if (n_storetree_dirs && (e = storeTreeGet (uri, key))) { + return e; + } +#endif + return NULL; } static int @@ -803,6 +834,15 @@ } } storeLog(STORE_LOG_RELEASE, e); +#ifdef USE_STORETREE + if (EBIT_TEST (e->flags, ENTRY_TENTATIVE)) { + /* Created in storeTreeGet but never tried to swap in for some + reason (no-cache request is one case where this happens). + We don't know whether there is a file on disk, much likely + there isn't. Don't waste efforts unlinking. */ + e->swap_filen = -1; + } +#endif if (e->swap_filen > -1) { storeUnlink(e); if (e->swap_status == SWAPOUT_DONE) @@ -909,9 +949,22 @@ { int i; /* Calculate size of hash table (maximum currently 64k buckets). */ - i = Config.Swap.maxSize / Config.Store.avgObjectSize; - debug(20, 1) ("Swap maxSize %d KB, estimated %d objects\n", - Config.Swap.maxSize, i); +#ifdef USE_STORETREE + if (n_storetree_dirs == Config.cacheSwap.n_configured) { + /* With storetree, on-disk objects are not in hash table, so + hash table should be only large enough to contain in-memory + objects. */ + i = Config.memMaxSize / Config.Store.avgObjectSize; + debug(20, 1) ("cache_mem %d KB, estimated %d hash table objects\n", + Config.memMaxSize, i); + } + else +#endif + { + i = Config.Swap.maxSize / Config.Store.avgObjectSize; + debug(20, 1) ("Swap maxSize %d KB, estimated %d objects\n", + Config.Swap.maxSize, i); + } i /= Config.Store.objectsPerBucket; debug(20, 1) ("Target number of buckets: %d\n", i); /* ideally the full scan period should be configurable, for the @@ -1312,6 +1365,16 @@ e->swap_file_number = -1; } else { assert(-1 == e->swap_file_number); +#if USE_STORETREE + /* If filn chosen by storeDirSelectSwapDir belongs to a + STORETREE directory, redirect the object to proper + STORETREE directory, chosen based on URL */ + if (storeTreeDirIs (&Config.cacheSwap.swapDirs[filn >> SWAP_DIR_SHIFT])) { + int dirn = storeTreeSelectSwapDir(e->key); + e->swap_file_number = dirn << SWAP_DIR_SHIFT; + return; + } +#endif /* USE_STORETREE */ storeDirMapBitSet(e->swap_file_number = filn); storeDirLRUAdd(e); } Index: squid/src/store_client.c diff -u squid/src/store_client.c:1.5 squid/src/store_client.c:1.4.14.1.2.2 --- squid/src/store_client.c:1.5 Fri Jan 5 12:46:59 2001 +++ squid/src/store_client.c Fri Dec 22 04:40:49 2000 @@ -1,4 +1,3 @@ - /* * $Id$ * @@ -157,7 +156,16 @@ { STCB *callback = sc->callback; char *buf = sc->copy_buf; +#if BUG_20000721a + /* sc->callback may get reset by storeSwapInFileClosed. + Don't fail in this case */ assert(sc->callback); +#else + if (! callback) { + debug(20, 2) ("storeClientCallback: sc->callback == 0\n"); + return; + } +#endif sc->callback = NULL; sc->copy_buf = NULL; if (cbdataValid(sc->callback_data)) @@ -380,6 +388,7 @@ tlv *tlv_list; tlv *t; int swap_object_ok = 1; + assert(sc->entry->hash.key); /* --sizif */ assert(sc->flags.disk_io_pending); sc->flags.disk_io_pending = 0; assert(sc->callback != NULL); @@ -432,6 +441,14 @@ } break; case STORE_META_STD: +#if USE_STORETREE + /* If we have just recreated StoreEntry, restore metadata + like when rebuilding store from cache dir */ + if (EBIT_TEST (e->flags, ENTRY_TENTATIVE)) { + storeTreeRestoreMetadata(e, t); + EBIT_CLR (e->flags, ENTRY_TENTATIVE); + } +#endif break; default: debug(20, 1) ("WARNING: got unused STORE_META type %d\n", t->type); Index: squid/src/store_dir.c diff -u squid/src/store_dir.c:1.8 squid/src/store_dir.c:1.4.18.2 --- squid/src/store_dir.c:1.8 Thu Jan 4 22:38:08 2001 +++ squid/src/store_dir.c Fri Jan 5 12:49:03 2001 @@ -132,7 +132,7 @@ * XXX This function does NOT account for the read_only flag! */ static int -storeDirSelectSwapDirRoundRobin(const StoreEntry * unused) +storeDirSelectSwapDirRoundRobin(const StoreEntry * e) { static int dirn = 0; int i; @@ -148,8 +148,17 @@ sd = &Config.cacheSwap.swapDirs[dirn]; if (sd->cur_size > sd->max_size) continue; - return dirn; + break; } +#ifdef USE_STORETREE + /* Give storedir a chance to redirect this object to another dir. + STORETREE dirs use that as all storetree dirs you configure + act together like hash buckets. They use a hash based on URL + to select which dir the object should go to -- this allows to + have only one open() in storeTreeGet(). */ + if (sd->reselectdir) + dirn = sd->reselectdir(e)->index; +#endif return dirn; } Index: squid/src/store_swapin.c diff -u squid/src/store_swapin.c:1.4 squid/src/store_swapin.c:1.4.18.1 --- squid/src/store_swapin.c:1.4 Fri Nov 3 00:39:20 2000 +++ squid/src/store_swapin.c Sun Dec 17 05:55:39 2000 @@ -63,7 +63,21 @@ e->swap_filen); sc->swapin_sio = storeOpen(e, storeSwapInFileNotify, storeSwapInFileClosed, sc); +#if BUG_20000721 + /* Locking here is too late. At least with diskd and multiple + cache_dirs, storeSwapInFileClosed can be called before we get + here, and cbdataUnlock gets called before cbdataLock, hence an + assertion failure. --sizif */ cbdataLock(sc->swapin_sio); +#else + if (sc->flags.swapin_dead) { + sc->flags.swapin_dead = 0; + sc->swapin_sio = NULL; + } + else { + cbdataLock(sc->swapin_sio); + } +#endif } static void @@ -73,8 +87,21 @@ STCB *callback; debug(20, 3) ("storeSwapInFileClosed: sio=%p, errflag=%d\n", sio, errflag); +#if BUG_20000721 cbdataUnlock(sio); sc->swapin_sio = NULL; +#else + if (sc->swapin_sio) { + cbdataUnlock(sio); + sc->swapin_sio = NULL; + } + else { + /* tough luck. We've been called before storeOpen finished + and sc->swapin_sio is assigned and locked. sio is + cbdataReallyFreed already. */ + sc->flags.swapin_dead = 1; + } +#endif if ((callback = sc->callback)) { assert(errflag <= 0); sc->callback = NULL; Index: squid/src/store_swapout.c diff -u squid/src/store_swapout.c:1.5 squid/src/store_swapout.c:1.4.14.1.2.1 --- squid/src/store_swapout.c:1.5 Fri Jan 5 12:46:59 2001 +++ squid/src/store_swapout.c Sun Dec 17 05:55:39 2000 @@ -234,8 +234,9 @@ assert(swap_buf_len > 0); debug(20, 3) ("storeSwapOut: swapping out %d bytes from %d\n", swap_buf_len, (int) mem->swapout.queue_offset); + storeWrite(mem->swapout.sio, mem->swapout.memnode->data, swap_buf_len, + mem->swap_hdr_sz + mem->swapout.queue_offset, NULL); mem->swapout.queue_offset += swap_buf_len; - storeWrite(mem->swapout.sio, mem->swapout.memnode->data, swap_buf_len, -1, NULL); /* the storeWrite() call might generate an error */ if (e->swap_status != SWAPOUT_WRITING) break; Index: squid/src/structs.h diff -u squid/src/structs.h:1.15 squid/src/structs.h:1.8.8.3 --- squid/src/structs.h:1.15 Thu Jan 4 22:38:08 2001 +++ squid/src/structs.h Fri Jan 5 12:49:03 2001 @@ -1247,6 +1247,7 @@ unsigned int disk_io_pending:1; unsigned int store_copying:1; unsigned int copy_event_pending:1; + unsigned int swapin_dead:1; } flags; #if DELAY_POOLS delay_id delay_id; @@ -1340,11 +1341,14 @@ ping_status_t ping_status:3; store_status_t store_status:3; swap_status_t swap_status:3; + unsigned id_valid:1; + unsigned long long id; }; struct _SwapDir { char *type; int cur_size; + int high_size; int low_size; int max_size; char *path; @@ -1357,6 +1361,7 @@ struct { unsigned int selected:1; unsigned int read_only:1; + unsigned int offline:1; } flags; STINIT *init; /* Initialise the fs */ STNEWFS *newfs; /* Create a new fs */ @@ -1371,6 +1376,9 @@ STUNREFOBJ *unrefobj; /* Unreference this object */ STCALLBACK *callback; /* Handle pending callbacks */ STSYNC *sync; /* Sync the directory */ + /* The following are for storetree */ + STHALFRELEASE *halfrelease; /* Release StoreEntry but keep disk object */ + STRESELECTDIR *reselectdir; /* Swap dir redirection function */ struct { STOBJCREATE *create; STOBJOPEN *open; @@ -1639,6 +1647,9 @@ int writes; int seeks; int unlinks; + int truncates; + int utimes; + int stats; } disk; struct { int accepts; Index: squid/src/typedefs.h diff -u squid/src/typedefs.h:1.6 squid/src/typedefs.h:1.4.10.1.2.2 --- squid/src/typedefs.h:1.6 Fri Jan 5 12:46:59 2001 +++ squid/src/typedefs.h Fri Jan 5 12:49:03 2001 @@ -248,6 +248,8 @@ typedef void STDONE(void); typedef int STCALLBACK(SwapDir *); typedef void STSYNC(SwapDir *); +typedef void STHALFRELEASE(StoreEntry *); +typedef SwapDir *STRESELECTDIR(const StoreEntry *); typedef storeIOState *STOBJCREATE(SwapDir *, StoreEntry *, STFNCB *, STIOCB *, void *); typedef storeIOState *STOBJOPEN(SwapDir *, StoreEntry *, STFNCB *, STIOCB *, void *); Index: squid/src/fs/butterfly/Makefile.in diff -u /dev/null squid/src/fs/butterfly/Makefile.in:1.1.2.1 --- /dev/null Tue Sep 28 18:39:07 2004 +++ squid/src/fs/butterfly/Makefile.in Sun Dec 17 05:55:40 2000 @@ -0,0 +1,76 @@ +# +# Makefile for the BUTTERFLY storage driver for the Squid Object Cache server +# +# $Id$ +# + +FS = butterfly + +prefix = @prefix@ +exec_prefix = @exec_prefix@ +exec_suffix = @exec_suffix@ +cgi_suffix = @cgi_suffix@ +top_srcdir = @top_srcdir@ +bindir = @bindir@ +libexecdir = @libexecdir@ +sysconfdir = @sysconfdir@ +localstatedir = @localstatedir@ +srcdir = @srcdir@ +VPATH = @srcdir@ + +CC = @CC@ +MAKEDEPEND = @MAKEDEPEND@ +AR_R = @AR_R@ +RANLIB = @RANLIB@ +AC_CFLAGS = @CFLAGS@ +SHELL = /bin/sh +LDFLAGS = @LDFLAGS@ +INSTALL = @INSTALL@ +INSTALL_BIN = @INSTALL_PROGRAM@ +MV = @MV@ +RM = @RM@ + +INCLUDE = -I../../../include -I$(top_srcdir)/include -I$(top_srcdir)/src/ +# sorry. Adjust to where your linux+reiserfs_raw+KAIO is. +INCLUDE += -I../../../..//linux-reiserfs-raw/include +CFLAGS = $(AC_CFLAGS) $(INCLUDE) $(DEFINES) + +OUT = ../$(FS).a + +OBJS = \ + store_dir_bf.o \ + store_io_bf.o \ + aiolib.o \ + +all: $(OUT) + +$(OUT): $(OBJS) + @rm -f ../stamp + $(AR_R) $(OUT) $(OBJS) + $(RANLIB) $(OUT) + +$(OBJS): $(top_srcdir)/include/version.h ../../../include/autoconf.h + +.c.o: + @rm -f ../stamp + $(CC) -DSQUID_PREFIX=\"$(prefix)\" $(CFLAGS) -c $< + +clean: + -rm -rf *.o *pure_* core ../$(FS).a + +distclean: clean + -rm -f Makefile + -rm -f Makefile.bak + -rm -f tags + +tags: + ctags *.[ch] $(top_srcdir)/src/*.[ch] $(top_srcdir)/include/*.h $(top_srcdir)/lib/*.[ch] +TAGS: + etags *.h $(top_srcdir)/src/*.h $(top_srcdir)/include/*.h $(top_srcdir)/lib/*.h \ + *.c $(top_srcdir)/src/*.c $(top_srcdir)/lib/*.c + +depend: + $(MAKEDEPEND) $(INCLUDE) -fMakefile *.c + +install: + @true Index: squid/src/fs/butterfly/README.aiolib diff -u /dev/null squid/src/fs/butterfly/README.aiolib:1.1.2.1 --- /dev/null Tue Sep 28 18:39:07 2004 +++ squid/src/fs/butterfly/README.aiolib Sun Dec 17 05:55:40 2000 @@ -0,0 +1,134 @@ +/* + * + * libdba: Facilities which accelerate database performance. + * Included are POSIX Asynchronous I/O, + * Post-wait synchronization, + * and a fast gettimeofday(). + * + * Copyright 1999, Silicon Graphics, Inc. + * + * Written October 1999 by Rajagopal Ananthanarayanan (ananth) + * at Silicon Graphics, Inc. + */ + +Asynchronous I/O Facility +------------------------- + +Overview +-------- + +The asynchronous I/O (AIO) facility implements interfaces defined by the +POSIX standard. The AIO facility implemented in the SGI Linux Environment +1.1 differs from glibc implementation. The glibc version realizes +asynchrony by employing slave threads to process the I/O requests. This +implies that only as many I/O requests as the number of slaves are truly +asynchronous at the device, since the slaves use the blocking system calls +to service the I/O: when a slave thread is processing an I/O request, +it is blocked in that I/O system call. + +The SGI AIO version is mostly implemented in the Linux kernel. To +distinguish with the glibc version, we will use the term KAIO. In KAIO, +when possible, I/O requests are implemented as split-phase I/O requests. +With split-phase I/O, the initiating request (such as an aio_read) truly +queues the I/O at the device as the first phase of the I/O request; a +second phase of the I/O request, performed as part of the I/O completion, +propagates results of the request. The results may include the contents +of the I/O buffer on a read, the number of bytes read or written, and +any error status. Thus, with split-phase I/O as much asynchrony as the +device can support is actually achieved. However, not all file systems or +devices readily support split-phase I/O. Currently, KAIO only supports +split-phase I/O for file systems that employ generic_file_read() as +their read routine (several file systems in current Linux kernel use +generic_file_read, including popular ext2), and for all Character Disk +Devices (Raw) which is also provided as part of SGI Linux Environment 1.1. +For requests which are not split-phase, KAIO employs slave threads, +similar to the glibc implementation. + + +Using KAIO +---------- + +KAIO implements POSIX Asynchronous I/O interfaces, although formal +compliance testing and branding have not been attempted yet. All interfaces, +such as aio_read(), aio_write(), aio_suspend(), aio_cancel(), aio_error(), +aio_return(), and lio_listio() are supported. Further, all interfaces +can use 64-bit offsets allowing greater than 2GB offsets, as discussed below. + +To use KAIO as opposed to glibc AIO, one has to include , +since glibc already defines a . Further, libdba.so has to be +linked into the executable; the sources for libdba can be found in +/usr/src/linux/lib/libdba. Finally, libdba.so needs to be installed +under /lib on the machine where the executable is to run. KAIO and +glibc AIO cannot be intermixed; including and +would cause compilation errors, since both define, for instance, aiocb, +the AIO control block. + +The kernel support for KAIO is enabled using the CONFIG_AIO option, +as part of "General Setup". Further, CONFIG_AIO_MAX allows the maximum +number of outstanding I/Os to be configurable, although the current +default of 4096 should be sufficient for most environments. + +KAIO is designed to work with POSIX Threads (pthreads), or any programs +that employ clone threads, as long as the clones are created with the +CLONE_VM option. + +Finally, the number of slave threads can be changed using the environment +variable AIO_NTHREADS. By default 4 threads (AIO_DEFAULT_THREADS in +) are created. All slaves have the prefix "kaiod-XXXXX" as +their name in top(1) or pstree(1), where "XXXXX" is the pid of the +parent which created the slaves; see BUGS about ps(1). + + +Enabling 64-bit offsets +----------------------- + +User code can turn on 64-bit offsets in aiocb by: + + #define _FILE_OFFSET_BITS 64 + #include + #include +which will turn all aio_*() interfaces to operate with 64-bit offsets. +More precisely, aio_offset will be a 64-bit quantitity (loff_t). + +Further, if _LARGEFILE64_SOURCE is defined in the above, it will enable +LFS style explicit aio_*64() and lio_listio64() interfaces; these will +accept aiocb64_t as their argument. If only _LARGEFILE64_SOURCE is +defined, then aio_*() and lio_listio() will use 32-bit offsets. + +Note that if you include other system header files before including it +as above, those headers may include , which will turn the +later inclusion of into a null inclusion. This will lead +to the incorrect AIO interfaces. To avoid this, one possibility is +to define either or both of _FILE_OFFSET_BITS and _LARGEFILE64_SOURCE +before including any system header file. + +As an alternate way of turning on 64-bit capability, AIO_FILE_OFFSET64 +enables 64-bit offsets, and AIO_LARGEFILE64 enables LFS interfaces. +While this scheme does not involve and its associated +include-order problem, these definitions are understood only by +KAIO code. + +The kernel always uses 64-bit offset values. If user code does not +enable 64-bit offsets, the library (libdba.so) will suitably zero +out the padding. + +Known Bugs +---------- + +ps(1) shows the slave threads with the name of the parent, +although top(1) and pstree(1) show the name correctly. + +Resources +--------- + +(1) Where to find AIO man pages? + + http://www.opengroup.org/public/pubs/online/7908799/xsh/aio.h.html + +Note: aio_fsync() is not required by POSIX, and currently +not implemented in KAIO. + +(2) Where to find LFS specifications? + + http://ftp.sas.com/standards/large.file/ + Index: squid/src/fs/butterfly/aiolib.c diff -u /dev/null squid/src/fs/butterfly/aiolib.c:1.1.2.1 --- /dev/null Tue Sep 28 18:39:07 2004 +++ squid/src/fs/butterfly/aiolib.c Sun Dec 17 05:55:40 2000 @@ -0,0 +1,294 @@ +/* + * Copyright 1999, Silicon Graphics, Inc. + * + * Written October 1999 by Rajagopal Ananthanarayanan (ananth) + * at Silicon Graphics, Inc. + * + * According to http://oss.sgi.com/projects/kaio/license.html, + * KAIO code is covered by the same license as the linux kernel (GPL) + * + */ +#define AIO_LARGEFILE64 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "squid.h" + + +_syscall5(int, aio, + int, cmd, + unsigned long, arg1, + unsigned long, arg2, + unsigned long, arg3, + unsigned long, arg4); + +static int aio_cpid[AIO_MAX_THREADS]; + + +int +child(void *arg) +{ + /* child */ + fprintf (stderr, "entering child %d...\n", getpid()); + sigblock(-1); /* block all signals */ + aio(AIOCMD_SLAVE, 0, 0, 0, 0); + fprintf (stderr, "child %d exiting: %d\n", getpid(), errno); + exit(-1); +} + + +static void +aio_init(int nslaves) +{ + int i; + char *cstack; + + if (nslaves == 0) { + char *p; + + /* get it from the environment */ + if ((p = getenv(AIO_THREAD_ENVIRON))) + nslaves = strtol(p, 0, 0); + else + nslaves = AIO_DEFAULT_THREADS; + } + + if (nslaves <= 0 || nslaves > AIO_MAX_THREADS) { + fprintf(stderr, "aio_init: Warning! nthreads %d not in " + "valid range 1 .. %d (see AIO_MAX_THREADS)\n", + nslaves, AIO_MAX_THREADS); + fprintf(stderr, "aio_init: defaulting to %d threads " + "(see AIO_DEFAULT_THREADS)\n", + AIO_DEFAULT_THREADS); + nslaves = AIO_DEFAULT_THREADS; + } + + for (i = 0; i < nslaves; i++) { + cstack = (char *) xmalloc(64 * 1024); + cstack += (32 * 1024); + cstack = (char *)((unsigned long)cstack & 0xfffff000); + + /* + aio_cpid[i] = + aio(AIOCMD_SLAVE, 0, 0, 0, 0); + */ + aio_cpid[i] = clone(child, cstack, + SIGCHLD|CLONE_VM|CLONE_FS|CLONE_FILES, 0); + if (aio_cpid[i] < 0) { + perror("clone"); + exit(-1); + } + } +} + +int +aio_read64(aiocb64_t *aiocb) +{ + int ret; + + ret = aio(AIOCMD_READ, (unsigned long)aiocb, 0, 0, 0); + if (ret < 0 && errno == ENOTCONN) { + aio_init(0); + ret = aio(AIOCMD_READ, (unsigned long)aiocb, 0, 0, 0); + } + return ret; +} + +int +aio_write64(aiocb64_t *aiocb) +{ + int ret; + + ret = aio(AIOCMD_WRITE, (unsigned long)aiocb, 0, 0, 0); + if (ret < 0 && errno == ENOTCONN) { + aio_init(0); + ret = aio(AIOCMD_WRITE, (unsigned long)aiocb, 0, 0, 0); + } + return ret; +} + + +int +aio_cancel64(int fildes, aiocb64_t *aiocb) +{ + int ret; + + ret = aio(AIOCMD_CANCEL, (unsigned long)fildes, + (unsigned long)aiocb, 0, 0); + if (ret < 0 && errno == ENOTCONN) { + aio_init(0); + ret = aio(AIOCMD_CANCEL, (unsigned long)fildes, + (unsigned long)aiocb, 0, 0); + } + return ret; +} + +int +aio_suspend64(const aiocb64_t *const list[], int nent, const struct timespec *timeout) +{ + int ret; + + ret = aio(AIOCMD_SUSPEND, (unsigned long)list, (unsigned long)nent, + (unsigned long)timeout, 0); + if (ret < 0 && errno == ENOTCONN) { + aio_init(0); + ret = aio(AIOCMD_SUSPEND, (unsigned long)list, + (unsigned long)nent, + (unsigned long)timeout, 0); + } + return ret; +} + +ssize_t +aio_return64(aiocb64_t *aiocb) +{ + return(aiocb->aio_reserved[3]); +} + +int +lio_listio64(int mode, aiocb64_t *const list[], int nent, sigevent_t *sig) +{ + int ret; + + ret = aio(AIOCMD_LIST_IO, + (unsigned long)mode, (unsigned long)list, + (unsigned long)nent, (unsigned long)sig); + if (ret < 0 && errno == ENOTCONN) { + aio_init(0); + ret = aio(AIOCMD_LIST_IO, + (unsigned long)mode, (unsigned long)list, + (unsigned long)nent, (unsigned long)sig); + } + return ret; +} + +int +aio_ioctl64(aiocb64_t *aiocb) +{ + int ret; + + ret = aio(AIOCMD_IOCTL, (unsigned long)aiocb, 0, 0, 0); + if (ret < 0 && errno == ENOTCONN) { + aio_init(0); + ret = aio(AIOCMD_IOCTL, (unsigned long)aiocb, 0, 0, 0); + } + return ret; +} + +int +aio_close64(aiocb64_t *aiocb) +{ + int ret; + + ret = aio(AIOCMD_CLOSE, (unsigned long)aiocb, 0, 0, 0); + if (ret < 0 && errno == ENOTCONN) { + aio_init(0); + ret = aio(AIOCMD_CLOSE, (unsigned long)aiocb, 0, 0, 0); + } + return ret; +} + + +#ifndef __USE_FILE_OFFSET64 +#define PAD_AIOCB(aiocb) *(unsigned int *)&((aiocb)->__aio_pad) = 0; +#else +#define PAD_AIOCB(aiocb) +#endif + +int +aio_read(aiocb_t *aiocb) +{ + PAD_AIOCB(aiocb); + return(aio_read64((aiocb64_t *)aiocb)); +} + +int +aio_write(aiocb_t *aiocb) +{ + PAD_AIOCB(aiocb); + return(aio_write64((aiocb64_t *)aiocb)); +} + + +int +aio_cancel(int fildes, aiocb_t *aiocb) +{ + if (aiocb) + PAD_AIOCB(aiocb); + return(aio_cancel64(fildes, (aiocb64_t *)aiocb)); +} + + +typedef const aiocb64_t * const *CLISTC; +typedef aiocb64_t * const *LISTC; + +int +aio_suspend(const aiocb_t *const list[], int nent, const struct timespec *timeout) +{ +#ifndef __USE_FILE_OFFSET64 + int i; + + for (i = 0; i < nent; i++) + if (list[i]) + PAD_AIOCB(list[i]); +#endif + return(aio_suspend64((CLISTC)list, nent, timeout)); +} + +ssize_t +aio_return(aiocb_t *aiocb) +{ + return aio_return64((aiocb64_t *)aiocb); +} + +int +lio_listio(int mode, aiocb_t *const list[], int nent, sigevent_t *sig) +{ +#ifndef __USE_FILE_OFFSET64 + int i; + + for (i = 0; i < nent; i++) + if (list[i]) + PAD_AIOCB(list[i]); +#endif + return(lio_listio64(mode, (LISTC)list, nent, sig)); +} + +int +aio_ioctl(aiocb_t *aiocb) +{ + if (aiocb) + PAD_AIOCB(aiocb); + return(aio_ioctl64((aiocb64_t *)aiocb)); +} + +int +aio_close(aiocb_t *aiocb) +{ + if (aiocb) + PAD_AIOCB(aiocb); + return(aio_close64((aiocb64_t *)aiocb)); +} + +void +aio_pcinvalidate(int fd) +{ + if (aio(AIOCMD_PCINVALIDATE, fd, 0, 0, 0)) { + if (errno == ENOTCONN) { + aio_init(0); + if (aio(AIOCMD_PCINVALIDATE, fd, 0, 0, 0) == 0) + return; + } + perror("AIOCMD_PCINVALIDATE"); + exit(-1); + } +} Index: squid/src/fs/butterfly/store_bf.h diff -u /dev/null squid/src/fs/butterfly/store_bf.h:1.1.2.2 --- /dev/null Tue Sep 28 18:39:07 2004 +++ squid/src/fs/butterfly/store_bf.h Fri Dec 29 01:10:46 2000 @@ -0,0 +1,143 @@ +/* + * store_bf.h + * + * Internal declarations for the Butterfly cache_dir routines + * + * $Id$ + */ + +#ifndef __STORE_BF_H__ +#define __STORE_BF_H__ + +#include +#include + + +typedef struct reiserfs_raw_ioctl_arg reiserfs_raw_ioctl_arg; +typedef struct aiocb aiocb; + +typedef struct _bf_stats bf_stats_t; +typedef struct _bfinfo_t bfinfo_t; +typedef struct _bfop_t bfop_t; +typedef struct _bfstate_t bfstate_t; + + +/* When read/write/close request comes for sio which hasn't fd + available yet, we cannot fire up aio_* operation for it + immediately. Instead, we queue the requests and process them after + open finishes. The following structure is the element of the + queues. */ + +enum bf_type { + BFNONE, BFOPEN, BFCREAT, BFREAD, BFWRITE, + BFCLOSE, BFUNLINK, BFSETATTR, + BFMAX, +}; +#define BFOP_NAMES {"none","open","create","read","write", \ + "close","unlink","setattr"} + +struct _bfop_t { + bfop_t *next; /* efficient dlist -- don't need xcalloc */ + bfop_t *prev; + bfop_t *busychain; + enum bf_type type; + storeIOState *sio; /* our owner */ + aiocb aio; + union { + STRCB *read; + FREE *write; /* free_func */ + } callback; + void *callback_data; /* only for callback.read */ + off_t rlen; /* real real/write length, syscall return */ + int starttime; /* start timestamp, ms */ + unsigned int started:1; /* 0 until launched */ + unsigned int finished:1; /* 0 until landed */ +}; + +struct _bfinfo_t { + int dir; /* fd of sd->path for reiserfs_raw ioctls */ + link_list *kicklater; /* list of bfstates that got EAGAIN */ + int refuse_swapout; /* storeBfCreate returns NULL if more + than this ops away */ +}; + +struct _bfstate_t { + int fd; /* file fd, -1 if not opened yet */ + reiserfs_raw_ioctl_arg ra; /* single ra shared by all parallel ops */ + bfop_t ops; /* op used for BF{OPEN,CREAT,UNLINK,SETATTR} + and as queue head for BF{READ,WRITE} */ + int errflag; /* error that caused abortion of the sio. + Normaly zero and no abort. */ + unsigned int close_request:1; /* close when q is empty */ + unsigned int close_started:1; /* BF_CLOSE launched already */ + unsigned int aborting:1; /* close ASAP (flush not run the q) */ + unsigned int unlink_attempted:1; /* already tried to unlink the file */ +}; + +struct _bf_stats { + unsigned eagain; + unsigned away; + unsigned max_away; + unsigned abs_max_away; + unsigned swapout_refused; + struct bf_stats_counters { + unsigned ops; + unsigned success; + unsigned fail; + unsigned max_dura; /* maximal op duration */ + float avg_dura; /* average op duration */ + float avg_dura100; /* averege op duration in last 100 ops */ + } counters[BFMAX]; +}; + +/* FIFO lists of launched ops. We keep different ops in different + lists because op delays and urgency are different */ +struct bf_busyops { + bfop_t *head; /* FIFO list of launched BF of certain type */ + bfop_t **tail; /* tail of the FIFO list */ +}; + +extern MemPool *bf_state_pool; +extern MemPool *bf_op_pool; +extern bf_stats_t bf_stats; +extern struct bf_busyops bf_busyops[BFMAX]; +extern char *op_names; +extern FILE *bf_opstats; /* bulky binary op stats file */ +extern char *bf_opstats_fn; + + +/* from store_io_bf.c */ +extern int storeBfKickQueue (bfstate_t *bfstate); +extern void storeBfOpCompletion (bfop_t *op); + +/* from store_dir_bf.c */ +extern void storeBfDirTakeOffline(SwapDir *sd); +extern void storeBfDirUnrefObj(SwapDir * SD, StoreEntry * e); +extern void storeBfDirReplAdd(SwapDir * SD, StoreEntry * e); +extern void storeBfDirReplRemove(StoreEntry * e); +extern void storeBfDirFillRa(StoreEntry *e, reiserfs_raw_ioctl_arg *ra); + +/* + * Store IO stuff + */ +extern STOBJCREATE storeBfCreate; +extern STOBJOPEN storeBfOpen; +extern STOBJCLOSE storeBfClose; +extern STOBJREAD storeBfRead; +extern STOBJWRITE storeBfWrite; +extern STOBJUNLINK storeBfUnlink; + + +/* Get microsecond timestamp for profiling. Almost verbatim from + linux/mm/filemap.c (KAIO) */ + +static inline unsigned long +kaio_time(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000000 + tv.tv_usec); +} + + +#endif /* __STORE_BF_H__ */ Index: squid/src/fs/butterfly/store_dir_bf.c diff -u /dev/null squid/src/fs/butterfly/store_dir_bf.c:1.1.2.3 --- /dev/null Tue Sep 28 18:39:07 2004 +++ squid/src/fs/butterfly/store_dir_bf.c Fri Dec 29 01:10:46 2000 @@ -0,0 +1,1064 @@ + +/* + * $Id$ + * + * DEBUG: section 86 Store Directory, the "Butterfly" version + * AUTHOR: Yury Shevchuk + * + * SQUID Internet Object Cache http://squid.nlanr.net/Squid/ + * ---------------------------------------------------------- + * + * Squid is the result of efforts by numerous individuals from the + * Internet community. Development is led by Duane Wessels of the + * National Laboratory for Applied Network Research and funded by the + * National Science Foundation. Squid is Copyrighted (C) 1998 by + * the Regents of the University of California. Please see the + * COPYRIGHT file for full details. Squid incorporates software + * developed and/or copyrighted by other sources. Please see the + * CREDITS file for full details. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. + * + */ + +#include "squid.h" +#include + +#if HAVE_STATVFS +#if HAVE_SYS_STATVFS_H +#include +#endif +#endif + +#include "store_bf.h" + +bf_stats_t bf_stats; + +typedef struct _RebuildState RebuildState; +struct _RebuildState { + SwapDir *sd; + struct _store_rebuild_data counts; + int clean; +}; + +int n_bf_dirs = 0; +int n_bf_dirs_online = 0; +static int *bf_dir_index = NULL; +static int *bf_dir_online_index = NULL; +MemPool *bf_state_pool = NULL; +MemPool *bf_op_pool = NULL; +static int bf_initialised = 0; +static int bf_nthreads = 0; +struct bf_busyops bf_busyops[BFMAX]; +char *bfop_names[] = BFOP_NAMES; +FILE *bf_opstats; /* bulky binary op stats file */ +char *bf_opstats_fn; + +static int storeBfDirCreateDirectory(const char *path, + int should_exist); +static void storeBfDirReindex(); +static int storeBfDirVerifyDirectory(const char *path); +static int storeBfDirVerifyCacheDir(SwapDir * sd); +static void storeBfDirOpenSwapLog(SwapDir * sd); +static void storeBfDirCloseSwapLog(SwapDir * sd); +static void storeBfDirInit(SwapDir * sd); +static void storeBfStats(StoreEntry * sentry); +static void storeBfDirSync(SwapDir * SD); +static int storeBfDirCallback(SwapDir * SD); +static void storeBfRelease(StoreEntry *e); +static SwapDir *storeBfReselectDir (const StoreEntry *e); +static void storeBfDirRebuild(SwapDir * sd); +static int storeBfDirWriteCleanStart(SwapDir * sd); +static const StoreEntry *storeBfDirCleanLogNextEntry(SwapDir * sd); +static void storeBfDirWriteCleanDone(SwapDir * sd); +static void storeBfDirSwapLog(const SwapDir * sd, const StoreEntry * e, int op); +static void storeBfDirNewfs(SwapDir * sd); +static int storeBfDirIs(SwapDir * sd); +static void storeBfDirMaintain(SwapDir * SD); +static int storeBfSelectSwapDir(const StoreEntry *); +static void storeBfDirRefObj(SwapDir * SD, StoreEntry * e); +static void storeBfDirStats(SwapDir * SD, StoreEntry * sentry); +static void storeBfDirReconfigure(SwapDir * sd, int index, char *path); +static void storeBfDirDump(StoreEntry * entry, const char *name, SwapDir * s); +static void storeBfDirFree(SwapDir * s); +static int storeBfCleanupDoubleCheck(SwapDir * sd, StoreEntry * e); +static void storeBfDirParse(SwapDir * sd, int index, char *path); + + +int +sanitycheck() +{ + int t; + unsigned true_away = 0; + + for (t = BFNONE+1; t < BFMAX; t++) { + bfop_t *op = bf_busyops[t].head; + bfop_t **true_tail = &bf_busyops[t].head; + while (op) { + true_away++; + true_tail = &op->busychain; + op = op->busychain; + } + debug(86, 1) ("sanitycheck: bf_busyops[t].tail != true_tail!\n"); + } + debug(86, 1) ("sanitycheck: bf_stats.away(%d) != true_away(%d)\n", + bf_stats.away, true_away); + return 0; +} + +static int +storeBfDirCreateDirectory(const char *path, int should_exist) +{ + /* our directories are root dirs of reiserfs_raw fs's. Never + create anything. */ + if (storeBfDirVerifyDirectory (path) != 0) + fatalf("Swap directory %s needs manual intervention\n", path); + return 0; +} + + +/* Switch @sd to offline state (after disk failure is detected). + Offline state means don't use this directory temporarily, but + be ready to turn it back on-line, with minimal hit ratio loss. */ + +void +storeBfDirTakeOffline(SwapDir *sd) +{ + if (! sd->flags.offline) { + sd->flags.offline = 1; + n_storetree_dirs--; + assert(n_storetree_dirs >= 0); + storeBfDirReindex (); + } +} + +/* We maintain two arrays of SwapDir indices, subsets of + Config.cacheSwap.swapDirs: bf_dir_index contains all butterfly + dirs, and bf_dir_online_index contains all butterfly dirs currently + on-line. These are used by storeBfSelectSwapDir. This function + builds these arrays; it should be called every time the dirs are + reconfigured or online status changed */ + +static void +storeBfDirReindex () +{ + int dirn; + + safe_free (bf_dir_index); + safe_free (bf_dir_online_index); + + n_bf_dirs = 0; + for (dirn = 0; dirn < Config.cacheSwap.n_configured; dirn++) { + SwapDir *sd = INDEXSD(dirn); + if (storeBfDirIs(sd)) + n_bf_dirs++; + } + + if (n_bf_dirs == 0) + return; + + bf_dir_index = xcalloc(n_bf_dirs, sizeof(*bf_dir_index)); + bf_dir_online_index = xcalloc(n_bf_dirs, sizeof(*bf_dir_online_index)); + + n_bf_dirs = 0; + n_bf_dirs_online = 0; + + for (dirn = 0; dirn < Config.cacheSwap.n_configured; dirn++) { + SwapDir *sd = INDEXSD(dirn); + if (storeBfDirIs(sd)) { + bf_dir_index[n_bf_dirs++] = dirn; + if (! sd->flags.offline) + bf_dir_online_index[n_bf_dirs_online++] = dirn; + } + } +} + +static int +storeBfDirVerifyDirectory(const char *path) +{ + struct stat sb; + reiserfs_raw_ioctl_arg ra; + int fd, fd2; + + if (stat(path, &sb) < 0) { + debug(20, 0) ("%s: %s\n", path, xstrerror()); + return -1; + } + if (S_ISDIR(sb.st_mode) == 0) { + debug(20, 0) ("%s is not a directory\n", path); + return -1; + } + fd = open (path, O_RDONLY); + if (fd < 0) { + debug(20, 0) ("%s is unreadable for uid %u, gid %u\n", + path, getuid(), getgid()); + return -1; + } + ra.id1 = ra.id2 = 7; + fd2 = ioctl (fd, REISERFS_IOC_RAWOPEN, &ra); + if (fd2 == -1 && errno != ENOENT) { + debug(20, 0) ("%s is not a functional REISERFS-RAW directory\n", path); + close(fd); + return -2; + } + close(fd2); + close(fd); + return 0; +} + +/* + * This function is called by storeBfDirInit(). If this returns < 0, + * then Squid exits, complains about swap directories not + * existing, and instructs the admin to run 'squid -z' + */ +static int +storeBfDirVerifyCacheDir(SwapDir * sd) +{ + const char *path = sd->path; + int result = storeBfDirVerifyDirectory(path); + + if (result == -2) { + /* The directory does exist, but it looks like the drive that + the dir is on is offline. Don't panic, just set the flag + to avoid using this directory. */ + sd->flags.offline = 1; + result = 0; + } + return result; +} + +static void +storeBfDirOpenSwapLog(SwapDir * sd) +{ + bfinfo_t *bfi = sd->fsdata; + static const char *errmsg = + "\tFailed to verify one of the swap directories, Check cache.log\n" + "\tfor details.\n"; + + sd->flags.offline = 0; + if (storeBfDirVerifyCacheDir(sd) < 0) + fatal(errmsg); + if (! sd->flags.offline) { + bfi->dir = open (sd->path, O_RDONLY); + if (bfi->dir < 0) { + debug(50, 0) ("storeBfInit: open(%s): %s\n", sd->path, xstrerror()); + sd->flags.offline = 1; + } + else { + char note[FD_DESC_SZ]; + snprintf (note, FD_DESC_SZ, "Butterfly dir fd %s", sd->path); + fd_open(bfi->dir, FD_UNKNOWN, note); + n_storetree_dirs++; + } + } + storeBfDirReindex (); +} + +static void +storeBfDirCloseSwapLog(SwapDir * sd) +{ + bfinfo_t *bfi = sd->fsdata; + fde *F = &fd_table[bfi->dir]; + + /* + * Careful. If there was disk failure and the admin used + * umount_-f, our fd is closed behind our back, and then possibly + * reused for other purposes. We shouldn't close the fd if this + * is the case. + */ + if (memcmp (F->desc, "Butterfly dir fd", 16) == 0) { + fd_close (bfi->dir); + close (bfi->dir); + } + bfi->dir = -1; + + if (! sd->flags.offline) + n_storetree_dirs--; + assert(n_storetree_dirs >= 0); + + storeBfDirReindex (); +} + +static void +storeBfDirInit(SwapDir * sd) +{ + if (!getenv(AIO_THREAD_ENVIRON)) { + static char aio_nthreads[30]; + snprintf(aio_nthreads, sizeof(aio_nthreads), "%d", bf_nthreads); + setenv(AIO_THREAD_ENVIRON, aio_nthreads, 0); + } + /* We are doing polled I/O, don't wait too long in comm_poll */ + comm_quick_poll_required(); + storeBfDirOpenSwapLog(sd); + storeBfDirRebuild(sd); +} + + +static void +storeBfStats(StoreEntry * sentry) +{ + int t; + + if (bf_stats.abs_max_away < bf_stats.max_away) + bf_stats.abs_max_away = bf_stats.max_away; + storeAppendPrintf(sentry, "AIO ops in flight: now %u max %u absmax %u\n", + bf_stats.away, + bf_stats.max_away, + bf_stats.abs_max_away); + bf_stats.max_away = 0; + + storeAppendPrintf(sentry, "EAGAIN errors: %u\n", bf_stats.eagain); + storeAppendPrintf(sentry, "Swapout refused because of disk overload: %u\n", + bf_stats.swapout_refused); + storeAppendPrintf(sentry, "\n OPS SUCCESS FAIL" + " DURATION, ms: MAX AVG AVG-for-last-100-ops\n"); + + for (t = BFNONE+1; t < BFMAX; t++) { + struct bf_stats_counters *c = &bf_stats.counters[t]; + storeAppendPrintf(sentry, "%7s %7u %7u %7u %5u %5.0f %5.0f\n", + bfop_names[t], + c->ops, c->success, c->fail, + c->max_dura, c->avg_dura, c->avg_dura100); + } + + sanitycheck(); +} + +/* + * storeBfDirSync + * + * Sync any pending data. We just sit around and read the queue + * until the data has finished writing. + */ +static void +storeBfDirSync(SwapDir * sd) +{ + /* later. */ +#if 0 + bfinfo_t *bfi = sd->fsdata; + while (bfi->away > 0) { + debug(86, 1) ("storeBfDirSync: %d messages away\n", bfi->away); + storeBfDirCallback(SD); + } +#endif +} + +/* Write down op duration profiling data */ + +static void +storeBfWriteOpStats (bfop_t *op) +{ + if (bf_opstats_fn && ! bf_opstats) { + bf_opstats = fopen (bf_opstats_fn, "a"); + if (! bf_opstats) { + debug(50, 1) ("WARNING: %s: %s\n", bf_opstats_fn, xstrerror()); + bf_opstats_fn = 0; + } + } + fwrite (&op->type, sizeof (op->type), 1, bf_opstats); + fwrite (&bf_stats.away, sizeof (bf_stats.away), 1, bf_opstats); + fwrite (&op->aio.aio_times, sizeof (op->aio.aio_times), 1, bf_opstats); +} + + +/* + * storeBfDirCallback + * + * This is called regularly from the main select() loop to let us serve + * aio completion requests. + */ +static int +storeBfDirCallback(SwapDir * sd) +{ + bfinfo_t *bfi = sd->fsdata; + storeIOState *sio; + static struct timeval last_call; + int count_completions = 0; + int count_kicks = 0; + int i; + + /* Queue lookup order. First, serve ops that require no further + processing (their processing doesn't result in growth of the + queue). BFOPEN and BFCREAT trigger bursts of new messages, so + they come last. BFOPEN before BFCREAT as open is "interactive" + while creat is "batch" (swap-out delay does not matter) */ + static int op_order[] = {BFCLOSE, BFUNLINK, BFREAD, BFWRITE, + BFOPEN, BFCREAT, BFNONE}; + int scan_depth[] = {3, 3, 30, 3, bf_nthreads, 1}; + + /* No point in scanning queues more often than once in 3ms */ + if ((unsigned long)tvSubUsec(last_call, current_time) < 3000) + return 0; + + last_call = current_time; + + if (bf_stats.away > 0) + debug(86,3) ("storeBfDirCallback: away %u\n", bf_stats.away); + + for (i = 0; op_order[i] != BFNONE; i++) { + bfop_t *op; + int t = op_order[i]; + int depth = scan_depth[i]; /* go through this many non-ready ops */ + bfop_t **opp = &bf_busyops[t].head; + + while ((op = *opp)) { + if (aio_error(&op->aio) == EINPROGRESS) { + debug (86,3) ("storeBfDirCallback: %p incomplete, depth %d\n", op, depth); + opp = &op->busychain; + if (--depth <= 0) + break; + } + else { + *opp = op->busychain; + if (! *opp) + bf_busyops[t].tail = opp; + bf_stats.away--; + debug (86,3) ("storeBfDirCallback: %p complete\n", op); + if (bf_opstats_fn) { + op->aio.aio_times[AIOTIME_FINISH] = kaio_time (); + storeBfWriteOpStats (op); + } + storeBfOpCompletion (op); + count_completions++; + } + } + } + + while ((sio = linklistShift (&bfi->kicklater))) { + if (cbdataValid (sio)) { + bfstate_t *bfstate = sio->fsstate; + if (storeBfKickQueue (bfstate) < 0) { + /* EAGAIN again. Further kick attempts will likely + fail too, so stop for now (busy wait processing the + same sio isn't the best we can do). */ + cbdataUnlock (sio); + break; + } + count_kicks++; /* unsucessful uncounted */ + } + cbdataUnlock (sio); + } + + if (count_completions + count_kicks > 0) + debug (86,3) ("storeBfDirCallback: done %u completions, %u kicks\n", + count_completions, count_kicks); + + return count_completions + count_kicks; +} + + +/* Remove StoreEntry from memory while keeping the disk object. Save + important e->lastref and e->expires info in the file's stat data, + so expiration process (storeTreeDirClean) can make decisions based + on them without reading the file. + + For reiserfs_raw, we avoid setting attributes now. They has been + set on object creation, and the only change we may need now is + update lastref attribute which is currently set to the time of file + creation or last open. (Hm, we must update lastref for hot objects + periodically somewhere, or they might get GCed on the disk.) */ + +static void +storeBfRelease(StoreEntry *e) +{ + assert (e->lock_count == 0); + + if (EBIT_TEST (e->flags, ENTRY_TENTATIVE)) { + /* If we are asked to release tentative entry, this means it + wasn't succesful (no file found for this entry). It is no + longer tentative, it is simply dummy StoreEntry. */ + debug(86, 3) ("storeBfRelease: rm unsuccesful tentative entry %p\n", e); + goto kill_it; + } + +#define M(BITNUM) (1L<<(BITNUM)) + if ((e->flags &~ M(ENTRY_VALIDATED)) != (M(ENTRY_CACHABLE)|M(ENTRY_DISPATCHED))) { + /* Uncommon, leave in memory */ + debug(86, 3) ("storeBfRelease: leaving in memory entry with uncommon flags %s\n", + storeEntryFlags(e)); + return; + } + + kill_it: + storeSetMemStatus(e, NOT_IN_MEMORY); + destroy_StoreEntry(e); +} + + +/* This is called by store dir selection procedure to let us redirect + the object to one of our fellow bf dirs -- + storeBfSelectSwapDir will say which one using a hash based on + URL */ + +static SwapDir * +storeBfReselectDir (const StoreEntry *e) +{ + return INDEXSD(storeBfSelectSwapDir(e)); +} + + +static void +storeRebuildDummy(void *data) +{ + RebuildState *rb = data; + + /* + * Everything already done in storeBfDirRebuild. But calling + * storeRebuildComplete from there is banned. + */ + store_dirs_rebuilding--; + storeRebuildComplete(&rb->counts); + cbdataFree(rb); +} + +CBDATA_TYPE(RebuildState); + +static void +storeBfDirRebuild(SwapDir * sd) +{ + RebuildState *rb = xcalloc(1, sizeof(*rb)); + EVH *func = storeRebuildDummy; + + CBDATA_INIT_TYPE(RebuildState); + rb = CBDATA_ALLOC(RebuildState, NULL); + rb->sd = sd; + store_dirs_rebuilding++; + eventAdd("storeRebuild", func, rb, 0.0, 0); +} + + +/* + * Begin the process to write clean cache state. For BF this + * is no-op. + */ +static int +storeBfDirWriteCleanStart(SwapDir * sd) +{ + sd->log.clean.write = NULL; + sd->log.clean.state = NULL; + return 0; /* -1 to indicate error */ +} + +/* + * Get the next entry that is a candidate for clean log writing + */ +static const StoreEntry * +storeBfDirCleanLogNextEntry(SwapDir * sd) +{ + return NULL; +} + +static void +storeBfDirWriteCleanDone(SwapDir * sd) +{ + /* sync (wait while diskd finishes) and write down age classes. later */ +} + +static void +storeBfDirSwapLog(const SwapDir * sd, const StoreEntry * e, int op) +{ +} + +static void +storeBfDirNewfs(SwapDir * sd) +{ + debug(47, 3) ("Creating swap space in %s\n", sd->path); + storeBfDirCreateDirectory(sd->path, 0); + /* storeBfDirCreateSwapSubDirs(sd); */ +} + + +/* We use 64bit version of r5 hash to make reiserfs_raw file id. R5 + it chosen for its property of more or less preserving original data + order, which will hopefully be good for seek times. + + The code has been copied from linux/fs/reiserfs/hashes.c, the only + change is s/u32/unsigned long long/g. + + One more change (following the hint from Nikita Danilov + ): the values from original r5 are always + a multiple of 11, which means narrowing the hash space. Fix that + by doing multiplication by 11 *before* adding a char, not after. */ + +static unsigned long long +r5l_hash (const char *msg, int len) +{ + unsigned long long a = 0; + while (*msg) { + a *= 11; + a += *msg << 4; + a += *msg >> 4; + msg++; + } + return a; +} + +static inline unsigned long long +make_id (const char *key, const char *msg, int len) +{ +#if 1 + return r5l_hash (msg, len); +#else + return *(unsigned long long *)(&key[8]); +#endif +} + + + +/* This is important function called from storeGetPublic, + disk-oriented equivalent of storeGet. */ + +StoreEntry * +storeTreeGet (const char *url, const cache_key *key) +{ + StoreEntry *e; + + if (n_storetree_dirs == 0) + return NULL; + if (strncasecmp (url, "cache_object://", sizeof "cache_object://"-1) == 0) + return NULL; + + debug(86, 5) ("StoreTreeGet for %s %s Callers %p %p %p %p\n", url, + storeKeyText (key), + __builtin_return_address(0), + __builtin_return_address(1), + __builtin_return_address(2), + __builtin_return_address(3) + ); + + /* We don't know if the requested object is in the store, and we + don't want to waste time on stat(2) to find out. So we create + tentative StoreEntry with requested URL as if it were in the + store, and let upper layers try to open it. They will try to + swap in and if the object is not on the disk, they's get open + error and handle the error -- for example, in clientCacheHit + they jump to clientCacheMiss on swapin error. */ + + e = new_StoreEntry (STORE_ENTRY_WITHOUT_MEMOBJ, NULL, NULL); + e->store_status = STORE_OK; + e->mem_status = NOT_IN_MEMORY; + e->swap_status = SWAPOUT_DONE; + e->swap_filen = 73; /* for luck */ + /* would have to read in the object to restore the flags. To + avoid that, we'll teach storeTreeRelease not to purge + StoreEntries with non-common flags (ENTRY_NEGCACHED, for one) */ + e->flags = 0; + EBIT_SET(e->flags, ENTRY_CACHABLE); + EBIT_SET(e->flags, ENTRY_DISPATCHED); + EBIT_SET(e->flags, ENTRY_VALIDATED); + EBIT_SET(e->flags, ENTRY_TENTATIVE); + e->ping_status = PING_NONE; + storeHashInsert (e, key); + + e->id = make_id (key, url, strlen (url)); + e->id_valid = 1; + e->swap_dirn = storeBfSelectSwapDir (e); + + return e; +} + + +/* Restore StoreEntry fields from metadata just read from disk object. + Called from storeClientReadHeader */ +void +storeTreeRestoreMetadata(StoreEntry *e, tlv *t) +{ + SwapDir *sd = INDEXSD(e->swap_dirn); + size_t save_sz = e->swap_file_sz; + + assert(t->type == STORE_META_STD); + assert(t->length == STORE_HDR_METASIZE); + assert(storeBfDirIs (sd)); + xmemcpy(&e->timestamp, t->value, STORE_HDR_METASIZE); + + /* swap_file_sz in the metadata is most likely 0, it is not known + at the point when swap-out starts. Replace with known good + value that we got from RAWOPEN */ + e->swap_file_sz = save_sz; + + /* this will probably be set later, but just in case */ + e->lastref = squid_curtime; +} + + +static int +storeBfDirIs(SwapDir * sd) +{ + return (strcmp(sd->type, "butterfly") == 0); +} + +static void +storeBfDirMaintain(SwapDir * SD) +{ + /* reiresfs-raw has object removal implemented in the kernel. */ + return; +} + +/* + * storeBfDirCheckObj + * + * This routine is called by storeDirSelectSwapDir to see if the given + * object is able to be stored on this filesystem. (Round-robin dir + * selection alg doesn't use this at all). + * + */ +static int +storeBfDirCheckObj(SwapDir * SD, const StoreEntry * e) +{ + if (storeBfSelectSwapDir(e) != SD->index) + return -1; + return 0; +} + + +/* Select the cachedir where to put/where to look + for the given entry. The current implementation is + quick&dirty: assumes all bf dirs are of equal size. + The right way is to weight based on dir sizes, TODO later. + Look at CARP as Henrik suggested? */ + +static int +storeBfSelectSwapDir(const StoreEntry * _e) +{ + StoreEntry *e = (StoreEntry *) _e; + int dirn; + + if (! e->id_valid) { + assert (e->mem_obj); + assert (e->mem_obj->url); + e->id = make_id (e->hash.key, e->mem_obj->url, + strlen (e->mem_obj->url)); + e->id_valid = 1; + } + + /* Select one of butterfly dirs based on a hash value extracted + from MD5 key of the object */ + + assert (n_bf_dirs > 0); + dirn = bf_dir_index[*(unsigned *)e->hash.key % n_bf_dirs]; + + if (INDEXSD(dirn)->flags.offline) { + /* The normal selection have chosen a dir currently offline. + Reselect the dir from the remaining (online) dirs. This + way, the objects that would normally go to the disk + currently offline are distributed evenly among the + remaining dirs. */ + if (n_bf_dirs_online <= 0) + fatal("All dirs offline, can't handle this yet"); + dirn = bf_dir_online_index[*(unsigned *)e->hash.key % n_bf_dirs_online]; + } + + assert (! INDEXSD(dirn)->flags.offline); + + return dirn; +} + + +/* + * storeBfDirRefObj + * + * This routine is called whenever an object is referenced, so we can + * maintain replacement information within the storage fs. + * + * TODO put reiserfs_raw setattr here! + */ +static void +storeBfDirRefObj(SwapDir *sd, StoreEntry * e) +{ + debug(1, 3) ("storeBfDirRefObj: referencing %p %d/%d\n", e); +#if 0 + /* TODO: call storeBfSettattr here */ + if (sd->repl->Referenced) + sd->repl->Referenced(sd->repl, e, &e->repl); +#endif +} + +/* + * storeBfDirUnrefObj + * This routine is called whenever the last reference to an object is + * removed, to maintain replacement information within the storage fs. + */ +void +storeBfDirUnrefObj(SwapDir * SD, StoreEntry * e) +{ + debug(1, 3) ("storeBfDirUnrefObj: unreferencing %p\n", e); +#if 0 + if (sd->repl->Dereferenced) + sd->repl->Dereferenced(sd->repl, e, &e->repl); +#endif +} + + +/* + * Add and remove the given StoreEntry from the replacement policy in + * use. + */ + +void +storeBfDirReplAdd(SwapDir * sd, StoreEntry * e) +{ +#if 0 + debug(20, 4) ("storeBfDirReplAdd: added node %p to dir %d\n", e, sd->index); + sd->repl->Add(sd->repl, e, &e->repl); +#endif +} + + +void +storeBfDirReplRemove(StoreEntry * e) +{ +#if 0 + SwapDir *SD; + if (e->swap_dirn < 0) + return; + SD = INDEXSD(e->swap_dirn); + debug(20, 4) ("storeBfDirReplRemove: remove node %p from dir %d\n", e, + SD->index); + SD->repl->Remove(SD->repl, e, &e->repl); +#endif +} + + + +static void +storeBfDirStats(SwapDir *sd, StoreEntry * sentry) +{ + bfinfo_t *bfi; +#if HAVE_STATVFS + struct statvfs sfs; +#endif + bfi = sd->fsdata; +#if HAVE_STATVFS +#define fsbtoblk(num, fsbs, bs) \ + (((fsbs) != 0 && (fsbs) < (bs)) ? \ + (num) / ((bs) / (fsbs)) : (num) * ((fsbs) / (bs))) + if (!statvfs(sd->path, &sfs)) { + storeAppendPrintf(sentry, "Filesystem Space in use: %d/%d KB (%d%%)\n", + fsbtoblk((sfs.f_blocks - sfs.f_bfree), sfs.f_frsize, 1024), + fsbtoblk(sfs.f_blocks, sfs.f_frsize, 1024), + percent(sfs.f_blocks - sfs.f_bfree, sfs.f_blocks)); + } +#endif + + storeAppendPrintf(sentry, "Flags:"); + if (sd->flags.selected) + storeAppendPrintf(sentry, " SELECTED"); + if (sd->flags.read_only) + storeAppendPrintf(sentry, " READ-ONLY"); + if (sd->flags.offline) + storeAppendPrintf(sentry, " OFF-LINE"); + storeAppendPrintf(sentry, "\n"); +} + +/* + * storeBfDirReconfigure + * + * This routine is called when the given swapdir needs reconfiguring + */ +static void +storeBfDirReconfigure(SwapDir * sd, int index, char *path) +{ + char *token; + unsigned int read_only = 0; + char *opstats_fn = NULL; + bfinfo_t *bfi = sd->fsdata; + + /* TODO -- remove RO or make it work. See selectSwapDir. */ + + while ((token = strtok(NULL, w_space))) { + /* TODO -- remove RO or make it work. See selectSwapDir. */ + if (!strcasecmp(token, "read-only")) + read_only = 1; + else if (!strncasecmp(token, "opstats=", 8)) + opstats_fn = xstrdup (token+8); + else if (!strncasecmp(token, "refuse_swapout=", 15)) + bfi->refuse_swapout = atoi(token+15); + } + + /* just reconfigure it */ + if (sd->flags.read_only != read_only) + debug(3, 1) ("Cache dir '%s' now %s\n", + path, read_only ? "Read-Only" : "Read-Write"); + sd->flags.read_only = read_only; + + if (bf_opstats) { + fclose (bf_opstats); + bf_opstats = NULL; + } + if (bf_opstats_fn) + xfree (bf_opstats_fn); + bf_opstats_fn = opstats_fn; + + return; +} + +static void +storeBfDirDump(StoreEntry * entry, const char *name, SwapDir * sd) +{ + storeAppendPrintf(entry, "%s %s %s\n", + name, + "butterfly", + sd->path); +} + +/* + * Only "free" the filesystem specific stuff here + */ +static void +storeBfDirFree(SwapDir * sd) +{ + bfinfo_t *bfi = sd->fsdata; + xfree(bfi); + sd->fsdata = NULL; /* Will aid debugging... */ +} + +void +storeBfDirFillRa(StoreEntry *e, reiserfs_raw_ioctl_arg *ra) +{ + if (! e->id_valid) { + assert (e->mem_obj); + assert (e->mem_obj->url); + e->id = make_id (e->hash.key, e->mem_obj->url, strlen (e->mem_obj->url)); + e->id_valid = 1; + } + ra->id1 = (unsigned) e->id; + ra->id2 = (unsigned) (e->id >> 32); + ra->lastref = e->lastref; + ra->expires = ~0; /* end of Epoch -- don't let kernel + remove it for now. */ + ra->user1 = 0; /* put headersz here, later */ +} + +/* + * storeBfCleanupDoubleCheck + * + * This is called by storeCleanup() if -S was given on the command line. + */ +static int +storeBfCleanupDoubleCheck(SwapDir * sd, StoreEntry * e) +{ + /* very good dir, trust me! */ + return 0; +} + +/* + * storeBfDirParse + * + * Called when a *new* fs is being setup. + */ +static void +storeBfDirParse(SwapDir * sd, int index, char *path) +{ + char *token; + unsigned int read_only = 0; + unsigned int nthreads = 8; + bfinfo_t *bfi; + + sd->fsdata = bfi = xcalloc(1, sizeof(*bfi)); + bfi->refuse_swapout = 1500; /* default */ + + while ((token = strtok(NULL, w_space))) { + /* TODO -- remove RO or make it work. See selectSwapDir. */ + if (!strcasecmp(token, "read-only")) + read_only = 1; + else if (!strncasecmp(token, "threads=", 8)) + nthreads = atoi(token+8); + else if (!strncasecmp(token, "opstats=", 8)) { + if (bf_opstats_fn) + debug(86, 1) ("WARNING: %s: already have an opstats file (%s), ignoring\n", + token, bf_opstats_fn); + else + bf_opstats_fn = xstrdup (token+8); + } + else if (!strncasecmp(token, "refuse_swapout=", 15)) + bfi->refuse_swapout = atoi(token+15); + } + + sd->index = index; + sd->path = xstrdup(path); + /* keep cache_swap >= cache_mem check happy. Use a constant for + now, do it with statvfs later. */ + sd->max_size = 1000<<20; /* 1 TB */ + sd->low_size = sd->max_size; + sd->high_size = sd->max_size; + sd->flags.read_only = read_only; + sd->init = storeBfDirInit; + sd->newfs = storeBfDirNewfs; + sd->dump = storeBfDirDump; + sd->freefs = storeBfDirFree; + sd->dblcheck = storeBfCleanupDoubleCheck; + sd->statfs = storeBfDirStats; + sd->maintainfs = storeBfDirMaintain; + sd->checkobj = storeBfDirCheckObj; + sd->refobj = storeBfDirRefObj; + sd->unrefobj = storeBfDirUnrefObj; + sd->callback = storeBfDirCallback; + sd->sync = storeBfDirSync; + sd->halfrelease = storeBfRelease; + sd->reselectdir = storeBfReselectDir; + sd->obj.create = storeBfCreate; + sd->obj.open = storeBfOpen; + sd->obj.close = storeBfClose; + sd->obj.read = storeBfRead; + sd->obj.write = storeBfWrite; + sd->obj.unlink = storeBfUnlink; + sd->log.open = storeBfDirOpenSwapLog; + sd->log.close = storeBfDirCloseSwapLog; + sd->log.write = storeBfDirSwapLog; + sd->log.clean.start = storeBfDirWriteCleanStart; + sd->log.clean.nextentry = storeBfDirCleanLogNextEntry; + sd->log.clean.done = storeBfDirWriteCleanDone; + bf_nthreads += nthreads; + +} + +/* + * Initial setup / end destruction + */ +void +storeBfDirDone(void) +{ + memPoolDestroy(bf_state_pool); + memPoolDestroy(bf_op_pool); + if (bf_opstats) + fclose (bf_opstats); + if (bf_opstats_fn) + xfree(bf_opstats_fn); + bf_initialised = 0; +} + +/* Setup routine. This must be named by its long official name, not + just _bf */ + +void +storeFsSetup_butterfly (storefs_entry_t * storefs) +{ + int t; + + assert(!bf_initialised); + storefs->parsefunc = storeBfDirParse; + storefs->reconfigurefunc = storeBfDirReconfigure; + storefs->donefunc = storeBfDirDone; + bf_state_pool = memPoolCreate("BF IO State data", sizeof(bfstate_t)); + bf_op_pool = memPoolCreate("BF IO operation data", sizeof(bfop_t)); + memset(&bf_stats, '\0', sizeof(bf_stats)); + cachemgrRegister("butterfly", "Butterfly Stats", storeBfStats, 0, 1); + debug(81, 1) ("Butterfly started\n"); + for (t = 0; t < BFMAX; t++) { + bf_busyops[t].head = NULL; + bf_busyops[t].tail = &bf_busyops[t].head; + } + bf_initialised = 1; +} Index: squid/src/fs/butterfly/store_io_bf.c diff -u /dev/null squid/src/fs/butterfly/store_io_bf.c:1.1.2.3 --- /dev/null Tue Sep 28 18:39:07 2004 +++ squid/src/fs/butterfly/store_io_bf.c Fri Dec 29 01:10:46 2000 @@ -0,0 +1,959 @@ + +/* $Id$ + * + * DEBUG: section 81 Butterfly cache_dir I/O functions. + * Written by Yury Shevchuk + * + * SQUID Internet Object Cache http://squid.nlanr.net/Squid/ + * ---------------------------------------------------------- + * + * Squid is the result of efforts by numerous individuals from the + * Internet community. Development is led by Duane Wessels of the + * National Laboratory for Applied Network Research and funded by the + * National Science Foundation. Squid is Copyrighted (C) 1998 by + * the Regents of the University of California. Please see the + * COPYRIGHT file for full details. Squid incorporates software + * developed and/or copyrighted by other sources. Please see the + * CREDITS file for full details. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. + * + */ + +/* + * Operation outline + * ~~~~~~~~~~~~~~~~~ + * + * There are two groups of operations. + * + * 1) Stateless: + * + * storeBfUnlink/Setattr + * + * These are simple one-shot operations. They don't have sio associated + * with them. storeBfUnlink/Setattr launches the required aio operation, + * and storeBfOpCompletion cleanups allocated control data upon completion. + * + * 2) Stateful. The sequence goes like this: + * + * storeBfOpen/Creat + * storeBfRead/Write, storeBfRead/Write, ... + * storeBfClose + * + * storeBfOpen/Create create sio and associated bfstate, launch + * open/creat aio, and return (although the open is not completed yet). + * The upper layers can immediately call storeBfRead/Write which we + * cannot lauch right away as we don't have fd until Open/Creat + * completes. So we queue the request; there's a single queue for that + * in struct bfstate. The upper layers continue to supply requests using + * storeBfRead/Write. + * + * The queue is ran by storeBfKickQueue which is called from + * storeBfOpCompletion (and also from storeBfRead/Write upon queueing + * another request). + * + * When storeBfClose is called, issuing the actual close request is + * deferred until the queue is empty. + * + */ + +#include "config.h" +#include "squid.h" + +#include "store_bf.h" + + +static int storeBfUnlink2(SwapDir *sd, StoreEntry *e, storeIOState *sio); +static int storeBfLaunch (SwapDir *sd, int (*aiofunc)(aiocb *), bfop_t *op); +static void storeBfTryLaunchClose (storeIOState *sio); +static void storeBfCloseLastResort (storeIOState *sio); +static void storeBfIOCallback(storeIOState * sio, int errflag); +static void storeBfIOAbort (storeIOState *sio, int op_type, int errflag); +static void storeBfOpEnqueue (bfstate_t *bfstate, bfop_t *op); +static void storeBfSweep (bfstate_t *bfstate); +static void storeBfUpdateOffset (bfstate_t *bfstate); +static void storeBfOrderKick (bfstate_t *bfstate); +static char *storeBfOpDump (bfop_t *op); +static char *storeBfStateDump (bfstate_t *bfstate); +static void storeBfIOFreeEntry(void *sio); + + +storeIOState * +storeBfOpen(SwapDir * sd, StoreEntry * e, STFNCB * file_callback, + STIOCB * callback, void *callback_data) +{ + storeIOState *sio; + bfstate_t *bfstate; + bfinfo_t *bfi = sd->fsdata; + bfop_t *op; + + debug(81, 3) ("storeBfOpen: key %s\n", storeKeyText(e->hash.key)); + + sio = CBDATA_ALLOC(storeIOState, storeBfIOFreeEntry); + sio->fsstate = bfstate = memPoolAlloc(bf_state_pool); + sio->swap_dirn = sd->index; + sio->mode = O_RDONLY; + sio->callback = callback; + sio->callback_data = callback_data; + sio->e = e; + cbdataLock(callback_data); + + storeBfDirFillRa(e, &bfstate->ra); + bfstate->fd = -1; /* not opened */ + bfstate->errflag = 0; + bfstate->ops.next = bfstate->ops.prev = &bfstate->ops; + op = &bfstate->ops; + op->type = BFOPEN; + op->sio = sio; + op->aio.aio_fildes = bfi->dir; + op->aio.aio_buf = &bfstate->ra; + op->aio.aio_nbytes = REISERFS_IOC_RAWOPEN; + op->aio.aio_sigevent.sigev_notify = SIGEV_NONE; + op->aio.aio_sigevent.sigev_signo = 0; + op->aio.aio_sigevent.sigev_value.sival_ptr = op; + + if (storeBfLaunch (sd, aio_ioctl, op) != 0) { + cbdataUnlock(callback_data); + cbdataFree(sio); + return NULL; + } + return sio; +} + + +storeIOState * +storeBfCreate(SwapDir * sd, StoreEntry * e, STFNCB * file_callback, + STIOCB * callback, void *callback_data) +{ + storeIOState *sio; + bfstate_t *bfstate; + bfinfo_t *bfi = sd->fsdata; + bfop_t *op; + + debug(81, 3) ("storeBfCreate: key %s flags %s\n", + storeKeyText(e->hash.key), storeEntryFlags (e)); + + /* Avoid disk subsystem overload. May lead to losing hits, so + don't set the limit too low. */ + if (bf_stats.away > bfi->refuse_swapout) { + bf_stats.swapout_refused++; + return NULL; + } + + sio = CBDATA_ALLOC(storeIOState, storeBfIOFreeEntry); + sio->fsstate = bfstate = memPoolAlloc(bf_state_pool); + sio->swap_dirn = sd->index; + sio->mode = O_WRONLY | O_CREAT | O_TRUNC; /* if that meant something :) */ + sio->callback = callback; + sio->callback_data = callback_data; + sio->e = e; + cbdataLock(callback_data); + + storeBfDirFillRa(e, &bfstate->ra); + bfstate->fd = -1; /* not opened */ + bfstate->errflag = 0; + bfstate->ops.next = bfstate->ops.prev = &bfstate->ops; + op = &bfstate->ops; + op->type = BFCREAT; + op->sio = sio; + op->aio.aio_fildes = bfi->dir; + op->aio.aio_buf = &bfstate->ra; + op->aio.aio_nbytes = REISERFS_IOC_RAWCREAT; + op->aio.aio_sigevent.sigev_notify = SIGEV_NONE; + op->aio.aio_sigevent.sigev_signo = 0; + op->aio.aio_sigevent.sigev_value.sival_ptr = op; + + /* Do not launch anything now; will lauch creat and subsequent + writes when close request arrives */ + + /* storeBfDirReplAdd(sd, e); */ + return sio; +} + + +void +storeBfClose(SwapDir * sd, storeIOState * sio) +{ + bfstate_t *bfstate = sio->fsstate; + + debug(81, 3) ("storeBfClose: dir %d, key %s\n", sd->index, + storeKeyText (sio->e->hash.key)); + + bfstate->close_request = 1; + + /* Creat and subsequent writes are launched all at once when we + have them all queued. Now. */ + if (bfstate->ops.type == BFCREAT) { + bfop_t *op = &bfstate->ops; + if (storeBfLaunch (sd, aio_ioctl, op) != 0) { + storeBfIOAbort (op->sio, op->type, errno); + storeBfKickQueue (bfstate); + } + return; + } + + storeBfTryLaunchClose (sio); +} + + +void +storeBfRead(SwapDir * sd, storeIOState * sio, char *buf, size_t size, + off_t offset, STRCB * callback, void *callback_data) +{ + bfstate_t *bfstate = sio->fsstate; + bfop_t *op; + + op = memPoolAlloc(bf_op_pool); + op->type = BFREAD; + op->sio = sio; + op->callback.read = callback; + op->callback_data = callback_data; + op->aio.aio_buf = buf; + op->aio.aio_nbytes = size; + op->aio.aio_offset = offset; + op->aio.aio_sigevent.sigev_notify = SIGEV_NONE; + op->aio.aio_sigevent.sigev_signo = 0; + op->aio.aio_sigevent.sigev_value.sival_ptr = op; + + /* Now that we have remembered it, we don't want it to be freed */ + cbdataLock(callback_data); + + debug(81, 3) ("storeBfRead: %s", storeBfOpDump(op)); + + storeBfOpEnqueue (bfstate, op); + storeBfKickQueue (bfstate); +} + + +void +storeBfWrite(SwapDir * sd, storeIOState * sio, char *buf, size_t size, + off_t offset, FREE * free_func) +{ + bfstate_t *bfstate = sio->fsstate; + bfop_t *op; + + op = memPoolAlloc(bf_op_pool); + op->type = BFWRITE; + op->sio = sio; + op->callback.write = free_func; + op->callback_data = NULL; + op->aio.aio_buf = buf; + op->aio.aio_nbytes = size; + op->aio.aio_offset = offset; + op->aio.aio_sigevent.sigev_notify = SIGEV_NONE; + op->aio.aio_sigevent.sigev_signo = 0; + op->aio.aio_sigevent.sigev_value.sival_ptr = op; + + debug(81, 3) ("storeBfWrite: %s", storeBfOpDump(op)); + + storeBfOpEnqueue (bfstate, op); + /*storeBfKickQueue (bfstate); tentative: launch bfwrite on close only */ + + debug(81, 8) ("storeBfWrite:%d %s", __LINE__, storeBfStateDump (bfstate)); +} + + +void +storeBfUnlink(SwapDir *sd, StoreEntry *e) +{ + debug(81, 3) ("storeBfUnlink: dir %d, key %s\n", + sd->index, storeKeyText(e->hash.key)); + /* storeBfDirReplRemove(e); */ + storeBfUnlink2 (sd, e, NULL); +} + +static int +storeBfUnlink2(SwapDir *sd, StoreEntry *e, storeIOState *sio) +{ + reiserfs_raw_ioctl_arg *ra; + bfinfo_t *bfi = sd->fsdata; + bfop_t *op; + + /* For Butterfly unlink is rare operation as cache replacement is + carried out in the kernel. So we can afford to xcalloc to + avoid complicating our data */ + + ra = xcalloc (1, sizeof (reiserfs_raw_ioctl_arg)); + storeBfDirFillRa(e, ra); + + op = memPoolAlloc(bf_op_pool); + op->next = op->prev = op; + op->type = BFUNLINK; + op->sio = sio; + op->aio.aio_fildes = bfi->dir; + op->aio.aio_buf = ra; + op->aio.aio_nbytes = REISERFS_IOC_RAWUNLINK; + op->aio.aio_sigevent.sigev_notify = SIGEV_NONE; + op->aio.aio_sigevent.sigev_signo = 0; + op->aio.aio_sigevent.sigev_value.sival_ptr = op; + + if (storeBfLaunch (sd, aio_ioctl, op) != 0) { + if (errno == EAGAIN) { + int x = ioctl (bfi->dir, REISERFS_IOC_RAWUNLINK, ra); + debug(81, 1) ("storeBfUnlink: resorted to sync unlink, =%d\n", x); + } + xfree (ra); + memPoolFree(bf_op_pool, op); + return -1; + } + + /* + * Note: we leave op&ra non-freed. Will get op back in + * sigev_value upon completion and free then. + */ + + return 0; +} + + +void +storeBfSetattr(SwapDir *sd, StoreEntry * e) +{ + reiserfs_raw_ioctl_arg *ra; + bfinfo_t *bfi = sd->fsdata; + bfop_t *op; + + debug(81, 3) ("storeBfSetattr: dir %d, key %s\n", + sd->index, storeKeyText(e->hash.key)); + + ra = xcalloc (1, sizeof (reiserfs_raw_ioctl_arg)); + storeBfDirFillRa(e, ra); + + op = memPoolAlloc(bf_op_pool); + op->next = op->prev = op; + op->type = BFSETATTR; + op->sio = NULL; + op->aio.aio_fildes = bfi->dir; + op->aio.aio_buf = ra; + op->aio.aio_nbytes = REISERFS_IOC_RAWSETATTR; + op->aio.aio_sigevent.sigev_notify = SIGEV_NONE; + op->aio.aio_sigevent.sigev_signo = 0; + op->aio.aio_sigevent.sigev_value.sival_ptr = op; + + if (storeBfLaunch (sd, aio_ioctl, op) != 0) { + /* OK, maybe next time... */ + xfree (ra); + memPoolFree(bf_op_pool, op); + return; + } + + /* + * Note: we leave op&ra non-freed. Will get op back in + * sigev_value upon completion and free then. + */ +} + + +/* Wrapper for aio_*, provides debugging convenience and simplifies + accounting */ + +static int +storeBfLaunch (SwapDir * sd, int (*aiofunc)(aiocb *), bfop_t *op) +{ + int x; + struct bf_stats_counters *c = &bf_stats.counters[op->type]; + + debug(81, 2) ("storeBfLaunch: dir %d %s", sd->index, storeBfOpDump(op)); + + /* Right after going offline, upper layers can continue making + requests to the drive. Prevent the ops from being actually + launched, decrease the amount of syslog I/O error complaints */ + + if (sd && sd->flags.offline) + return -1; + + if (bf_opstats) + op->aio.aio_times[AIOTIME_REQUEST] = kaio_time (); + + x = aiofunc (&op->aio); + c->ops++; + if (x < 0) { + c->fail++; + if (errno == EAGAIN) bf_stats.eagain++; + debug(50, 1) ("storeBfLaunch: type %d: %s\n", op->type, xstrerror()); + } + else { + op->started = 1; + op->starttime = tvMsec (current_time); + + op->busychain = NULL; + *bf_busyops[op->type].tail = op; + bf_busyops[op->type].tail = &op->busychain; + + if (++bf_stats.away > bf_stats.max_away) + bf_stats.max_away = bf_stats.away; + } + + return x; +} + + +/* Start all queued i/o operations that are ready to be launched. It + it OK and even good to start multiple reads or writes in parallel. + Returns -1 if launch attempts hit EAGAIN, 0 otherwise */ + +int +storeBfKickQueue (bfstate_t *bfstate) +{ + storeIOState *sio = bfstate->ops.sio; + bfop_t *op; + int x; + + /* + * Remove completed and aborted ops off the head of the queue. + */ + storeBfSweep (bfstate); + + /* + * If close requested and q is empty, launch the close + */ + if (bfstate->close_request && bfstate->ops.next == &bfstate->ops) { + storeBfTryLaunchClose (sio); + return 0; + } + + /* + * If no fd, we wannot start read/write yet, we're waiting for + * OPEN/CREAT to finish. storeBfOpCompletion will call + * storeBfKickQueue again, and then... + */ + if (bfstate->fd < 0) + return 0; + + /* + * If we are shutting down after error, don't start new + * operations. + */ + if (bfstate->aborting) + /* + * Now we either have flying ops in the q, or close launched. + * So OpCompletion is guaranteed, just return and wait for it. + */ + return 0; + + /* + * At last, our main duty. Launch everything launchable. + */ + for (op = bfstate->ops.next; op != &bfstate->ops; op = op->next) { + if (op->started) + continue; + + op->aio.aio_fildes = bfstate->fd; + + switch (op->type) { + case BFREAD: + x = storeBfLaunch (NULL, aio_read, op); + break; + case BFWRITE: + x = storeBfLaunch (NULL, aio_write, op); + break; + default: + abort (); + } + + if (x < 0) { + if (errno == EAGAIN) { + /* Looks like too many ops are launched already. Wait + for some to land before continuing. The nasty + thing is that if no ops are currently in flight, + nobody will call us again (except the user -- but + that's unreliable). Therefore, ask for help from + upper layers (which have its price, follow me...) */ + storeBfOrderKick (bfstate); + return -1; + } + else { + /* Real error. This is the end of the sio. Raise the + abort flag and recurse -- as the op failed here, + not in OpCompletion, we must do funeral + arrangements ourselves */ + storeBfIOAbort (op->sio, op->type, errno); + storeBfKickQueue (bfstate); + return 0; + } + } + +#if 0 + /* tentative: "don't overdo the asyncronous operations", as + Henrik says. Write is already asynchronous by nature, so + if it does not return fast then launching more writes will + only clutter the queue without any speed gain */ + if (op->type == BFWRITE) + break; +#endif + } + + return 0; +} + + +/* Handle async I/O operation completion event. 'op' is the descriptor + of the completed operation. */ + +void +storeBfOpCompletion (bfop_t *op) +{ + storeIOState *sio = op->sio; + bfstate_t *bfstate = sio? sio->fsstate: NULL; + struct bf_stats_counters *c; + int x = aio_error (&op->aio); + int dura = tvMsec (current_time) - op->starttime; + int rlen; + + op->finished = 1; + + debug(81, 2) ("storeBfOpCompletion: %d ms, x=%d %s", + dura, x, storeBfOpDump(op)); + + /* assert (! sio || cbdataValid(sio)), because we never call + storeBfIOCallback before the file is really closed. */ + + if (op->type == BFNONE || op->type >= BFMAX) { + debug(81, 1) ("storeBfOpCompletion: bad op type (%d)\n", op->type); + /* this "cannot happen" so error handling is sloppy */ + return; + } + + /* + * Update statistics counters + */ + c = &bf_stats.counters[op->type]; + (x == 0) ? c->success++ : c->fail++; + if (c->max_dura < dura) + c->max_dura = dura; + c->avg_dura = (c->avg_dura * (c->ops-1) + dura) / c->ops; + c->avg_dura100 = (c->avg_dura100 * 99 + dura) / 100; + + /* + * Do operation-specific processing + */ + switch (op->type) { + case BFCREAT: + if (x == EEXIST) { + /* id collision (not too likely) or REFRESH_MISS. Handle + by removing the old file and retrying */ + debug(81, 2) ("storeBfCompletion: BFCREAT: dir%d, probably id collision!\n", + sio->swap_dirn, x); + if (bfstate->unlink_attempted) + debug(81, 1) ("storeBfCompletion: BFCREAT: dir%d, lingering unlink %s\n", + sio->swap_dirn, storeKeyText(sio->e->hash.key)); + else { + int x = storeBfUnlink2 (INDEXSD (sio->swap_dirn), sio->e, sio); + if (x >= 0) { + bfstate->unlink_attempted++; + return; + } + } + /* unlink failed, continue processing the error (this will + result in SWAPOUT_FAIL) */ + } + create_fail: + /* fall-through */ + + case BFOPEN: + if (x != 0) + storeBfIOAbort (sio, op->type, x); + else { + bfstate->fd = aio_return (&op->aio); + sio->e->swap_file_sz = bfstate->ra.size; + store_open_disk_fd++; + debug(81, 4) ("storeBfOpCompletion: sio %p fd %d opened\n", sio, bfstate->fd); + } + break; + + case BFREAD: + if (x != 0) + storeBfIOAbort (sio, op->type, x); + rlen = (x == 0)? aio_return (&op->aio): -1; + op->rlen = (x == 0)? aio_return (&op->aio): 0; + storeBfUpdateOffset (bfstate); + if (! bfstate->aborting && cbdataValid(op->callback_data)) + op->callback.read (op->callback_data, op->aio.aio_buf, rlen); + cbdataUnlock (op->callback_data); + break; + + case BFWRITE: + if (x != 0) + storeBfIOAbort (sio, op->type, x); + op->rlen = (x == 0)? aio_return (&op->aio): 0; + storeBfUpdateOffset (bfstate); + if (op->callback.write) + op->callback.write (op->aio.aio_buf); /* free_func */ + break; + + case BFCLOSE: + debug(81, 4) ("storeBfOpCompletion: fd %d closed (x=%d)\n", bfstate->fd, x); + if (x != 0) + storeBfCloseLastResort (sio); + else { + bfstate->fd = -1; + store_open_disk_fd--; + } + storeBfIOCallback (sio, bfstate->errflag); + /* That's the showdown. sio is freed by now */ + return; + + case BFUNLINK: + xfree (op->aio.aio_buf); /* ra */ + memPoolFree(bf_op_pool, op); + if (sio) { + /* This unlink was launched to handle EEXIST error. + Now retry CREATE */ + op = &bfstate->ops; + assert (op->type == BFCREAT); + if (x) { + x = EEXIST; + goto create_fail; + } + op->started = op->finished = 0; + if (storeBfLaunch (NULL, aio_ioctl, op) != 0) { + x = EEXIST; + goto create_fail; + } + } + return; + + case BFSETATTR: + xfree (op->aio.aio_buf); /* ra */ + memPoolFree(bf_op_pool, op); + return; + + default: + abort (); /* cannot happen. */ + } + + /* + * Ok, op done. Go ask for more... + */ + storeBfKickQueue (bfstate); + + /* + * Don't use op below this point. storeBfKickQueue calls + * storeBfSweep that potentially destroys the bfop pointed by it. + */ + + return; +} + + + +/* === STATIC =========================================================== */ + + + +/* Launch async file close operation. We are one step away from + storeBfIOCallback now. The queue must be empty before this is + called. Note that the call to this possibly results in sio + invalidation, so after calling this, return immediately; don't use + any pointers like op or bfstate that you might have. */ + +static void +storeBfTryLaunchClose (storeIOState *sio) +{ + bfstate_t *bfstate = sio->fsstate; + bfop_t *op = &bfstate->ops; + + if (bfstate->close_started) + return; + if (bfstate->ops.next != &bfstate->ops) + return; /* we'll be called again when q empties */ + + debug(81, 3) ("storeBfTryLaunchClose: %s", storeBfStateDump (bfstate)); + + if (bfstate->fd < 0) { + /* not opened -- no need to close. Bye right now */ + storeBfIOCallback (sio, bfstate->errflag); + return; + } + +#if 1 + /* Looks like close(2) is never longer than 10 us (!) so doing it + through KAIO is pure overhead */ + goto synchronous_close; +#endif + + op->type = BFCLOSE; + op->sio = sio; + op->aio.aio_fildes = bfstate->fd; + op->aio.aio_sigevent.sigev_notify = SIGEV_NONE; + op->aio.aio_sigevent.sigev_signo = 0; + op->aio.aio_sigevent.sigev_value.sival_ptr = op; + + debug(81, 2) ("storeBfTryLaunchClose: fd=%d %s", bfstate->fd, storeBfOpDump(op)); + debug(81, 2) ("storeBfTryLaunchClose: callers: %p %p %p %p\n", + __builtin_return_address(0), + __builtin_return_address(1), + __builtin_return_address(2), + __builtin_return_address(3)); + + bfstate->close_started = 1; + if (storeBfLaunch (NULL, aio_close, op) < 0) { + synchronous_close: + storeBfCloseLastResort (sio); + storeBfIOCallback (sio, bfstate->errflag); + } +} + + +/* Called if aio_close failed, to attempt synchronous file close. This + will never be called ;-) 'errflag' is the error returned by + aio_close. */ + +static void +storeBfCloseLastResort (storeIOState *sio) +{ + bfstate_t *bfstate = sio->fsstate; + + /* debug(81, 1) ("Resorting to sync close()...\n"); */ + + if (close (bfstate->fd) != 0) + debug(50, 1) ("close: %s\n", xstrerror()); + else { + bfstate->fd = -1; + store_open_disk_fd--; + } +} + + +/* Call the STIOCB aka "file closed" callback. This is the final + accord in the life of every sio */ + +static void +storeBfIOCallback (storeIOState *sio, int errflag) +{ + bfstate_t *bfstate = sio->fsstate; + int valid = cbdataValid(sio->callback_data); + + debug(81, 3) ("storeBfIOCallback: errflag=%d\n", errflag); + assert (bfstate->ops.next == &bfstate->ops); + cbdataUnlock(sio->callback_data); + if (valid) { + errno = errflag; + sio->callback(sio->callback_data, errflag? DISK_ERROR: DISK_OK, sio); + } + cbdataFree(sio); +} + + +/* Change the sio state to "aborting" -- that is, arrange to shut down + all I/O active on the sio, close the file and eventually call + storeBfIOCallback. This is called if any operation (except close) + fails. It does no actions, just sets appropriate flags. + storeBfKickQueue will notice and shutdown instead of continuing. */ + +static void +storeBfIOAbort (storeIOState *sio, int type, int errflag) +{ + bfstate_t *bfstate = sio->fsstate; + + debug(50, 3) ("storeBfIOAbort: sio %p, aio_{%d}, errflag=%d\n", + sio, type, errflag); + + if (errflag && !bfstate->errflag) + bfstate->errflag = errflag; + bfstate->close_request = 1; + bfstate->aborting = 1; + + /* Don't sweep here -- this will invalidate ops while our callers + are often interesed in them, despite the aborting. + storeBfKickQueue will do that. */ + + if (bfstate->ops.next != &bfstate->ops) { + /* TODO: aio_cancel all ops currently in flight? */ + } + + /* If got EIO, switch the dir offline. To get it back online, + send squid a SIGHUP; this will reload configuration and + revalidate store dirs. */ + + if (bfstate->errflag == EIO) { + SwapDir *sd = INDEXSD (sio->swap_dirn); + if (! sd->flags.offline) { + debug(47, 1) ("storeBfIOAbort: got EIO, taking %s offline\n", sd->path); + storeBfDirTakeOffline (sd); + } + } +} + + +/* Add op to the tail of the op queue */ + +static void +storeBfOpEnqueue (bfstate_t *bfstate, bfop_t *op) +{ + if (bfstate->close_started) { + /* Too late, we are closing. */ + if (op->callback_data) + cbdataUnlock (op->callback_data); + memPoolFree (bf_op_pool, op); + } + op->prev = bfstate->ops.prev; + bfstate->ops.prev = op; + op->next = &bfstate->ops; + op->prev->next = op; +} + + +/* Remove finished ops from the start of the list. Finished ops + interspersed with unfinished ones are left alone, they will be + needed for future storeBfUpdateOffset()s. If aborting, kill + unstarted ops too. */ + +static void +storeBfSweep (bfstate_t *bfstate) +{ + bfop_t *op; + bfop_t *next; + + for (op = bfstate->ops.next; op != &bfstate->ops; op = next) { + next = op->next; + if (op->finished || (bfstate->aborting && ! op->started)) { + op->prev->next = op->next; + op->next->prev = op->prev; + if (! op->finished && op->callback_data) + cbdataUnlock (op->callback_data); + memPoolFree (bf_op_pool, op); + continue; + } + break; + } +} + + +/* Calculate file offset for storeOffset by examining 'finished' ops + in the queue. We rely on the fact that queue is ordered by offset, + and there is for sure at least one element in the queue (the one + that storeBfOpCompletion processes when it calls us). */ + +static void +storeBfUpdateOffset (bfstate_t *bfstate) +{ + storeIOState *sio = bfstate->ops.sio; + bfop_t *op; + + for (op = bfstate->ops.next; op != &bfstate->ops; op = op->next) { + if (op->finished) { + off_t offset = op->aio.aio_offset + op->rlen; + if (sio->offset < offset) + sio->offset = offset; + continue; + } + break; + } + + debug(81, 5) ("storeBfUpdateOffset: sio %p, offset %lu\n", + sio, sio->offset); +} + + +/* Arrange for our queue being kicked externally at a later time. + Called when we lose initiative because of EAGAIN. */ + +static void +storeBfOrderKick (bfstate_t *bfstate) +{ + storeIOState *sio = bfstate->ops.sio; + bfinfo_t *bfi = INDEXSD(sio->swap_dirn)->fsdata; + bfop_t *op; + + /* Avoid the external help if possible. Check if there are ops in + flight; if yes, their completion will do the kick and no + external help is needed. */ + + for (op = bfstate->ops.next; op != &bfstate->ops; op = op->next) { + if (op->started && ! op->finished) + /* found! */ + return; + } + + /* No way. Go begging. We have to use usual callback calling + conventions, because user can kick us before the ordered kick + is activated, and when it at last activated it might find sio + freed. */ + + cbdataLock (sio); + linklistPush(&(bfi->kicklater), sio); +} + +/* Debugging aid */ + +static char * +storeBfOpDump (bfop_t *op) +{ + static char buf[1000]; + static char *optypestr[] = { + "BFNONE", "BFOPEN", "BFCREAT", "BFREAD", "BFWRITE", + "BFCLOSE", "BFUNLINK", "BFSETATTR", + }; + char *typestr = op->type >= BFMAX? "bad!": optypestr[op->type]; + int n = 0; + + n += snprintf (buf+n, sizeof (buf)-n, + "op %p: %9s sio %p %s %s", + op, typestr, op->sio, + op->started? "started": "", + op->finished? "finished": ""); + if (op->type == BFREAD || op->type == BFWRITE) + n += snprintf (buf+n, sizeof (buf)-n, + " %u@%lu", + op->aio.aio_nbytes, + op->aio.aio_offset); + if (op->type == BFWRITE) + n += snprintf (buf+n, sizeof (buf)-n, + " %02x %02x %02x %02x %02x", + ((unsigned char *)op->aio.aio_buf)[0], + ((unsigned char *)op->aio.aio_buf)[1], + ((unsigned char *)op->aio.aio_buf)[2], + ((unsigned char *)op->aio.aio_buf)[3], + ((unsigned char *)op->aio.aio_buf)[4]); + n += snprintf (buf+n, sizeof (buf)-n, "\n"); + return buf; +} + + +static char * +storeBfStateDump (bfstate_t *bfstate) +{ + bfop_t *op; + static char buf[10000]; + char *p = buf; + + p += snprintf (p, &buf[10000] - p, "bfstate %p\n", bfstate); + p += snprintf (p, &buf[10000] - p, " fd %d errflag %d %s %s %s\n", + bfstate->fd, + bfstate->errflag, + (bfstate->close_request? "CR":""), + (bfstate->close_started? "CS":""), + (bfstate->aborting? "ABORTING":"")); + op = &bfstate->ops; + do { + p += snprintf (p, &buf[10000] - p, storeBfOpDump (op)); + op = op->next; + } while (op != &bfstate->ops); + + return buf; +} + +static void +storeBfIOFreeEntry(void *_sio) +{ + storeIOState * sio = _sio; + bfstate_t *bfstate = sio->fsstate; + + /* queue must be freed before cbdataFree()ing the sio */ + assert (bfstate->ops.next == &bfstate->ops); + + memPoolFree(bf_state_pool, sio->fsstate); +}