This patch is generated from the reiserfs_raw branch of HEAD in squid
Wed Sep 29 01:34:07 2004 GMT
See http://devel.squid-cache.org/

Index: squid/acconfig.h
diff -u squid/acconfig.h:1.4 squid/acconfig.h:1.4.6.1
--- squid/acconfig.h:1.4	Sat Dec 16 20:57:39 2000
+++ squid/acconfig.h	Sun Dec 17 05:55:39 2000
@@ -48,6 +48,9 @@
 /* Define to use async disk I/O operations */
 #undef USE_ASYNC_IO
 
+/* Define to include suuport for "tree" (and "asynctree") cache_dir types */
+#undef USE_STORETREE
+
 /* Defines how many threads to use for async I/O */
 #undef ASYNC_IO_THREADS
 
Index: squid/configure.in
diff -u squid/configure.in:1.9 squid/configure.in:1.7.6.2
--- squid/configure.in:1.9	Thu Jan  4 14:44:02 2001
+++ squid/configure.in	Fri Jan  5 12:49:02 2001
@@ -324,6 +324,13 @@
 AC_SUBST(STORE_OBJS)
 STORE_LIBS="`echo $STORE_OBJS|sed -e's%fs/%%g'`"
 AC_SUBST(STORE_LIBS)
+for module in $STORE_MODULES; do
+    case $module in
+    butterfly)
+      AC_DEFINE(USE_STORETREE, 1)
+      ;;
+    esac
+done
 
 dnl --enable-heap-replacement compability option
 AC_ARG_ENABLE(heap-replacement,
@@ -865,6 +872,7 @@
 	time.h \
 	unistd.h \
 	varargs.h \
+	utime.h \
 )
 
 AC_C_CONST
Index: squid/doc/debug-sections.txt
diff -u squid/doc/debug-sections.txt:1.2 squid/doc/debug-sections.txt:1.2.22.1
--- squid/doc/debug-sections.txt:1.2	Sat Oct 21 08:16:08 2000
+++ squid/doc/debug-sections.txt	Sun Dec 17 05:55:39 2000
@@ -86,3 +86,6 @@
 section 79    HTTP Meter Header
 section 80    WCCP
 section 81    Store Removal/Replacement policy
+section 86    Store Directory, Balanced tree (Reiserfs) version
+section 88    Storage Manager TREE Interface I/O Routines, async
+section 89    Storage Manager TREE Interface I/O Routines
Index: squid/include/util.h
diff -u squid/include/util.h:1.4 squid/include/util.h:1.4.16.1
--- squid/include/util.h:1.4	Sat Nov  4 15:23:06 2000
+++ squid/include/util.h	Sun Dec 17 05:55:39 2000
@@ -65,6 +65,7 @@
 extern char *xstrndup(const char *, size_t);
 extern const char *xstrerror(void);
 extern const char *xbstrerror(int);
+extern int tvMsec(struct timeval);
 extern int tvSubMsec(struct timeval, struct timeval);
 extern int tvSubUsec(struct timeval, struct timeval);
 extern double tvSubDsec(struct timeval, struct timeval);
Index: squid/lib/util.c
diff -u squid/lib/util.c:1.4 squid/lib/util.c:1.4.10.1
--- squid/lib/util.c:1.4	Tue Nov 28 03:35:34 2000
+++ squid/lib/util.c	Sun Dec 17 05:55:39 2000
@@ -662,6 +662,12 @@
 }
 
 int
+tvMsec(struct timeval t)
+{
+    return t.tv_sec * 1000 + t.tv_usec / 1000;
+}
+
+int
 tvSubMsec(struct timeval t1, struct timeval t2)
 {
     return (t2.tv_sec - t1.tv_sec) * 1000 +
Index: squid/src/defines.h
diff -u squid/src/defines.h:1.5 squid/src/defines.h:1.3.16.1.2.2
--- squid/src/defines.h:1.5	Fri Jan  5 12:46:59 2001
+++ squid/src/defines.h	Fri Jan  5 12:49:03 2001
@@ -146,7 +146,7 @@
 #define LOG_ENABLE  1
 #define LOG_DISABLE 0
 
-#define SM_PAGE_SIZE 4096
+#define SM_PAGE_SIZE (16*1024)	/* Was 4096 --sizif */
 
 #define EBIT_SET(flag, bit) 	((void)((flag) |= ((1L<<(bit)))))
 #define EBIT_CLR(flag, bit) 	((void)((flag) &= ~((1L<<(bit)))))
@@ -270,7 +270,7 @@
  */
 #define PEER_TCP_MAGIC_COUNT 10
 
-#define CLIENT_SOCK_SZ 4096
+#define CLIENT_SOCK_SZ (16*1024)	/* Was: 4096  --sizif */
 
 #define URI_WHITESPACE_STRIP 0
 #define URI_WHITESPACE_ALLOW 1
Index: squid/src/enums.h
diff -u squid/src/enums.h:1.7 squid/src/enums.h:1.6.4.1.2.1
--- squid/src/enums.h:1.7	Fri Jan  5 12:46:59 2001
+++ squid/src/enums.h	Sun Dec 17 05:55:39 2000
@@ -483,7 +483,8 @@
     ENTRY_VALIDATED,
     ENTRY_BAD_LENGTH,
     ENTRY_ABORTED,
-    ENTRY_DONT_LOG		/* hack for gross 'Pump' entries */
+    ENTRY_DONT_LOG,		/* hack for gross 'Pump' entries */
+    ENTRY_TENTATIVE,		/* storetree: file might not exist */
 };
 
 typedef enum {
Index: squid/src/globals.h
diff -u squid/src/globals.h:1.5 squid/src/globals.h:1.5.8.1
--- squid/src/globals.h:1.5	Tue Dec 12 15:21:19 2000
+++ squid/src/globals.h	Sun Dec 17 05:55:39 2000
@@ -145,6 +145,9 @@
 extern request_flags null_request_flags;
 extern int store_open_disk_fd;	/* 0 */
 extern const char *SwapDirType[];
+#ifdef USE_STORETREE
+extern int n_storetree_dirs;
+#endif /* USE_STORETREE */
 extern storefs_entry_t *storefs_list;	/* NULL */
 extern storerepl_entry_t *storerepl_list;	/* NULL */
 extern int store_swap_low;	/* 0 */
Index: squid/src/protos.h
diff -u squid/src/protos.h:1.12 squid/src/protos.h:1.6.4.1.2.3
--- squid/src/protos.h:1.12	Fri Jan  5 12:46:59 2001
+++ squid/src/protos.h	Fri Jan  5 13:10:43 2001
@@ -817,6 +817,7 @@
  * store.c
  */
 extern StoreEntry *new_StoreEntry(int, const char *, const char *);
+extern FREE destroy_StoreEntry;
 extern StoreEntry *storeGet(const cache_key *);
 extern StoreEntry *storeGetPublic(const char *uri, const method_t method);
 extern StoreEntry *storeCreateEntry(const char *, const char *, request_flags, method_t);
@@ -927,7 +928,6 @@
 extern OBJH storeDirStats;
 extern char *storeDirSwapLogFile(int, const char *);
 extern char *storeSwapDir(int);
-extern char *storeSwapFullPath(int, char *);
 extern char *storeSwapSubSubDir(int, char *);
 extern const char *storeSwapPath(int);
 extern int storeDirWriteCleanLogs(int reopen);
@@ -949,8 +949,15 @@
 int storeDirGetBlkSize(const char *path, int *blksize);
 
 /*
- * store_swapmeta.c
+ * store_dir_drr.c
  */
+#if USE_STORETREE
+extern StoreEntry *storeTreeGet(const char *url, const cache_key *key);
+extern void storeTreeRestoreMetadata(StoreEntry *e, tlv *t);
+#endif
+
+/*
+ * store_swapmeta.c */
 extern char *storeSwapMetaPack(tlv * tlv_list, int *length);
 extern tlv *storeSwapMetaBuild(StoreEntry * e);
 extern tlv *storeSwapMetaUnpack(const char *buf, int *hdrlen);
Index: squid/src/squid.h
diff -u squid/src/squid.h:1.5 squid/src/squid.h:1.3.16.1.2.2
--- squid/src/squid.h:1.5	Fri Jan  5 12:46:59 2001
+++ squid/src/squid.h	Fri Jan  5 12:49:03 2001
@@ -137,6 +137,9 @@
 #if HAVE_TIME_H
 #include <time.h>
 #endif
+#if HAVE_UTIME_H
+#include <utime.h>
+#endif
 #if HAVE_SYS_PARAM_H
 #include <sys/param.h>
 #endif
Index: squid/src/stat.c
diff -u squid/src/stat.c:1.5 squid/src/stat.c:1.4.14.1.2.1
--- squid/src/stat.c:1.5	Fri Jan  5 12:46:59 2001
+++ squid/src/stat.c	Sun Dec 17 05:55:39 2000
@@ -236,6 +236,8 @@
 	strcat(buf, "BAD_LENGTH,");
     if (EBIT_TEST(flags, ENTRY_ABORTED))
 	strcat(buf, "ABORTED,");
+    if (EBIT_TEST(flags, ENTRY_TENTATIVE))
+	strcat(buf, "TENTATIVE,");
     if ((t = strrchr(buf, ',')))
 	*t = '\0';
     return buf;
@@ -812,6 +814,9 @@
     storeAppendPrintf(sentry, "syscalls.disk.writes = %f/sec\n", XAVG(syscalls.disk.writes));
     storeAppendPrintf(sentry, "syscalls.disk.seeks = %f/sec\n", XAVG(syscalls.disk.seeks));
     storeAppendPrintf(sentry, "syscalls.disk.unlinks = %f/sec\n", XAVG(syscalls.disk.unlinks));
+    storeAppendPrintf(sentry, "syscalls.disk.truncates = %f/sec\n", XAVG(syscalls.disk.truncates));
+    storeAppendPrintf(sentry, "syscalls.disk.utimes = %f/sec\n", XAVG(syscalls.disk.utimes));
+    storeAppendPrintf(sentry, "syscalls.disk.stats = %f/sec\n", XAVG(syscalls.disk.stats));
     storeAppendPrintf(sentry, "syscalls.sock.accepts = %f/sec\n", XAVG(syscalls.sock.accepts));
     storeAppendPrintf(sentry, "syscalls.sock.sockets = %f/sec\n", XAVG(syscalls.sock.sockets));
     storeAppendPrintf(sentry, "syscalls.sock.connects = %f/sec\n", XAVG(syscalls.sock.connects));
Index: squid/src/store.c
diff -u squid/src/store.c:1.8 squid/src/store.c:1.6.4.1.2.2
--- squid/src/store.c:1.8	Wed Jan  3 23:23:59 2001
+++ squid/src/store.c	Fri Jan  5 12:49:03 2001
@@ -65,6 +65,8 @@
     "SWAPOUT_DONE"
 };
 
+int n_storetree_dirs = 0;
+
 typedef struct lock_ctrl_t {
     SIH *callback;
     void *callback_data;
@@ -81,7 +83,6 @@
 static void storeHashDelete(StoreEntry *);
 static MemObject *new_MemObject(const char *, const char *);
 static void destroy_MemObject(StoreEntry *);
-static FREE destroy_StoreEntry;
 static void storePurgeMem(StoreEntry *);
 static void storeEntryReferenced(StoreEntry *);
 static void storeEntryDereferenced(StoreEntry *);
@@ -169,7 +170,7 @@
     memFree(mem, MEM_MEMOBJECT);
 }
 
-static void
+void
 destroy_StoreEntry(void *data)
 {
     StoreEntry *e = data;
@@ -215,6 +216,18 @@
 	storeKeyText(e->hash.key));
     storeSetMemStatus(e, NOT_IN_MEMORY);
     destroy_MemObject(e);
+#if USE_STORETREE
+    /* For STORETREE type dirs, we free StoreEntry as well.  STORETREE
+       is able to find object on disk based on the URL, so we can
+       reconstruct StoreEntry from disk object when (and whether)
+       request for it comes */
+    if (e->swap_status == SWAPOUT_DONE) {
+	SwapDir *sd = INDEXSD(e->swap_dirn);
+	if (sd->halfrelease)
+	    sd->halfrelease (e);
+	return;
+    }
+#endif /* USE_STORETREE */
     if (e->swap_status != SWAPOUT_DONE)
 	storeRelease(e);
 }
@@ -293,6 +306,13 @@
 	return (int) e->lock_count;
     if (e->store_status == STORE_PENDING)
 	EBIT_SET(e->flags, RELEASE_REQUEST);
+#define DIRTYHACK
+#ifdef DIRTYHACK
+    if (storePendingNClients(e) != 0) {
+        debug(20, 3) ("storeUnlockObject: storePendingNClients != 0!!  Leak!!\n");
+	return (int) ++e->lock_count;
+    }
+#endif
     assert(storePendingNClients(e) == 0);
     if (EBIT_TEST(e->flags, RELEASE_REQUEST))
 	storeRelease(e);
@@ -322,7 +342,18 @@
 StoreEntry *
 storeGetPublic(const char *uri, const method_t method)
 {
-    return storeGet(storeKeyPublic(uri, method));
+    const cache_key *key = storeKeyPublic(uri, method);
+    StoreEntry *e;
+
+    if ((e = storeGet(key))) {
+	return e;
+    }
+#if USE_STORETREE
+    if (n_storetree_dirs && (e = storeTreeGet (uri, key))) {
+	return e;
+    }
+#endif
+    return NULL;
 }
 
 static int
@@ -803,6 +834,15 @@
 	}
     }
     storeLog(STORE_LOG_RELEASE, e);
+#ifdef USE_STORETREE
+    if (EBIT_TEST (e->flags, ENTRY_TENTATIVE)) {
+	/* Created in storeTreeGet but never tried to swap in for some
+	   reason (no-cache request is one case where this happens).
+	   We don't know whether there is a file on disk, much likely
+	   there isn't.  Don't waste efforts unlinking. */
+        e->swap_filen = -1;
+    }
+#endif
     if (e->swap_filen > -1) {
 	storeUnlink(e);
 	if (e->swap_status == SWAPOUT_DONE)
@@ -909,9 +949,22 @@
 {
     int i;
     /* Calculate size of hash table (maximum currently 64k buckets).  */
-    i = Config.Swap.maxSize / Config.Store.avgObjectSize;
-    debug(20, 1) ("Swap maxSize %d KB, estimated %d objects\n",
-	Config.Swap.maxSize, i);
+#ifdef USE_STORETREE
+    if (n_storetree_dirs == Config.cacheSwap.n_configured) {
+	/* With storetree, on-disk objects are not in hash table, so
+	   hash table should be only large enough to contain in-memory
+	   objects. */
+	i = Config.memMaxSize / Config.Store.avgObjectSize;
+	debug(20, 1) ("cache_mem %d KB, estimated %d hash table objects\n",
+		      Config.memMaxSize, i);
+    }
+    else
+#endif
+    {
+	i = Config.Swap.maxSize / Config.Store.avgObjectSize;
+	debug(20, 1) ("Swap maxSize %d KB, estimated %d objects\n",
+		      Config.Swap.maxSize, i);
+    }
     i /= Config.Store.objectsPerBucket;
     debug(20, 1) ("Target number of buckets: %d\n", i);
     /* ideally the full scan period should be configurable, for the
@@ -1312,6 +1365,16 @@
 	e->swap_file_number = -1;
     } else {
 	assert(-1 == e->swap_file_number);
+#if USE_STORETREE
+	/* If filn chosen by storeDirSelectSwapDir belongs to a
+	   STORETREE directory, redirect the object to proper
+	   STORETREE directory, chosen based on URL */
+	if (storeTreeDirIs (&Config.cacheSwap.swapDirs[filn >> SWAP_DIR_SHIFT])) {
+	    int dirn = storeTreeSelectSwapDir(e->key);
+	    e->swap_file_number = dirn << SWAP_DIR_SHIFT;
+	    return;
+	}
+#endif /* USE_STORETREE */
 	storeDirMapBitSet(e->swap_file_number = filn);
 	storeDirLRUAdd(e);
     }
Index: squid/src/store_client.c
diff -u squid/src/store_client.c:1.5 squid/src/store_client.c:1.4.14.1.2.2
--- squid/src/store_client.c:1.5	Fri Jan  5 12:46:59 2001
+++ squid/src/store_client.c	Fri Dec 22 04:40:49 2000
@@ -1,4 +1,3 @@
-
 /*
  * $Id$
  *
@@ -157,7 +156,16 @@
 {
     STCB *callback = sc->callback;
     char *buf = sc->copy_buf;
+#if BUG_20000721a
+    /* sc->callback may get reset by storeSwapInFileClosed.
+       Don't fail in this case */
     assert(sc->callback);
+#else
+    if (! callback) {
+        debug(20, 2) ("storeClientCallback: sc->callback == 0\n");
+	return;
+    }
+#endif
     sc->callback = NULL;
     sc->copy_buf = NULL;
     if (cbdataValid(sc->callback_data))
@@ -380,6 +388,7 @@
     tlv *tlv_list;
     tlv *t;
     int swap_object_ok = 1;
+    assert(sc->entry->hash.key);	/* --sizif */
     assert(sc->flags.disk_io_pending);
     sc->flags.disk_io_pending = 0;
     assert(sc->callback != NULL);
@@ -432,6 +441,14 @@
 	    }
 	    break;
 	case STORE_META_STD:
+#if USE_STORETREE
+	    /* If we have just recreated StoreEntry, restore metadata
+               like when rebuilding store from cache dir */
+	    if (EBIT_TEST (e->flags, ENTRY_TENTATIVE)) {
+		storeTreeRestoreMetadata(e, t);
+		EBIT_CLR (e->flags, ENTRY_TENTATIVE);
+	    }
+#endif
 	    break;
 	default:
 	    debug(20, 1) ("WARNING: got unused STORE_META type %d\n", t->type);
Index: squid/src/store_dir.c
diff -u squid/src/store_dir.c:1.8 squid/src/store_dir.c:1.4.18.2
--- squid/src/store_dir.c:1.8	Thu Jan  4 22:38:08 2001
+++ squid/src/store_dir.c	Fri Jan  5 12:49:03 2001
@@ -132,7 +132,7 @@
  * XXX This function does NOT account for the read_only flag!
  */
 static int
-storeDirSelectSwapDirRoundRobin(const StoreEntry * unused)
+storeDirSelectSwapDirRoundRobin(const StoreEntry * e)
 {
     static int dirn = 0;
     int i;
@@ -148,8 +148,17 @@
 	sd = &Config.cacheSwap.swapDirs[dirn];
 	if (sd->cur_size > sd->max_size)
 	    continue;
-	return dirn;
+	break;
     }
+#ifdef USE_STORETREE
+    /* Give storedir a chance to redirect this object to another dir.
+       STORETREE dirs use that as all storetree dirs you configure
+       act together like hash buckets.  They use a hash based on URL
+       to select which dir the object should go to -- this allows to
+       have only one open() in storeTreeGet(). */
+    if (sd->reselectdir)
+	dirn = sd->reselectdir(e)->index;
+#endif
     return dirn;
 }
 
Index: squid/src/store_swapin.c
diff -u squid/src/store_swapin.c:1.4 squid/src/store_swapin.c:1.4.18.1
--- squid/src/store_swapin.c:1.4	Fri Nov  3 00:39:20 2000
+++ squid/src/store_swapin.c	Sun Dec 17 05:55:39 2000
@@ -63,7 +63,21 @@
 	e->swap_filen);
     sc->swapin_sio = storeOpen(e, storeSwapInFileNotify, storeSwapInFileClosed,
 	sc);
+#if BUG_20000721
+    /* Locking here is too late.  At least with diskd and multiple
+       cache_dirs, storeSwapInFileClosed can be called before we get
+       here, and cbdataUnlock gets called before cbdataLock, hence an
+       assertion failure.   --sizif */
     cbdataLock(sc->swapin_sio);
+#else
+    if (sc->flags.swapin_dead) {
+        sc->flags.swapin_dead = 0;
+        sc->swapin_sio = NULL;
+    }
+    else {
+        cbdataLock(sc->swapin_sio);
+    }
+#endif
 }
 
 static void
@@ -73,8 +87,21 @@
     STCB *callback;
     debug(20, 3) ("storeSwapInFileClosed: sio=%p, errflag=%d\n",
 	sio, errflag);
+#if BUG_20000721
     cbdataUnlock(sio);
     sc->swapin_sio = NULL;
+#else
+    if (sc->swapin_sio) {
+	cbdataUnlock(sio);
+	sc->swapin_sio = NULL;
+    }
+    else {
+        /* tough luck.  We've been called before storeOpen finished
+           and sc->swapin_sio is assigned and locked.  sio is
+           cbdataReallyFreed already. */
+        sc->flags.swapin_dead = 1;
+    }
+#endif
     if ((callback = sc->callback)) {
 	assert(errflag <= 0);
 	sc->callback = NULL;
Index: squid/src/store_swapout.c
diff -u squid/src/store_swapout.c:1.5 squid/src/store_swapout.c:1.4.14.1.2.1
--- squid/src/store_swapout.c:1.5	Fri Jan  5 12:46:59 2001
+++ squid/src/store_swapout.c	Sun Dec 17 05:55:39 2000
@@ -234,8 +234,9 @@
 	assert(swap_buf_len > 0);
 	debug(20, 3) ("storeSwapOut: swapping out %d bytes from %d\n",
 	    swap_buf_len, (int) mem->swapout.queue_offset);
+	storeWrite(mem->swapout.sio, mem->swapout.memnode->data, swap_buf_len,
+		   mem->swap_hdr_sz + mem->swapout.queue_offset, NULL);
 	mem->swapout.queue_offset += swap_buf_len;
-	storeWrite(mem->swapout.sio, mem->swapout.memnode->data, swap_buf_len, -1, NULL);
 	/* the storeWrite() call might generate an error */
 	if (e->swap_status != SWAPOUT_WRITING)
 	    break;
Index: squid/src/structs.h
diff -u squid/src/structs.h:1.15 squid/src/structs.h:1.8.8.3
--- squid/src/structs.h:1.15	Thu Jan  4 22:38:08 2001
+++ squid/src/structs.h	Fri Jan  5 12:49:03 2001
@@ -1247,6 +1247,7 @@
 	unsigned int disk_io_pending:1;
 	unsigned int store_copying:1;
 	unsigned int copy_event_pending:1;
+	unsigned int swapin_dead:1;
     } flags;
 #if DELAY_POOLS
     delay_id delay_id;
@@ -1340,11 +1341,14 @@
     ping_status_t ping_status:3;
     store_status_t store_status:3;
     swap_status_t swap_status:3;
+    unsigned id_valid:1;
+    unsigned long long id;
 };
 
 struct _SwapDir {
     char *type;
     int cur_size;
+    int high_size;
     int low_size;
     int max_size;
     char *path;
@@ -1357,6 +1361,7 @@
     struct {
 	unsigned int selected:1;
 	unsigned int read_only:1;
+	unsigned int offline:1;
     } flags;
     STINIT *init;		/* Initialise the fs */
     STNEWFS *newfs;		/* Create a new fs */
@@ -1371,6 +1376,9 @@
     STUNREFOBJ *unrefobj;	/* Unreference this object */
     STCALLBACK *callback;	/* Handle pending callbacks */
     STSYNC *sync;		/* Sync the directory */
+    /* The following are for storetree */
+    STHALFRELEASE *halfrelease;	/* Release StoreEntry but keep disk object */
+    STRESELECTDIR *reselectdir; /* Swap dir redirection function */
     struct {
 	STOBJCREATE *create;
 	STOBJOPEN *open;
@@ -1639,6 +1647,9 @@
 	    int writes;
 	    int seeks;
 	    int unlinks;
+	    int truncates;
+	    int utimes;
+	    int stats;
 	} disk;
 	struct {
 	    int accepts;
Index: squid/src/typedefs.h
diff -u squid/src/typedefs.h:1.6 squid/src/typedefs.h:1.4.10.1.2.2
--- squid/src/typedefs.h:1.6	Fri Jan  5 12:46:59 2001
+++ squid/src/typedefs.h	Fri Jan  5 12:49:03 2001
@@ -248,6 +248,8 @@
 typedef void STDONE(void);
 typedef int STCALLBACK(SwapDir *);
 typedef void STSYNC(SwapDir *);
+typedef void STHALFRELEASE(StoreEntry *);
+typedef SwapDir *STRESELECTDIR(const StoreEntry *);
 
 typedef storeIOState *STOBJCREATE(SwapDir *, StoreEntry *, STFNCB *, STIOCB *, void *);
 typedef storeIOState *STOBJOPEN(SwapDir *, StoreEntry *, STFNCB *, STIOCB *, void *);
Index: squid/src/fs/butterfly/Makefile.in
diff -u /dev/null squid/src/fs/butterfly/Makefile.in:1.1.2.1
--- /dev/null	Tue Sep 28 18:39:07 2004
+++ squid/src/fs/butterfly/Makefile.in	Sun Dec 17 05:55:40 2000
@@ -0,0 +1,76 @@
+#
+#  Makefile for the BUTTERFLY storage driver for the Squid Object Cache server
+#
+#  $Id$
+#
+
+FS		= butterfly
+
+prefix		= @prefix@
+exec_prefix	= @exec_prefix@
+exec_suffix	= @exec_suffix@
+cgi_suffix	= @cgi_suffix@
+top_srcdir	= @top_srcdir@
+bindir		= @bindir@
+libexecdir      = @libexecdir@
+sysconfdir	= @sysconfdir@
+localstatedir   = @localstatedir@
+srcdir		= @srcdir@
+VPATH		= @srcdir@
+
+CC		= @CC@
+MAKEDEPEND	= @MAKEDEPEND@
+AR_R		= @AR_R@
+RANLIB		= @RANLIB@
+AC_CFLAGS	= @CFLAGS@
+SHELL		= /bin/sh
+LDFLAGS		= @LDFLAGS@
+INSTALL         = @INSTALL@
+INSTALL_BIN     = @INSTALL_PROGRAM@
+MV		= @MV@
+RM		= @RM@
+
+INCLUDE		= -I../../../include -I$(top_srcdir)/include -I$(top_srcdir)/src/
+# sorry.  Adjust to where your linux+reiserfs_raw+KAIO is.
+INCLUDE		+= -I../../../..//linux-reiserfs-raw/include
+CFLAGS 		= $(AC_CFLAGS) $(INCLUDE) $(DEFINES)
+
+OUT		= ../$(FS).a
+
+OBJS	 	= \
+		store_dir_bf.o \
+		store_io_bf.o \
+		aiolib.o \
+
+all: $(OUT)
+
+$(OUT): $(OBJS)
+	@rm -f ../stamp
+	$(AR_R) $(OUT) $(OBJS)
+	$(RANLIB) $(OUT)
+
+$(OBJS): $(top_srcdir)/include/version.h ../../../include/autoconf.h
+
+.c.o:
+	@rm -f ../stamp
+	$(CC) -DSQUID_PREFIX=\"$(prefix)\" $(CFLAGS) -c $<
+
+clean: 
+	-rm -rf *.o *pure_* core ../$(FS).a
+
+distclean:	clean
+	-rm -f Makefile
+	-rm -f Makefile.bak
+	-rm -f tags
+
+tags:
+	ctags *.[ch] $(top_srcdir)/src/*.[ch] $(top_srcdir)/include/*.h $(top_srcdir)/lib/*.[ch]
+TAGS:
+	etags *.h $(top_srcdir)/src/*.h $(top_srcdir)/include/*.h $(top_srcdir)/lib/*.h \
+	      *.c $(top_srcdir)/src/*.c $(top_srcdir)/lib/*.c
+
+depend:
+	$(MAKEDEPEND) $(INCLUDE) -fMakefile *.c
+
+install:
+	@true
Index: squid/src/fs/butterfly/README.aiolib
diff -u /dev/null squid/src/fs/butterfly/README.aiolib:1.1.2.1
--- /dev/null	Tue Sep 28 18:39:07 2004
+++ squid/src/fs/butterfly/README.aiolib	Sun Dec 17 05:55:40 2000
@@ -0,0 +1,134 @@
+/*
+ *
+ * libdba: Facilities which accelerate database performance.
+ * 	   Included are POSIX Asynchronous I/O,
+ *			Post-wait synchronization,
+ *			and a fast gettimeofday().
+ *
+ * Copyright 1999, Silicon Graphics, Inc.
+ *
+ * Written October 1999 by Rajagopal Ananthanarayanan (ananth)
+ *  			at Silicon Graphics, Inc.
+ */
+
+Asynchronous I/O Facility
+-------------------------
+
+Overview
+--------
+
+The asynchronous I/O (AIO) facility implements interfaces defined by the
+POSIX standard. The AIO facility implemented in the SGI Linux Environment
+1.1 differs from glibc implementation.  The  glibc version realizes
+asynchrony by employing slave threads to process the I/O requests. This
+implies that only as many I/O requests as the number of slaves are truly
+asynchronous at the device, since the slaves use the blocking system calls
+to service the I/O: when a slave thread is processing an I/O request,
+it is blocked in that I/O system call.
+
+The SGI AIO version is mostly implemented in the Linux kernel. To
+distinguish with the glibc version, we will use the term KAIO. In KAIO,
+when possible, I/O requests are implemented as split-phase I/O requests.
+With split-phase I/O, the initiating request (such as an aio_read) truly
+queues the I/O at the device as the first phase of the I/O request; a
+second phase of the I/O request, performed as part of the I/O completion,
+propagates results of the request.  The results may include the contents
+of the I/O buffer on a read, the number of bytes read or written, and
+any error status.  Thus, with split-phase I/O as much asynchrony as the
+device can support is actually achieved. However, not all file systems or
+devices readily support split-phase I/O. Currently, KAIO only supports
+split-phase I/O for file systems that employ generic_file_read() as
+their read routine (several file systems in current Linux kernel use
+generic_file_read, including popular ext2), and for all Character Disk
+Devices (Raw) which is also provided as part of SGI Linux Environment 1.1.
+For requests which are not split-phase, KAIO employs slave threads,
+similar to the glibc implementation.
+
+
+Using KAIO
+----------
+
+KAIO implements POSIX Asynchronous I/O interfaces, although formal
+compliance testing and branding have not been attempted yet.  All interfaces,
+such as aio_read(), aio_write(), aio_suspend(), aio_cancel(), aio_error(),
+aio_return(), and lio_listio() are supported. Further, all interfaces
+can use 64-bit offsets allowing greater than 2GB offsets, as discussed below.
+
+To use KAIO as opposed to glibc AIO, one has to include <linux/aio.h>,
+since glibc already defines a <aio.h>. Further, libdba.so has to be
+linked into the executable; the sources for libdba can be found in
+/usr/src/linux/lib/libdba. Finally, libdba.so needs to be installed
+under /lib on the machine where the executable is to run.  KAIO and
+glibc AIO cannot be intermixed; including <aio.h> and <linux/aio.h>
+would cause compilation errors, since both define, for instance, aiocb,
+the AIO control block.
+
+The kernel support for KAIO is enabled using the CONFIG_AIO option,
+as part of "General Setup". Further, CONFIG_AIO_MAX allows the maximum
+number of outstanding I/Os to be configurable, although the current
+default of 4096 should be sufficient for most environments.
+
+KAIO is designed to work with POSIX Threads (pthreads), or any programs
+that employ clone threads, as long as the clones are created with the
+CLONE_VM option.
+
+Finally, the number of slave threads can be changed using the environment
+variable AIO_NTHREADS. By default 4 threads (AIO_DEFAULT_THREADS in
+<linux/aio.h>) are created. All slaves have the prefix "kaiod-XXXXX" as
+their name in top(1) or pstree(1), where "XXXXX" is the pid of the
+parent which created the slaves; see BUGS about ps(1).
+
+
+Enabling 64-bit offsets
+-----------------------
+
+User code can turn on 64-bit offsets in aiocb by:
+
+      #define _FILE_OFFSET_BITS 64
+      #include <features.h>
+      #include <linux/aio.h>
+which will turn all aio_*() interfaces to operate with 64-bit offsets.
+More precisely, aio_offset will be a 64-bit quantitity (loff_t).
+
+Further, if _LARGEFILE64_SOURCE is defined in the above, it will enable
+LFS style explicit aio_*64() and lio_listio64() interfaces; these will
+accept aiocb64_t as their argument. If only _LARGEFILE64_SOURCE is
+defined, then aio_*() and lio_listio() will use 32-bit offsets.
+
+Note that if you include other system header files before including it
+as above, those headers may include <features.h>, which will turn the
+later inclusion of <features.h> into a null inclusion.  This will lead
+to the incorrect AIO interfaces.  To avoid this, one possibility is
+to define either or both of _FILE_OFFSET_BITS and _LARGEFILE64_SOURCE
+before including any system header file.
+
+As an alternate way of turning on 64-bit capability, AIO_FILE_OFFSET64
+enables 64-bit offsets, and AIO_LARGEFILE64 enables LFS interfaces.
+While this scheme does not involve <features.h> and its associated
+include-order problem, these definitions are understood only by
+KAIO code.
+
+The kernel always uses 64-bit offset values. If user code does not
+enable 64-bit offsets, the library (libdba.so) will suitably zero
+out the padding.
+
+Known Bugs
+----------
+
+ps(1) shows the slave threads with the name of the parent,
+although top(1) and pstree(1) show the name correctly.
+
+Resources
+---------
+
+(1) Where to find AIO man pages?
+
+	http://www.opengroup.org/public/pubs/online/7908799/xsh/aio.h.html
+
+Note: aio_fsync() is not required by POSIX, and currently
+not implemented in KAIO.
+	
+(2) Where to find LFS specifications?
+
+	http://ftp.sas.com/standards/large.file/
+
Index: squid/src/fs/butterfly/aiolib.c
diff -u /dev/null squid/src/fs/butterfly/aiolib.c:1.1.2.1
--- /dev/null	Tue Sep 28 18:39:07 2004
+++ squid/src/fs/butterfly/aiolib.c	Sun Dec 17 05:55:40 2000
@@ -0,0 +1,294 @@
+/*
+ * Copyright 1999, Silicon Graphics, Inc.
+ *
+ * Written October 1999 by Rajagopal Ananthanarayanan (ananth)
+ *  			at Silicon Graphics, Inc.
+ *
+ * According to http://oss.sgi.com/projects/kaio/license.html,
+ * KAIO code is covered by the same license as the linux kernel (GPL) 
+ *
+ */
+#define AIO_LARGEFILE64
+#include <linux/aio.h>
+#include <signal.h>
+#include <time.h>
+#include <sys/types.h>
+#include <bits/siginfo.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <asm/unistd.h>
+#include <sched.h>
+
+#include "squid.h"
+
+
+_syscall5(int, aio,
+		int, cmd,
+		unsigned long, arg1,
+		unsigned long, arg2,
+		unsigned long, arg3,
+		unsigned long, arg4);
+
+static int aio_cpid[AIO_MAX_THREADS];
+
+
+int
+child(void *arg)
+{
+	/* child */
+	fprintf (stderr, "entering child %d...\n", getpid());
+	sigblock(-1); /* block all signals */
+	aio(AIOCMD_SLAVE, 0, 0, 0, 0);
+	fprintf (stderr, "child %d exiting: %d\n", getpid(), errno);
+	exit(-1);
+}
+
+
+static void
+aio_init(int nslaves)
+{
+	int i;
+	char *cstack;
+
+	if (nslaves == 0) {
+		char *p;
+
+		/* get it from the environment */
+		if ((p = getenv(AIO_THREAD_ENVIRON)))
+			nslaves = strtol(p, 0, 0);
+		else
+			nslaves = AIO_DEFAULT_THREADS;
+	}
+
+	if (nslaves <= 0 || nslaves > AIO_MAX_THREADS) {
+		fprintf(stderr, "aio_init: Warning! nthreads %d not in "
+				"valid range 1 .. %d (see AIO_MAX_THREADS)\n",
+				nslaves, AIO_MAX_THREADS); 
+		fprintf(stderr, "aio_init: defaulting to %d threads "
+				"(see AIO_DEFAULT_THREADS)\n",
+				AIO_DEFAULT_THREADS);
+		nslaves = AIO_DEFAULT_THREADS;
+	}
+
+	for (i = 0; i < nslaves; i++) {
+		cstack = (char *) xmalloc(64 * 1024);
+		cstack += (32 * 1024);
+		cstack = (char *)((unsigned long)cstack & 0xfffff000);
+
+		/*
+		aio_cpid[i] =
+			aio(AIOCMD_SLAVE, 0, 0, 0, 0);
+		*/
+		aio_cpid[i] = clone(child, cstack,
+			SIGCHLD|CLONE_VM|CLONE_FS|CLONE_FILES, 0);
+		if (aio_cpid[i] < 0) {
+			perror("clone");
+			exit(-1);
+		}
+	}
+}
+
+int
+aio_read64(aiocb64_t *aiocb)
+{
+	int ret;
+
+	ret = aio(AIOCMD_READ, (unsigned long)aiocb, 0, 0, 0);
+	if (ret < 0 && errno == ENOTCONN) {
+		aio_init(0);
+		ret = aio(AIOCMD_READ, (unsigned long)aiocb, 0, 0, 0);
+	}
+	return ret;
+}
+
+int
+aio_write64(aiocb64_t *aiocb)
+{
+	int ret;
+
+	ret = aio(AIOCMD_WRITE, (unsigned long)aiocb, 0, 0, 0);
+	if (ret < 0 && errno == ENOTCONN) {
+		aio_init(0);
+		ret = aio(AIOCMD_WRITE, (unsigned long)aiocb, 0, 0, 0);
+	}
+	return ret;
+}
+
+
+int
+aio_cancel64(int fildes, aiocb64_t *aiocb)
+{
+	int ret;
+
+	ret = aio(AIOCMD_CANCEL, (unsigned long)fildes,
+			(unsigned long)aiocb, 0, 0);
+	if (ret < 0 && errno == ENOTCONN) {
+		aio_init(0);
+		ret = aio(AIOCMD_CANCEL, (unsigned long)fildes,
+				(unsigned long)aiocb, 0, 0);
+	}
+	return ret;
+}
+
+int
+aio_suspend64(const aiocb64_t *const list[], int nent, const struct timespec *timeout)
+{
+	int ret;
+
+	ret = aio(AIOCMD_SUSPEND, (unsigned long)list, (unsigned long)nent,
+			(unsigned long)timeout, 0);
+	if (ret < 0 && errno == ENOTCONN) {
+		aio_init(0);
+		ret = aio(AIOCMD_SUSPEND, (unsigned long)list,
+				(unsigned long)nent,
+				(unsigned long)timeout, 0);
+	}
+	return ret;
+}
+
+ssize_t
+aio_return64(aiocb64_t *aiocb)
+{
+	return(aiocb->aio_reserved[3]);
+}
+
+int
+lio_listio64(int mode, aiocb64_t *const list[], int nent, sigevent_t *sig)
+{
+	int ret;
+
+	ret = aio(AIOCMD_LIST_IO,
+			(unsigned long)mode, (unsigned long)list,
+			(unsigned long)nent, (unsigned long)sig);
+	if (ret < 0 && errno == ENOTCONN) {
+		aio_init(0);
+		ret = aio(AIOCMD_LIST_IO,
+				(unsigned long)mode, (unsigned long)list,
+				(unsigned long)nent, (unsigned long)sig);
+	}
+	return ret;
+}
+
+int
+aio_ioctl64(aiocb64_t *aiocb)
+{
+	int ret;
+
+	ret = aio(AIOCMD_IOCTL, (unsigned long)aiocb, 0, 0, 0);
+	if (ret < 0 && errno == ENOTCONN) {
+		aio_init(0);
+		ret = aio(AIOCMD_IOCTL, (unsigned long)aiocb, 0, 0, 0);
+	}
+	return ret;
+}
+
+int
+aio_close64(aiocb64_t *aiocb)
+{
+	int ret;
+
+	ret = aio(AIOCMD_CLOSE, (unsigned long)aiocb, 0, 0, 0);
+	if (ret < 0 && errno == ENOTCONN) {
+		aio_init(0);
+		ret = aio(AIOCMD_CLOSE, (unsigned long)aiocb, 0, 0, 0);
+	}
+	return ret;
+}
+
+
+#ifndef __USE_FILE_OFFSET64
+#define PAD_AIOCB(aiocb)	*(unsigned int *)&((aiocb)->__aio_pad) = 0;
+#else
+#define PAD_AIOCB(aiocb)
+#endif
+
+int
+aio_read(aiocb_t *aiocb)
+{
+	PAD_AIOCB(aiocb);
+	return(aio_read64((aiocb64_t *)aiocb));
+}
+
+int
+aio_write(aiocb_t *aiocb)
+{
+	PAD_AIOCB(aiocb);
+	return(aio_write64((aiocb64_t *)aiocb));
+}
+
+
+int
+aio_cancel(int fildes, aiocb_t *aiocb)
+{
+	if (aiocb)
+		PAD_AIOCB(aiocb);
+	return(aio_cancel64(fildes, (aiocb64_t *)aiocb));
+}
+
+
+typedef const aiocb64_t * const *CLISTC;
+typedef       aiocb64_t * const	*LISTC;
+
+int
+aio_suspend(const aiocb_t *const list[], int nent, const struct timespec *timeout)
+{
+#ifndef __USE_FILE_OFFSET64
+	int i;
+
+	for (i = 0; i < nent; i++) 
+		if (list[i])
+			PAD_AIOCB(list[i]);
+#endif
+	return(aio_suspend64((CLISTC)list, nent, timeout));
+}
+
+ssize_t
+aio_return(aiocb_t *aiocb)
+{
+	return aio_return64((aiocb64_t *)aiocb);
+}
+
+int
+lio_listio(int mode, aiocb_t *const list[], int nent, sigevent_t *sig)
+{
+#ifndef __USE_FILE_OFFSET64
+	int i;
+
+	for (i = 0; i < nent; i++) 
+		if (list[i])
+			PAD_AIOCB(list[i]);
+#endif
+	return(lio_listio64(mode, (LISTC)list, nent, sig));
+}
+
+int
+aio_ioctl(aiocb_t *aiocb)
+{
+	if (aiocb)
+		PAD_AIOCB(aiocb);
+	return(aio_ioctl64((aiocb64_t *)aiocb));
+}
+
+int
+aio_close(aiocb_t *aiocb)
+{
+	if (aiocb)
+		PAD_AIOCB(aiocb);
+	return(aio_close64((aiocb64_t *)aiocb));
+}
+
+void
+aio_pcinvalidate(int fd)
+{
+	if (aio(AIOCMD_PCINVALIDATE, fd, 0, 0, 0)) {
+		if (errno == ENOTCONN) {
+			aio_init(0);
+			if (aio(AIOCMD_PCINVALIDATE, fd, 0, 0, 0) == 0)
+				return;
+		}
+		perror("AIOCMD_PCINVALIDATE");
+		exit(-1);
+	}
+}
Index: squid/src/fs/butterfly/store_bf.h
diff -u /dev/null squid/src/fs/butterfly/store_bf.h:1.1.2.2
--- /dev/null	Tue Sep 28 18:39:07 2004
+++ squid/src/fs/butterfly/store_bf.h	Fri Dec 29 01:10:46 2000
@@ -0,0 +1,143 @@
+/*
+ * store_bf.h
+ *
+ * Internal declarations for the Butterfly cache_dir routines
+ *
+ * $Id$
+ */
+
+#ifndef __STORE_BF_H__
+#define __STORE_BF_H__
+
+#include <linux/reiserfs_fs.h>
+#include <linux/aio.h>
+
+
+typedef struct reiserfs_raw_ioctl_arg reiserfs_raw_ioctl_arg;
+typedef struct aiocb aiocb;
+
+typedef struct _bf_stats bf_stats_t;
+typedef struct _bfinfo_t bfinfo_t;
+typedef struct _bfop_t bfop_t;
+typedef struct _bfstate_t bfstate_t;
+
+
+/* When read/write/close request comes for sio which hasn't fd
+   available yet, we cannot fire up aio_* operation for it
+   immediately.  Instead, we queue the requests and process them after
+   open finishes.  The following structure is the element of the
+   queues. */
+
+enum bf_type {
+	BFNONE, BFOPEN, BFCREAT, BFREAD, BFWRITE,
+	BFCLOSE, BFUNLINK, BFSETATTR,
+	BFMAX,
+};
+#define BFOP_NAMES {"none","open","create","read","write", \
+                    "close","unlink","setattr"}
+
+struct _bfop_t {
+    bfop_t *next;		/* efficient dlist -- don't need xcalloc */
+    bfop_t *prev;
+    bfop_t *busychain;
+    enum bf_type type;
+    storeIOState *sio;		/* our owner */
+    aiocb aio;
+    union {
+	STRCB *read;
+	FREE *write;		/* free_func */
+    } callback;
+    void *callback_data;	/* only for callback.read */
+    off_t rlen;			/* real real/write length, syscall return */
+    int starttime;		/* start timestamp, ms */
+    unsigned int started:1;	/* 0 until launched */
+    unsigned int finished:1;	/* 0 until landed */
+};
+
+struct _bfinfo_t {
+    int dir;			/* fd of sd->path for reiserfs_raw ioctls */
+    link_list *kicklater;	/* list of bfstates that got EAGAIN */
+    int refuse_swapout;		/* storeBfCreate returns NULL if more
+				   than this ops away */
+};
+
+struct _bfstate_t {
+    int fd;			/* file fd, -1 if not opened yet */
+    reiserfs_raw_ioctl_arg ra;	/* single ra shared by all parallel ops */
+    bfop_t ops;			/* op used for BF{OPEN,CREAT,UNLINK,SETATTR}
+				   and as queue head for BF{READ,WRITE} */
+    int errflag;		/* error that caused abortion of the sio.
+				   Normaly zero and no abort.  */
+    unsigned int close_request:1; /* close when q is empty */
+    unsigned int close_started:1; /* BF_CLOSE launched already */
+    unsigned int aborting:1;	  /* close ASAP (flush not run the q) */
+    unsigned int unlink_attempted:1; /* already tried to unlink the file */
+};
+
+struct _bf_stats {
+    unsigned eagain;
+    unsigned away;
+    unsigned max_away;
+    unsigned abs_max_away;
+    unsigned swapout_refused;
+    struct bf_stats_counters {
+	unsigned ops;
+	unsigned success;
+	unsigned fail;
+	unsigned max_dura;	/* maximal op duration */
+	float    avg_dura;	/* average op duration */
+	float    avg_dura100;	/* averege op duration in last 100 ops */
+    } counters[BFMAX];
+};
+
+/* FIFO lists of launched ops.  We keep different ops in different
+   lists because op delays and urgency are different */
+struct bf_busyops {
+    bfop_t *head;		/* FIFO list of launched BF of certain type */
+    bfop_t **tail;	 	/* tail of the FIFO list */
+};
+
+extern MemPool *bf_state_pool;
+extern MemPool *bf_op_pool;
+extern bf_stats_t bf_stats;
+extern struct bf_busyops bf_busyops[BFMAX];
+extern char *op_names;
+extern FILE *bf_opstats;		/* bulky binary op stats file */
+extern char *bf_opstats_fn;
+
+
+/* from store_io_bf.c */
+extern int   storeBfKickQueue (bfstate_t *bfstate);
+extern void  storeBfOpCompletion (bfop_t *op);
+
+/* from store_dir_bf.c */
+extern void  storeBfDirTakeOffline(SwapDir *sd);
+extern void  storeBfDirUnrefObj(SwapDir * SD, StoreEntry * e);
+extern void  storeBfDirReplAdd(SwapDir * SD, StoreEntry * e);
+extern void  storeBfDirReplRemove(StoreEntry * e);
+extern void  storeBfDirFillRa(StoreEntry *e, reiserfs_raw_ioctl_arg *ra);
+
+/*
+ * Store IO stuff
+ */
+extern STOBJCREATE storeBfCreate;
+extern STOBJOPEN storeBfOpen;
+extern STOBJCLOSE storeBfClose;
+extern STOBJREAD storeBfRead;
+extern STOBJWRITE storeBfWrite;
+extern STOBJUNLINK storeBfUnlink;
+
+
+/* Get microsecond timestamp for profiling.  Almost verbatim from
+   linux/mm/filemap.c (KAIO) */
+
+static inline unsigned long
+kaio_time(void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (tv.tv_sec * 1000000 + tv.tv_usec);
+}
+
+
+#endif /* __STORE_BF_H__ */
Index: squid/src/fs/butterfly/store_dir_bf.c
diff -u /dev/null squid/src/fs/butterfly/store_dir_bf.c:1.1.2.3
--- /dev/null	Tue Sep 28 18:39:07 2004
+++ squid/src/fs/butterfly/store_dir_bf.c	Fri Dec 29 01:10:46 2000
@@ -0,0 +1,1064 @@
+
+/*
+ * $Id$
+ *
+ * DEBUG: section 86    Store Directory, the "Butterfly" version
+ * AUTHOR: Yury Shevchuk <sizif@botik.ru>
+ *
+ * SQUID Internet Object Cache  http://squid.nlanr.net/Squid/
+ * ----------------------------------------------------------
+ *
+ *  Squid is the result of efforts by numerous individuals from the
+ *  Internet community.  Development is led by Duane Wessels of the
+ *  National Laboratory for Applied Network Research and funded by the
+ *  National Science Foundation.  Squid is Copyrighted (C) 1998 by
+ *  the Regents of the University of California.  Please see the
+ *  COPYRIGHT file for full details.  Squid incorporates software
+ *  developed and/or copyrighted by other sources.  Please see the
+ *  CREDITS file for full details.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
+ *
+ */
+
+#include "squid.h"
+#include <signal.h>
+
+#if HAVE_STATVFS
+#if HAVE_SYS_STATVFS_H
+#include <sys/statvfs.h>
+#endif
+#endif
+
+#include "store_bf.h"
+
+bf_stats_t bf_stats;
+
+typedef struct _RebuildState RebuildState;
+struct _RebuildState {
+    SwapDir *sd;
+    struct _store_rebuild_data counts;
+    int clean;
+};
+
+int n_bf_dirs = 0;
+int n_bf_dirs_online = 0;
+static int *bf_dir_index = NULL;
+static int *bf_dir_online_index = NULL;
+MemPool *bf_state_pool = NULL;
+MemPool *bf_op_pool = NULL;
+static int bf_initialised = 0;
+static int bf_nthreads = 0;
+struct bf_busyops bf_busyops[BFMAX];
+char *bfop_names[] = BFOP_NAMES;
+FILE *bf_opstats;		/* bulky binary op stats file */
+char *bf_opstats_fn;
+
+static int  storeBfDirCreateDirectory(const char *path,
+					     int should_exist);
+static void storeBfDirReindex();
+static int  storeBfDirVerifyDirectory(const char *path);
+static int  storeBfDirVerifyCacheDir(SwapDir * sd);
+static void storeBfDirOpenSwapLog(SwapDir * sd);
+static void storeBfDirCloseSwapLog(SwapDir * sd);
+static void storeBfDirInit(SwapDir * sd);
+static void storeBfStats(StoreEntry * sentry);
+static void storeBfDirSync(SwapDir * SD);
+static int  storeBfDirCallback(SwapDir * SD);
+static void storeBfRelease(StoreEntry *e);
+static SwapDir *storeBfReselectDir (const StoreEntry *e);
+static void storeBfDirRebuild(SwapDir * sd);
+static int storeBfDirWriteCleanStart(SwapDir * sd);
+static const StoreEntry *storeBfDirCleanLogNextEntry(SwapDir * sd);
+static void storeBfDirWriteCleanDone(SwapDir * sd);
+static void storeBfDirSwapLog(const SwapDir * sd, const StoreEntry * e, int op);
+static void storeBfDirNewfs(SwapDir * sd);
+static int  storeBfDirIs(SwapDir * sd);
+static void storeBfDirMaintain(SwapDir * SD);
+static int  storeBfSelectSwapDir(const StoreEntry *);
+static void storeBfDirRefObj(SwapDir * SD, StoreEntry * e);
+static void storeBfDirStats(SwapDir * SD, StoreEntry * sentry);
+static void storeBfDirReconfigure(SwapDir * sd, int index, char *path);
+static void storeBfDirDump(StoreEntry * entry, const char *name, SwapDir * s);
+static void storeBfDirFree(SwapDir * s);
+static int  storeBfCleanupDoubleCheck(SwapDir * sd, StoreEntry * e);
+static void storeBfDirParse(SwapDir * sd, int index, char *path);
+
+
+int
+sanitycheck()
+{
+    int t;
+    unsigned true_away = 0;
+
+    for (t = BFNONE+1; t < BFMAX; t++) {
+	bfop_t *op = bf_busyops[t].head;
+	bfop_t **true_tail = &bf_busyops[t].head;
+	while (op) {
+	    true_away++;
+	    true_tail = &op->busychain;
+	    op = op->busychain;
+	}
+	debug(86, 1) ("sanitycheck: bf_busyops[t].tail != true_tail!\n");
+    }
+    debug(86, 1) ("sanitycheck: bf_stats.away(%d) != true_away(%d)\n",
+		  bf_stats.away, true_away);
+    return 0;
+}
+
+static int
+storeBfDirCreateDirectory(const char *path, int should_exist)
+{
+    /* our directories are root dirs of reiserfs_raw fs's.  Never
+       create anything. */
+    if (storeBfDirVerifyDirectory (path) != 0)
+	fatalf("Swap directory %s needs manual intervention\n", path);
+    return 0;
+}
+
+
+/* Switch @sd to offline state (after disk failure is detected).
+   Offline state means don't use this directory temporarily, but
+   be ready to turn it back on-line, with minimal hit ratio loss. */
+
+void
+storeBfDirTakeOffline(SwapDir *sd)
+{
+    if (! sd->flags.offline) {
+	sd->flags.offline = 1;
+	n_storetree_dirs--;
+	assert(n_storetree_dirs >= 0);
+	storeBfDirReindex ();
+    }
+}
+
+/* We maintain two arrays of SwapDir indices, subsets of
+   Config.cacheSwap.swapDirs: bf_dir_index contains all butterfly
+   dirs, and bf_dir_online_index contains all butterfly dirs currently
+   on-line.  These are used by storeBfSelectSwapDir.  This function
+   builds these arrays; it should be called every time the dirs are
+   reconfigured or online status changed */
+
+static void
+storeBfDirReindex ()
+{
+    int dirn;
+
+    safe_free (bf_dir_index);
+    safe_free (bf_dir_online_index);
+
+    n_bf_dirs = 0;
+    for (dirn = 0; dirn < Config.cacheSwap.n_configured; dirn++) {
+	SwapDir *sd = INDEXSD(dirn);
+	if (storeBfDirIs(sd))
+	    n_bf_dirs++;
+    }
+
+    if (n_bf_dirs == 0)
+	return;
+
+    bf_dir_index = xcalloc(n_bf_dirs, sizeof(*bf_dir_index));
+    bf_dir_online_index = xcalloc(n_bf_dirs, sizeof(*bf_dir_online_index));
+
+    n_bf_dirs = 0;
+    n_bf_dirs_online = 0;
+
+    for (dirn = 0; dirn < Config.cacheSwap.n_configured; dirn++) {
+	SwapDir *sd = INDEXSD(dirn);
+	if (storeBfDirIs(sd)) {
+	    bf_dir_index[n_bf_dirs++] = dirn;
+	    if (! sd->flags.offline)
+		bf_dir_online_index[n_bf_dirs_online++] = dirn;
+	}
+    }
+}
+
+static int
+storeBfDirVerifyDirectory(const char *path)
+{
+    struct stat sb;
+    reiserfs_raw_ioctl_arg ra;
+    int fd, fd2;
+
+    if (stat(path, &sb) < 0) {
+	debug(20, 0) ("%s: %s\n", path, xstrerror());
+	return -1;
+    }
+    if (S_ISDIR(sb.st_mode) == 0) {
+	debug(20, 0) ("%s is not a directory\n", path);
+	return -1;
+    }
+    fd = open (path, O_RDONLY);
+    if (fd < 0) {
+	debug(20, 0) ("%s is unreadable for uid %u, gid %u\n",
+		      path, getuid(), getgid());
+	return -1;
+    }
+    ra.id1 = ra.id2 = 7;
+    fd2 = ioctl (fd, REISERFS_IOC_RAWOPEN, &ra);
+    if (fd2 == -1 && errno != ENOENT) {
+	debug(20, 0) ("%s is not a functional REISERFS-RAW directory\n", path);
+	close(fd);
+	return -2;
+    }
+    close(fd2);
+    close(fd);
+    return 0;
+}
+
+/*
+ * This function is called by storeBfDirInit().  If this returns < 0,
+ * then Squid exits, complains about swap directories not
+ * existing, and instructs the admin to run 'squid -z'
+ */
+static int
+storeBfDirVerifyCacheDir(SwapDir * sd)
+{
+    const char *path = sd->path;
+    int result = storeBfDirVerifyDirectory(path);
+
+    if (result == -2) {
+	/* The directory does exist, but it looks like the drive that
+	   the dir is on is offline.  Don't panic, just set the flag
+	   to avoid using this directory. */
+	sd->flags.offline = 1;
+	result = 0;
+    }
+    return result;
+}
+
+static void
+storeBfDirOpenSwapLog(SwapDir * sd)
+{
+    bfinfo_t *bfi = sd->fsdata;
+    static const char *errmsg =
+      "\tFailed to verify one of the swap directories, Check cache.log\n"
+      "\tfor details.\n";
+
+    sd->flags.offline = 0;
+    if (storeBfDirVerifyCacheDir(sd) < 0)
+	fatal(errmsg);
+    if (! sd->flags.offline) {
+	bfi->dir = open (sd->path, O_RDONLY);
+	if (bfi->dir < 0) {
+	    debug(50, 0) ("storeBfInit: open(%s): %s\n", sd->path, xstrerror());
+	    sd->flags.offline = 1;
+	}
+	else {
+	    char note[FD_DESC_SZ];
+	    snprintf (note, FD_DESC_SZ, "Butterfly dir fd %s", sd->path);
+	    fd_open(bfi->dir, FD_UNKNOWN, note);
+	    n_storetree_dirs++;
+	}
+    }
+    storeBfDirReindex ();
+}
+
+static void
+storeBfDirCloseSwapLog(SwapDir * sd)
+{
+    bfinfo_t *bfi = sd->fsdata;
+    fde *F = &fd_table[bfi->dir];
+
+    /*
+     * Careful.  If there was disk failure and the admin used
+     * umount_-f, our fd is closed behind our back, and then possibly
+     * reused for other purposes.  We shouldn't close the fd if this
+     * is the case.
+     */
+    if (memcmp (F->desc, "Butterfly dir fd", 16) == 0) {
+	fd_close (bfi->dir);
+	close (bfi->dir);
+    }
+    bfi->dir = -1;
+
+    if (! sd->flags.offline)
+	n_storetree_dirs--;
+    assert(n_storetree_dirs >= 0);
+
+    storeBfDirReindex ();
+}
+
+static void
+storeBfDirInit(SwapDir * sd)
+{
+    if (!getenv(AIO_THREAD_ENVIRON)) {
+	static char aio_nthreads[30];
+	snprintf(aio_nthreads, sizeof(aio_nthreads), "%d", bf_nthreads);
+	setenv(AIO_THREAD_ENVIRON, aio_nthreads, 0);
+    }
+    /* We are doing polled I/O, don't wait too long in comm_poll */
+    comm_quick_poll_required();
+    storeBfDirOpenSwapLog(sd);
+    storeBfDirRebuild(sd);
+}
+
+
+static void
+storeBfStats(StoreEntry * sentry)
+{
+    int t;
+
+    if (bf_stats.abs_max_away < bf_stats.max_away)
+	bf_stats.abs_max_away = bf_stats.max_away;
+    storeAppendPrintf(sentry, "AIO ops in flight: now %u max %u absmax %u\n",
+		      bf_stats.away,
+		      bf_stats.max_away,
+		      bf_stats.abs_max_away);
+    bf_stats.max_away = 0;
+
+    storeAppendPrintf(sentry, "EAGAIN errors: %u\n", bf_stats.eagain);
+    storeAppendPrintf(sentry, "Swapout refused because of disk overload: %u\n",
+		      bf_stats.swapout_refused);
+    storeAppendPrintf(sentry, "\n            OPS SUCCESS    FAIL"
+		      "   DURATION, ms: MAX   AVG  AVG-for-last-100-ops\n");
+                                       
+    for (t = BFNONE+1; t < BFMAX; t++) {
+	struct bf_stats_counters *c = &bf_stats.counters[t];
+	storeAppendPrintf(sentry, "%7s %7u %7u %7u                %5u %5.0f %5.0f\n",
+			  bfop_names[t],
+			  c->ops, c->success, c->fail,
+			  c->max_dura, c->avg_dura, c->avg_dura100);
+    }
+
+    sanitycheck();
+}
+
+/*
+ * storeBfDirSync
+ *
+ * Sync any pending data. We just sit around and read the queue
+ * until the data has finished writing.
+ */
+static void
+storeBfDirSync(SwapDir * sd)
+{
+  /* later. */
+#if 0
+    bfinfo_t *bfi = sd->fsdata;
+    while (bfi->away > 0) {
+	debug(86, 1) ("storeBfDirSync: %d messages away\n", bfi->away);
+	storeBfDirCallback(SD);
+    }
+#endif
+}
+
+/* Write down op duration profiling data */
+
+static void
+storeBfWriteOpStats (bfop_t *op)
+{
+    if (bf_opstats_fn && ! bf_opstats) {
+	bf_opstats = fopen (bf_opstats_fn, "a");
+	if (! bf_opstats) {
+	    debug(50, 1) ("WARNING: %s: %s\n", bf_opstats_fn, xstrerror());
+	    bf_opstats_fn = 0;
+	}
+    }
+    fwrite (&op->type, sizeof (op->type), 1, bf_opstats);
+    fwrite (&bf_stats.away, sizeof (bf_stats.away), 1, bf_opstats);
+    fwrite (&op->aio.aio_times, sizeof (op->aio.aio_times), 1, bf_opstats);
+}
+
+
+/*
+ * storeBfDirCallback
+ *
+ * This is called regularly from the main select() loop to let us serve
+ * aio completion requests.
+ */
+static int
+storeBfDirCallback(SwapDir * sd)
+{
+    bfinfo_t *bfi = sd->fsdata;
+    storeIOState *sio;
+    static struct timeval last_call;
+    int count_completions = 0;
+    int count_kicks = 0;
+    int i;
+
+    /* Queue lookup order.  First, serve ops that require no further
+       processing (their processing doesn't result in growth of the
+       queue).  BFOPEN and BFCREAT trigger bursts of new messages, so
+       they come last.  BFOPEN before BFCREAT as open is "interactive"
+       while creat is "batch" (swap-out delay does not matter) */
+    static int op_order[] = {BFCLOSE, BFUNLINK, BFREAD, BFWRITE,
+			     BFOPEN, BFCREAT, BFNONE};
+    int scan_depth[] = {3, 3, 30, 3, bf_nthreads, 1};
+
+    /* No point in scanning queues more often than once in 3ms */
+    if ((unsigned long)tvSubUsec(last_call, current_time) < 3000)
+	return 0;
+
+    last_call = current_time;
+    
+    if (bf_stats.away > 0)
+	debug(86,3) ("storeBfDirCallback: away %u\n", bf_stats.away);
+
+    for (i = 0; op_order[i] != BFNONE; i++) {
+	bfop_t *op;
+	int t = op_order[i];
+	int depth = scan_depth[i]; /* go through this many non-ready ops */
+	bfop_t **opp = &bf_busyops[t].head;
+
+	while ((op = *opp)) {
+	    if (aio_error(&op->aio) == EINPROGRESS) {
+		debug (86,3) ("storeBfDirCallback: %p incomplete, depth %d\n", op, depth);
+		opp = &op->busychain;
+		if (--depth <= 0)
+		    break;
+	    }
+	    else {
+		*opp = op->busychain;
+		if (! *opp)
+		    bf_busyops[t].tail = opp;
+		bf_stats.away--;
+		debug (86,3) ("storeBfDirCallback: %p complete\n", op);
+		if (bf_opstats_fn) {
+		    op->aio.aio_times[AIOTIME_FINISH] = kaio_time ();
+		    storeBfWriteOpStats (op);
+		}
+		storeBfOpCompletion (op);
+		count_completions++;
+	    }
+	}
+    }
+
+    while ((sio = linklistShift (&bfi->kicklater))) {
+	if (cbdataValid (sio)) {
+	    bfstate_t *bfstate = sio->fsstate;
+	    if (storeBfKickQueue (bfstate) < 0) {
+		/* EAGAIN again.  Further kick attempts will likely
+		   fail too, so stop for now (busy wait processing the
+		   same sio isn't the best we can do). */
+		cbdataUnlock (sio);
+		break;
+	    }
+	    count_kicks++;	/* unsucessful uncounted */
+	}
+	cbdataUnlock (sio);
+    }
+
+    if (count_completions + count_kicks > 0)
+	debug (86,3) ("storeBfDirCallback: done %u completions, %u kicks\n",
+		      count_completions, count_kicks);
+
+    return count_completions + count_kicks;
+}
+
+
+/* Remove StoreEntry from memory while keeping the disk object.  Save
+   important e->lastref and e->expires info in the file's stat data,
+   so expiration process (storeTreeDirClean) can make decisions based
+   on them without reading the file.
+
+   For reiserfs_raw, we avoid setting attributes now.  They has been
+   set on object creation, and the only change we may need now is
+   update lastref attribute which is currently set to the time of file
+   creation or last open.  (Hm, we must update lastref for hot objects
+   periodically somewhere, or they might get GCed on the disk.)  */
+
+static void
+storeBfRelease(StoreEntry *e)
+{
+    assert (e->lock_count == 0);
+
+    if (EBIT_TEST (e->flags, ENTRY_TENTATIVE)) {
+	/* If we are asked to release tentative entry, this means it
+	   wasn't succesful (no file found for this entry).  It is no
+	   longer tentative, it is simply dummy StoreEntry. */
+	debug(86, 3) ("storeBfRelease: rm unsuccesful tentative entry %p\n", e);
+	goto kill_it;
+    }
+
+#define M(BITNUM) (1L<<(BITNUM))
+    if ((e->flags &~ M(ENTRY_VALIDATED)) != (M(ENTRY_CACHABLE)|M(ENTRY_DISPATCHED))) {
+	/* Uncommon, leave in memory */
+	debug(86, 3) ("storeBfRelease: leaving in memory entry with uncommon flags %s\n",
+		      storeEntryFlags(e));
+	return;
+    }
+
+ kill_it:
+    storeSetMemStatus(e, NOT_IN_MEMORY);
+    destroy_StoreEntry(e);
+}
+
+
+/* This is called by store dir selection procedure to let us redirect
+   the object to one of our fellow bf dirs --
+   storeBfSelectSwapDir will say which one using a hash based on
+   URL */
+
+static SwapDir *
+storeBfReselectDir (const StoreEntry *e)
+{
+    return INDEXSD(storeBfSelectSwapDir(e));
+}
+
+
+static void
+storeRebuildDummy(void *data)
+{
+    RebuildState *rb = data;
+
+    /*
+     * Everything already done in storeBfDirRebuild.  But calling
+     * storeRebuildComplete from there is banned.
+     */
+    store_dirs_rebuilding--;
+    storeRebuildComplete(&rb->counts);
+    cbdataFree(rb);
+}
+
+CBDATA_TYPE(RebuildState);
+
+static void
+storeBfDirRebuild(SwapDir * sd)
+{
+    RebuildState *rb = xcalloc(1, sizeof(*rb));
+    EVH *func = storeRebuildDummy;
+
+    CBDATA_INIT_TYPE(RebuildState);
+    rb = CBDATA_ALLOC(RebuildState, NULL);
+    rb->sd = sd;
+    store_dirs_rebuilding++;
+    eventAdd("storeRebuild", func, rb, 0.0, 0);
+}
+
+
+/*
+ * Begin the process to write clean cache state.  For BF this
+ * is no-op.
+ */
+static int
+storeBfDirWriteCleanStart(SwapDir * sd)
+{
+    sd->log.clean.write = NULL;
+    sd->log.clean.state = NULL;
+    return 0;  /* -1 to indicate error */
+}
+
+/*
+ * Get the next entry that is a candidate for clean log writing
+ */
+static const StoreEntry *
+storeBfDirCleanLogNextEntry(SwapDir * sd)
+{
+    return NULL;
+}
+
+static void
+storeBfDirWriteCleanDone(SwapDir * sd)
+{
+    /* sync (wait while diskd finishes) and write down age classes.  later */
+}
+
+static void
+storeBfDirSwapLog(const SwapDir * sd, const StoreEntry * e, int op)
+{
+}    
+
+static void
+storeBfDirNewfs(SwapDir * sd)
+{
+    debug(47, 3) ("Creating swap space in %s\n", sd->path);
+    storeBfDirCreateDirectory(sd->path, 0);
+    /* storeBfDirCreateSwapSubDirs(sd); */
+}
+
+
+/* We use 64bit version of r5 hash to make reiserfs_raw file id.  R5
+   it chosen for its property of more or less preserving original data
+   order, which will hopefully be good for seek times.
+
+   The code has been copied from linux/fs/reiserfs/hashes.c, the only
+   change is s/u32/unsigned long long/g.
+
+   One more change (following the hint from Nikita Danilov
+   <nikitadanilov@yahoo.com>): the values from original r5 are always
+   a multiple of 11, which means narrowing the hash space.  Fix that
+   by doing multiplication by 11 *before* adding a char, not after. */
+
+static unsigned long long
+r5l_hash (const char *msg, int len)
+{
+    unsigned long long a = 0;
+    while (*msg) { 
+	a *= 11;
+	a += *msg << 4;
+	a += *msg >> 4;
+	msg++;
+    } 
+    return a;
+}
+
+static inline unsigned long long
+make_id (const char *key, const char *msg, int len)
+{
+#if 1
+    return r5l_hash (msg, len);
+#else
+    return *(unsigned long long *)(&key[8]);
+#endif
+}
+
+
+
+/* This is important function called from storeGetPublic,
+   disk-oriented equivalent of storeGet.  */
+
+StoreEntry *
+storeTreeGet (const char *url, const cache_key *key)
+{
+    StoreEntry *e;
+
+    if (n_storetree_dirs == 0)
+	return NULL;
+    if (strncasecmp (url, "cache_object://", sizeof "cache_object://"-1) == 0)
+	return NULL;
+
+    debug(86, 5) ("StoreTreeGet for %s %s  Callers %p %p %p %p\n", url,
+		  storeKeyText (key),
+		  __builtin_return_address(0),
+		  __builtin_return_address(1),
+		  __builtin_return_address(2),
+		  __builtin_return_address(3)
+		  );
+
+    /* We don't know if the requested object is in the store, and we
+       don't want to waste time on stat(2) to find out.  So we create
+       tentative StoreEntry with requested URL as if it were in the
+       store, and let upper layers try to open it.  They will try to
+       swap in and if the object is not on the disk, they's get open
+       error and handle the error -- for example, in clientCacheHit
+       they jump to clientCacheMiss on swapin error. */
+
+    e = new_StoreEntry (STORE_ENTRY_WITHOUT_MEMOBJ, NULL, NULL);
+    e->store_status = STORE_OK;
+    e->mem_status = NOT_IN_MEMORY;
+    e->swap_status = SWAPOUT_DONE;
+    e->swap_filen = 73;		/* for luck */
+    /* would have to read in the object to restore the flags.  To
+       avoid that, we'll teach storeTreeRelease not to purge
+       StoreEntries with non-common flags (ENTRY_NEGCACHED, for one) */
+    e->flags = 0;
+    EBIT_SET(e->flags, ENTRY_CACHABLE);
+    EBIT_SET(e->flags, ENTRY_DISPATCHED);
+    EBIT_SET(e->flags, ENTRY_VALIDATED);
+    EBIT_SET(e->flags, ENTRY_TENTATIVE);
+    e->ping_status = PING_NONE;
+    storeHashInsert (e, key);
+
+    e->id = make_id (key, url, strlen (url));
+    e->id_valid = 1;
+    e->swap_dirn = storeBfSelectSwapDir (e);
+
+    return e;
+}
+
+
+/* Restore StoreEntry fields from metadata just read from disk object.
+   Called from storeClientReadHeader */
+void
+storeTreeRestoreMetadata(StoreEntry *e, tlv *t)
+{
+    SwapDir *sd = INDEXSD(e->swap_dirn);
+    size_t save_sz = e->swap_file_sz;
+
+    assert(t->type == STORE_META_STD);
+    assert(t->length == STORE_HDR_METASIZE);
+    assert(storeBfDirIs (sd));
+    xmemcpy(&e->timestamp, t->value, STORE_HDR_METASIZE);
+
+    /* swap_file_sz in the metadata is most likely 0, it is not known
+       at the point when swap-out starts.  Replace with known good
+       value that we got from RAWOPEN */
+    e->swap_file_sz = save_sz;
+
+    /* this will probably be set later, but just in case */
+    e->lastref = squid_curtime;
+}
+
+
+static int
+storeBfDirIs(SwapDir * sd)
+{
+    return (strcmp(sd->type, "butterfly") == 0);
+}
+
+static void
+storeBfDirMaintain(SwapDir * SD)
+{
+    /* reiresfs-raw has object removal implemented in the kernel. */
+    return;
+}
+
+/*
+ * storeBfDirCheckObj
+ *
+ * This routine is called by storeDirSelectSwapDir to see if the given
+ * object is able to be stored on this filesystem.  (Round-robin dir
+ * selection alg doesn't use this at all).
+ *
+ */
+static int
+storeBfDirCheckObj(SwapDir * SD, const StoreEntry * e)
+{
+    if (storeBfSelectSwapDir(e) != SD->index)
+        return -1;
+    return 0;
+}
+
+
+/* Select the cachedir where to put/where to look
+   for the given entry.  The current implementation is
+   quick&dirty: assumes all bf dirs are of equal size.
+   The right way is to weight based on dir sizes, TODO later.
+   Look at CARP as Henrik suggested?  */
+
+static int
+storeBfSelectSwapDir(const StoreEntry * _e)
+{
+    StoreEntry *e = (StoreEntry *) _e;
+    int dirn;
+
+    if (! e->id_valid) {
+	assert (e->mem_obj);
+	assert (e->mem_obj->url);
+	e->id = make_id (e->hash.key, e->mem_obj->url,
+			 strlen (e->mem_obj->url));
+	e->id_valid = 1;
+    }
+
+    /* Select one of butterfly dirs based on a hash value extracted
+       from MD5 key of the object */
+
+    assert (n_bf_dirs > 0);
+    dirn = bf_dir_index[*(unsigned *)e->hash.key % n_bf_dirs];
+
+    if (INDEXSD(dirn)->flags.offline) {
+	/* The normal selection have chosen a dir currently offline.
+	   Reselect the dir from the remaining (online) dirs.  This
+	   way, the objects that would normally go to the disk
+	   currently offline are distributed evenly among the
+	   remaining dirs. */
+	if (n_bf_dirs_online <= 0)
+	    fatal("All dirs offline, can't handle this yet");
+	dirn = bf_dir_online_index[*(unsigned *)e->hash.key % n_bf_dirs_online];
+    }
+
+    assert (! INDEXSD(dirn)->flags.offline);
+
+    return dirn;
+}
+
+
+/*
+ * storeBfDirRefObj
+ *
+ * This routine is called whenever an object is referenced, so we can
+ * maintain replacement information within the storage fs.
+ *
+ * TODO put reiserfs_raw setattr here!
+ */
+static void
+storeBfDirRefObj(SwapDir *sd, StoreEntry * e)
+{
+    debug(1, 3) ("storeBfDirRefObj: referencing %p %d/%d\n", e);
+#if 0
+    /* TODO: call storeBfSettattr here */
+    if (sd->repl->Referenced)
+	sd->repl->Referenced(sd->repl, e, &e->repl);
+#endif
+}
+
+/*
+ * storeBfDirUnrefObj
+ * This routine is called whenever the last reference to an object is
+ * removed, to maintain replacement information within the storage fs.
+ */
+void
+storeBfDirUnrefObj(SwapDir * SD, StoreEntry * e)
+{
+    debug(1, 3) ("storeBfDirUnrefObj: unreferencing %p\n", e);
+#if 0
+    if (sd->repl->Dereferenced)
+	sd->repl->Dereferenced(sd->repl, e, &e->repl);
+#endif
+}
+
+
+/*
+ * Add and remove the given StoreEntry from the replacement policy in
+ * use.
+ */
+
+void
+storeBfDirReplAdd(SwapDir * sd, StoreEntry * e)
+{
+#if 0
+    debug(20, 4) ("storeBfDirReplAdd: added node %p to dir %d\n", e, sd->index);
+    sd->repl->Add(sd->repl, e, &e->repl);
+#endif
+}
+
+
+void
+storeBfDirReplRemove(StoreEntry * e)
+{
+#if 0
+    SwapDir *SD;
+    if (e->swap_dirn < 0)
+	return;
+    SD = INDEXSD(e->swap_dirn);
+    debug(20, 4) ("storeBfDirReplRemove: remove node %p from dir %d\n", e,
+	SD->index);
+    SD->repl->Remove(SD->repl, e, &e->repl);
+#endif
+}
+
+
+
+static void
+storeBfDirStats(SwapDir *sd, StoreEntry * sentry)
+{
+    bfinfo_t *bfi;
+#if HAVE_STATVFS
+    struct statvfs sfs;
+#endif
+    bfi = sd->fsdata;
+#if HAVE_STATVFS
+#define fsbtoblk(num, fsbs, bs) \
+    (((fsbs) != 0 && (fsbs) < (bs)) ? \
+            (num) / ((bs) / (fsbs)) : (num) * ((fsbs) / (bs)))
+    if (!statvfs(sd->path, &sfs)) {
+	storeAppendPrintf(sentry, "Filesystem Space in use: %d/%d KB (%d%%)\n",
+	    fsbtoblk((sfs.f_blocks - sfs.f_bfree), sfs.f_frsize, 1024),
+	    fsbtoblk(sfs.f_blocks, sfs.f_frsize, 1024),
+	    percent(sfs.f_blocks - sfs.f_bfree, sfs.f_blocks));
+    }
+#endif
+
+    storeAppendPrintf(sentry, "Flags:");
+    if (sd->flags.selected)
+	storeAppendPrintf(sentry, " SELECTED");
+    if (sd->flags.read_only)
+	storeAppendPrintf(sentry, " READ-ONLY");
+    if (sd->flags.offline)
+	storeAppendPrintf(sentry, " OFF-LINE");
+    storeAppendPrintf(sentry, "\n");
+}
+
+/*
+ * storeBfDirReconfigure
+ *
+ * This routine is called when the given swapdir needs reconfiguring 
+ */
+static void
+storeBfDirReconfigure(SwapDir * sd, int index, char *path)
+{
+    char *token;
+    unsigned int read_only = 0;
+    char *opstats_fn = NULL;
+    bfinfo_t *bfi = sd->fsdata;
+
+    /* TODO -- remove RO or make it work.  See selectSwapDir. */
+
+    while ((token = strtok(NULL, w_space))) {
+	/* TODO -- remove RO or make it work.  See selectSwapDir. */
+	if (!strcasecmp(token, "read-only"))
+	    read_only = 1;
+	else if (!strncasecmp(token, "opstats=", 8))
+	    opstats_fn = xstrdup (token+8);
+	else if (!strncasecmp(token, "refuse_swapout=", 15))
+	    bfi->refuse_swapout = atoi(token+15);
+    }
+
+    /* just reconfigure it */
+    if (sd->flags.read_only != read_only)
+	debug(3, 1) ("Cache dir '%s' now %s\n",
+	    path, read_only ? "Read-Only" : "Read-Write");
+    sd->flags.read_only = read_only;
+
+    if (bf_opstats) {
+	fclose (bf_opstats);
+	bf_opstats = NULL;
+    }
+    if (bf_opstats_fn)
+	xfree (bf_opstats_fn);
+    bf_opstats_fn = opstats_fn;
+
+    return;
+}
+
+static void
+storeBfDirDump(StoreEntry * entry, const char *name, SwapDir * sd)
+{
+    storeAppendPrintf(entry, "%s %s %s\n",
+		      name,
+		      "butterfly",
+		      sd->path);
+}
+
+/*
+ * Only "free" the filesystem specific stuff here
+ */
+static void
+storeBfDirFree(SwapDir * sd)
+{
+    bfinfo_t *bfi = sd->fsdata;
+    xfree(bfi);
+    sd->fsdata = NULL;		/* Will aid debugging... */
+}
+
+void
+storeBfDirFillRa(StoreEntry *e, reiserfs_raw_ioctl_arg *ra)
+{
+    if (! e->id_valid) {
+	assert (e->mem_obj);
+	assert (e->mem_obj->url);
+	e->id = make_id (e->hash.key, e->mem_obj->url, strlen (e->mem_obj->url));
+	e->id_valid = 1;
+    }
+    ra->id1 = (unsigned) e->id;
+    ra->id2 = (unsigned) (e->id >> 32);
+    ra->lastref = e->lastref;
+    ra->expires = ~0;		/* end of Epoch -- don't let kernel
+                                   remove it for now. */
+    ra->user1 = 0;		/* put headersz here, later */
+}
+
+/*
+ * storeBfCleanupDoubleCheck
+ *
+ * This is called by storeCleanup() if -S was given on the command line.
+ */
+static int
+storeBfCleanupDoubleCheck(SwapDir * sd, StoreEntry * e)
+{
+    /* very good dir, trust me! */
+    return 0;
+}
+
+/*
+ * storeBfDirParse
+ *
+ * Called when a *new* fs is being setup.
+ */
+static void
+storeBfDirParse(SwapDir * sd, int index, char *path)
+{
+    char *token;
+    unsigned int read_only = 0;
+    unsigned int nthreads = 8;
+    bfinfo_t *bfi;
+
+    sd->fsdata = bfi = xcalloc(1, sizeof(*bfi));
+    bfi->refuse_swapout = 1500;	/* default */
+
+    while ((token = strtok(NULL, w_space))) {
+	/* TODO -- remove RO or make it work.  See selectSwapDir. */
+	if (!strcasecmp(token, "read-only"))
+	    read_only = 1;
+	else if (!strncasecmp(token, "threads=", 8))
+	    nthreads = atoi(token+8);
+	else if (!strncasecmp(token, "opstats=", 8)) {
+	    if (bf_opstats_fn)
+		debug(86, 1) ("WARNING: %s: already have an opstats file (%s), ignoring\n",
+			      token, bf_opstats_fn);
+	    else
+		bf_opstats_fn = xstrdup (token+8);
+	}
+	else if (!strncasecmp(token, "refuse_swapout=", 15))
+	    bfi->refuse_swapout = atoi(token+15);
+    }
+
+    sd->index = index;
+    sd->path = xstrdup(path);
+    /* keep cache_swap >= cache_mem check happy.  Use a constant for
+       now, do it with statvfs later. */
+    sd->max_size = 1000<<20;	/* 1 TB */
+    sd->low_size = sd->max_size;
+    sd->high_size = sd->max_size;
+    sd->flags.read_only = read_only;
+    sd->init = storeBfDirInit;
+    sd->newfs = storeBfDirNewfs;
+    sd->dump = storeBfDirDump;
+    sd->freefs = storeBfDirFree;
+    sd->dblcheck = storeBfCleanupDoubleCheck;
+    sd->statfs = storeBfDirStats;
+    sd->maintainfs = storeBfDirMaintain;
+    sd->checkobj = storeBfDirCheckObj;
+    sd->refobj = storeBfDirRefObj;
+    sd->unrefobj = storeBfDirUnrefObj;
+    sd->callback = storeBfDirCallback;
+    sd->sync = storeBfDirSync;
+    sd->halfrelease = storeBfRelease;
+    sd->reselectdir = storeBfReselectDir;
+    sd->obj.create = storeBfCreate;
+    sd->obj.open = storeBfOpen;
+    sd->obj.close = storeBfClose;
+    sd->obj.read = storeBfRead;
+    sd->obj.write = storeBfWrite;
+    sd->obj.unlink = storeBfUnlink;
+    sd->log.open = storeBfDirOpenSwapLog;
+    sd->log.close = storeBfDirCloseSwapLog;
+    sd->log.write = storeBfDirSwapLog;
+    sd->log.clean.start = storeBfDirWriteCleanStart;
+    sd->log.clean.nextentry = storeBfDirCleanLogNextEntry;
+    sd->log.clean.done = storeBfDirWriteCleanDone;
+    bf_nthreads += nthreads;
+
+}
+
+/*
+ * Initial setup / end destruction
+ */
+void
+storeBfDirDone(void)
+{
+    memPoolDestroy(bf_state_pool);
+    memPoolDestroy(bf_op_pool);
+    if (bf_opstats)
+	fclose (bf_opstats);
+    if (bf_opstats_fn)
+	xfree(bf_opstats_fn);
+    bf_initialised = 0;
+}
+
+/* Setup routine.  This must be named by its long official name, not
+   just _bf */
+
+void
+storeFsSetup_butterfly (storefs_entry_t * storefs)
+{
+    int t;
+
+    assert(!bf_initialised);
+    storefs->parsefunc = storeBfDirParse;
+    storefs->reconfigurefunc = storeBfDirReconfigure;
+    storefs->donefunc = storeBfDirDone;
+    bf_state_pool = memPoolCreate("BF IO State data", sizeof(bfstate_t));
+    bf_op_pool = memPoolCreate("BF IO operation data", sizeof(bfop_t));
+    memset(&bf_stats, '\0', sizeof(bf_stats));
+    cachemgrRegister("butterfly", "Butterfly Stats", storeBfStats, 0, 1);
+    debug(81, 1) ("Butterfly started\n");
+    for (t = 0; t < BFMAX; t++) {
+	bf_busyops[t].head = NULL;
+	bf_busyops[t].tail = &bf_busyops[t].head;
+    }
+    bf_initialised = 1;
+}
Index: squid/src/fs/butterfly/store_io_bf.c
diff -u /dev/null squid/src/fs/butterfly/store_io_bf.c:1.1.2.3
--- /dev/null	Tue Sep 28 18:39:07 2004
+++ squid/src/fs/butterfly/store_io_bf.c	Fri Dec 29 01:10:46 2000
@@ -0,0 +1,959 @@
+
+/* $Id$
+ *
+ * DEBUG: section 81   Butterfly cache_dir I/O functions.
+ * Written by Yury Shevchuk <sizif@botik.ru>
+ *
+ * SQUID Internet Object Cache  http://squid.nlanr.net/Squid/
+ * ----------------------------------------------------------
+ *
+ *  Squid is the result of efforts by numerous individuals from the
+ *  Internet community.  Development is led by Duane Wessels of the
+ *  National Laboratory for Applied Network Research and funded by the
+ *  National Science Foundation.  Squid is Copyrighted (C) 1998 by
+ *  the Regents of the University of California.  Please see the
+ *  COPYRIGHT file for full details.  Squid incorporates software
+ *  developed and/or copyrighted by other sources.  Please see the
+ *  CREDITS file for full details.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
+ *
+ */
+
+/*
+ * Operation outline
+ * ~~~~~~~~~~~~~~~~~
+ * 
+ * There are two groups of operations.
+ * 
+ * 1) Stateless:
+ * 
+ *    storeBfUnlink/Setattr
+ * 
+ * These are simple one-shot operations.  They don't have sio associated
+ * with them.  storeBfUnlink/Setattr launches the required aio operation,
+ * and storeBfOpCompletion cleanups allocated control data upon completion.
+ * 
+ * 2) Stateful.  The sequence goes like this:
+ * 
+ *    storeBfOpen/Creat
+ *    storeBfRead/Write, storeBfRead/Write, ...
+ *    storeBfClose
+ * 
+ * storeBfOpen/Create create sio and associated bfstate, launch
+ * open/creat aio, and return (although the open is not completed yet).
+ * The upper layers can immediately call storeBfRead/Write which we
+ * cannot lauch right away as we don't have fd until Open/Creat
+ * completes.  So we queue the request; there's a single queue for that
+ * in struct bfstate.  The upper layers continue to supply requests using
+ * storeBfRead/Write.
+ * 
+ * The queue is ran by storeBfKickQueue which is called from
+ * storeBfOpCompletion (and also from storeBfRead/Write upon queueing
+ * another request).
+ * 
+ * When storeBfClose is called, issuing the actual close request is
+ * deferred until the queue is empty.
+ * 
+ */
+
+#include "config.h"
+#include "squid.h"
+
+#include "store_bf.h"
+
+
+static int   storeBfUnlink2(SwapDir *sd, StoreEntry *e, storeIOState *sio);
+static int   storeBfLaunch (SwapDir *sd, int (*aiofunc)(aiocb *), bfop_t *op);
+static void  storeBfTryLaunchClose (storeIOState *sio);
+static void  storeBfCloseLastResort (storeIOState *sio);
+static void  storeBfIOCallback(storeIOState * sio, int errflag);
+static void  storeBfIOAbort (storeIOState *sio, int op_type, int errflag);
+static void  storeBfOpEnqueue (bfstate_t *bfstate, bfop_t *op);
+static void  storeBfSweep (bfstate_t *bfstate);
+static void  storeBfUpdateOffset (bfstate_t *bfstate);
+static void  storeBfOrderKick (bfstate_t *bfstate);
+static char *storeBfOpDump (bfop_t *op);
+static char *storeBfStateDump (bfstate_t *bfstate);
+static void  storeBfIOFreeEntry(void *sio);
+
+
+storeIOState *
+storeBfOpen(SwapDir * sd, StoreEntry * e, STFNCB * file_callback,
+	    STIOCB * callback, void *callback_data)
+{
+    storeIOState *sio;
+    bfstate_t *bfstate;
+    bfinfo_t *bfi = sd->fsdata;
+    bfop_t *op;
+
+    debug(81, 3) ("storeBfOpen: key %s\n", storeKeyText(e->hash.key));
+
+    sio = CBDATA_ALLOC(storeIOState, storeBfIOFreeEntry);
+    sio->fsstate = bfstate = memPoolAlloc(bf_state_pool);
+    sio->swap_dirn = sd->index;
+    sio->mode = O_RDONLY;
+    sio->callback = callback;
+    sio->callback_data = callback_data;
+    sio->e = e;
+    cbdataLock(callback_data);
+
+    storeBfDirFillRa(e, &bfstate->ra);
+    bfstate->fd = -1;		/* not opened */
+    bfstate->errflag = 0;
+    bfstate->ops.next = bfstate->ops.prev = &bfstate->ops;
+    op = &bfstate->ops;
+    op->type = BFOPEN;
+    op->sio = sio;
+    op->aio.aio_fildes = bfi->dir;
+    op->aio.aio_buf = &bfstate->ra;
+    op->aio.aio_nbytes = REISERFS_IOC_RAWOPEN;
+    op->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
+    op->aio.aio_sigevent.sigev_signo = 0;
+    op->aio.aio_sigevent.sigev_value.sival_ptr = op;
+    
+    if (storeBfLaunch (sd, aio_ioctl, op) != 0) {
+	cbdataUnlock(callback_data);
+	cbdataFree(sio);
+	return NULL;
+    }
+    return sio;
+}
+
+
+storeIOState *
+storeBfCreate(SwapDir * sd, StoreEntry * e, STFNCB * file_callback,
+    STIOCB * callback, void *callback_data)
+{
+    storeIOState *sio;
+    bfstate_t *bfstate;
+    bfinfo_t *bfi = sd->fsdata;
+    bfop_t *op;
+
+    debug(81, 3) ("storeBfCreate: key %s flags %s\n",
+		  storeKeyText(e->hash.key), storeEntryFlags (e));
+
+    /* Avoid disk subsystem overload.  May lead to losing hits, so
+       don't set the limit too low. */
+    if (bf_stats.away > bfi->refuse_swapout) {
+	bf_stats.swapout_refused++;
+	return NULL;
+    }
+
+    sio = CBDATA_ALLOC(storeIOState, storeBfIOFreeEntry);
+    sio->fsstate = bfstate = memPoolAlloc(bf_state_pool);
+    sio->swap_dirn = sd->index;
+    sio->mode = O_WRONLY | O_CREAT | O_TRUNC; /* if that meant something :) */
+    sio->callback = callback;
+    sio->callback_data = callback_data;
+    sio->e = e;
+    cbdataLock(callback_data);
+
+    storeBfDirFillRa(e, &bfstate->ra);
+    bfstate->fd = -1;		/* not opened */
+    bfstate->errflag = 0;
+    bfstate->ops.next = bfstate->ops.prev = &bfstate->ops;
+    op = &bfstate->ops;
+    op->type = BFCREAT;
+    op->sio = sio;
+    op->aio.aio_fildes = bfi->dir;
+    op->aio.aio_buf = &bfstate->ra;
+    op->aio.aio_nbytes = REISERFS_IOC_RAWCREAT;
+    op->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
+    op->aio.aio_sigevent.sigev_signo = 0;
+    op->aio.aio_sigevent.sigev_value.sival_ptr = op;
+
+    /* Do not launch anything now; will lauch creat and subsequent
+       writes when close request arrives */
+
+    /* storeBfDirReplAdd(sd, e); */
+    return sio;
+}
+
+
+void
+storeBfClose(SwapDir * sd, storeIOState * sio)
+{
+    bfstate_t *bfstate = sio->fsstate;
+
+    debug(81, 3) ("storeBfClose: dir %d, key %s\n", sd->index,
+		  storeKeyText (sio->e->hash.key));
+
+    bfstate->close_request = 1;
+
+    /* Creat and subsequent writes are launched all at once when we
+       have them all queued.  Now. */
+    if (bfstate->ops.type == BFCREAT) {
+	bfop_t *op = &bfstate->ops;
+	if (storeBfLaunch (sd, aio_ioctl, op) != 0) {
+	    storeBfIOAbort (op->sio, op->type, errno);
+	    storeBfKickQueue (bfstate);
+	}
+	return;
+    }
+    
+    storeBfTryLaunchClose (sio);
+}
+
+
+void
+storeBfRead(SwapDir * sd, storeIOState * sio, char *buf, size_t size,
+	    off_t offset, STRCB * callback, void *callback_data)
+{
+    bfstate_t *bfstate = sio->fsstate;
+    bfop_t *op;
+
+    op = memPoolAlloc(bf_op_pool);
+    op->type = BFREAD;
+    op->sio = sio;
+    op->callback.read = callback;
+    op->callback_data = callback_data;
+    op->aio.aio_buf = buf;
+    op->aio.aio_nbytes = size;
+    op->aio.aio_offset = offset;
+    op->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
+    op->aio.aio_sigevent.sigev_signo = 0;
+    op->aio.aio_sigevent.sigev_value.sival_ptr = op;
+
+    /* Now that we have remembered it, we don't want it to be freed */
+    cbdataLock(callback_data);
+
+    debug(81, 3) ("storeBfRead: %s", storeBfOpDump(op));
+
+    storeBfOpEnqueue (bfstate, op);
+    storeBfKickQueue (bfstate);
+}
+
+
+void
+storeBfWrite(SwapDir * sd, storeIOState * sio, char *buf, size_t size,
+	     off_t offset, FREE * free_func)
+{
+    bfstate_t *bfstate = sio->fsstate;
+    bfop_t *op;
+
+    op = memPoolAlloc(bf_op_pool);
+    op->type = BFWRITE;
+    op->sio = sio;
+    op->callback.write = free_func;
+    op->callback_data = NULL;
+    op->aio.aio_buf = buf;
+    op->aio.aio_nbytes = size;
+    op->aio.aio_offset = offset;
+    op->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
+    op->aio.aio_sigevent.sigev_signo = 0;
+    op->aio.aio_sigevent.sigev_value.sival_ptr = op;
+
+    debug(81, 3) ("storeBfWrite: %s", storeBfOpDump(op));
+
+    storeBfOpEnqueue (bfstate, op);
+    /*storeBfKickQueue (bfstate);  tentative: launch bfwrite on close only */
+
+    debug(81, 8) ("storeBfWrite:%d %s", __LINE__, storeBfStateDump (bfstate));
+}
+
+
+void
+storeBfUnlink(SwapDir *sd, StoreEntry *e)
+{
+    debug(81, 3) ("storeBfUnlink: dir %d, key %s\n",
+		  sd->index, storeKeyText(e->hash.key));
+    /* storeBfDirReplRemove(e); */
+    storeBfUnlink2 (sd, e, NULL);
+}
+
+static int
+storeBfUnlink2(SwapDir *sd, StoreEntry *e, storeIOState *sio)
+{
+    reiserfs_raw_ioctl_arg *ra;
+    bfinfo_t *bfi = sd->fsdata;
+    bfop_t *op;
+
+    /* For Butterfly unlink is rare operation as cache replacement is
+       carried out in the kernel.  So we can afford to xcalloc to
+       avoid complicating our data */
+
+    ra = xcalloc (1, sizeof (reiserfs_raw_ioctl_arg));
+    storeBfDirFillRa(e, ra);
+
+    op = memPoolAlloc(bf_op_pool);
+    op->next = op->prev = op;
+    op->type = BFUNLINK;
+    op->sio = sio;
+    op->aio.aio_fildes = bfi->dir;
+    op->aio.aio_buf = ra;
+    op->aio.aio_nbytes = REISERFS_IOC_RAWUNLINK;
+    op->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
+    op->aio.aio_sigevent.sigev_signo = 0;
+    op->aio.aio_sigevent.sigev_value.sival_ptr = op;
+
+    if (storeBfLaunch (sd, aio_ioctl, op) != 0) {
+	if (errno == EAGAIN) {
+	    int x = ioctl (bfi->dir, REISERFS_IOC_RAWUNLINK, ra);
+	    debug(81, 1) ("storeBfUnlink: resorted to sync unlink, =%d\n", x);
+	}
+	xfree (ra);
+	memPoolFree(bf_op_pool, op);
+	return -1;
+    }
+
+    /*
+     * Note: we leave op&ra non-freed.  Will get op back in
+     * sigev_value upon completion and free then.
+     */
+
+    return 0;
+}
+
+
+void
+storeBfSetattr(SwapDir *sd, StoreEntry * e)
+{
+    reiserfs_raw_ioctl_arg *ra;
+    bfinfo_t *bfi = sd->fsdata;
+    bfop_t *op;
+
+    debug(81, 3) ("storeBfSetattr: dir %d, key %s\n",
+		  sd->index, storeKeyText(e->hash.key));
+
+    ra = xcalloc (1, sizeof (reiserfs_raw_ioctl_arg));
+    storeBfDirFillRa(e, ra);
+
+    op = memPoolAlloc(bf_op_pool);
+    op->next = op->prev = op;
+    op->type = BFSETATTR;
+    op->sio = NULL;
+    op->aio.aio_fildes = bfi->dir;
+    op->aio.aio_buf = ra;
+    op->aio.aio_nbytes = REISERFS_IOC_RAWSETATTR;
+    op->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
+    op->aio.aio_sigevent.sigev_signo = 0;
+    op->aio.aio_sigevent.sigev_value.sival_ptr = op;
+
+    if (storeBfLaunch (sd, aio_ioctl, op) != 0) {
+	/* OK, maybe next time... */
+	xfree (ra);
+	memPoolFree(bf_op_pool, op);
+	return;
+    }
+
+    /*
+     * Note: we leave op&ra non-freed.  Will get op back in
+     * sigev_value upon completion and free then.
+     */
+}
+
+
+/* Wrapper for aio_*, provides debugging convenience and simplifies
+   accounting */
+
+static int
+storeBfLaunch (SwapDir * sd, int (*aiofunc)(aiocb *), bfop_t *op)
+{
+    int x;
+    struct bf_stats_counters *c = &bf_stats.counters[op->type];
+
+    debug(81, 2) ("storeBfLaunch: dir %d %s", sd->index, storeBfOpDump(op));
+
+    /* Right after going offline, upper layers can continue making
+       requests to the drive.  Prevent the ops from being actually
+       launched, decrease the amount of syslog I/O error complaints */
+
+    if (sd && sd->flags.offline)
+	return -1;
+
+    if (bf_opstats)
+	op->aio.aio_times[AIOTIME_REQUEST] = kaio_time ();
+
+    x = aiofunc (&op->aio);
+    c->ops++;
+    if (x < 0) {
+	c->fail++;
+	if (errno == EAGAIN) bf_stats.eagain++;
+	debug(50, 1) ("storeBfLaunch: type %d: %s\n", op->type, xstrerror());
+    }
+    else {
+	op->started = 1;
+	op->starttime = tvMsec (current_time);
+
+	op->busychain = NULL;
+	*bf_busyops[op->type].tail = op;
+	bf_busyops[op->type].tail = &op->busychain;
+
+	if (++bf_stats.away > bf_stats.max_away)
+	    bf_stats.max_away = bf_stats.away;
+    }
+
+    return x;
+}
+
+
+/* Start all queued i/o operations that are ready to be launched.  It
+   it OK and even good to start multiple reads or writes in parallel.
+   Returns -1 if launch attempts hit EAGAIN, 0 otherwise */
+
+int
+storeBfKickQueue (bfstate_t *bfstate)
+{
+    storeIOState *sio = bfstate->ops.sio;
+    bfop_t *op;
+    int x;
+
+    /*
+     * Remove completed and aborted ops off the head of the queue.
+     */
+    storeBfSweep (bfstate);
+
+    /*
+     * If close requested and q is empty, launch the close 
+     */
+    if (bfstate->close_request && bfstate->ops.next == &bfstate->ops) {
+	storeBfTryLaunchClose (sio);
+	return 0;
+    }
+
+    /*
+     * If no fd, we wannot start read/write yet, we're waiting for
+     * OPEN/CREAT to finish.  storeBfOpCompletion will call
+     * storeBfKickQueue again, and then...
+     */
+    if (bfstate->fd < 0)
+	return 0;
+
+    /*
+     * If we are shutting down after error, don't start new
+     * operations.
+     */
+    if (bfstate->aborting)
+	/*
+	 * Now we either have flying ops in the q, or close launched.
+	 * So OpCompletion is guaranteed, just return and wait for it.
+	 */
+	return 0;
+
+    /*
+     * At last, our main duty.  Launch everything launchable.
+     */
+    for (op = bfstate->ops.next; op != &bfstate->ops; op = op->next) {
+	if (op->started)
+	    continue;
+
+	op->aio.aio_fildes = bfstate->fd;
+
+	switch (op->type) {
+	case BFREAD:
+	    x = storeBfLaunch (NULL, aio_read, op);
+	    break;
+	case BFWRITE:
+	    x = storeBfLaunch (NULL, aio_write, op);
+	    break;
+	default:
+	    abort ();
+	}
+
+	if (x < 0) {
+	    if (errno == EAGAIN) {
+		/* Looks like too many ops are launched already.  Wait
+                   for some to land before continuing.  The nasty
+                   thing is that if no ops are currently in flight,
+                   nobody will call us again (except the user -- but
+                   that's unreliable).  Therefore, ask for help from
+                   upper layers (which have its price, follow me...) */
+		storeBfOrderKick (bfstate);
+		return -1;
+	    }
+	    else {
+		/* Real error.  This is the end of the sio.  Raise the
+                   abort flag and recurse -- as the op failed here,
+                   not in OpCompletion, we must do funeral
+                   arrangements ourselves  */
+		storeBfIOAbort (op->sio, op->type, errno);
+		storeBfKickQueue (bfstate);
+		return 0;
+	    }
+	}
+
+#if 0
+	/* tentative: "don't overdo the asyncronous operations", as
+	   Henrik says.  Write is already asynchronous by nature, so
+	   if it does not return fast then launching more writes will
+	   only clutter the queue without any speed gain */
+	if (op->type == BFWRITE)
+	    break;
+#endif
+    }
+
+    return 0;
+}
+
+
+/* Handle async I/O operation completion event. 'op' is the descriptor
+   of the completed operation. */
+
+void    
+storeBfOpCompletion (bfop_t *op)
+{
+    storeIOState *sio = op->sio;
+    bfstate_t *bfstate = sio? sio->fsstate: NULL;
+    struct bf_stats_counters *c;
+    int x = aio_error (&op->aio);
+    int dura = tvMsec (current_time) - op->starttime;
+    int rlen;
+
+    op->finished = 1;
+
+    debug(81, 2) ("storeBfOpCompletion: %d ms, x=%d %s",
+		  dura, x, storeBfOpDump(op));
+
+    /* assert (! sio || cbdataValid(sio)), because we never call
+       storeBfIOCallback before the file is really closed. */
+
+    if (op->type == BFNONE || op->type >= BFMAX) {
+	debug(81, 1) ("storeBfOpCompletion: bad op type (%d)\n", op->type);
+	/* this "cannot happen" so error handling is sloppy */
+	return;
+    }
+
+    /*
+     * Update statistics counters
+     */
+    c = &bf_stats.counters[op->type];
+    (x == 0) ? c->success++ : c->fail++;
+    if (c->max_dura < dura)
+	c->max_dura = dura;
+    c->avg_dura = (c->avg_dura * (c->ops-1) + dura) / c->ops;
+    c->avg_dura100 = (c->avg_dura100 * 99 + dura) / 100;
+
+    /*
+     * Do operation-specific processing
+     */
+    switch (op->type) {
+    case BFCREAT:
+	if (x == EEXIST) {
+	    /* id collision (not too likely) or REFRESH_MISS.  Handle
+	       by removing the old file and retrying */
+	    debug(81, 2) ("storeBfCompletion: BFCREAT: dir%d, probably id collision!\n",
+			  sio->swap_dirn, x);
+	    if (bfstate->unlink_attempted)
+		debug(81, 1) ("storeBfCompletion: BFCREAT: dir%d, lingering unlink %s\n",
+			      sio->swap_dirn, storeKeyText(sio->e->hash.key));
+	    else {
+		int x = storeBfUnlink2 (INDEXSD (sio->swap_dirn), sio->e, sio);
+		if (x >= 0) {
+		    bfstate->unlink_attempted++;
+		    return;
+		}
+	    }
+	    /* unlink failed, continue processing the error (this will
+               result in SWAPOUT_FAIL) */
+	}
+    create_fail:
+	/* fall-through */
+
+    case BFOPEN:
+	if (x != 0)
+	    storeBfIOAbort (sio, op->type, x);
+	else {
+	    bfstate->fd = aio_return (&op->aio);
+	    sio->e->swap_file_sz = bfstate->ra.size;
+	    store_open_disk_fd++;
+	    debug(81, 4) ("storeBfOpCompletion: sio %p fd %d opened\n", sio, bfstate->fd);
+	}
+	break;
+
+    case BFREAD:
+	if (x != 0)
+	    storeBfIOAbort (sio, op->type, x);
+	rlen = (x == 0)? aio_return (&op->aio): -1;
+	op->rlen = (x == 0)? aio_return (&op->aio): 0;
+	storeBfUpdateOffset (bfstate);
+	if (! bfstate->aborting && cbdataValid(op->callback_data))
+	    op->callback.read (op->callback_data, op->aio.aio_buf, rlen);
+	cbdataUnlock (op->callback_data);
+	break;
+
+    case BFWRITE:
+	if (x != 0)
+	    storeBfIOAbort (sio, op->type, x);
+	op->rlen = (x == 0)? aio_return (&op->aio): 0;
+	storeBfUpdateOffset (bfstate);
+	if (op->callback.write)
+	    op->callback.write (op->aio.aio_buf); /* free_func */
+	break;
+
+    case BFCLOSE:
+	debug(81, 4) ("storeBfOpCompletion: fd %d closed (x=%d)\n", bfstate->fd, x);
+	if (x != 0)
+	    storeBfCloseLastResort (sio);
+	else {
+	    bfstate->fd = -1;
+	    store_open_disk_fd--;
+	}
+	storeBfIOCallback (sio, bfstate->errflag);
+	/* That's the showdown.  sio is freed by now */
+	return;
+
+    case BFUNLINK:
+	xfree (op->aio.aio_buf); /* ra */
+	memPoolFree(bf_op_pool, op);
+	if (sio) {
+	    /* This unlink was launched to handle EEXIST error.
+	       Now retry CREATE */
+	    op = &bfstate->ops;
+	    assert (op->type == BFCREAT);
+	    if (x) {
+		x = EEXIST;
+		goto create_fail;
+	    }
+	    op->started = op->finished = 0;
+	    if (storeBfLaunch (NULL, aio_ioctl, op) != 0) {
+		x = EEXIST;
+		goto create_fail;
+	    }
+	}
+	return;
+
+    case BFSETATTR:
+	xfree (op->aio.aio_buf); /* ra */
+	memPoolFree(bf_op_pool, op);
+	return;
+
+    default:
+	abort ();		/* cannot happen. */
+    }
+
+    /*
+     * Ok, op done.  Go ask for more...
+     */
+    storeBfKickQueue (bfstate);
+
+    /*
+     * Don't use op below this point.  storeBfKickQueue calls
+     * storeBfSweep that potentially destroys the bfop pointed by it.
+     */
+
+    return;
+}
+
+
+
+/*  === STATIC =========================================================== */
+
+
+
+/* Launch async file close operation.  We are one step away from
+   storeBfIOCallback now.  The queue must be empty before this is
+   called.  Note that the call to this possibly results in sio
+   invalidation, so after calling this, return immediately; don't use
+   any pointers like op or bfstate that you might have. */
+
+static void
+storeBfTryLaunchClose (storeIOState *sio)
+{
+    bfstate_t *bfstate = sio->fsstate;
+    bfop_t *op = &bfstate->ops;
+
+    if (bfstate->close_started)
+	return;
+    if (bfstate->ops.next != &bfstate->ops)
+	return;			/* we'll be called again when q empties */
+
+    debug(81, 3) ("storeBfTryLaunchClose: %s", storeBfStateDump (bfstate));
+
+    if (bfstate->fd < 0) {
+	/* not opened -- no need to close.  Bye right now */
+	storeBfIOCallback (sio, bfstate->errflag);
+	return;
+    }
+
+#if 1
+    /* Looks like close(2) is never longer than 10 us (!) so doing it
+       through KAIO is pure overhead */
+    goto synchronous_close;
+#endif
+
+    op->type = BFCLOSE;
+    op->sio = sio;
+    op->aio.aio_fildes = bfstate->fd;
+    op->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
+    op->aio.aio_sigevent.sigev_signo = 0;
+    op->aio.aio_sigevent.sigev_value.sival_ptr = op;
+    
+    debug(81, 2) ("storeBfTryLaunchClose: fd=%d %s", bfstate->fd, storeBfOpDump(op));
+    debug(81, 2) ("storeBfTryLaunchClose: callers: %p %p %p %p\n",
+		  __builtin_return_address(0),
+		  __builtin_return_address(1),
+		  __builtin_return_address(2),
+		  __builtin_return_address(3));
+
+    bfstate->close_started = 1;
+    if (storeBfLaunch (NULL, aio_close, op) < 0) {
+    synchronous_close:
+	storeBfCloseLastResort (sio);
+	storeBfIOCallback (sio, bfstate->errflag);
+    }
+}
+
+
+/* Called if aio_close failed, to attempt synchronous file close.  This
+   will never be called ;-) 'errflag' is the error returned by
+   aio_close. */
+
+static void
+storeBfCloseLastResort (storeIOState *sio)
+{
+    bfstate_t *bfstate = sio->fsstate;
+
+    /* debug(81, 1) ("Resorting to sync close()...\n"); */
+
+    if (close (bfstate->fd) != 0) 
+	debug(50, 1) ("close: %s\n", xstrerror());
+    else {
+	bfstate->fd = -1;
+	store_open_disk_fd--;
+    }
+}
+
+
+/* Call the STIOCB aka "file closed" callback.  This is the final
+   accord in the life of every sio */
+
+static void
+storeBfIOCallback (storeIOState *sio, int errflag)
+{
+    bfstate_t *bfstate = sio->fsstate;
+    int valid = cbdataValid(sio->callback_data);
+
+    debug(81, 3) ("storeBfIOCallback: errflag=%d\n", errflag);
+    assert (bfstate->ops.next == &bfstate->ops);
+    cbdataUnlock(sio->callback_data);
+    if (valid) {
+	errno = errflag;
+	sio->callback(sio->callback_data, errflag? DISK_ERROR: DISK_OK, sio);
+    }
+    cbdataFree(sio);
+}
+
+
+/* Change the sio state to "aborting" -- that is, arrange to shut down
+   all I/O active on the sio, close the file and eventually call
+   storeBfIOCallback.  This is called if any operation (except close)
+   fails.  It does no actions, just sets appropriate flags.
+   storeBfKickQueue will notice and shutdown instead of continuing. */
+
+static void
+storeBfIOAbort (storeIOState *sio, int type, int errflag)
+{
+    bfstate_t *bfstate = sio->fsstate;
+
+    debug(50, 3) ("storeBfIOAbort: sio %p, aio_{%d}, errflag=%d\n",
+		  sio, type, errflag);
+
+    if (errflag && !bfstate->errflag)
+	bfstate->errflag = errflag;
+    bfstate->close_request = 1;
+    bfstate->aborting = 1;
+
+    /* Don't sweep here -- this will invalidate ops while our callers
+       are often interesed in them, despite the aborting.
+       storeBfKickQueue will do that. */
+
+    if (bfstate->ops.next != &bfstate->ops) {
+	/* TODO: aio_cancel all ops currently in flight? */
+    }
+
+    /* If got EIO, switch the dir offline.  To get it back online,
+       send squid a SIGHUP; this will reload configuration and
+       revalidate store dirs. */
+
+    if (bfstate->errflag == EIO) {
+	SwapDir *sd = INDEXSD (sio->swap_dirn);
+	if (! sd->flags.offline) {
+	    debug(47, 1) ("storeBfIOAbort: got EIO, taking %s offline\n", sd->path);
+	    storeBfDirTakeOffline (sd);
+	}
+    }
+}
+
+
+/* Add op to the tail of the op queue */
+
+static void
+storeBfOpEnqueue (bfstate_t *bfstate, bfop_t *op)
+{
+    if (bfstate->close_started) {
+	/* Too late, we are closing. */
+	if (op->callback_data)
+	    cbdataUnlock (op->callback_data);
+	memPoolFree (bf_op_pool, op);
+    }
+    op->prev = bfstate->ops.prev;
+    bfstate->ops.prev = op;
+    op->next = &bfstate->ops;
+    op->prev->next = op;
+}
+
+
+/* Remove finished ops from the start of the list.  Finished ops
+   interspersed with unfinished ones are left alone, they will be
+   needed for future storeBfUpdateOffset()s.  If aborting, kill
+   unstarted ops too. */
+
+static void
+storeBfSweep (bfstate_t *bfstate)
+{
+    bfop_t *op;
+    bfop_t *next;
+
+    for (op = bfstate->ops.next; op != &bfstate->ops; op = next) {
+	next = op->next;
+	if (op->finished || (bfstate->aborting && ! op->started)) {
+	    op->prev->next = op->next;
+	    op->next->prev = op->prev;
+	    if (! op->finished && op->callback_data)
+		cbdataUnlock (op->callback_data);
+	    memPoolFree (bf_op_pool, op);
+	    continue;
+	}
+	break;
+    }
+}
+
+
+/* Calculate file offset for storeOffset by examining 'finished' ops
+   in the queue.  We rely on the fact that queue is ordered by offset,
+   and there is for sure at least one element in the queue (the one
+   that storeBfOpCompletion processes when it calls us). */
+
+static void
+storeBfUpdateOffset (bfstate_t *bfstate)
+{
+    storeIOState *sio = bfstate->ops.sio;
+    bfop_t *op;
+
+    for (op = bfstate->ops.next; op != &bfstate->ops; op = op->next) {
+	if (op->finished) {
+	    off_t offset = op->aio.aio_offset + op->rlen;
+	    if (sio->offset < offset)
+		sio->offset = offset;
+	    continue;
+	}
+	break;
+    }
+
+    debug(81, 5) ("storeBfUpdateOffset: sio %p, offset %lu\n",
+		  sio, sio->offset);
+}
+
+
+/* Arrange for our queue being kicked externally at a later time.
+   Called when we lose initiative because of EAGAIN. */
+
+static void
+storeBfOrderKick (bfstate_t *bfstate)
+{
+    storeIOState *sio = bfstate->ops.sio;
+    bfinfo_t *bfi = INDEXSD(sio->swap_dirn)->fsdata;
+    bfop_t *op;
+
+    /* Avoid the external help if possible.  Check if there are ops in
+       flight; if yes, their completion will do the kick and no
+       external help is needed. */
+
+    for (op = bfstate->ops.next; op != &bfstate->ops; op = op->next) {
+	if (op->started && ! op->finished)
+	    /* found! */
+	    return;
+    }
+
+    /* No way.  Go begging.  We have to use usual callback calling
+       conventions, because user can kick us before the ordered kick
+       is activated, and when it at last activated it might find sio
+       freed.  */
+
+    cbdataLock (sio);
+    linklistPush(&(bfi->kicklater), sio);
+}
+
+/* Debugging aid */
+
+static char *
+storeBfOpDump (bfop_t *op)
+{
+    static char buf[1000];
+    static char *optypestr[] = {
+	"BFNONE", "BFOPEN", "BFCREAT", "BFREAD", "BFWRITE",
+	"BFCLOSE", "BFUNLINK", "BFSETATTR",
+    };
+    char *typestr = op->type >= BFMAX? "bad!": optypestr[op->type];
+    int n = 0;
+
+    n += snprintf (buf+n, sizeof (buf)-n,
+		   "op %p: %9s sio %p %s %s",
+		   op, typestr, op->sio,
+		   op->started? "started": "",
+		   op->finished? "finished": "");
+    if (op->type == BFREAD || op->type == BFWRITE)
+	n += snprintf (buf+n, sizeof (buf)-n,
+		       " %u@%lu",
+		       op->aio.aio_nbytes,
+		       op->aio.aio_offset);
+    if (op->type == BFWRITE)
+	n += snprintf (buf+n, sizeof (buf)-n,
+		       " %02x %02x %02x %02x %02x",
+		       ((unsigned char *)op->aio.aio_buf)[0],
+		       ((unsigned char *)op->aio.aio_buf)[1],
+		       ((unsigned char *)op->aio.aio_buf)[2],
+		       ((unsigned char *)op->aio.aio_buf)[3],
+		       ((unsigned char *)op->aio.aio_buf)[4]);
+    n += snprintf (buf+n, sizeof (buf)-n, "\n");
+    return buf;
+}
+
+
+static char *
+storeBfStateDump (bfstate_t *bfstate)
+{
+    bfop_t *op;
+    static char buf[10000];
+    char *p = buf;
+
+    p += snprintf (p, &buf[10000] - p, "bfstate %p\n", bfstate);
+    p += snprintf (p, &buf[10000] - p, " fd %d errflag %d %s %s %s\n",
+		  bfstate->fd,
+		  bfstate->errflag,
+		  (bfstate->close_request? "CR":""),
+		  (bfstate->close_started? "CS":""),
+		  (bfstate->aborting? "ABORTING":""));
+    op = &bfstate->ops;
+    do {
+	p += snprintf (p, &buf[10000] - p, storeBfOpDump (op));
+	op = op->next;
+    } while (op != &bfstate->ops);
+
+    return buf;
+}
+
+static void
+storeBfIOFreeEntry(void *_sio)
+{
+    storeIOState * sio = _sio;
+    bfstate_t *bfstate = sio->fsstate;
+
+    /* queue must be freed before cbdataFree()ing the sio */
+    assert (bfstate->ops.next == &bfstate->ops);
+
+    memPoolFree(bf_state_pool, sio->fsstate);
+}