Squid-2.2.STABLE3: Improved TCP dead peer detection & failover A major adjustment of how Squid detects a failing TCP peer and how connections is retried when it fails to connect. Index: squid/src/cache_cf.c diff -u squid/src/cache_cf.c:1.1.1.29 squid/src/cache_cf.c:1.1.1.29.8.1 --- squid/src/cache_cf.c:1.1.1.29 Tue Apr 20 17:26:02 1999 +++ squid/src/cache_cf.c Wed May 19 23:59:51 1999 @@ -1000,6 +1000,7 @@ p->weight = 1; p->icp.version = ICP_VERSION_CURRENT; p->tcp_up = PEER_TCP_MAGIC_COUNT; + p->test_fd = -1; #if USE_CARP if (p->carp.load_factor) { /* calculate this peers hash for use in CARP */ Index: squid/src/forward.c diff -u squid/src/forward.c:1.1.1.12 squid/src/forward.c:1.1.1.12.2.1 --- squid/src/forward.c:1.1.1.12 Wed May 19 23:27:14 1999 +++ squid/src/forward.c Wed May 19 23:59:52 1999 @@ -137,13 +137,20 @@ fwdState->n_tries, (int) (squid_curtime - fwdState->start)); if (fwdState->servers->next) { - /* cycle */ + /* use next, or cycle if origin server isn't last */ FwdServer *fs = fwdState->servers; - FwdServer **T; + FwdServer **T, *T2 = NULL; fwdState->servers = fs->next; - for (T = &fwdState->servers; *T; T = &(*T)->next); - *T = fs; - fs->next = NULL; + for (T = &fwdState->servers; *T; T2=*T, T = &(*T)->next); + if (T2 && T2->peer) { + /* cycle */ + *T = fs; + fs->next = NULL; + } else { + /* Use next. The last "direct" entry is retried multiple times */ + fwdState->servers = fs->next; + fwdServerFree(fs); + } } /* use eventAdd to break potential call sequence loops */ eventAdd("fwdConnectStart", fwdConnectStart, fwdState, 0.0, 1); @@ -186,12 +193,14 @@ err->request = requestLink(request); fwdFail(fwdState, err); if (fs->peer) - peerCheckConnectStart(fs->peer); + peerConnectFailed(fs->peer); comm_close(server_fd); } else { debug(17, 3) ("fwdConnectDone: FD %d: '%s'\n", server_fd, storeUrl(fwdState->entry)); fd_note(server_fd, storeUrl(fwdState->entry)); fd_table[server_fd].uses++; + if (fs->peer) + peerConnectSucceded(fs->peer); fwdDispatch(fwdState); } current = NULL; Index: squid/src/neighbors.c diff -u squid/src/neighbors.c:1.1.1.31.10.1 squid/src/neighbors.c:1.1.1.31.10.2 --- squid/src/neighbors.c:1.1.1.31.10.1 Wed May 19 23:44:04 1999 +++ squid/src/neighbors.c Wed May 19 23:59:52 1999 @@ -48,9 +48,9 @@ static void neighborCountIgnored(peer *); static void peerRefreshDNS(void *); static IPH peerDNSConfigure; -static EVH peerCheckConnect; -static IPH peerCheckConnect2; -static CNCB peerCheckConnectDone; +static void peerProbeConnect(peer *); +static IPH peerProbeConnect2; +static CNCB peerProbeConnectDone; static void peerCountMcastPeersDone(void *data); static void peerCountMcastPeersStart(void *data); static void peerCountMcastPeersSchedule(peer * p, time_t when); @@ -859,8 +859,10 @@ int neighborUp(const peer * p) { - if (!p->tcp_up) + if (!p->tcp_up) { + peerProbeConnect((peer *)p); return 0; + } if (p->options.no_query) return 1; if (p->stats.probe_start != 0 && @@ -960,62 +962,82 @@ eventAddIsh("peerRefreshDNS", peerRefreshDNS, NULL, 3600.0, 1); } +void +peerConnectFailed(peer *p) +{ + p->stats.last_connect_failure = squid_curtime; + if (!p->tcp_up) { + debug(15, 2) ("TCP connection to %s/%d dead\n", p->host, p->http_port); + return; + } + debug(15, 1) ("TCP connection to %s/%d failed\n", p->host, p->http_port); + p->tcp_up--; + if (!p->tcp_up) { + debug(15, 1) ("Detected DEAD %s: %s/%d/%d\n", + neighborTypeStr(p), + p->host, p->http_port, p->icp.port); + p->stats.logged_state = PEER_DEAD; + } +} + +void +peerConnectSucceded(peer *p) +{ + if (!p->tcp_up) { + debug(15, 2) ("TCP connection to %s/%d succeded\n", p->host, p->http_port); + debug(15, 1) ("Detected REVIVED %s: %s/%d/%d\n", + neighborTypeStr(p), + p->host, p->http_port, p->icp.port); + p->stats.logged_state = PEER_ALIVE; + } + p->tcp_up = PEER_TCP_MAGIC_COUNT; +} + /* - * peerCheckConnect will NOT be called by eventRun if the peer/data - * pointer becomes invalid. + * peerProbeConnect will be called on dead peers by neighborUp */ static void -peerCheckConnect(void *data) +peerProbeConnect(peer *p) { - peer *p = data; int fd; + if (p->test_fd != -1) + return; /* probe already running */ + if (squid_curtime - p->stats.last_connect_probe < Config.Timeout.connect) + return; /* don't probe to often */ fd = comm_open(SOCK_STREAM, 0, Config.Addrs.tcp_outgoing, 0, COMM_NONBLOCKING, p->host); if (fd < 0) return; p->test_fd = fd; - ipcache_nbgethostbyname(p->host, peerCheckConnect2, p); + p->stats.last_connect_probe = squid_curtime; + ipcache_nbgethostbyname(p->host, peerProbeConnect2, p); } static void -peerCheckConnect2(const ipcache_addrs * ianotused, void *data) +peerProbeConnect2(const ipcache_addrs * ianotused, void *data) { peer *p = data; commConnectStart(p->test_fd, p->host, p->http_port, - peerCheckConnectDone, + peerProbeConnectDone, p); } static void -peerCheckConnectDone(int fd, int status, void *data) +peerProbeConnectDone(int fd, int status, void *data) { peer *p = data; if (status == COMM_OK) { - p->tcp_up = PEER_TCP_MAGIC_COUNT; - debug(15, 1) ("TCP connection to %s/%d succeeded\n", - p->host, p->http_port); + peerConnectSucceded(p); } else { - eventAdd("peerCheckConnect", peerCheckConnect, p, 60.0, 1); + peerConnectFailed(p); } comm_close(fd); + p->test_fd = -1; return; } -void -peerCheckConnectStart(peer * p) -{ - if (!p->tcp_up) - return; - debug(15, 1) ("TCP connection to %s/%d failed\n", p->host, p->http_port); - p->tcp_up--; - if (p->tcp_up != (PEER_TCP_MAGIC_COUNT - 1)) - return; - p->last_fail_time = squid_curtime; - eventAdd("peerCheckConnect", peerCheckConnect, p, 30.0, 1); -} - static void peerCountMcastPeersSchedule(peer * p, time_t when) { @@ -1218,9 +1240,9 @@ #if USE_HTCP } #endif - if (e->last_fail_time) { + if (e->stats.last_connect_failure) { storeAppendPrintf(sentry, "Last failed connect() at: %s\n", - mkhttpdlogtime(&(e->last_fail_time))); + mkhttpdlogtime(&(e->stats.last_connect_failure))); } if (e->peer_domain != NULL) { storeAppendPrintf(sentry, "DOMAIN LIST: "); Index: squid/src/peer_select.c diff -u squid/src/peer_select.c:1.1.1.28 squid/src/peer_select.c:1.1.1.28.2.1 --- squid/src/peer_select.c:1.1.1.28 Wed May 19 23:27:18 1999 +++ squid/src/peer_select.c Wed May 19 23:59:52 1999 @@ -281,8 +281,8 @@ if (Config.onoff.prefer_direct) peerGetSomeDirect(ps); peerGetSomeParent(ps); - if (!Config.onoff.prefer_direct) - peerGetSomeDirect(ps); + /* Have direct as a last resort if possible.. */ + peerGetSomeDirect(ps); peerSelectCallback(ps); } Index: squid/src/protos.h diff -u squid/src/protos.h:1.1.1.40 squid/src/protos.h:1.1.1.40.2.1 --- squid/src/protos.h:1.1.1.40 Wed May 19 23:27:19 1999 +++ squid/src/protos.h Wed May 19 23:59:52 1999 @@ -613,7 +613,8 @@ extern CBDUNL peerDestroy; extern char *neighborTypeStr(const peer * e); extern peer_t neighborType(const peer *, const request_t *); -extern void peerCheckConnectStart(peer *); +extern void peerConnectFailed(peer *); +extern void peerConnectSucceded(peer *); extern void dump_peer_options(StoreEntry *, peer *); extern int peerHTTPOkay(const peer *, request_t *); extern peer *whichPeer(const struct sockaddr_in *from); Index: squid/src/structs.h diff -u squid/src/structs.h:1.1.1.37.10.3 squid/src/structs.h:1.1.1.37.10.4 --- squid/src/structs.h:1.1.1.37.10.3 Wed May 19 23:44:06 1999 +++ squid/src/structs.h Wed May 19 23:59:53 1999 @@ -997,6 +997,8 @@ time_t probe_start; time_t last_query; time_t last_reply; + time_t last_connect_failure; + time_t last_connect_probe; int logged_state; /* so we can print dead/revived msgs */ } stats; struct { @@ -1047,7 +1049,6 @@ PeerDigest *digest; #endif int tcp_up; /* 0 if a connect() fails */ - time_t last_fail_time; struct in_addr addresses[10]; int n_addresses; int rr_count;