This patch is generated from the prefetching branch of HEAD in squid3
Mon Jan 23 03:19:07 2006 GMT
See http://devel.squid-cache.org/

Index: squid3/configure.in
diff -u squid3/configure.in:1.87 squid3/configure.in:1.62.2.5
--- squid3/configure.in:1.87	Wed Dec 28 19:13:07 2005
+++ squid3/configure.in	Sat Jan 21 12:37:51 2006
@@ -711,6 +711,20 @@
   fi
 ])
 
+AM_CONDITIONAL(USE_HTMLPREFETCH, false)
+AC_ARG_ENABLE(html-analysis,
+              AC_HELP_STRING([--enable-html-analysis],[Enable HTML content analysis and prefetching. Requires libxml2.]),
+              ac_cv_use_htmlprefetch=$enableval, ac_cv_use_htmlprefetch=no)
+AC_CACHE_CHECK(whether to enable HTML prefetching, ac_cv_use_htmlprefetch,
+               ac_cv_use_htmlprefetch=no)
+if test "$ac_cv_use_htmlprefetch" = "yes" ; then
+  AC_DEFINE(USE_HTMLPREFETCH,1,[Compile the HTML analysis support])
+  AM_CONDITIONAL(USE_HTMLPREFETCH, true)
+  XTRA_LIBS="$XTRA_LIBS -lxml2"
+else
+  AC_DEFINE(USE_HTMLPREFETCH,0,[Compile the HTML analysis support])
+fi
+
 AM_CONDITIONAL(USE_ESI, false)
 AC_ARG_ENABLE(esi,
               AC_HELP_STRING([--enable-esi],[Enable ESI for accelerators. Requires libexpat. Enabling ESI will cause squid to follow the Edge Acceleration Specification (www.esi.org). This causes squid to IGNORE client Cache-Control headers. DO NOT use this in a squid configured as a web proxy, ONLY use it in a squid configured for webserver acceleration.]),
Index: squid3/doc/debug-sections.txt
diff -u squid3/doc/debug-sections.txt:1.6 squid3/doc/debug-sections.txt:1.6.6.1
--- squid3/doc/debug-sections.txt:1.6	Tue Jul 22 19:12:50 2003
+++ squid3/doc/debug-sections.txt	Wed Jan 12 13:58:27 2005
@@ -96,3 +96,4 @@
 section 90    Store Client
 section 91    Http Surrogate-Control Header
 section 92    Store File System
+section 93    HTML content analysis and fetcher
Index: squid3/src/Debug.h
diff -u squid3/src/Debug.h:1.8 squid3/src/Debug.h:1.7.6.3
--- squid3/src/Debug.h:1.8	Mon Dec 19 19:12:57 2005
+++ squid3/src/Debug.h	Sat Jan 21 12:37:55 2006
@@ -36,6 +36,8 @@
 #ifndef SQUID_DEBUG
 #define SQUID_DEBUG
 
+#include "defines.h"
+
 #include <iostream>
 #undef assert
 #include <sstream>
Index: squid3/src/ESIExpatParser.h
diff -u squid3/src/ESIExpatParser.h:1.4 squid3/src/ESIExpatParser.h:1.2.8.2
--- squid3/src/ESIExpatParser.h:1.4	Sun Jul  3 19:14:11 2005
+++ squid3/src/ESIExpatParser.h	Thu Oct 13 20:16:26 2005
@@ -36,6 +36,11 @@
 #include "ESIParser.h"
 #include "expat.h"
 
+#ifdef XMLCALL
+#define EXPAT_XMLCALL XMLCALL
+#undef XMLCALL
+#endif
+
 class ESIExpatParser : public ESIParser
 {
 
Index: squid3/src/ESIInclude.cc
diff -u squid3/src/ESIInclude.cc:1.8 squid3/src/ESIInclude.cc:1.5.2.4
--- squid3/src/ESIInclude.cc:1.8	Sat Nov  5 15:03:11 2005
+++ squid3/src/ESIInclude.cc	Sat Jan 21 12:37:58 2006
@@ -331,7 +331,7 @@
 
     debug (86,5)("ESIIncludeStart: Starting subrequest with url '%s'\n", tempUrl);
 
-    if (clientBeginRequest(METHOD_GET, tempUrl, esiBufferRecipient, esiBufferDetach, stream.getRaw(), &tempheaders, stream->localbuffer->buf, HTTP_REQBUF_SZ)) {
+    if (clientBeginRequest(METHOD_GET, tempUrl, esiBufferRecipient, esiBufferDetach, stream.getRaw(), &tempheaders, stream->localbuffer->buf, HTTP_REQBUF_SZ,  (ClientHttpRequest::flags_type){1,0,0,0,0}, no_addr)) {
         debug (86,0) ("starting new ESI subrequest failed\n");
     }
 
Index: squid3/src/ESILibxml2Parser.h
diff -u squid3/src/ESILibxml2Parser.h:1.3 squid3/src/ESILibxml2Parser.h:1.1.4.2
--- squid3/src/ESILibxml2Parser.h:1.3	Sun Jul  3 19:14:11 2005
+++ squid3/src/ESILibxml2Parser.h	Thu Oct 13 20:16:26 2005
@@ -51,6 +51,11 @@
 #include <libxml/HTMLparser.h>
 #include <libxml/HTMLtree.h>
 
+#ifdef XMLCALL
+#define LIBXML2_XMLCALL XMLCALL
+#undef XMLCALL
+#endif
+
 #ifdef OLD_FREE
 #define free OLD_FREE
 #endif
Index: squid3/src/HTMLAnalysisStream.cc
diff -u /dev/null squid3/src/HTMLAnalysisStream.cc:1.1.2.7
--- /dev/null		Thu Jan  1 01:00:00 1970
+++ squid3/src/HTMLAnalysisStream.cc	Mon May 23 18:22:11 2005
@@ -0,0 +1,405 @@
+
+/*
+ * $Id$
+ *
+ * DEBUG: section 93    HTML parsing and fetching
+ * AUTHOR: Nick Lewycky
+ *
+ * SQUID Web Proxy Cache          http://www.squid-cache.org/
+ * ----------------------------------------------------------
+ *
+ *  Squid is the result of efforts by numerous individuals from
+ *  the Internet community; see the CONTRIBUTORS file for full
+ *  details.   Many organizations have provided support for Squid's
+ *  development; see the SPONSORS file for full details.  Squid is
+ *  Copyrighted (C) 2001 by the Regents of the University of
+ *  California; see the COPYRIGHT file for full details.  Squid
+ *  incorporates software developed and/or copyrighted by other
+ *  sources; see the CREDITS file for full details.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *  
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
+ *
+ *  Copyright (c) 2004, Nick Lewycky <nicholas@mxc.ca>
+ *
+ */
+
+#include <stdexcept>
+#include <string>
+
+#include <libxml/SAX2.h>
+
+#include "squid.h"
+#include "Debug.h"
+#include "HttpReply.h"
+#include "HttpRequest.h"
+#include "HTMLAnalysisStream.h"
+#include "http.h"
+#include "PrefetchStream.h"
+#include "protos.h"
+
+using namespace std;
+
+CBDATA_CLASS_INIT(HTMLAnalysisStream);
+
+/* NOTE: this does get called after detach. */
+void
+htmlBufferData(clientStreamNode *node, ClientHttpRequest *req,
+               HttpReply *reply, StoreIOBuffer buffer)
+{
+    assert(node->data.getRaw());
+    debugs(93, 4, "(AS) bufferData");
+
+    HTMLAnalysisStream::Pointer self =
+        dynamic_cast<HTMLAnalysisStream *>(node->data.getRaw());
+    if (!self.getRaw())
+    {
+        /* detach time. but if it's not my node->data, whose is it? */
+        debugs(93, 4, "(AS) this makes no sense at all!");
+        return;
+    }
+
+    debugs(93, 4, "(AS) length: " << buffer.length <<
+           ", offset: " << buffer.offset);
+    self->parse(buffer.data, buffer.length, 1);
+    node->readBuffer.offset += buffer.length;
+
+    //clientStreamCallback(node, req, reply, node->readBuffer); // detaches
+    clientStreamCallback(node, req, reply, buffer);
+}
+
+void
+htmlStreamRead(clientStreamNode *node, ClientHttpRequest *req)
+{
+    HTMLAnalysisStream::Pointer self =
+        dynamic_cast<HTMLAnalysisStream *>(node->data.getRaw());
+
+    if (!req) return;
+
+    debugs(93, 4, "(AS) before read");
+    clientStreamRead(node, req, node->readBuffer);
+    debugs(93, 4, "(AS) after read");
+}
+
+void
+htmlStreamDetach(clientStreamNode *node, ClientHttpRequest *req)
+{
+    debugs(93, 4, "(AS) detach");
+    clientStreamDetach(node, req);
+}
+
+clientStream_status_t
+htmlStreamStatus(clientStreamNode *node, ClientHttpRequest *req)
+{
+    debugs(93, 4, "(AS) status");
+    return clientStreamStatus(node, req);
+}
+
+HTMLAnalysisStream::HTMLAnalysisStream(const HttpRequest *req)
+        :
+        request(req),
+        parser(htmlCreatePushParserCtxt(&handler, static_cast<void *>(this),
+                                        NULL, 0, NULL, XML_CHAR_ENCODING_NONE))
+{
+    if (!parser)
+        throw runtime_error("Unable to create parser.");
+
+    xmlSubstituteEntitiesDefault(1);
+
+    relative_url = urlCanonicalClean(request);
+
+    debugs(93, 5, "(AS) depth: " << request->recursion_depth);
+
+    debugs(93, 4, "(AS) analyzing " << relative_url);
+}
+
+HTMLAnalysisStream::~HTMLAnalysisStream()
+{
+    htmlFreeParserCtxt(parser);
+}
+
+inline void
+HTMLAnalysisStream::prefetch(const string &url, bool allow_recursion)
+{
+    PrefetchStream::prefetch(url, relative_url, request, allow_recursion);
+}
+
+void
+HTMLAnalysisStream::parse(const char *document, size_t len, bool partial)
+{
+    debugs(93, 4, "(AS) chunk to parse, " << len << " long " << (partial?"partial":""));
+    debugs(93, 4, "(AS) chunk is: " << string(document, len));
+    htmlParseChunk(parser, document, len, partial ? 0 : 1);
+}
+
+void
+HTMLAnalysisStream::start_element_handler(void *userData,
+        const xmlChar * xname, const xmlChar ** xattr)
+{
+    const char  *name = reinterpret_cast<const char  *>(xname);
+    const char **attr = reinterpret_cast<const char **>(xattr);
+
+    if (!name || !attr)
+        return;
+
+    HTMLAnalysisStream *self = static_cast<HTMLAnalysisStream *>(userData);
+
+    // Example:
+    // <img src="foo.png">
+    if (strcasecmp(name, "img") == 0) {
+        while (*attr) {
+            if (strcasecmp(*attr, "src") == 0 && *(attr + 1)) {
+                self->prefetch(*(attr + 1), false);
+                return;
+            }
+
+            attr += 2;
+        }
+
+        return;
+    }
+
+    // Example:
+    // <frame src="foo.html">
+    // <iframe src="foo.html">
+    if (strcasecmp(name, "frame") == 0 || strcasecmp(name, "iframe") == 0) {
+        while (*attr) {
+            if (strcasecmp(*attr, "src") == 0 && *(attr + 1)) {
+                self->prefetch(*(attr + 1), true);
+                return;
+            }
+
+            attr += 2;
+        }
+
+        return;
+    }
+
+    // Example:
+    // <link rel="stylesheet" href="foo.css">
+    // <link href="foo.html" rel="prefetch">
+    bool is_fetchable = false;
+    string href;
+    if (strcasecmp(name, "link") == 0) {
+        while (*attr) {
+            if (strcasecmp(*attr, "href") == 0 && *(attr + 1)) {
+                if (is_fetchable) {
+                    self->prefetch(*(attr + 1), true);
+                    return;
+                } else
+                    href = *(attr + 1);
+            } else if (strcasecmp(*attr, "rel") == 0) {
+                if (strcasecmp(*(attr + 1), "stylesheet") != 0 &&
+                        strcasecmp(*(attr + 1), "prefetch") != 0)
+                    return;
+
+                if (!href.empty()) {
+                    self->prefetch(*(attr + 1), true);
+                    return;
+                } else
+                    is_fetchable = true;
+            }
+
+            attr += 2;
+        }
+
+        return;
+    }
+
+    // Example:
+    // <meta http-equiv="link" content="<foo.html>; rel=prefetch">
+    bool is_httpequiv = false;
+
+    if (strcasecmp(name, "meta") == 0) {
+        while (*attr) {
+            if (strcasecmp(*attr, "content") == 0 && *(attr + 1)) {
+                // TODO: replace this with some incantation of httpHeaderParse?
+
+                /* From RFC 2068:
+                 *
+                 * Link           = "Link" ":" #("<" URI ">" *( ";" link-param )
+                 *
+                 * link-param     = ( ( "rel" "=" relationship )
+                 *                    | ( "rev" "=" relationship )
+                 *                    | ( "title" "=" quoted-string )
+                 *                    | ( "anchor" "=" <"> URI <"> )
+                 *                    | ( link-extension ) )
+                 *
+                 * link-extension = token [ "=" ( token | quoted-string ) ]
+                 *
+                 * relationship   = sgml-name
+                 *                | ( <"> sgml-name *( SP sgml-name) <"> )
+                 *
+                 * sgml-name      = ALPHA *( ALPHA | DIGIT | "." | "-" )
+                 */
+
+                const char *p1 = *(attr + 1), *p2;
+
+                if (!p1)
+                    return;     // eg. <meta content>
+
+                while (*p1 && *p1 == ' ')
+                    p1++;
+
+                if (*p1 == '<')
+                    p1++;
+                else
+                    return;     // malformed (no leading '<')
+
+                p2 = p1;
+
+                while (*p2 && *p2 != '>' && strncasecmp(p2, "&gt;", 4) != 0)
+                    p2++;
+
+                if (!*p2)
+                    return;     // malformed (end of string before '>')
+
+                href = string(p1, p2);
+
+                p2++;   // skip over closing '>'
+
+                while (*p2 == ';' && !is_fetchable) {
+                    if (!*p2 == ';')
+                        return;
+
+                    p2++;       // skip over separating ';'
+
+                    while (*p2 && *p2 == ' ')
+                        p2++;
+
+                    if (strncasecmp(p2, "rel", 3) == 0) {
+                        p2 += 3;        // skip over 'rel'
+
+                        while (*p2 == ' ')
+                            p2++;
+
+                        if (*p2 != '=')
+                            return;     // malformed (no '=' in link-param)
+
+                        p2++;   // skip over '='
+
+                        while (*p2 == ' ')
+                            p2++;
+
+                        bool is_quoted = *p2 == '"';
+
+                        if (is_quoted)
+                            p2++;
+
+                        if (strncasecmp(p2, "prefetch", 8) == 0) {
+                            p2 += 8;
+
+                            if (is_quoted && *p2 != '"')
+                                return;
+                            else {
+                                if (is_httpequiv) {
+                                    self->prefetch(href, true);
+                                    return;
+                                } else
+                                    is_fetchable = true;
+                            }
+                        } else
+                            return;
+                    } else {
+                        while (*p2 && *p2 != '=')
+                            p2++;
+
+                        p2++;
+
+                        bool is_quoted = *p2 == '"';
+
+                        if (is_quoted) {
+                            while (*p2 && *p2 != '"')
+                                p2++;
+
+                            while (*p2 == ' ')
+                                p2++;
+
+                            p2++;
+                        } else
+                            while (*p2 && *p2 != ' ')
+                                p2++;
+                    }
+                }
+
+                if (!is_fetchable)
+                    return;     // no [further] link-param, hence no rel=prefetch
+
+            } else if (strcasecmp(*attr, "http-equiv") == 0 &&
+                       strcasecmp(*(attr + 1), "link") == 0) {
+                if (!href.empty()) {
+                    self->prefetch(href, true);
+                    return;
+                } else
+                    is_httpequiv = true;
+            }
+
+            attr += 2;
+        }
+
+        return;
+    }
+
+    // Example:
+    // <base href="http://www.example.com/foo/">
+    if (strcasecmp(name, "base") == 0) {
+        while (*attr) {
+            if (strcasecmp(*attr, "href") == 0 && *(attr + 1)) {
+                self->relative_url = *(attr + 1);
+                return;
+            }
+
+            attr += 2;
+        }
+
+        return;
+    }
+}
+
+htmlSAXHandler
+HTMLAnalysisStream::handler = {
+    NULL,                       // internal subset
+    NULL,                       // is standalone
+    NULL,                       // has internal subset
+    NULL,                       // has external subset
+    NULL,                       // resolve entity
+    NULL,                       // get entity
+    NULL,                       // entity declaration
+    NULL,                       // notation declaration
+    NULL,                       // attribute declaration
+    NULL,                       // element declaraion
+    NULL,                       // unparsed entity
+    NULL,                       // set document locator
+    NULL,                       // start document
+    NULL,                       // end document
+    HTMLAnalysisStream::start_element_handler,  // start element
+    NULL,                       // end element
+    NULL,                       // reference
+    NULL,                       // characters
+    NULL,                       // ignorable whitespace
+    NULL,                       // processing instruction
+    NULL,                       // comment
+    NULL,                       // warning
+    NULL,                       // error
+    NULL,                       // fatal error
+    NULL,                       // get parameter entity
+    NULL,                       // cdata block
+    NULL,                       // external subset
+    XML_SAX2_MAGIC,
+    NULL,                       // private
+    NULL,                       // start element namespace
+    NULL,                       // end element namespace
+    NULL                        // xml structured error
+};
+
Index: squid3/src/HTMLAnalysisStream.h
diff -u /dev/null squid3/src/HTMLAnalysisStream.h:1.1.2.3
--- /dev/null		Thu Jan  1 01:00:00 1970
+++ squid3/src/HTMLAnalysisStream.h	Thu May  5 11:24:14 2005
@@ -0,0 +1,64 @@
+#ifndef SQUID_HTMLANALYSISSTREAM_H
+#define SQUID_HTMLANALYSISSTREAM_H
+
+#include <set>
+#include <string>
+
+#include <libxml/HTMLparser.h>
+#include <libxml/SAX2.h>
+
+#include "defines.h"
+#include "clientStream.h"
+#include "client_side_request.h"
+#include "Store.h"
+#include "StoreClient.h"
+
+class HttpRequest;
+
+void htmlBufferData(clientStreamNode *, ClientHttpRequest *,
+                    HttpReply *, StoreIOBuffer);
+void
+htmlStreamRead(clientStreamNode *node, ClientHttpRequest *req);
+void
+htmlStreamDetach(clientStreamNode *node, ClientHttpRequest *req);
+clientStream_status_t
+htmlStreamStatus(clientStreamNode *node, ClientHttpRequest *req);
+
+class HTMLAnalysisStream : public RefCountable
+{
+    friend void
+    htmlBufferData(clientStreamNode *, ClientHttpRequest *,
+                        HttpReply *, StoreIOBuffer);
+    friend void
+    htmlStreamRead(clientStreamNode *node, ClientHttpRequest *req);
+    friend void
+    htmlStreamDetach(clientStreamNode *node, ClientHttpRequest *req);
+    friend clientStream_status_t
+    htmlStreamStatus(clientStreamNode *node, ClientHttpRequest *req);
+
+
+public:
+    typedef RefCount<HTMLAnalysisStream> Pointer;
+
+    explicit HTMLAnalysisStream(const HttpRequest *);
+    ~HTMLAnalysisStream();
+
+private:
+    void prefetch(const std::string &, bool);
+
+    void parse(const char *document, size_t len, bool partial);
+
+    static void start_element_handler(void *, const xmlChar *,
+	const xmlChar **);
+
+    const HttpRequest *request;
+
+    htmlParserCtxtPtr parser;
+    std::string relative_url;
+
+    static htmlSAXHandler handler;
+
+    CBDATA_CLASS2(HTMLAnalysisStream);
+};
+
+#endif // header guard
Index: squid3/src/HttpRequest.h
diff -u squid3/src/HttpRequest.h:1.17 squid3/src/HttpRequest.h:1.10.4.3
--- squid3/src/HttpRequest.h:1.17	Thu Jan 19 19:13:56 2006
+++ squid3/src/HttpRequest.h	Sat Jan 21 12:37:58 2006
@@ -79,6 +79,10 @@
     time_t ims;
     int imslen;
     int max_forwards;
+#if USE_HTMLPREFETCH
+    int recursion_depth;
+#endif
+
     /* these in_addr's could probably be sockaddr_in's */
 
     struct IN_ADDR client_addr;
Index: squid3/src/Makefile.am
diff -u squid3/src/Makefile.am:1.76 squid3/src/Makefile.am:1.57.2.9
--- squid3/src/Makefile.am:1.76	Fri Jan 13 19:14:05 2006
+++ squid3/src/Makefile.am	Sat Jan 21 12:37:58 2006
@@ -97,6 +97,17 @@
   ESI_SOURCE = 
 endif
 
+HTMLPREFETCH_ALL_SOURCE = \
+	HTMLAnalysisStream.cc \
+	HTMLAnalysisStream.h \
+	PrefetchStream.cc \
+	PrefetchStream.h
+if USE_HTMLPREFETCH
+  HTMLPREFETCH_SOURCE = $(HTMLPREFETCH_ALL_SOURCE)
+else
+  HTMLPREFETCH_SOURCE =
+endif
+
 if USE_ICAP_CLIENT
   ICAP_LIBS = ICAP/libicap.a
   SUBDIRS += ICAP
@@ -168,14 +179,19 @@
 ARP_ACL_SOURCE =
 endif
 
+if USE_HTMLPREFETCH
+HTMLPREFETCH_INC = -I/usr/include/libxml2
+else
+HTMLPREFETCH_INC =
+endif
+
 AM_CFLAGS = @SQUID_CFLAGS@
 AM_CXXFLAGS = @SQUID_CXXFLAGS@
 
 EXTRA_LIBRARIES = libAIO.a libBlocking.a libDiskDaemon.a libDiskThreads.a
 noinst_LIBRARIES = @DISK_LIBS@
 
-INCLUDES        = -I. -I$(srcdir) -I$(top_builddir)/include -I$(top_srcdir)/include -I$(top_srcdir)/lib/libTrie/include
-INCLUDES	+= @SQUID_CPPUNIT_INC@
+INCLUDES        = -I. -I$(srcdir) -I$(top_builddir)/include -I$(top_srcdir)/include -I$(top_srcdir)/lib/libTrie/include @SQUID_CPPUNIT_INC@ $(HTMLPREFETCH_INC)
 
 EXTRA_PROGRAMS = \
 	DiskIO/DiskDaemon/diskd \
@@ -259,6 +275,7 @@
 	htcp.h \
 	$(IDENT_ALL_SOURCE) \
 	$(ESI_ALL_SOURCE) \
+	$(HTMLPREFETCH_ALL_SOURCE) \
 	ProfStats.cc \
 	LeakFinder.cc \
 	LeakFinder.h \
@@ -438,6 +455,7 @@
 	HttpRequest.cc \
 	HttpRequest.h \
 	HttpVersion.h \
+	$(HTMLPREFETCH_SOURCE) \
 	icmp.cc \
 	ICP.h \
 	icp_v2.cc \
@@ -1005,7 +1023,6 @@
 	@AUTH_LINKOBJS@ \
 	@AUTH_OBJS@ \
 	@SQUID_CPPUNIT_LA@
-	
 
 STORE_TEST_SOURCES=\
 	$(TESTSOURCES) \
Index: squid3/src/PrefetchStream.cc
diff -u /dev/null squid3/src/PrefetchStream.cc:1.1.2.11
--- /dev/null		Thu Jan  1 01:00:00 1970
+++ squid3/src/PrefetchStream.cc	Sat Jan 21 12:37:58 2006
@@ -0,0 +1,249 @@
+
+/*
+ * $Id$
+ *
+ * DEBUG: section 93    HTML parsing and fetching
+ * AUTHOR: Nick Lewycky
+ *
+ * SQUID Web Proxy Cache          http://www.squid-cache.org/
+ * ----------------------------------------------------------
+ *
+ *  Squid is the result of efforts by numerous individuals from
+ *  the Internet community; see the CONTRIBUTORS file for full
+ *  details.   Many organizations have provided support for Squid's
+ *  development; see the SPONSORS file for full details.  Squid is
+ *  Copyrighted (C) 2001 by the Regents of the University of
+ *  California; see the COPYRIGHT file for full details.  Squid
+ *  incorporates software developed and/or copyrighted by other
+ *  sources; see the CREDITS file for full details.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *  
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
+ *
+ *  Copyright (c) 2004, Nick Lewycky <nicholas@mxc.ca>
+ *
+ */
+
+#include <limits>
+#include <set>
+
+#include "squid.h"
+#include "client_side_request.h"
+#include "Debug.h"
+#include "HttpReply.h"
+#include "HttpRequest.h"
+#include "PrefetchStream.h"
+
+using namespace std;
+
+CBDATA_CLASS_INIT(PrefetchStream);
+
+void
+PrefetchStream::prefetch(const string &href, const string &relative_url,
+                         const HttpRequest *request, bool recurse)
+{
+    static int no_recursion = numeric_limits<int>::max();
+
+    char *url = urlResolveRelative(href.c_str(), relative_url.c_str());
+
+    if (!url) {
+        debugs(93, 4, "(PS) Unable to resolve " << href <<
+               " against relative " << relative_url);
+        return;
+    }
+
+    {
+        char *f, *s, *n, *q, *w, *p;
+        urlParseRFC1808(url, &f, &s, &n, &q, &w, &p);
+        xfree(f);
+        xfree(q);
+        xfree(w);
+
+        if (!strlen(s) || !strlen(n) || p[0] != '/') {
+            // not absolute.
+            xfree(s);
+            xfree(n);
+            xfree(p);
+            xfree(url);
+            debugs(93, 4, "(PS) Refusing to fetch " << href <<
+                   " against relative " << relative_url);
+            return;
+        }
+
+        xfree(s);
+        xfree(n);
+        xfree(p);
+    }
+
+    // Coalesce multiple request for identical URLs. There's a
+    // race between when the object is requested and when it's entered
+    // into the cache. Prefetching will tend to aggravate that problem.
+    // TODO: remove this on landing of the collapsed_forwarding branch.
+    if (pending.find(url) != pending.end()) {
+      debugs(93, 4, "(PS) Request of " << url << " coalesced.");
+      xfree(url);
+      return;
+    }
+
+    // Refuse re-download negatively cached entries ... by refusing to
+    // fetch any entry found in the cache.
+    if (storeGetPublic(url, METHOD_GET)) {
+        debugs(93, 4, "(PS) No need to prefetch " << url);
+        xfree(url);
+        return;
+    }
+    pending.insert(url);
+
+    debugs(93, 4, "(PS) prefetching " << url);
+
+    HttpHeader tempheaders(hoRequest);
+    tempheaders.update(&request->header, NULL);
+    tempheaders.removeConnectionHeaderEntries();
+    httpHeaderDelById(&tempheaders, HDR_ACCEPT_RANGES);
+    httpHeaderDelById(&tempheaders, HDR_AGE);
+    httpHeaderDelById(&tempheaders, HDR_ETAG);
+    httpHeaderDelById(&tempheaders, HDR_CONTENT_LENGTH);
+    httpHeaderDelById(&tempheaders, HDR_CONTENT_MD5);
+    httpHeaderDelById(&tempheaders, HDR_CONTENT_RANGE);
+    httpHeaderDelById(&tempheaders, HDR_IF_MATCH);
+    httpHeaderDelById(&tempheaders, HDR_IF_MODIFIED_SINCE);
+    httpHeaderDelById(&tempheaders, HDR_IF_NONE_MATCH);
+    httpHeaderDelById(&tempheaders, HDR_IF_RANGE);
+    httpHeaderDelById(&tempheaders, HDR_RANGE);
+    httpHeaderPutStr(&tempheaders, HDR_CONNECTION, "close");
+    httpHeaderPutStr(&tempheaders, HDR_REFERER, urlCanonicalClean(request));
+    // login information should be inherited through urlResolveRelative.
+
+    PrefetchStream::Pointer stream = new PrefetchStream;
+    assert(stream.getRaw());
+    stream->pending_element = pending.insert(url).first;
+
+    // TODO: tag it "prefetch" for the delay pools.
+
+    // XXX: performs client authentication against "no_addr" IP
+    // instead of the IP causing the prefetch.
+    if (clientBeginRequest(METHOD_GET, url, PrefetchStream::BufferData,
+                           PrefetchStream::Detach, stream.getRaw(), &tempheaders,
+                           stream->discard_buffer, /*256*/HTTP_REQBUF_SZ,
+                           (ClientHttpRequest::flags_type){0,0,0,0,0}, no_addr,
+                           recurse?request->recursion_depth+1:no_recursion)) {
+        debugs(93, 4, "(PS) Failed to prefetch " << url);
+        return;
+    }
+    httpHeaderClean(&tempheaders);
+
+    xfree(url);
+}
+
+void
+PrefetchStream::BufferData(clientStreamNode *node, ClientHttpRequest *req,
+                           HttpReply *reply, StoreIOBuffer buffer)
+{
+    assert(cbdataReferenceValid(node));
+    dynamic_cast<PrefetchStream *>(node->data.getRaw())->bufferData(node, req, reply, buffer);
+}
+
+void
+PrefetchStream::Detach(clientStreamNode *node, ClientHttpRequest *req)
+{
+    assert(cbdataReferenceValid(node));
+    dynamic_cast<PrefetchStream *>(node->data.getRaw())->detach(node, req);
+}
+
+/* based on esiBufferRecipient */
+void
+PrefetchStream::bufferData(clientStreamNode *node, ClientHttpRequest *req,
+                           HttpReply *reply, StoreIOBuffer buffer)
+{
+    debugs(93, 4, "(PS) bufferData");
+
+    assert(!req->getConn().getRaw());
+
+    if (req->out.size)
+        assert(!reply);
+    else {
+        if (reply) {
+            if (reply->sline.status != HTTP_OK) {
+                debugs(93, 4, "(PS) Aborting on non-\"HTTP OK\"");
+                delete reply;
+                reply = NULL;
+                httpRequestFree(req);
+                return;
+            }
+#if HEADERS_LOG
+            headersLog(0, 0, req->request->method, reply);
+#endif
+
+            delete reply;
+            reply = NULL;
+        }
+    }
+
+    if (buffer.data && buffer.length)
+    {
+        debugs(93, 4, "(PS) advancing");
+        req->out.size += buffer.length;
+    }
+
+    if (!reply && !buffer.data && !buffer.length) {
+        debugs(93, 4, "(PS) EOF / Read error / aborted entry");
+        httpRequestFree(req);
+        return;
+    }
+
+    if (clientHttpRequestStatus(-1, req)) {
+      debugs(93, 4, "(PS) XXX some sort of weird overflow condition?");
+      node->data = NULL;
+      return;
+    }
+
+    switch (clientStreamStatus(node, req)) {
+        case STREAM_UNPLANNED_COMPLETE:
+            debugs(93, 4, "(PS) stream_unplanned_complete");
+            httpRequestFree(req);
+            return;
+        case STREAM_COMPLETE:
+            debugs(93, 4, "(PS) stream_complete");
+            httpRequestFree(req);
+            return;
+        case STREAM_FAILED:
+            debugs(93, 4, "(PS) stream_failed");
+            httpRequestFree(req);
+            return;
+        case STREAM_NONE:
+            debugs(93, 4, "(PS) stream_none");
+            debugs(93, 4, "(PS) before read");
+            buffer.offset += buffer.length;
+            clientStreamRead(node, req, buffer);
+            debugs(93, 4, "(PS) after read");
+            return;
+        default:
+            debugs(93, 4, "(PS) default");
+            return;
+    }
+}
+
+void
+PrefetchStream::detach(clientStreamNode *node, ClientHttpRequest *req)
+{
+    debugs(93, 4, "(PS) detach");
+    pending.erase(pending_element);
+    clientStreamDetach(node, req);
+}
+
+set<string>
+PrefetchStream::pending;
+
+char
+PrefetchStream::discard_buffer[HTTP_REQBUF_SZ];
Index: squid3/src/PrefetchStream.h
diff -u /dev/null squid3/src/PrefetchStream.h:1.1.2.2
--- /dev/null		Thu Jan  1 01:00:00 1970
+++ squid3/src/PrefetchStream.h	Thu May  5 11:24:15 2005
@@ -0,0 +1,41 @@
+#ifndef SQUID_PREFETCHSTREAM_H
+#define SQUID_PREFETCHSTREAM_H
+
+#include <set>
+#include <string>
+
+#include "defines.h"
+#include "clientStream.h"
+#include "Store.h"
+#include "StoreClient.h"
+
+class HttpReply;
+class HttpRequest;
+
+class PrefetchStream : public RefCountable
+{
+public:
+    typedef RefCount<PrefetchStream> Pointer;
+
+    static void BufferData(clientStreamNode *, ClientHttpRequest *,
+                           HttpReply *, StoreIOBuffer);
+    static void Detach(clientStreamNode *, ClientHttpRequest *);
+
+    static char discard_buffer[HTTP_REQBUF_SZ];
+    std::set<std::string>::iterator pending_element;
+
+    static void prefetch(const std::string &, const std::string &,
+                         const HttpRequest *, bool);
+
+    // used for coalescing identical requests
+    static std::set<std::string> pending;
+
+private:
+    void bufferData(clientStreamNode *, ClientHttpRequest *,
+                    HttpReply *, StoreIOBuffer);
+    void detach(clientStreamNode *, ClientHttpRequest *);
+
+    CBDATA_CLASS2(PrefetchStream);
+};
+
+#endif // header guard
Index: squid3/src/cf.data.pre
diff -u squid3/src/cf.data.pre:1.93 squid3/src/cf.data.pre:1.65.2.5
--- squid3/src/cf.data.pre:1.93	Tue Dec 13 19:13:12 2005
+++ squid3/src/cf.data.pre	Sat Jan 21 12:37:58 2006
@@ -4029,6 +4029,16 @@
 	the same value since they both use port 2048.
 DOC_END
 
+NAME: html_analysis
+IFDEF: USE_HTMLPREFETCH
+COMMENT: on|off
+TYPE: onoff
+LOC: Config.onoff.htmlPrefetch
+DEFAULT: on
+DOC_START
+        Analyze HTML documents received from the server and fetch all
+        embedded documents into the cache. The default is on.
+DOC_END
 
 COMMENT_START
  DELAY POOL PARAMETERS (all require DELAY_POOLS compilation option)
Index: squid3/src/cf_gen_defines
diff -u squid3/src/cf_gen_defines:1.3 squid3/src/cf_gen_defines:1.2.2.2
--- squid3/src/cf_gen_defines:1.3	Sat Apr 30 19:13:22 2005
+++ squid3/src/cf_gen_defines	Thu Oct 13 20:16:27 2005
@@ -11,6 +11,7 @@
 	define["USE_DNSSERVERS"]="--disable-internal-dns"
 	define["!USE_DNSSERVERS"]="--enable-internal-dns"
 	define["USE_HTCP"]="--enable-htcp"
+        define["USE_HTMLPREFETCH"]="--enable-html-analysis"
 	define["USE_ICMP"]="--enable-icmp"
 	define["USE_IDENT"]="--enable-ident-lookups"
 	define["USE_REFERER_LOG"]="--enable-referer-log"
Index: squid3/src/client_side_reply.cc
diff -u squid3/src/client_side_reply.cc:1.70 squid3/src/client_side_reply.cc:1.57.2.7
--- squid3/src/client_side_reply.cc:1.70	Thu Jan 19 19:13:56 2006
+++ squid3/src/client_side_reply.cc	Sat Jan 21 12:37:59 2006
@@ -56,6 +56,9 @@
 #include "DelayPools.h"
 #endif
 #include "client_side.h"
+#if USE_HTMLPREFETCH
+#include "HTMLAnalysisStream.h"
+#endif
 
 CBDATA_CLASS_INIT(clientReplyContext);
 
@@ -1939,6 +1942,28 @@
     ("clientReplyContext::sendMoreData: Appending %d bytes after %d bytes of headers\n",
      (int) body_size, rep->hdr_sz);
 
+#if USE_HTMLPREFETCH
+
+    if (Config.onoff.htmlPrefetch) {
+        const HttpReply *reply = http->storeEntry()->getReply();
+        const String content_type = reply->content_type;
+
+        if (content_type.buf() &&
+            (content_type.caseCmp("text/html") == 0 ||
+             content_type.caseCmp("application/xhtml+xml") == 0) &&
+            !httpHeaderHas(&reply->header, HDR_CONTENT_ENCODING) &&
+            !logTypeIsATcpHit(http->logType) &&
+            reply->sline.status == HTTP_OK &&
+            !http->request->range && http->request->recursion_depth < 5) {
+            HTMLAnalysisStream *htmlas = new HTMLAnalysisStream(http->request);
+            clientStreamInsertHead(&http->client_stream, htmlStreamRead,
+                                   htmlBufferData, htmlStreamDetach,
+                                   htmlStreamStatus, htmlas);
+        }
+    }
+
+#endif
+
 #if ESI
 
     if (http->flags.accel && rep->sline.status != HTTP_FORBIDDEN &&
Index: squid3/src/client_side_request.cc
diff -u squid3/src/client_side_request.cc:1.45 squid3/src/client_side_request.cc:1.33.2.6
--- squid3/src/client_side_request.cc:1.45	Wed Jan 11 19:13:42 2006
+++ squid3/src/client_side_request.cc	Sat Jan 21 12:37:59 2006
@@ -269,7 +269,7 @@
 int				/* returns nonzero on failure */
 clientBeginRequest(method_t method, char const *url, CSCB * streamcallback,
                    CSD * streamdetach, ClientStreamData streamdata, HttpHeader const *header,
-                   char *tailbuf, size_t taillen)
+                   char *tailbuf, size_t taillen, ClientHttpRequest::flags_type flags, in_addr addr, int prefetch_recursion_depth)
 {
     size_t url_sz;
     HttpVersion http_ver (1, 0);
@@ -289,9 +289,7 @@
     /* make it visible in the 'current acctive requests list' */
     dlinkAdd(http, &http->active, &ClientActiveRequests);
     /* Set flags */
-    /* internal requests only makes sense in an
-     * accelerator today. TODO: accept flags ? */
-    http->flags.accel = 1;
+    http->flags = flags;
     /* allow size for url rewriting */
     url_sz = strlen(url) + Config.appendDomainLen + 5;
     http->uri = (char *)xcalloc(url_sz, 1);
@@ -303,7 +301,7 @@
     }
 
     /*
-     * now update the headers in request with our supplied headers. urLParse
+     * now update the headers in request with our supplied headers. urlParse
      * should return a blank header set, but we use Update to be sure of
      * correctness.
      */
@@ -332,7 +330,7 @@
     /* Internally created requests cannot have bodies today */
     request->content_length = 0;
 
-    request->client_addr = no_addr;
+    request->client_addr = addr;
 
     request->my_addr = no_addr;	/* undefined for internal requests */
 
@@ -340,6 +338,10 @@
 
     request->http_ver = http_ver;
 
+#if USE_HTMLPREFETCH
+    request->recursion_depth = prefetch_recursion_depth;
+#endif
+
     http->request = requestLink(request);
 
     /* optional - skip the access check ? */
Index: squid3/src/client_side_request.h
diff -u squid3/src/client_side_request.h:1.20 squid3/src/client_side_request.h:1.17.6.5
--- squid3/src/client_side_request.h:1.20	Mon Nov 21 19:13:08 2005
+++ squid3/src/client_side_request.h	Sat Jan 21 12:37:59 2006
@@ -48,7 +48,6 @@
 #endif
 
 /* client_side_request.c - client side request related routines (pure logic) */
-extern int clientBeginRequest(method_t, char const *, CSCB *, CSD *, ClientStreamData, HttpHeader const *, char *, size_t);
 
 class MemObject;
 
@@ -106,7 +105,7 @@
     struct timeval start;
     AccessLogEntry al;
 
-    struct
+    struct flags_type
     {
 
 unsigned int accel:
@@ -165,6 +164,8 @@
 #endif
 };
 
+extern int clientBeginRequest(method_t, char const *, CSCB *, CSD *, ClientStreamData, HttpHeader const *, char *, size_t, ClientHttpRequest::flags_type, in_addr = no_addr, int prefetch_recursion_depth = 0);
+
 /* client http based routines */
 SQUIDCEXTERN char *clientConstructTraceEcho(ClientHttpRequest *);
 SQUIDCEXTERN ACLChecklist *clientAclChecklistCreate(const acl_access * acl,ClientHttpRequest * http);
Index: squid3/src/http.cc
diff -u squid3/src/http.cc:1.71 squid3/src/http.cc:1.43.2.9
--- squid3/src/http.cc:1.71	Thu Jan 19 19:13:56 2006
+++ squid3/src/http.cc	Sat Jan 21 12:37:59 2006
@@ -35,7 +35,7 @@
 
 /*
  * Anonymizing patch by lutz@as-node.jena.thur.de
- * have a look into http-anon.c to get more informations.
+ * have a look into http-anon.c to get more information.
  */
 
 #include "squid.h"
@@ -58,6 +58,10 @@
 extern ICAPConfig TheICAPConfig;
 #endif
 
+#include <memory>
+
+using namespace std;
+
 CBDATA_CLASS_INIT(HttpStateData);
 
 static const char *const crlf = "\r\n";
@@ -422,6 +426,8 @@
     }
 
 #endif
+
+    // TODO: prefetch from HDR_LINK if applicable.
 }
 
 int
@@ -1084,6 +1090,8 @@
         while (len > 0 && isspace(*buf))
             xmemmove(buf, buf + 1, len--);
 
+        debug(11, 5) ("httpReadReply: FD %d: stripeed whitespace, now len=%d.\n", fd, (int)len);
+
         if (len == 0) {
             /* Continue to read... */
             /* Timeout NOT increased. This whitespace was from previous reply */
Index: squid3/src/http.h
diff -u squid3/src/http.h:1.16 squid3/src/http.h:1.10.2.6
--- squid3/src/http.h:1.16	Wed Jan  4 19:13:23 2006
+++ squid3/src/http.h	Sat Jan 21 12:37:59 2006
@@ -34,6 +34,7 @@
 #ifndef SQUID_HTTP_H
 #define SQUID_HTTP_H
 
+#include "MemBuf.h"
 #include "StoreIOBuffer.h"
 #include "comm.h"
 #include "forward.h"
Index: squid3/src/protos.h
diff -u squid3/src/protos.h:1.63 squid3/src/protos.h:1.48.2.5
--- squid3/src/protos.h:1.63	Tue Jan  3 19:13:23 2006
+++ squid3/src/protos.h	Sat Jan 21 12:37:59 2006
@@ -818,6 +818,10 @@
 SQUIDCEXTERN char *urlCanonicalClean(const HttpRequest *);
 SQUIDCEXTERN char *urlHostname(const char *url);
 SQUIDCEXTERN void urlExtMethodConfigure(void);
+SQUIDCEXTERN void urlParseRFC1808(const char *url,
+    char **fragment, char **scheme,
+    char **net_loc, char **query, char **params, char **path);
+SQUIDCEXTERN char *urlResolveRelative(const char *embedded, const char *base);
 
 SQUIDCEXTERN void useragentOpenLog(void);
 SQUIDCEXTERN void useragentRotateLog(void);
Index: squid3/src/squid.h
diff -u squid3/src/squid.h:1.21 squid3/src/squid.h:1.18.2.4
--- squid3/src/squid.h:1.21	Mon Nov 21 19:13:08 2005
+++ squid3/src/squid.h	Sat Jan 21 12:37:59 2006
@@ -37,6 +37,9 @@
 
 #include "config.h"
 
+#ifdef assert
+#undef assert
+#endif
 #if PURIFY
 #define assert(EX) ((void)0)
 #elif defined(NODEBUG)
Index: squid3/src/structs.h
diff -u squid3/src/structs.h:1.88 squid3/src/structs.h:1.65.2.4
--- squid3/src/structs.h:1.88	Tue Jan  3 19:13:24 2006
+++ squid3/src/structs.h	Sat Jan 21 12:37:59 2006
@@ -579,6 +579,12 @@
         int emailErrData;
         int httpd_suppress_version_string;
         int global_internal_static;
+#if USE_HTMLPREFETCH
+
+        int htmlPrefetch;
+
+#endif
+
     }
 
     onoff;
Index: squid3/src/url.cc
diff -u squid3/src/url.cc:1.12 squid3/src/url.cc:1.9.6.8
--- squid3/src/url.cc:1.12	Thu Jan 19 19:13:56 2006
+++ squid3/src/url.cc	Sat Jan 21 12:37:59 2006
@@ -869,3 +869,392 @@
         w = w->next;
     }
 }
+
+void
+urlParseRFC1808(const char *url,
+                char **fragment, char **scheme, char **net_loc,
+                char **query, char **params, char **path)
+{
+    char *url_ = xstrdup(url);
+
+    /* 2.4.1. Parsing the Fragment Identifier
+     * 
+     * If the parse string contains a crosshatch "#" character, then
+     * the substring after the first (left-most) crosshatch "#" and up
+     * to the end of the parse string is the <fragment> identifier. If
+     * the crosshatch is the last character, or no crosshatch is
+     * present, then the fragment identifier is empty. The matched
+     * substring, including the crosshatch character, is removed from
+     * the parse string before continuing. 
+     *
+     * Note that the fragment identifier is not considered part of the
+     * URL. However, since it is often attached to the URL, parsers
+     * must be able to recognize and set aside fragment identifiers as
+     * part of the process.
+     */
+    char *p = strchr(url_, '#');
+
+    if (p) {
+        *p = 0;
+        p++;
+        size_t len = strlen(p);
+        *fragment = static_cast<char *>(xcalloc(len + 1, sizeof(char)));
+        strncpy(*fragment, p, len);
+    } else
+        *fragment = xstrdup("");
+
+    /* 2.4.2. Parsing the Scheme
+     *
+     * If the parse string contains a colon ":" after the first
+     * character and before any characters not allowed as part of a
+     * scheme name (i.e., any not an alphanumeric, plus "+", period
+     * ".", or hyphen "-"), the <scheme> of the URL is the substring
+     * of characters up to but not including the first colon. These
+     * characters and the colon are then removed from the parse string
+     * before continuing.
+     */
+    p = url_;
+
+    while (*p && (isalnum(*p) || *p == '+' || *p == '.' || *p == '-'))
+        p++;
+
+    if (*p == ':') {
+        *p = 0;
+        p++;
+        size_t len = strlen(url_);
+        *scheme = static_cast<char *>(xcalloc(len + 1, sizeof(char)));
+        strncpy(*scheme, url_, len);
+    } else {
+        *scheme = xstrdup("");
+        p = url_;
+    }
+
+    /* 2.4.3. Parsing the Network Location/Login
+     *
+     * If the parse string begins with a double-slash "//", then the
+     * substring of characters after the double-slash and up to, but
+     * not including, the next slash "/" character is the network
+     * location/login (<net_loc>) of the URL. If no trailing slash "/"
+     * is present, the entire remaining parse string is assigned to
+     * <net_loc>. The double- slash and <net_loc> are removed from the
+     * parse string before continuing.
+     */
+    char *begin = p;
+
+    if (begin[0] == '/' && begin[1] == '/') {	// depends on short circuit
+        p = strchr(begin + 2, '/');
+        *begin = 0;
+        *(begin + 1) = 0;
+        begin += 2;
+        size_t len = p ? p - begin : strlen(begin);
+        *net_loc = static_cast<char *>(xcalloc(len + 1, sizeof(char)));
+        strncpy(*net_loc, begin, len);
+    } else
+        *net_loc = xstrdup("");
+
+    /* 2.4.4. Parsing the Query Information
+     *
+     * If the parse string contains a question mark "?" character,
+     * then the substring after the first (left-most) question mark
+     * "?" and up to the end of the parse string is the <query>
+     * information. If the question mark is the last character, or no
+     * question mark is present, then the query information is
+     * empty. The matched substring, including the question mark
+     * character, is removed from the parse string before continuing.
+     */
+    begin = p;
+
+    if (p)
+        p = strchr(p, '?');	// eg., scheme://host?query
+
+    if (p && *p) {		// depends on short circuit (obvious this time)
+        *p = 0;
+        p++;
+        size_t len = strlen(p);
+        *query = static_cast<char *>(xcalloc(len + 1, sizeof(char)));
+        strncpy(*query, p, len);
+    } else
+        *query = xstrdup("");
+
+    /* 2.4.5.  Parsing the Parameters
+     *
+     * If the parse string contains a semicolon ";" character, then
+     * the substring after the first (left-most) semicolon ";" and up
+     * to the end of the parse string is the parameters (<params>).
+     * If the semicolon is the last character, or no semicolon is
+     * present, then <params> is empty.  The matched substring,
+     * including the semicolon character, is removed from the parse
+     * string before continuing.
+     */
+    p = begin ? strchr(begin, ';') : NULL;
+
+    if (p && *p) {
+        *p = 0;
+        p++;
+        size_t len = strlen(p);
+        *params = static_cast<char *>(xcalloc(len + 1, sizeof(char)));
+        strncpy(*params, p, len);
+    } else
+        *params = xstrdup("");
+
+    /* 2.4.6.  Parsing the Path
+     *
+     * After the above steps, all that is left of the parse string is
+     * the URL <path> and the slash "/" that may precede it.  Even
+     * though the initial slash is not part of the URL path, the
+     * parser must remember whether or not it was present so that
+     * later processes can differentiate between relative and absolute
+     * paths.  Often this is done by simply storing the preceding
+     * slash along with the path.
+     */
+    if (begin && *begin) {
+        size_t len = strlen(begin);
+        *path = static_cast<char *>(xcalloc(len + 1, sizeof(char)));
+        strncpy(*path, begin, len);
+    } else
+        *path = xstrdup("");
+
+    xfree(url_);
+}
+
+// This isn't *exactly* RFC 1808, but it's very close. Specifically,
+// it won't inherit param/query/fragment from the base into the
+// embedded URL. Also, it doesn't check whether a path is a "complete
+// path", so it won't emit ".." or "." in certain cases it should.
+// Neither of these deviations are dangerous.
+char *
+urlResolveRelative(const char *embedded, const char *base)
+{
+    char *base_scheme, *base_host, *base_path, *base_params,
+    *base_query, *base_fragment;
+    char *embd_scheme, *embd_host, *embd_path, *embd_params,
+    *embd_query, *embd_fragment;
+
+    /* Step 1: The base URL is established according to the rules of
+     * Section 3.  If the base URL is the empty string (unknown), the
+     * embedded URL is interpreted as an absolute URL and we are done.
+     */
+
+    if (strlen(base) == 0)
+        return xstrdup(embedded);
+
+    /* Step 2: Both the base and embedded URLs are parsed into their
+     * component parts as described in Section 2.4.
+     *
+     * a) If the embedded URL is entirely empty, it inherits the
+     * entire base URL (i.e., is set equal to the base URL) and we are
+     * done.
+     */
+    if (strlen(embedded) == 0)
+        return xstrdup(base);
+
+    /* b) If the embedded URL starts with a scheme name, it is
+     * interpreted as an absolute URL and we are done.
+     */
+    urlParseRFC1808(embedded, &embd_fragment, &embd_scheme, &embd_host,
+                    &embd_query, &embd_params, &embd_path);
+
+    if (strlen(embd_scheme) != 0) {
+        xfree(embd_scheme);
+        xfree(embd_host);
+        xfree(embd_path);
+        xfree(embd_params);
+        xfree(embd_query);
+        xfree(embd_fragment);
+
+        return xstrdup(embedded);
+    }
+
+    /* c) Otherwise, the embedded URL inherits the scheme of the base
+     * URL.
+     */
+    urlParseRFC1808(base, &base_fragment, &base_scheme, &base_host,
+                    &base_query, &base_params, &base_path);
+
+    xfree(embd_scheme);
+
+    embd_scheme = xstrdup(base_scheme);
+
+    // This might just be covering up a problem elsewhere, I'm not
+    // sure.
+    if (!base_path || !*base_path) {
+        base_path = xstrdup("/");
+    }
+
+    /* Step 3: If the embedded URL's <net_loc> is non-empty, we skip
+     * to Step 7.  Otherwise, the embedded URL inherits the <net_loc>
+     * (if any) of the base URL.
+     */
+    if (strlen(embd_host) == 0) {
+        xfree(embd_host);
+        embd_host = xstrdup(base_host);
+
+        /* Step 4: If the embedded URL path is preceded by a slash
+         * "/", the path is not relative and we skip to Step 7.
+         */
+
+        if (embd_path[0] != '/') {
+            /* Step 5: If the embedded URL path is empty (and not
+             * preceded by a slash), then the embedded URL inherits
+             * the base URL path.
+             */
+            // The RFC goes on to inherit <params> and <query>,
+            // however that sounds bogus to me, and Mozilla agrees, so
+            // we don't do that.
+
+            if (strlen(embd_path) == 0) {
+                xfree(embd_path);
+                embd_path = xstrdup(base_path);
+            } else {
+                /* Step 6: The last segment of the base URL's path
+                 * (anything following the rightmost slash "/", or the
+                 * entire path if no slash is present) is removed and
+                 * the embedded URL's path is appended in its place.
+                 * The following operations are then applied, in
+                 * order, to the new path:
+                 */
+                char *new_path =
+                    static_cast<char *>(
+                        xcalloc(strlen(base_path) + strlen(embd_path) + 1,
+                        sizeof(char)));
+
+                strcat(new_path, base_path);
+
+                char *p = strrchr(new_path, '/');
+
+                if (!p)
+                    p = new_path;
+                else
+                    p++;
+
+                *p = 0;
+
+                strcat(new_path, embd_path);
+
+                /* a) All occurrences of "./", where "." is a complete
+                 * path segment, are removed.
+                 */
+                while ((p = strstr(new_path, "/./"))) {
+                    memmove(p, p + 2, strlen(p + 2) + 1);
+                }
+
+                while (new_path[0] == '.' && new_path[1] == '/') {
+                    memmove(new_path, new_path + 2, strlen(new_path + 2) + 1);
+                }
+
+                /* b) If the path ends with "." as a complete path
+                 * segment, that "." is removed.
+                 */
+                if (strlen(new_path) == 1 && new_path[0] == '.')
+                    new_path[0] = 0;
+
+                while (strlen(new_path) >= 2 &&
+                        new_path[strlen(new_path) - 2] == '/' &&
+                        new_path[strlen(new_path) - 1] == '.') {
+                    new_path[strlen(new_path) - 2] = 0;
+                }
+
+                /* c) All occurrences of "<segment>/../", where
+                 * <segment> is a complete path segment not equal to
+                 * "..", are removed. Removal of these path segments
+                 * is performed iteratively, removing the leftmost
+                 * matching pattern on each iteration, until no
+                 * matching pattern remains.
+                 */
+                p = new_path;
+
+                while ((p = strstr(p, "/../"))) {
+                    char *segment = new_path;
+
+                    if (p != new_path) {
+                        segment = p - 1;
+
+                        while (segment != new_path && *segment != '/')
+                            segment--;
+                    }
+
+                    if (!(segment[0] == '.' && segment[1] == '.')) {
+                        memmove(segment, p + 3, strlen(p + 3) + 1);
+                        p = segment;
+                    }
+                }
+
+                /* d) If the path ends with "<segment>/..", where
+                 * <segment> is a complete path segment not equal to
+                 * "..", that "<segment>/.." is removed.
+                 */
+                while (strlen(new_path) >= 3 &&
+                        new_path[strlen(new_path) - 3] == '/' &&
+                        new_path[strlen(new_path) - 2] == '.' &&
+                        new_path[strlen(new_path) - 1] == '.') {
+                    char *segment = new_path + strlen(new_path) - 4;
+
+                    while (segment != new_path && *segment != '/')
+                        segment--;
+
+                    *segment = 0;
+                }
+
+                xfree(embd_path);
+                embd_path = new_path;
+            }
+        }
+    }
+
+    /* Step 7: The resulting URL components, including any inherited
+     * from the base URL, are recombined to give the absolute form of
+     * the embedded URL.
+     */
+    char *new_url =
+        static_cast<char *>(
+            xcalloc(strlen(embd_scheme) + strlen(embd_host) +
+                    strlen(embd_path) + strlen(embd_params) +
+                    strlen(embd_query) + strlen(embd_fragment) + 8,
+                    sizeof(char)));
+
+    if (strlen(embd_scheme)) {
+        strcat(new_url, embd_scheme);
+        strcat(new_url, ":");
+    }
+
+    if (strlen(embd_host)) {
+        strcat(new_url, "//");
+        strcat(new_url, embd_host);
+
+        if (strlen(embd_path) && embd_path[0] != '/')
+            strcat(new_url, "/");
+    }
+
+    strcat(new_url, embd_path);
+
+    if (strlen(embd_params)) {
+        strcat(new_url, ";");
+        strcat(new_url, embd_params);
+    }
+
+    if (strlen(embd_query)) {
+        strcat(new_url, "?");
+        strcat(new_url, embd_query);
+    }
+
+    if (strlen(embd_fragment)) {
+        strcat(new_url, "#");
+        strcat(new_url, embd_fragment);
+    }
+
+    xfree(base_scheme);
+    xfree(base_host);
+    xfree(base_path);
+    xfree(base_params);
+    xfree(base_query);
+    xfree(base_fragment);
+
+    xfree(embd_scheme);
+    xfree(embd_host);
+    xfree(embd_path);
+    xfree(embd_params);
+    xfree(embd_query);
+    xfree(embd_fragment);
+
+    return new_url;
+}
