--------------------- PatchSet 1188 Date: 2005/01/13 03:53:04 Author: nlewycky Branch: prefetching Tag: (none) Log: Refactor the stream responsible for fetching into a separate file. Members: src/HTMLPrefetchParser.cc:1.2->1.2.2.1 src/HTMLPrefetchParser.h:1.2->1.2.2.1 src/Makefile.am:1.57.2.1->1.57.2.2 src/PrefetchStream.cc:1.1->1.1.2.1 src/PrefetchStream.h:1.1->1.1.2.1 --- /dev/null Wed Feb 14 13:33:00 2007 +++ squid3/src/HTMLPrefetchParser.cc Wed Feb 14 13:34:09 2007 @@ -0,0 +1,347 @@ + +/* + * $Id: HTMLPrefetchParser.cc,v 1.2.2.1 2005/01/13 03:53:04 nlewycky Exp $ + * + * DEBUG: section 93 HTML parsing and fetching + * AUTHOR: Nick Lewycky + * + * SQUID Web Proxy Cache http://www.squid-cache.org/ + * ---------------------------------------------------------- + * + * Squid is the result of efforts by numerous individuals from + * the Internet community; see the CONTRIBUTORS file for full + * details. Many organizations have provided support for Squid's + * development; see the SPONSORS file for full details. Squid is + * Copyrighted (C) 2001 by the Regents of the University of + * California; see the COPYRIGHT file for full details. Squid + * incorporates software developed and/or copyrighted by other + * sources; see the CREDITS file for full details. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. + * + * Copyright (c) 2004, Nick Lewycky + * + */ + +#include +#include + +#include + +#include "squid.h" +#include "Debug.h" +#include "HttpReply.h" +#include "HttpRequest.h" +#include "HTMLPrefetchParser.h" +#include "http.h" +#include "PrefetchStream.h" +#include "protos.h" + +using namespace std; + +HTMLPrefetchParser::HTMLPrefetchParser(const HttpStateData * state) + : + stateData(state), + parser(NULL) +{} + +HTMLPrefetchParser::~HTMLPrefetchParser() +{ + if (parser) + htmlFreeParserCtxt(parser); +} + +void +HTMLPrefetchParser::init() +{ + if (parser) + return; + + parser = htmlCreatePushParserCtxt(&handler, static_cast(this), + NULL, 0, NULL, XML_CHAR_ENCODING_NONE); + + if (!parser) + throw runtime_error("Unable to create parser."); + + xmlSubstituteEntitiesDefault(1); + + relative_url = urlCanonical(stateData->orig_request); + + debugs(93, 1, "analyzing " << relative_url); +} + +inline void +HTMLPrefetchParser::prefetch(const string &url) +{ + PrefetchStream::prefetch(url, relative_url, stateData->orig_request); +} + +void +HTMLPrefetchParser::real_parse(const char *document, size_t len, bool partial) +{ + debugs(93, 1, "parse to chunk, " << len << " long " << (partial?"partial":"")); + debugs(93, 1, "chunk is: " << string(document, len)); + htmlParseChunk(parser, document, len, partial ? 1 : 0); +} + +/* This code is ugly for two reasons. One is the need to inter-operate + * with a C library. The other is a need for high-performance leading + * to lots of short circuiting. + */ +void +HTMLPrefetchParser::start_element_handler(void *userData, + const xmlChar * xname, const xmlChar ** xattr) +{ + const char *name = reinterpret_cast(xname); + const char **attr = reinterpret_cast(xattr); + + if (!name || !attr) + return; + + HTMLPrefetchParser *self = static_cast(userData); + + // Example: + // + if (strcasecmp(name, "img") == 0) { + while (*attr) { + if (strcasecmp(name, "img") == 0 && + strcasecmp(*attr, "src") == 0) { + self->prefetch(*(attr + 1)); + return; + } + + attr += 2; + } + + return; + } + + // Example: + // + // + bool is_fetchable = false; + string href; + if (strcasecmp(name, "link") == 0) { + while (*attr) { + if (strcasecmp(*attr, "href") == 0) { + if (is_fetchable) { + self->prefetch(*(attr + 1)); + return; + } else + href = *(attr + 1); + } else if (strcasecmp(*attr, "rel") == 0) { + if (strcasecmp(*(attr + 1), "stylesheet") != 0 && + strcasecmp(*(attr + 1), "prefetch") != 0) + return; + + if (!href.empty()) { + self->prefetch(*(attr + 1)); + return; + } else + is_fetchable = true; + } + + attr += 2; + } + + return; + } + + // Example: + // + bool is_httpequiv = false; + + if (strcasecmp(name, "meta") == 0) { + while (*attr) { + if (strcasecmp(*attr, "content") == 0) { + // TODO: replace this with some incantation of httpHeaderParse? + + /* From RFC 2068: + * + * Link = "Link" ":" #("<" URI ">" *( ";" link-param ) + * + * link-param = ( ( "rel" "=" relationship ) + * | ( "rev" "=" relationship ) + * | ( "title" "=" quoted-string ) + * | ( "anchor" "=" <"> URI <"> ) + * | ( link-extension ) ) + * + * link-extension = token [ "=" ( token | quoted-string ) ] + * + * relationship = sgml-name + * | ( <"> sgml-name *( SP sgml-name) <"> ) + * + * sgml-name = ALPHA *( ALPHA | DIGIT | "." | "-" ) + */ + + const char *p1 = *(attr + 1), *p2; + + if (!p1) + return; // eg. + + while (*p1 && *p1 == ' ') + p1++; + + if (*p1 == '<') + p1++; + else + return; // malformed (no leading '<') + + p2 = p1; + + while (*p2 && *p2 != '>' && strncasecmp(p2, ">", 4) != 0) + p2++; + + if (!*p2) + return; // malformed (end of string before '>') + + href = string(p1, p2); + + p2++; // skip over closing '>' + + while (*p2 == ';' && !is_fetchable) { + if (!*p2 == ';') + return; + + p2++; // skip over separating ';' + + while (*p2 && *p2 == ' ') + p2++; + + if (strncasecmp(p2, "rel", 3) == 0) { + p2 += 3; // skip over 'rel' + + while (*p2 == ' ') + p2++; + + if (*p2 != '=') + return; // malformed (no '=' in link-param) + + p2++; // skip over '=' + + while (*p2 == ' ') + p2++; + + bool is_quoted = *p2 == '"'; + + if (is_quoted) + p2++; + + if (strncasecmp(p2, "prefetch", 8) == 0) { + p2 += 8; + + if (is_quoted && *p2 != '"') + return; + else { + if (is_httpequiv) { + self->prefetch(href); + return; + } else + is_fetchable = true; + } + } else + return; + } else { + while (*p2 && *p2 != '=') + p2++; + + p2++; + + bool is_quoted = *p2 == '"'; + + if (is_quoted) { + while (*p2 && *p2 != '"') + p2++; + + while (*p2 == ' ') + p2++; + + p2++; + } else + while (*p2 && *p2 != ' ') + p2++; + } + } + + if (!is_fetchable) + return; // no [further] link-param, hence no rel=prefetch + + } else if (strcasecmp(*attr, "http-equiv") == 0 && + strcasecmp(*(attr + 1), "link") == 0) { + if (!href.empty()) { + self->prefetch(href); + return; + } else + is_httpequiv = true; + } + + attr += 2; + } + + return; + } + + // Example: + // + if (strcasecmp(name, "base") == 0) { + while (*attr) { + if (strcasecmp(*attr, "href") == 0) { + self->relative_url = *(attr + 1); + return; + } + + attr += 2; + } + + return; + } +} + +htmlSAXHandler +HTMLPrefetchParser::handler = { + NULL, // internal subset + NULL, // is standalone + NULL, // has internal subset + NULL, // has external subset + NULL, // resolve entity + NULL, // get entity + NULL, // entity declaration + NULL, // notation declaration + NULL, // attribute declaration + NULL, // element declaraion + NULL, // unparsed entity + NULL, // set document locator + NULL, // start document + NULL, // end document + HTMLPrefetchParser::start_element_handler, // start element + NULL, // end element + NULL, // reference + NULL, // characters + NULL, // ignorable whitespace + NULL, // processing instruction + NULL, // comment + NULL, // warning + NULL, // error + NULL, // fatal error + NULL, // get parameter entity + NULL, // cdata block + NULL, // external subset + XML_SAX2_MAGIC, + NULL, // private + NULL, // start element namespace + NULL, // end element namespace + NULL // xml structured error +}; + --- /dev/null Wed Feb 14 13:33:00 2007 +++ squid3/src/HTMLPrefetchParser.h Wed Feb 14 13:34:09 2007 @@ -0,0 +1,48 @@ +#ifndef SQUID_HTMLPREFETCHPARSER_H +#define SQUID_HTMLPREFETCHPARSER_H + +#include +#include + +#include +#include + +#include "defines.h" +#include "clientStream.h" +#include "Store.h" +#include "StoreClient.h" + +class HttpStateData; + +class HTMLPrefetchParser +{ +public: + explicit HTMLPrefetchParser(const HttpStateData *); + ~HTMLPrefetchParser(); + + void init(); + + inline void parse(const char *document, size_t len, bool partial) + { + if (!parser) + return; + real_parse(document, len, partial); + } + +private: + void prefetch(const std::string &); + + void real_parse(const char *document, size_t len, bool partial); + + static void start_element_handler(void *, const xmlChar *, + const xmlChar **); + + const HttpStateData *stateData; + + htmlParserCtxtPtr parser; + std::string relative_url; + + static htmlSAXHandler handler; +}; + +#endif // header guard Index: squid3/src/Makefile.am =================================================================== RCS file: /cvsroot/squid-sf//squid3/src/Makefile.am,v retrieving revision 1.57.2.1 retrieving revision 1.57.2.2 diff -u -r1.57.2.1 -r1.57.2.2 --- squid3/src/Makefile.am 12 Jan 2005 21:58:27 -0000 1.57.2.1 +++ squid3/src/Makefile.am 13 Jan 2005 03:53:04 -0000 1.57.2.2 @@ -1,7 +1,7 @@ # # Makefile for the Squid Object Cache server # -# $Id: Makefile.am,v 1.57.2.1 2005/01/12 21:58:27 nlewycky Exp $ +# $Id: Makefile.am,v 1.57.2.2 2005/01/13 03:53:04 nlewycky Exp $ # # Uncomment and customize the following to suit your needs: # @@ -94,7 +94,9 @@ HTMLPREFETCH_ALL_SOURCE = \ HTMLPrefetchParser.cc \ - HTMLPrefetchParser.h + HTMLPrefetchParser.h \ + PrefetchStream.cc \ + PrefetchStream.h if USE_HTMLPREFETCH HTMLPREFETCH_SOURCE = $(HTMLPREFETCH_ALL_SOURCE) else @@ -983,7 +985,6 @@ @AUTH_LINKOBJS@ \ @AUTH_OBJS@ \ @SQUID_CPPUNIT_LA@ - STORE_TEST_SOURCES=\ $(TESTSOURCES) \ --- /dev/null Wed Feb 14 13:33:00 2007 +++ squid3/src/PrefetchStream.cc Wed Feb 14 13:34:09 2007 @@ -0,0 +1,216 @@ + +/* + * $Id: PrefetchStream.cc,v 1.1.2.1 2005/01/13 03:53:05 nlewycky Exp $ + * + * DEBUG: section 93 HTML parsing and fetching + * AUTHOR: Nick Lewycky + * + * SQUID Web Proxy Cache http://www.squid-cache.org/ + * ---------------------------------------------------------- + * + * Squid is the result of efforts by numerous individuals from + * the Internet community; see the CONTRIBUTORS file for full + * details. Many organizations have provided support for Squid's + * development; see the SPONSORS file for full details. Squid is + * Copyrighted (C) 2001 by the Regents of the University of + * California; see the COPYRIGHT file for full details. Squid + * incorporates software developed and/or copyrighted by other + * sources; see the CREDITS file for full details. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. + * + * Copyright (c) 2004, Nick Lewycky + * + */ + +#include + +#include "squid.h" +#include "client_side_request.h" +#include "Debug.h" +#include "HttpReply.h" +#include "HttpRequest.h" +#include "PrefetchStream.h" + +using namespace std; + +CBDATA_CLASS_INIT(PrefetchStream); + +void +PrefetchStream::prefetch(const string &href, const string &relative_url, + const HttpRequest *request) +{ + char *url = urlResolveRelative(href.c_str(), relative_url.c_str()); + + if (!url) { + debugs(93, 1, "Unable to resolve " << href << + " against relative " << relative_url); + return; + } + + { + char *f, *s, *n, *q, *w, *p; + urlParseRFC1808(url, &f, &s, &n, &q, &w, &p); + xfree(f); + xfree(q); + xfree(w); + + if (!strlen(s) || !strlen(n) || p[0] != '/') { + // not absolute. + xfree(s); + xfree(n); + xfree(p); + xfree(url); + debugs(93, 1, "Refusing to fetch " << href << + " against relative " << relative_url); + return; + } + + xfree(s); + xfree(n); + xfree(p); + } + + // Coalesce multiple request for identical URLs. There's a + // race between when the object is requested and when it's entered + // into the cache. Prefetching will tend to aggravate that problem. + // TODO: remove this on landing of the collapsed_forwarding branch. + if (pending.find(url) != pending.end()) { + debugs(93, 1, "Request of " << url << " coalesced."); + return; + } + pending.insert(url); + + // Refuse re-download negatively cached entries ... by refusing to + // fetch any entry found in the cache. + if (storeGetPublic(url, METHOD_GET) != NULL) { + debugs(93, 1, "No need to prefetch " << url); + return; + } + + debugs(93, 1, "prefetching " << url); + + HttpHeader tempheaders(hoRequest); + tempheaders.update(&request->header, NULL); + tempheaders.removeConnectionHeaderEntries(); + httpHeaderDelById(&tempheaders, HDR_ACCEPT_RANGES); + httpHeaderDelById(&tempheaders, HDR_ETAG); + httpHeaderDelById(&tempheaders, HDR_CONTENT_LENGTH); + httpHeaderDelById(&tempheaders, HDR_CONTENT_MD5); + httpHeaderPutStr(&tempheaders, HDR_REFERER, urlCanonicalClean(request)); + // login information should be inherited through urlResolveRelative. + + PrefetchStream::Pointer stream = new PrefetchStream; + assert(stream.getRaw()); + stream->pending_element = pending.insert(url).first; + + // TODO: tag it "prefetch" for the delay pools. + + // XXX: performs client authentication against "no_addr" IP + // instead of the IP causing the prefetch. + if (clientBeginRequest(METHOD_GET, url, PrefetchStream::BufferData, + PrefetchStream::Detach, stream.getRaw(), &tempheaders, + stream->discard_buffer, /*256*/HTTP_REQBUF_SZ, false)) { + debugs(93, 1, "Failed to prefetch " << url); + return; + } + httpHeaderClean(&tempheaders); + + xfree(url); +} + +void +PrefetchStream::BufferData(clientStreamNode *node, ClientHttpRequest *req, + HttpReply *reply, StoreIOBuffer buffer) +{ + assert(cbdataReferenceValid(node)); + dynamic_cast(node->data.getRaw())->bufferData(node, req, reply, buffer); +} + +void +PrefetchStream::Detach(clientStreamNode *node, ClientHttpRequest *req) +{ + assert(cbdataReferenceValid(node)); + dynamic_cast(node->data.getRaw())->detach(node, req); +} + +void +PrefetchStream::bufferData(clientStreamNode *node, ClientHttpRequest *req, + HttpReply *reply, StoreIOBuffer buffer) +{ + debugs(93, 1, "bufferData"); + //debugs(93, 1, "reply " << reply << " body " << string(buffer.data, buffer.length)); + + assert(req); + + if (req->out.offset && reply) { + if (reply->sline.status != HTTP_OK) { + debugs(93, 1, "Aborting on non-\"HTTP OK\""); + httpReplyDestroy(reply); + httpRequestFree(req); + return; + } + httpReplyDestroy(reply); + reply = NULL; + } + + if (buffer.data && buffer.length) + req->out.offset += buffer.length; + + if (!reply && !buffer.data && !buffer.length) { + debugs(93, 1, "EOF / Read error / aborted entry"); + httpRequestFree(req); + return; + } + + switch (clientStreamStatus(node, req)) { + case STREAM_UNPLANNED_COMPLETE: + debugs(93, 1, "stream_unplanned_complete"); + detach(node, req); + return; + case STREAM_COMPLETE: + debugs(93, 1, "stream_complete"); + detach(node, req); + return; + case STREAM_FAILED: + debugs(93, 1, "stream_failed"); + detach(node, req); + return; + case STREAM_NONE: + debugs(93, 1, "stream_none"); + break; + default: + debugs(93, 1, "default"); + return; + } + + debugs(93, 1, "read"); + clientStreamRead(node, req, buffer); + debugs(93, 1, "after read"); +} + +void +PrefetchStream::detach(clientStreamNode *node, ClientHttpRequest *req) +{ + debugs(93, 1, "detach"); + pending.erase(pending_element); + clientStreamDetach(node, req); +} + +set +PrefetchStream::pending; + +char +PrefetchStream::discard_buffer[HTTP_REQBUF_SZ]; --- /dev/null Wed Feb 14 13:33:00 2007 +++ squid3/src/PrefetchStream.h Wed Feb 14 13:34:09 2007 @@ -0,0 +1,41 @@ +#ifndef SQUID_PREFETCHSTREAM_H +#define SQUID_PREFETCHSTREAM_H + +#include +#include + +#include "defines.h" +#include "clientStream.h" +#include "Store.h" +#include "StoreClient.h" + +class HttpReply; +class HttpRequest; + +class PrefetchStream : public RefCountable +{ +public: + typedef RefCount Pointer; + + static void BufferData(clientStreamNode *, ClientHttpRequest *, + HttpReply *, StoreIOBuffer); + static void Detach(clientStreamNode *, ClientHttpRequest *); + + static char discard_buffer[HTTP_REQBUF_SZ]; + std::set::iterator pending_element; + + static void prefetch(const std::string &, const std::string &, + const HttpRequest *); + + // used for coalescing identical requests + static std::set pending; + +private: + void bufferData(clientStreamNode *, ClientHttpRequest *, + HttpReply *, StoreIOBuffer); + void detach(clientStreamNode *, ClientHttpRequest *); + + CBDATA_CLASS2(PrefetchStream); +}; + +#endif // header guard