--------------------- PatchSet 1201 Date: 2005/01/26 19:51:33 Author: nlewycky Branch: prefetching Tag: (none) Log: Moved analyzer out of HttpStateData into a clientStream in clientReplyContext. Don't try to analyze encoded content. Members: src/HTMLAnalysisStream.cc:1.1->1.1.2.1 src/HTMLAnalysisStream.h:1.1->1.1.2.1 src/HTMLPrefetchParser.cc:1.2.2.1->1.2.2.2(DEAD) src/HTMLPrefetchParser.h:1.2.2.1->1.2.2.2(DEAD) src/Makefile.am:1.57.2.2->1.57.2.3 src/PrefetchStream.cc:1.1.2.1->1.1.2.2 src/client_side_reply.cc:1.57->1.57.2.1 src/http.cc:1.43.2.1->1.43.2.2 src/http.h:1.10.2.1->1.10.2.2 --- /dev/null Wed Feb 14 13:33:00 2007 +++ squid3/src/HTMLAnalysisStream.cc Wed Feb 14 13:34:10 2007 @@ -0,0 +1,383 @@ + +/* + * $Id: HTMLAnalysisStream.cc,v 1.1.2.1 2005/01/26 19:51:33 nlewycky Exp $ + * + * DEBUG: section 93 HTML parsing and fetching + * AUTHOR: Nick Lewycky + * + * SQUID Web Proxy Cache http://www.squid-cache.org/ + * ---------------------------------------------------------- + * + * Squid is the result of efforts by numerous individuals from + * the Internet community; see the CONTRIBUTORS file for full + * details. Many organizations have provided support for Squid's + * development; see the SPONSORS file for full details. Squid is + * Copyrighted (C) 2001 by the Regents of the University of + * California; see the COPYRIGHT file for full details. Squid + * incorporates software developed and/or copyrighted by other + * sources; see the CREDITS file for full details. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. + * + * Copyright (c) 2004, Nick Lewycky + * + */ + +#include +#include + +#include + +#include "squid.h" +#include "Debug.h" +#include "HttpReply.h" +#include "HttpRequest.h" +#include "HTMLAnalysisStream.h" +#include "http.h" +#include "PrefetchStream.h" +#include "protos.h" + +using namespace std; + +CBDATA_CLASS_INIT(HTMLAnalysisStream); + +/* NOTE: this does get called after detach. */ +void +htmlBufferData(clientStreamNode *node, ClientHttpRequest *req, + HttpReply *reply, StoreIOBuffer buffer) +{ + assert(node->data.getRaw()); + debugs(93, 1, "bufferData"); + clientStreamCallback(node, req, reply, buffer); + HTMLAnalysisStream::Pointer self = + dynamic_cast(node->data.getRaw()); + if (!self.getRaw()) + { + /* detach time. but if it's not my node->data, whose is it? */ + return; + } + + //self->parse(buffer.data, buffer.length, 1 /*?*/); + self->parse(node->next()->readBuffer.data + (reply?reply->hdr_sz:0), + node->next()->readBuffer.length - (reply?reply->hdr_sz:0), + 1); +} + +void +htmlStreamRead(clientStreamNode *node, ClientHttpRequest *req) +{ + HTMLAnalysisStream::Pointer self = + dynamic_cast(node->data.getRaw()); + + debugs(93, 1, "before read"); + clientStreamRead(node, req, /*self->buffer*/ node->next()->readBuffer); + debugs(93, 1, "after read"); +} + +void +htmlStreamDetach(clientStreamNode *node, ClientHttpRequest *req) +{ + debugs(93, 1, "detach"); + clientStreamDetach(node, req); +} + +clientStream_status_t +htmlStreamStatus(clientStreamNode *node, ClientHttpRequest *req) +{ + debugs(93, 1, "status"); + return clientStreamStatus(node, req); +} + +HTMLAnalysisStream::HTMLAnalysisStream(const HttpRequest *req) + : + request(req), + parser(htmlCreatePushParserCtxt(&handler, static_cast(this), + NULL, 0, NULL, XML_CHAR_ENCODING_NONE)) +{ + if (!parser) + throw runtime_error("Unable to create parser."); + + xmlSubstituteEntitiesDefault(1); + + relative_url = urlCanonicalClean(request); + + debugs(93, 1, "analyzing " << relative_url); +} + +HTMLAnalysisStream::~HTMLAnalysisStream() +{ +} + +inline void +HTMLAnalysisStream::prefetch(const string &url) +{ + PrefetchStream::prefetch(url, relative_url, request); +} + +void +HTMLAnalysisStream::parse(const char *document, size_t len, bool partial) +{ + debugs(93, 1, "chunk to parse, " << len << " long " << (partial?"partial":"")); + debugs(93, 1, "chunk is: " << string(document, len)); + htmlParseChunk(parser, document, len, partial ? 1 : 0); +} + +void +HTMLAnalysisStream::start_element_handler(void *userData, + const xmlChar * xname, const xmlChar ** xattr) +{ + const char *name = reinterpret_cast(xname); + const char **attr = reinterpret_cast(xattr); + + if (!name || !attr) + return; + + HTMLAnalysisStream *self = static_cast(userData); + + // Example: + // + // + //