--------------------- PatchSet 4261 Date: 2007/04/15 13:32:04 Author: amosjeffries Branch: squid3-ipv6 Tag: (none) Log: Ported and updated new URL parser from squid 2.6 (by Adrian) - Added IPv6 URI advancements. Fixes bug where sscanf would always mangle port detection on IPv6 format. Members: src/url.cc:1.9.8.8->1.9.8.9 Index: squid3/src/url.cc =================================================================== RCS file: /cvsroot/squid-sf//squid3/src/url.cc,v retrieving revision 1.9.8.8 retrieving revision 1.9.8.9 diff -u -r1.9.8.8 -r1.9.8.9 --- squid3/src/url.cc 7 Apr 2007 11:33:34 -0000 1.9.8.8 +++ squid3/src/url.cc 15 Apr 2007 13:32:04 -0000 1.9.8.9 @@ -1,6 +1,6 @@ /* - * $Id: url.cc,v 1.9.8.8 2007/04/07 11:33:34 amosjeffries Exp $ + * $Id: url.cc,v 1.9.8.9 2007/04/15 13:32:04 amosjeffries Exp $ * * DEBUG: section 23 URL Parsing * AUTHOR: Duane Wessels @@ -42,16 +42,16 @@ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789-._" -#ifdef INET6 - ":" /* FIXME : Maybe ] and [ also */ +#ifdef USE_IPV6 + "\[:\]" #endif ; static const char valid_hostname_chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789-." -#ifdef INET6 - ":" /* FIXME : Maybe ] and [ also */ +#ifdef USE_IPV6 + "\[:\]" #endif ; @@ -219,6 +219,14 @@ * looked for. * The url is non const so that if its too long we can NULL-terminate it in place. */ + +/* + * This routine parses a URL. Its assumed that the URL is complete - + * ie, the end of the string is the end of the URL. Don't pass a partial + * URL here as this routine doesn't have any way of knowing whether + * its partial or not (ie, it handles the case of no trailing slash as + * being "end of host with implied path of /". + */ HttpRequest * urlParse(method_t method, char *url, HttpRequest *request) { @@ -231,6 +239,9 @@ int port; protocol_t protocol = PROTO_NONE; int l; + int i; + const char *src; + char *dst; proto[0] = host[0] = urlpath[0] = login[0] = '\0'; if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) { @@ -239,45 +250,105 @@ debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l); return NULL; } - if (method == METHOD_CONNECT) { port = CONNECT_PORT; -#ifdef INET6 - if (sscanf(url, "[%[^]]]:%d", host, &port) < 1) - /* the next "if" is intended nested when INET6 */ -#endif - if (sscanf(url, "%[^:]:%d", host, &port) < 1) - return NULL; + if (sscanf(url, "[%[^:]]:%d", host, &port) < 1) + if (sscanf(url, "%[^:]:%d", host, &port) < 1) + return NULL; + } else if (!strncmp(url, "urn:", 4)) { return urnParse(method, url); } else { + /* Parse the URL: */ + src = url; + i = 0; + /* Find first : - everything before is protocol */ + for (i = 0, dst = proto; i < l && *src != ':'; i++, src++, dst++) { + *dst = *src; + } + if (i >= l) + return NULL; + *dst = '\0'; + + /* Then its :// */ + /* (XXX yah, I'm not checking we've got enough data left before checking the array..) */ + if (*src != ':' || *(src + 1) != '/' || *(src + 2) != '/') + return NULL; + i += 3; + src += 3; - if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2) - return NULL; + /* Then everything until first /; thats host (and port; which we'll look for here later) */ + /* bug 1881: If we don't get a "/" then we imply it was there */ + for (dst = host; i < l && *src != '/' && src != '\0'; i++, src++, dst++) { + *dst = *src; + } - protocol = urlParseProtocol(proto); + /* + * We can't check for "i >= l" here because we could be at the end of the line + * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've + * been -given- a valid URL and the path is just '/'. + */ + if (i > l) + return NULL; + *dst = '\0'; + /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */ + for (dst = urlpath; i < l && *src != '\r' && *src != '\n' && *src != '\0'; i++, src++, dst++) { + *dst = *src; + } + + /* We -could- be at the end of the buffer here */ + if (i > l) + return NULL; + /* If the URL path is empty we set it to be "/" */ + if (dst == urlpath) { + *(dst++) = '/'; + } + *dst = '\0'; + + protocol = urlParseProtocol(proto); port = urlDefaultPort(protocol); -#ifdef INET6 - if (sscanf(host, "[%[^]]]:%d", host, &port) < 2) - /* the next "sscanf" is intended nested when INET6 */ -#endif - sscanf(host, "%[^:]:%d", host, &port); /* FIXME. Urgent ! */ - /* 2003:800:45 */ /* Turns on 2003 800 */ - /* TODO AYJ: a check for a non-hex character after the port specifier fixes the bug. */ - /* BUT need to check the RFC to see if any of teh hex chars is allowed there. */ - /* I have a sneaking suspicion it has to be NULL or / in valid URI's */ - - /* Is there any login informaiton? */ - if ((t = strrchr(host, '@'))) { - strcpy((char *) login, (char *) host); - t = strrchr(login, '@'); - *t = 0; - strcpy((char *) host, t + 1); - } - + /* Is there any login information? (we should eventually parse it above) */ + if ((t = strrchr(host, '@'))) { + strcpy((char *) login, (char *) host); + t = strrchr(login, '@'); + *t = 0; + strcpy((char *) host, t + 1); + } + + /* Is there any host information? (we should eventually parse it above) */ + if(*host == '[') { + /* strip any IPA brackets. valid under IPv6 and IPv4 (although rare to non-existent) */ + src = host; src++; + l = strlen(host); + i = 1; + for (dst = host; i < l && *src != ']' && *src != '\0'; i++, src++, dst++) { + *dst = *src; + } + + /* we moved in-place, so truncate the actual hostname found */ + *(dst++) = '\0'; + + /* skip ahead to either start of port, or original EOS */ + while(*dst != '\0' && *dst != ':') dst++; + t = dst; + } else { + t = strrchr(host, ':'); + + if(t != strchr(host,':') ) { + /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */ + /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */ + /* therefore we MUST accept the case where they are not bracketed at all. */ + t = NULL; + } + } + + if (t && *t == ':') { + *t = '\0'; t++; + port = atoi(t); + } } for (t = host; *t; t++) @@ -286,14 +357,11 @@ if (stringHasWhitespace(host)) { if (URI_WHITESPACE_STRIP == Config.uri_whitespace) { t = q = host; - while (*t) { if (!xisspace(*t)) *q++ = *t; - t++; } - *q = '\0'; } } @@ -303,19 +371,18 @@ return NULL; } -#if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY + if (Config.appendDomain && !strchr(host, '.')) + strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1); + /* remove trailing dots from hostnames */ while ((l = strlen(host)) > 0 && host[--l] == '.') host[l] = '\0'; - /* remove duplicate dots */ - while ((t = strstr(host, ".."))) - xmemmove(t, t + 1, strlen(t)); - -#endif - - if (Config.appendDomain && !strchr(host, '.')) - strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1); + /* reject duplicate or leading dots */ + if (strstr(host, "..") || *host == '.') { + debug(23, 1) ("urlParse: Illegal hostname '%s'\n", host); + return NULL; + } if (port < 1 || port > 65535) { debug(23, 3) ("urlParse: Invalid port '%d'\n", port); @@ -329,40 +396,30 @@ debug(23, 0) ("urlParse: Deny access to port %d\n", port); return NULL; } - #endif + if (stringHasWhitespace(urlpath)) { debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url); - switch (Config.uri_whitespace) { - case URI_WHITESPACE_DENY: return NULL; - case URI_WHITESPACE_ALLOW: break; - case URI_WHITESPACE_ENCODE: t = rfc1738_escape_unescaped(urlpath); xstrncpy(urlpath, t, MAX_URL); break; - case URI_WHITESPACE_CHOP: *(urlpath + strcspn(urlpath, w_space)) = '\0'; break; - case URI_WHITESPACE_STRIP: - default: t = q = urlpath; - while (*t) { if (!xisspace(*t)) *q++ = *t; - t++; } - *q = '\0'; } }