---------------------
PatchSet 4261 
Date: 2007/04/15 13:32:04
Author: amosjeffries
Branch: squid3-ipv6
Tag: (none) 
Log:
Ported and updated new URL parser from squid 2.6 (by Adrian)
 - Added IPv6 URI advancements.
Fixes bug where sscanf would always mangle port detection on IPv6 format.

Members: 
	src/url.cc:1.9.8.8->1.9.8.9 

Index: squid3/src/url.cc
===================================================================
RCS file: /cvsroot/squid-sf//squid3/src/url.cc,v
retrieving revision 1.9.8.8
retrieving revision 1.9.8.9
diff -u -r1.9.8.8 -r1.9.8.9
--- squid3/src/url.cc	7 Apr 2007 11:33:34 -0000	1.9.8.8
+++ squid3/src/url.cc	15 Apr 2007 13:32:04 -0000	1.9.8.9
@@ -1,6 +1,6 @@
 
 /*
- * $Id: url.cc,v 1.9.8.8 2007/04/07 11:33:34 amosjeffries Exp $
+ * $Id: url.cc,v 1.9.8.9 2007/04/15 13:32:04 amosjeffries Exp $
  *
  * DEBUG: section 23    URL Parsing
  * AUTHOR: Duane Wessels
@@ -42,16 +42,16 @@
     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     "abcdefghijklmnopqrstuvwxyz"
     "0123456789-._"
-#ifdef INET6
-    ":" /* FIXME : Maybe ] and [ also */
+#ifdef USE_IPV6
+    "\[:\]"
 #endif
     ;
 static const char valid_hostname_chars[] =
     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     "abcdefghijklmnopqrstuvwxyz"
     "0123456789-."
-#ifdef INET6
-    ":" /* FIXME : Maybe ] and [ also */
+#ifdef USE_IPV6
+    "\[:\]"
 #endif
     ;
 
@@ -219,6 +219,14 @@
  * looked for.
  * The url is non const so that if its too long we can NULL-terminate it in place.
  */
+
+/*
+ * This routine parses a URL. Its assumed that the URL is complete -
+ * ie, the end of the string is the end of the URL. Don't pass a partial
+ * URL here as this routine doesn't have any way of knowing whether
+ * its partial or not (ie, it handles the case of no trailing slash as
+ * being "end of host with implied path of /".
+ */
 HttpRequest *
 urlParse(method_t method, char *url, HttpRequest *request)
 {
@@ -231,6 +239,9 @@
     int port;
     protocol_t protocol = PROTO_NONE;
     int l;
+    int i;
+    const char *src;
+    char *dst;
     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 
     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
@@ -239,45 +250,105 @@
         debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
         return NULL;
     }
-
     if (method == METHOD_CONNECT) {
         port = CONNECT_PORT;
 
-#ifdef INET6
-        if (sscanf(url, "[%[^]]]:%d", host, &port) < 1)
-	    /* the next "if" is intended nested when INET6 */
-#endif
-	    if (sscanf(url, "%[^:]:%d", host, &port) < 1)
-		return NULL;
+        if (sscanf(url, "[%[^:]]:%d", host, &port) < 1)
+            if (sscanf(url, "%[^:]:%d", host, &port) < 1)
+                return NULL;
+
     } else if (!strncmp(url, "urn:", 4)) {
         return urnParse(method, url);
     } else {
+        /* Parse the URL: */
+        src = url;
+        i = 0;
+        /* Find first : - everything before is protocol */
+        for (i = 0, dst = proto; i < l && *src != ':'; i++, src++, dst++) {
+            *dst = *src;
+        }
+        if (i >= l)
+           return NULL;
+        *dst = '\0';
+
+        /* Then its :// */
+        /* (XXX yah, I'm not checking we've got enough data left before checking the array..) */
+        if (*src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
+            return NULL;
+        i += 3;
+        src += 3;
 
-	if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
-	    return NULL;
+        /* Then everything until first /; thats host (and port; which we'll look for here later) */
+        /* bug 1881: If we don't get a "/" then we imply it was there */
+        for (dst = host; i < l && *src != '/' && src != '\0'; i++, src++, dst++) {
+            *dst = *src;
+        }
 
-        protocol = urlParseProtocol(proto);
+        /* 
+         * We can't check for "i >= l" here because we could be at the end of the line
+         * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
+         * been -given- a valid URL and the path is just '/'.
+         */
+        if (i > l)
+            return NULL;
+        *dst = '\0';
 
+        /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
+        for (dst = urlpath; i < l && *src != '\r' && *src != '\n' && *src != '\0'; i++, src++, dst++) {
+            *dst = *src;
+        }
+
+        /* We -could- be at the end of the buffer here */
+        if (i > l)
+            return NULL;
+        /* If the URL path is empty we set it to be "/" */
+        if (dst == urlpath) {
+            *(dst++) = '/';
+        }
+        *dst = '\0';
+
+        protocol = urlParseProtocol(proto);
         port = urlDefaultPort(protocol);
 
-#ifdef INET6
-        if (sscanf(host, "[%[^]]]:%d", host, &port) < 2)
-	    /* the next "sscanf" is intended nested when INET6 */
-#endif
-	    sscanf(host, "%[^:]:%d", host, &port);  /* FIXME. Urgent ! */
-	/* 2003:800:45 */ /* Turns on 2003 800 */
-	/* TODO AYJ: a check for a non-hex character after the port specifier fixes the bug. */
-	/*		BUT need to check the RFC to see if any of teh hex chars is allowed there. */
-	/*		I have a sneaking suspicion it has to be NULL or / in valid URI's */
-
-	    /* Is there any login informaiton? */
-	if ((t = strrchr(host, '@'))) {
-	    strcpy((char *) login, (char *) host);
-	    t = strrchr(login, '@');
-	    *t = 0;
-	    strcpy((char *) host, t + 1);
-	}
-	
+        /* Is there any login information? (we should eventually parse it above) */
+        if ((t = strrchr(host, '@'))) {
+            strcpy((char *) login, (char *) host);
+            t = strrchr(login, '@');
+            *t = 0;
+            strcpy((char *) host, t + 1);
+        }
+
+        /* Is there any host information? (we should eventually parse it above) */
+        if(*host == '[') {
+            /* strip any IPA brackets. valid under IPv6 and IPv4 (although rare to non-existent) */
+            src = host; src++;
+            l = strlen(host);
+            i = 1;
+            for (dst = host; i < l && *src != ']' && *src != '\0'; i++, src++, dst++) {
+                *dst = *src;
+            }
+
+            /* we moved in-place, so truncate the actual hostname found */
+            *(dst++) = '\0';
+
+            /* skip ahead to either start of port, or original EOS */
+            while(*dst != '\0' && *dst != ':') dst++;
+            t = dst;
+        } else {
+            t = strrchr(host, ':');
+
+            if(t != strchr(host,':') ) {
+                /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
+                /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
+                /* therefore we MUST accept the case where they are not bracketed at all. */
+                t = NULL;
+            }
+        }
+
+        if (t && *t == ':') {
+            *t = '\0'; t++;
+            port = atoi(t);
+        }
     }
 
     for (t = host; *t; t++)
@@ -286,14 +357,11 @@
     if (stringHasWhitespace(host)) {
         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
             t = q = host;
-
             while (*t) {
                 if (!xisspace(*t))
                     *q++ = *t;
-
                 t++;
             }
-
             *q = '\0';
         }
     }
@@ -303,19 +371,18 @@
         return NULL;
     }
 
-#if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
+    if (Config.appendDomain && !strchr(host, '.'))
+        strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
+
     /* remove trailing dots from hostnames */
     while ((l = strlen(host)) > 0 && host[--l] == '.')
         host[l] = '\0';
 
-    /* remove duplicate dots */
-    while ((t = strstr(host, "..")))
-        xmemmove(t, t + 1, strlen(t));
-
-#endif
-
-    if (Config.appendDomain && !strchr(host, '.'))
-        strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
+    /* reject duplicate or leading dots */
+    if (strstr(host, "..") || *host == '.') {
+        debug(23, 1) ("urlParse: Illegal hostname '%s'\n", host);
+        return NULL;
+    }
 
     if (port < 1 || port > 65535) {
         debug(23, 3) ("urlParse: Invalid port '%d'\n", port);
@@ -329,40 +396,30 @@
         debug(23, 0) ("urlParse: Deny access to port %d\n", port);
         return NULL;
     }
-
 #endif
+
     if (stringHasWhitespace(urlpath)) {
         debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
-
         switch (Config.uri_whitespace) {
-
         case URI_WHITESPACE_DENY:
             return NULL;
-
         case URI_WHITESPACE_ALLOW:
             break;
-
         case URI_WHITESPACE_ENCODE:
             t = rfc1738_escape_unescaped(urlpath);
             xstrncpy(urlpath, t, MAX_URL);
             break;
-
         case URI_WHITESPACE_CHOP:
             *(urlpath + strcspn(urlpath, w_space)) = '\0';
             break;
-
         case URI_WHITESPACE_STRIP:
-
         default:
             t = q = urlpath;
-
             while (*t) {
                 if (!xisspace(*t))
                     *q++ = *t;
-
                 t++;
             }
-
             *q = '\0';
         }
     }