urlmon: Implemented canonicalization function for paths in opaque URIs.
authorThomas Mullaly <thomas.mullaly@gmail.com>
Sun, 25 Jul 2010 20:00:50 +0000 (16:00 -0400)
committerAlexandre Julliard <julliard@winehq.org>
Tue, 3 Aug 2010 09:02:29 +0000 (11:02 +0200)
dlls/urlmon/tests/uri.c
dlls/urlmon/uri.c

index 65fd4a33a6af7e8c4c19159a70a248af19eda2b5..5bf8ca28d49cbe7c5a10d23bfb32813642385441 100644 (file)
@@ -2659,6 +2659,208 @@ static const uri_properties uri_tests[] = {
             {URL_SCHEME_WILDCARD,S_OK,FALSE},
             {URLZONE_INVALID,E_NOTIMPL,FALSE}
         }
+    },
+    /* Forbidden characters are encoded for opaque known scheme types. */
+    {   "mailto:\"acco<|>unt@example.com\"", 0, S_OK, FALSE,
+        Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|
+        Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
+        TRUE,
+        {
+            {"mailto:%22acco%3C%7C%3Eunt@example.com%22",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {"mailto:%22acco%3C%7C%3Eunt@example.com%22",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {".com%22",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE},
+            {"%22acco%3C%7C%3Eunt@example.com%22",S_OK,TRUE},
+            {"%22acco%3C%7C%3Eunt@example.com%22",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"mailto:\"acco<|>unt@example.com\"",S_OK,FALSE},
+            {"mailto",S_OK,FALSE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE}
+        },
+        {
+            {Uri_HOST_UNKNOWN,S_OK,FALSE},
+            {0,S_FALSE,FALSE},
+            {URL_SCHEME_MAILTO,S_OK,FALSE},
+            {URLZONE_INVALID,E_NOTIMPL,FALSE}
+        }
+    },
+    {   "news:test.tes<|>t.com", 0, S_OK, FALSE,
+        Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|
+        Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
+        TRUE,
+        {
+            {"news:test.tes%3C%7C%3Et.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {"news:test.tes%3C%7C%3Et.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {".com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE},
+            {"test.tes%3C%7C%3Et.com",S_OK,TRUE},
+            {"test.tes%3C%7C%3Et.com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"news:test.tes<|>t.com",S_OK,FALSE},
+            {"news",S_OK,FALSE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE}
+        },
+        {
+            {Uri_HOST_UNKNOWN,S_OK,FALSE},
+            {0,S_FALSE,FALSE},
+            {URL_SCHEME_NEWS,S_OK,FALSE},
+            {URLZONE_INVALID,E_NOTIMPL,FALSE}
+        }
+    },
+    /* Don't encode forbidden characters. */
+    {   "news:test.tes<|>t.com", Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS, S_OK, FALSE,
+        Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|
+        Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
+        TRUE,
+        {
+            {"news:test.tes<|>t.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {"news:test.tes<|>t.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {".com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE},
+            {"test.tes<|>t.com",S_OK,TRUE},
+            {"test.tes<|>t.com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"news:test.tes<|>t.com",S_OK,FALSE},
+            {"news",S_OK,FALSE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE}
+        },
+        {
+            {Uri_HOST_UNKNOWN,S_OK,FALSE},
+            {0,S_FALSE,FALSE},
+            {URL_SCHEME_NEWS,S_OK,FALSE},
+            {URLZONE_INVALID,E_NOTIMPL,FALSE}
+        }
+    },
+    /* Forbidden characters aren't encoded for unknown, opaque URIs. */
+    {   "urn:test.tes<|>t.com", 0, S_OK, FALSE,
+        Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|
+        Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
+        TRUE,
+        {
+            {"urn:test.tes<|>t.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {"urn:test.tes<|>t.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {".com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE},
+            {"test.tes<|>t.com",S_OK,TRUE},
+            {"test.tes<|>t.com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"urn:test.tes<|>t.com",S_OK,FALSE},
+            {"urn",S_OK,FALSE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE}
+        },
+        {
+            {Uri_HOST_UNKNOWN,S_OK,FALSE},
+            {0,S_FALSE,FALSE},
+            {URL_SCHEME_UNKNOWN,S_OK,FALSE},
+            {URLZONE_INVALID,E_NOTIMPL,FALSE}
+        }
+    },
+    /* Percent encoded unreserved characters are decoded for known opaque URIs. */
+    {   "news:test.%74%65%73%74.com", 0, S_OK, FALSE,
+        Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|
+        Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
+        TRUE,
+        {
+            {"news:test.test.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {"news:test.test.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {".com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE},
+            {"test.test.com",S_OK,TRUE},
+            {"test.test.com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"news:test.%74%65%73%74.com",S_OK,FALSE},
+            {"news",S_OK,FALSE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE}
+        },
+        {
+            {Uri_HOST_UNKNOWN,S_OK,FALSE},
+            {0,S_FALSE,FALSE},
+            {URL_SCHEME_NEWS,S_OK,FALSE},
+            {URLZONE_INVALID,E_NOTIMPL,FALSE}
+        }
+    },
+    /* Percent encoded characters are still decoded for known scheme types. */
+    {   "news:test.%74%65%73%74.com", Uri_CREATE_NO_CANONICALIZE, S_OK, FALSE,
+        Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|
+        Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
+        TRUE,
+        {
+            {"news:test.test.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {"news:test.test.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {".com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE},
+            {"test.test.com",S_OK,TRUE},
+            {"test.test.com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"news:test.%74%65%73%74.com",S_OK,FALSE},
+            {"news",S_OK,FALSE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE}
+        },
+        {
+            {Uri_HOST_UNKNOWN,S_OK,FALSE},
+            {0,S_FALSE,FALSE},
+            {URL_SCHEME_NEWS,S_OK,FALSE},
+            {URLZONE_INVALID,E_NOTIMPL,FALSE}
+        }
+    },
+    /* Percent encoded characters aren't decoded for unknown scheme types. */
+    {   "urn:test.%74%65%73%74.com", 0, S_OK, FALSE,
+        Uri_HAS_ABSOLUTE_URI|Uri_HAS_DISPLAY_URI|Uri_HAS_EXTENSION|Uri_HAS_PATH|Uri_HAS_PATH_AND_QUERY|
+        Uri_HAS_RAW_URI|Uri_HAS_SCHEME_NAME|Uri_HAS_HOST_TYPE|Uri_HAS_SCHEME,
+        TRUE,
+        {
+            {"urn:test.%74%65%73%74.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {"urn:test.%74%65%73%74.com",S_OK,TRUE},
+            {"",S_FALSE,FALSE},
+            {".com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE},
+            {"test.%74%65%73%74.com",S_OK,TRUE},
+            {"test.%74%65%73%74.com",S_OK,TRUE},
+            {"",S_FALSE,TRUE},
+            {"urn:test.%74%65%73%74.com",S_OK,FALSE},
+            {"urn",S_OK,FALSE},
+            {"",S_FALSE,FALSE},
+            {"",S_FALSE,FALSE}
+        },
+        {
+            {Uri_HOST_UNKNOWN,S_OK,FALSE},
+            {0,S_FALSE,FALSE},
+            {URL_SCHEME_UNKNOWN,S_OK,FALSE},
+            {URLZONE_INVALID,E_NOTIMPL,FALSE}
+        }
     }
 };
 
index 3e4e6fead20c751e6267ae17c351ca15e23ee650..f5ca037c45070be25e3ffcf10251c19ab8486ebb 100644 (file)
@@ -1675,6 +1675,8 @@ static BOOL parse_path_opaque(const WCHAR **ptr, parse_data *data, DWORD flags)
  *  (per MSDN documentation).
  */
 static BOOL parse_hierpart(const WCHAR **ptr, parse_data *data, DWORD flags) {
+    const WCHAR *start = *ptr;
+
     /* Checks if the authority information needs to be parsed.
      *
      * Relative URI's aren't hierarchical URI's, but, they could trick
@@ -1703,7 +1705,11 @@ static BOOL parse_hierpart(const WCHAR **ptr, parse_data *data, DWORD flags) {
                 return FALSE;
 
             return parse_path_hierarchical(ptr, data, flags);
-        }
+        } else
+            /* Reset ptr to it's starting position so opaque path parsing
+             * begins at the correct location.
+             */
+            *ptr = start;
     }
 
     /* If it reaches here, then the URI will be treated as an opaque
@@ -2456,6 +2462,78 @@ static BOOL canonicalize_path_hierarchical(const parse_data *data, Uri *uri,
     return TRUE;
 }
 
+/* Attempts to canonicalize the path for an opaque URI.
+ *
+ * For known scheme types:
+ *  1)  forbidden characters are percent encoded if
+ *      NO_ENCODE_FORBIDDEN_CHARACTERS isn't set.
+ *
+ *  2)  Percent encoded, unreserved characters are decoded
+ *      to their actual values, for known scheme types.
+ *
+ *  3)  '\\' are changed to '/' for known scheme types
+ *      except for mailto schemes.
+ */
+static BOOL canonicalize_path_opaque(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
+    const WCHAR *ptr;
+    const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
+
+    if(!data->path) {
+        uri->path_start = -1;
+        uri->path_len = 0;
+        return TRUE;
+    }
+
+    uri->path_start = uri->canon_len;
+
+    /* Windows doesn't allow a "//" to appear after the scheme
+     * of a URI, if it's an opaque URI.
+     */
+    if(data->scheme && *(data->path) == '/' && *(data->path+1) == '/') {
+        /* So it inserts a "/." before the "//" if it exists. */
+        if(!computeOnly) {
+            uri->canon_uri[uri->canon_len] = '/';
+            uri->canon_uri[uri->canon_len+1] = '.';
+        }
+
+        uri->canon_len += 2;
+    }
+
+    for(ptr = data->path; ptr < data->path+data->path_len; ++ptr) {
+        if(*ptr == '%' && known_scheme) {
+            WCHAR val = decode_pct_val(ptr);
+
+            if(is_unreserved(val)) {
+                if(!computeOnly)
+                    uri->canon_uri[uri->canon_len] = val;
+                ++uri->canon_len;
+
+                ptr += 2;
+                continue;
+            } else {
+                if(!computeOnly)
+                    uri->canon_uri[uri->canon_len] = *ptr;
+                ++uri->canon_len;
+            }
+        } else if(known_scheme && !is_unreserved(*ptr) && !is_reserved(*ptr) &&
+                  !(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS)) {
+            if(!computeOnly)
+                pct_encode_val(*ptr, uri->canon_uri+uri->canon_len);
+            uri->canon_len += 3;
+        } else {
+            if(!computeOnly)
+                uri->canon_uri[uri->canon_len] = *ptr;
+            ++uri->canon_len;
+        }
+    }
+
+    uri->path_len = uri->canon_len - uri->path_start;
+
+    TRACE("(%p %p %x %d): Canonicalized opaque URI path %s len=%d\n", data, uri, flags, computeOnly,
+        debugstr_wn(uri->canon_uri+uri->path_start, uri->path_len), uri->path_len);
+    return TRUE;
+}
+
 /* Determines how the URI represented by the parse_data should be canonicalized.
  *
  * Essentially, if the parse_data represents an hierarchical URI then it calls
@@ -2493,6 +2571,9 @@ static BOOL canonicalize_hierpart(const parse_data *data, Uri *uri, DWORD flags,
         uri->authority_start = -1;
         uri->authority_len = 0;
         uri->domain_offset = -1;
+
+        if(!canonicalize_path_opaque(data, uri, flags, computeOnly))
+            return FALSE;
     }
 
     return TRUE;