2016-08-11 19:09:39 +00:00
|
|
|
/*
|
2018-04-30 21:17:59 +00:00
|
|
|
* Copyright (C) 2016-2018 Apple Inc. All rights reserved.
|
2016-08-11 19:09:39 +00:00
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
|
|
|
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
|
|
|
|
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
|
|
|
* THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
2021-06-15 16:59:15 +00:00
|
|
|
#include <unicode/uidna.h>
|
2017-02-14 07:43:28 +00:00
|
|
|
#include <wtf/Expected.h>
|
2016-08-11 19:09:39 +00:00
|
|
|
#include <wtf/Forward.h>
|
2018-12-01 03:28:36 +00:00
|
|
|
#include <wtf/URL.h>
|
2016-08-11 19:09:39 +00:00
|
|
|
|
Support IDN2008 with UTS #46 instead of IDN2003
https://bugs.webkit.org/show_bug.cgi?id=144194
Reviewed by Darin Adler.
Source/WebCore:
Use uidna_nameToASCII instead of the deprecated uidna_IDNToASCII.
It uses IDN2008 instead of IDN2003, and it uses UTF #46 when used with a UIDNA opened with uidna_openUTS46.
This follows https://url.spec.whatwg.org/#concept-domain-to-ascii except we do not use Transitional_Processing
to prevent homograph attacks on german domain names with "ß" and "ss" in them. These are now treated as separate domains.
Firefox also doesn't use Transitional_Processing. Chrome and the current specification use Transitional_processing,
but https://github.com/whatwg/url/issues/110 might change the spec.
In addition, http://unicode.org/reports/tr46/ says:
"implementations are encouraged to apply the Bidi and ContextJ validity criteria"
Bidi checks prevent domain names with bidirectional text, such as latin and hebrew characters in the same domain. Chrome and Firefox do this.
ContextJ checks prevent code points such as U+200D, which is a zero-width joiner which users would not see when looking at the domain name.
Firefox currently enables ContextJ checks and it is suggested by UTS #46, so we'll do it.
ContextO checks, which we do not use and neither does any other browser nor the spec, would fail if a domain contains code points such as U+30FB,
which looks somewhat like a dot. We can investigate enabling these checks later.
Covered by new API tests and rebased LayoutTests.
The new API tests verify that we do not use transitional processing, that we do apply the Bidi and ContextJ checks, but not ContextO checks.
* platform/URLParser.cpp:
(WebCore::URLParser::domainToASCII):
(WebCore::URLParser::internationalDomainNameTranscoder):
* platform/URLParser.h:
* platform/mac/WebCoreNSURLExtras.mm:
(WebCore::mapHostNameWithRange):
Tools:
* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):
Add some tests from http://unicode.org/faq/idn.html verifying that we follow UTS46's deviations from IDN2008.
Add some tests based on https://tools.ietf.org/html/rfc5893 verifying that we check for bidirectional text.
Add a test based on https://tools.ietf.org/html/rfc5892 verifying that we do not do ContextO check.
Add a test for U+321D and U+321E which have particularly interesting punycode encodings. We match Firefox here now.
Also add a test from http://www.unicode.org/reports/tr46/#IDNAComparison verifying we are not using IDN2003.
We should consider importing all of http://www.unicode.org/Public/idna/9.0.0/IdnaTest.txt as URL domain tests.
LayoutTests:
* fast/encoding/idn-security.html:
Move some characters with changed IDN encodings to inside the check for old ICU.
* fast/url/idna2003-expected.txt:
* fast/url/idna2008-expected.txt:
Update expected results. We are now more compliant with IDN2008.
Canonical link: https://commits.webkit.org/182613@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@208902 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2016-11-18 22:47:24 +00:00
|
|
|
struct UIDNA;
|
|
|
|
|
2018-12-01 03:28:36 +00:00
|
|
|
namespace WTF {
|
2016-08-28 05:55:17 +00:00
|
|
|
|
2016-09-15 18:12:09 +00:00
|
|
|
template<typename CharacterType> class CodePointIterator;
|
|
|
|
|
2016-08-11 19:09:39 +00:00
|
|
|
class URLParser {
|
2019-08-12 20:57:15 +00:00
|
|
|
WTF_MAKE_FAST_ALLOCATED;
|
2016-08-11 19:09:39 +00:00
|
|
|
public:
|
2021-06-15 16:59:15 +00:00
|
|
|
constexpr static int allowedNameToASCIIErrors =
|
|
|
|
UIDNA_ERROR_EMPTY_LABEL
|
|
|
|
| UIDNA_ERROR_LABEL_TOO_LONG
|
|
|
|
| UIDNA_ERROR_DOMAIN_NAME_TOO_LONG
|
|
|
|
| UIDNA_ERROR_LEADING_HYPHEN
|
|
|
|
| UIDNA_ERROR_TRAILING_HYPHEN
|
|
|
|
| UIDNA_ERROR_HYPHEN_3_4;
|
|
|
|
|
|
|
|
// Needs to be big enough to hold an IDN-encoded name.
|
|
|
|
// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
|
|
|
|
constexpr static size_t hostnameBufferLength = 2048;
|
|
|
|
|
2021-07-08 22:59:59 +00:00
|
|
|
#define URLTextEncodingSentinelAllowingC0AtEndOfHash reinterpret_cast<const URLTextEncoding*>(-1)
|
|
|
|
|
2018-12-01 03:28:36 +00:00
|
|
|
WTF_EXPORT_PRIVATE static bool allValuesEqual(const URL&, const URL&);
|
|
|
|
WTF_EXPORT_PRIVATE static bool internalValuesConsistent(const URL&);
|
2016-09-14 01:34:27 +00:00
|
|
|
|
2018-12-01 03:28:36 +00:00
|
|
|
using URLEncodedForm = Vector<WTF::KeyValuePair<String, String>>;
|
|
|
|
WTF_EXPORT_PRIVATE static URLEncodedForm parseURLEncodedForm(StringView);
|
|
|
|
WTF_EXPORT_PRIVATE static String serialize(const URLEncodedForm&);
|
|
|
|
|
2021-07-08 22:59:59 +00:00
|
|
|
WTF_EXPORT_PRIVATE static bool isSpecialScheme(StringView);
|
2021-06-09 20:46:24 +00:00
|
|
|
WTF_EXPORT_PRIVATE static std::optional<String> maybeCanonicalizeScheme(StringView scheme);
|
2016-09-14 01:34:27 +00:00
|
|
|
|
Support IDN2008 with UTS #46 instead of IDN2003
https://bugs.webkit.org/show_bug.cgi?id=144194
Reviewed by Darin Adler.
Source/WebCore:
Use uidna_nameToASCII instead of the deprecated uidna_IDNToASCII.
It uses IDN2008 instead of IDN2003, and it uses UTF #46 when used with a UIDNA opened with uidna_openUTS46.
This follows https://url.spec.whatwg.org/#concept-domain-to-ascii except we do not use Transitional_Processing
to prevent homograph attacks on german domain names with "ß" and "ss" in them. These are now treated as separate domains.
Firefox also doesn't use Transitional_Processing. Chrome and the current specification use Transitional_processing,
but https://github.com/whatwg/url/issues/110 might change the spec.
In addition, http://unicode.org/reports/tr46/ says:
"implementations are encouraged to apply the Bidi and ContextJ validity criteria"
Bidi checks prevent domain names with bidirectional text, such as latin and hebrew characters in the same domain. Chrome and Firefox do this.
ContextJ checks prevent code points such as U+200D, which is a zero-width joiner which users would not see when looking at the domain name.
Firefox currently enables ContextJ checks and it is suggested by UTS #46, so we'll do it.
ContextO checks, which we do not use and neither does any other browser nor the spec, would fail if a domain contains code points such as U+30FB,
which looks somewhat like a dot. We can investigate enabling these checks later.
Covered by new API tests and rebased LayoutTests.
The new API tests verify that we do not use transitional processing, that we do apply the Bidi and ContextJ checks, but not ContextO checks.
* platform/URLParser.cpp:
(WebCore::URLParser::domainToASCII):
(WebCore::URLParser::internationalDomainNameTranscoder):
* platform/URLParser.h:
* platform/mac/WebCoreNSURLExtras.mm:
(WebCore::mapHostNameWithRange):
Tools:
* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):
Add some tests from http://unicode.org/faq/idn.html verifying that we follow UTS46's deviations from IDN2008.
Add some tests based on https://tools.ietf.org/html/rfc5893 verifying that we check for bidirectional text.
Add a test based on https://tools.ietf.org/html/rfc5892 verifying that we do not do ContextO check.
Add a test for U+321D and U+321E which have particularly interesting punycode encodings. We match Firefox here now.
Also add a test from http://www.unicode.org/reports/tr46/#IDNAComparison verifying we are not using IDN2003.
We should consider importing all of http://www.unicode.org/Public/idna/9.0.0/IdnaTest.txt as URL domain tests.
LayoutTests:
* fast/encoding/idn-security.html:
Move some characters with changed IDN encodings to inside the check for old ICU.
* fast/url/idna2003-expected.txt:
* fast/url/idna2008-expected.txt:
Update expected results. We are now more compliant with IDN2008.
Canonical link: https://commits.webkit.org/182613@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@208902 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2016-11-18 22:47:24 +00:00
|
|
|
static const UIDNA& internationalDomainNameTranscoder();
|
2017-11-14 19:15:23 +00:00
|
|
|
static bool isInUserInfoEncodeSet(UChar);
|
Support IDN2008 with UTS #46 instead of IDN2003
https://bugs.webkit.org/show_bug.cgi?id=144194
Reviewed by Darin Adler.
Source/WebCore:
Use uidna_nameToASCII instead of the deprecated uidna_IDNToASCII.
It uses IDN2008 instead of IDN2003, and it uses UTF #46 when used with a UIDNA opened with uidna_openUTS46.
This follows https://url.spec.whatwg.org/#concept-domain-to-ascii except we do not use Transitional_Processing
to prevent homograph attacks on german domain names with "ß" and "ss" in them. These are now treated as separate domains.
Firefox also doesn't use Transitional_Processing. Chrome and the current specification use Transitional_processing,
but https://github.com/whatwg/url/issues/110 might change the spec.
In addition, http://unicode.org/reports/tr46/ says:
"implementations are encouraged to apply the Bidi and ContextJ validity criteria"
Bidi checks prevent domain names with bidirectional text, such as latin and hebrew characters in the same domain. Chrome and Firefox do this.
ContextJ checks prevent code points such as U+200D, which is a zero-width joiner which users would not see when looking at the domain name.
Firefox currently enables ContextJ checks and it is suggested by UTS #46, so we'll do it.
ContextO checks, which we do not use and neither does any other browser nor the spec, would fail if a domain contains code points such as U+30FB,
which looks somewhat like a dot. We can investigate enabling these checks later.
Covered by new API tests and rebased LayoutTests.
The new API tests verify that we do not use transitional processing, that we do apply the Bidi and ContextJ checks, but not ContextO checks.
* platform/URLParser.cpp:
(WebCore::URLParser::domainToASCII):
(WebCore::URLParser::internationalDomainNameTranscoder):
* platform/URLParser.h:
* platform/mac/WebCoreNSURLExtras.mm:
(WebCore::mapHostNameWithRange):
Tools:
* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):
Add some tests from http://unicode.org/faq/idn.html verifying that we follow UTS46's deviations from IDN2008.
Add some tests based on https://tools.ietf.org/html/rfc5893 verifying that we check for bidirectional text.
Add a test based on https://tools.ietf.org/html/rfc5892 verifying that we do not do ContextO check.
Add a test for U+321D and U+321E which have particularly interesting punycode encodings. We match Firefox here now.
Also add a test from http://www.unicode.org/reports/tr46/#IDNAComparison verifying we are not using IDN2003.
We should consider importing all of http://www.unicode.org/Public/idna/9.0.0/IdnaTest.txt as URL domain tests.
LayoutTests:
* fast/encoding/idn-security.html:
Move some characters with changed IDN encodings to inside the check for old ICU.
* fast/url/idna2003-expected.txt:
* fast/url/idna2008-expected.txt:
Update expected results. We are now more compliant with IDN2008.
Canonical link: https://commits.webkit.org/182613@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@208902 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2016-11-18 22:47:24 +00:00
|
|
|
|
2021-05-30 16:11:40 +00:00
|
|
|
static std::optional<uint16_t> defaultPortForProtocol(StringView);
|
2018-12-01 04:28:18 +00:00
|
|
|
|
2016-08-17 00:41:30 +00:00
|
|
|
private:
|
2018-12-01 03:28:36 +00:00
|
|
|
URLParser(const String&, const URL& = { }, const URLTextEncoding* = nullptr);
|
|
|
|
URL result() { return m_url; }
|
|
|
|
|
|
|
|
friend class URL;
|
2016-11-04 23:59:03 +00:00
|
|
|
|
2016-08-17 00:41:30 +00:00
|
|
|
URL m_url;
|
2018-05-04 00:40:18 +00:00
|
|
|
Vector<LChar> m_asciiBuffer;
|
2016-09-09 01:32:18 +00:00
|
|
|
bool m_urlIsSpecial { false };
|
2017-02-24 16:45:32 +00:00
|
|
|
bool m_urlIsFile { false };
|
2016-09-14 19:34:53 +00:00
|
|
|
bool m_hostHasPercentOrNonASCII { false };
|
2021-07-08 22:59:59 +00:00
|
|
|
bool m_didSeeSyntaxViolation { false };
|
2016-09-23 20:58:03 +00:00
|
|
|
String m_inputString;
|
2016-09-23 23:40:46 +00:00
|
|
|
const void* m_inputBegin { nullptr };
|
|
|
|
|
2018-04-30 21:17:59 +00:00
|
|
|
static constexpr size_t defaultInlineBufferSize = 2048;
|
|
|
|
using LCharBuffer = Vector<LChar, defaultInlineBufferSize>;
|
2016-09-15 18:12:09 +00:00
|
|
|
|
URLParser should use TextEncoding through an abstract class
https://bugs.webkit.org/show_bug.cgi?id=190027
Reviewed by Andy Estes.
Source/WebCore:
URLParser uses TextEncoding for one call to encode, which is only used for encoding the query of URLs in documents with non-UTF encodings.
There are 3 call sites that specify the TextEncoding to use from the Document, and even those call sites use a UTF encoding most of the time.
All other URL parsing is done using a well-optimized path which assumes UTF-8 encoding and uses macros from ICU headers, not a TextEncoding.
Moving the logic in this way breaks URL and URLParser's dependency on TextEncoding, which makes it possible to use in a lower-level project
without also moving TextEncoding, TextCodec, TextCodecICU, ThreadGlobalData, and the rest of WebCore and JavaScriptCore.
There is no observable change in behavior. There is now one virtual function call in a code path in URLParser that is not performance-sensitive,
and TextEncodings now have a vtable, which uses a few more bytes of memory total for WebKit.
* css/parser/CSSParserContext.h:
(WebCore::CSSParserContext::completeURL const):
* css/parser/CSSParserIdioms.cpp:
(WebCore::completeURL):
* dom/Document.cpp:
(WebCore::Document::completeURL const):
* html/HTMLBaseElement.cpp:
(WebCore::HTMLBaseElement::href const):
Move the call to encodingForFormSubmission from the URL constructor to the 3 call sites that specify the encoding from the Document.
* loader/FormSubmission.cpp:
(WebCore::FormSubmission::create):
* loader/TextResourceDecoder.cpp:
(WebCore::TextResourceDecoder::encodingForURLParsing):
* loader/TextResourceDecoder.h:
* platform/URL.cpp:
(WebCore::URL::URL):
* platform/URL.h:
(WebCore::URLTextEncoding::~URLTextEncoding):
* platform/URLParser.cpp:
(WebCore::URLParser::encodeNonUTF8Query):
(WebCore::URLParser::copyURLPartsUntil):
(WebCore::URLParser::URLParser):
(WebCore::URLParser::parse):
(WebCore::URLParser::encodeQuery): Deleted.
A pointer replaces the boolean isUTF8Encoding and the TextEncoding& which had a default value of UTF8Encoding.
Now the pointer being null means that we use UTF8, and the pointer being non-null means we use that encoding.
* platform/URLParser.h:
(WebCore::URLParser::URLParser):
* platform/text/TextEncoding.cpp:
(WebCore::UTF7Encoding):
(WebCore::TextEncoding::encodingForFormSubmissionOrURLParsing const):
(WebCore::ASCIIEncoding):
(WebCore::Latin1Encoding):
(WebCore::UTF16BigEndianEncoding):
(WebCore::UTF16LittleEndianEncoding):
(WebCore::UTF8Encoding):
(WebCore::WindowsLatin1Encoding):
(WebCore::TextEncoding::encodingForFormSubmission const): Deleted.
Use NeverDestroyed because TextEncoding now has a virtual destructor.
* platform/text/TextEncoding.h:
Rename encodingForFormSubmission to encodingForFormSubmissionOrURLParsing to make it more clear that we are intentionally using it for both.
Tools:
* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::checkURL):
(TestWebKitAPI::TEST_F):
Canonical link: https://commits.webkit.org/205005@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@236565 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2018-09-27 20:05:52 +00:00
|
|
|
template<typename CharacterType> void parse(const CharacterType*, const unsigned length, const URL&, const URLTextEncoding*);
|
2016-09-23 19:57:57 +00:00
|
|
|
template<typename CharacterType> void parseAuthority(CodePointIterator<CharacterType>);
|
2016-10-13 21:01:58 +00:00
|
|
|
template<typename CharacterType> bool parseHostAndPort(CodePointIterator<CharacterType>);
|
2016-09-23 19:57:57 +00:00
|
|
|
template<typename CharacterType> bool parsePort(CodePointIterator<CharacterType>&);
|
2016-09-23 20:58:03 +00:00
|
|
|
|
|
|
|
void failure();
|
2016-09-29 18:18:04 +00:00
|
|
|
enum class ReportSyntaxViolation { No, Yes };
|
|
|
|
template<typename CharacterType, ReportSyntaxViolation reportSyntaxViolation = ReportSyntaxViolation::Yes>
|
|
|
|
void advance(CodePointIterator<CharacterType>& iterator) { advance<CharacterType, reportSyntaxViolation>(iterator, iterator); }
|
|
|
|
template<typename CharacterType, ReportSyntaxViolation = ReportSyntaxViolation::Yes>
|
|
|
|
void advance(CodePointIterator<CharacterType>&, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition);
|
2016-10-03 22:55:46 +00:00
|
|
|
template<typename CharacterType> bool takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>);
|
Implement URLParser::syntaxViolation
https://bugs.webkit.org/show_bug.cgi?id=162593
Reviewed by Geoffrey Garen.
Source/WebCore:
Most of the time when parsing URLs, we just look at the URL, find offsets of the host, path, query, etc.,
and the String can be used untouched. When this happens, we do not want to allocate and copy the String.
We want to just add a reference to an existing String.
Sometimes we need to canonicalize the String because there has been a syntaxViolation,
defined as any String that is different than its canonicalized URL String. In such cases we need to
allocate a new String and fill it with the canonicalized URL String. When a syntaxViolation happens for the
first time, we know that everything in the input String up to that point is equal to what it would have been
if we had canonicalized the beginning of the URL, copy it into a buffer, and continue parsing in a mode where
instead of just looking at the input URL String, we canonicalize each code point into the buffer.
Changes to behavior involve additional spec compliance with tabs and newlines in different places in URLs,
as well as additional spec compliance when parsing empty and null URLs relative to other URLs.
Both are covered by new API tests. Existing behavior covered by existing API tests.
This is about a 15% speed improvement on my URL parsing benchmark.
* platform/URL.cpp:
(WebCore::assertProtocolIsGood):
(WebCore::URL::protocolIs):
(WebCore::protocolIs):
* platform/URL.h:
* platform/URLParser.cpp:
(WebCore::isTabOrNewline):
(WebCore::URLParser::incrementIteratorSkippingTabsAndNewlines):
(WebCore::URLParser::isWindowsDriveLetter):
(WebCore::URLParser::appendToASCIIBuffer):
(WebCore::URLParser::checkWindowsDriveLetter):
(WebCore::URLParser::shouldCopyFileURL):
(WebCore::URLParser::utf8PercentEncode):
(WebCore::URLParser::utf8QueryEncode):
(WebCore::URLParser::copyURLPartsUntil):
(WebCore::URLParser::syntaxViolation):
(WebCore::URLParser::fragmentSyntaxViolation):
(WebCore::URLParser::parsedDataView):
(WebCore::URLParser::currentPosition):
(WebCore::URLParser::URLParser):
(WebCore::URLParser::parse):
(WebCore::URLParser::parseAuthority):
(WebCore::URLParser::parseIPv4Number):
(WebCore::URLParser::parseIPv4Host):
(WebCore::URLParser::parseIPv6Host):
(WebCore::URLParser::parsePort):
(WebCore::URLParser::parseHostAndPort):
(WebCore::serializeURLEncodedForm):
(WebCore::URLParser::allValuesEqual):
(WebCore::URLParser::internalValuesConsistent):
(WebCore::URLParser::incrementIteratorSkippingTabAndNewLine): Deleted.
(WebCore::URLParser::syntaxError): Deleted.
(WebCore::parseIPv4Number): Deleted.
* platform/URLParser.h:
(WebCore::URLParser::incrementIteratorSkippingTabsAndNewlines):
Tools:
* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):
Canonical link: https://commits.webkit.org/180569@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@206457 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2016-09-27 20:07:15 +00:00
|
|
|
template<typename CharacterType> void syntaxViolation(const CodePointIterator<CharacterType>&);
|
2016-09-29 18:18:04 +00:00
|
|
|
template<typename CharacterType> bool isPercentEncodedDot(CodePointIterator<CharacterType>);
|
2016-09-23 20:58:03 +00:00
|
|
|
template<typename CharacterType> bool isWindowsDriveLetter(CodePointIterator<CharacterType>);
|
2016-09-29 18:18:04 +00:00
|
|
|
template<typename CharacterType> bool isSingleDotPathSegment(CodePointIterator<CharacterType>);
|
|
|
|
template<typename CharacterType> bool isDoubleDotPathSegment(CodePointIterator<CharacterType>);
|
2016-09-23 20:58:03 +00:00
|
|
|
template<typename CharacterType> bool shouldCopyFileURL(CodePointIterator<CharacterType>);
|
2016-09-29 18:18:04 +00:00
|
|
|
template<typename CharacterType> bool checkLocalhostCodePoint(CodePointIterator<CharacterType>&, UChar32);
|
|
|
|
template<typename CharacterType> bool isAtLocalhost(CodePointIterator<CharacterType>);
|
|
|
|
bool isLocalhost(StringView);
|
|
|
|
template<typename CharacterType> void consumeSingleDotPathSegment(CodePointIterator<CharacterType>&);
|
|
|
|
template<typename CharacterType> void consumeDoubleDotPathSegment(CodePointIterator<CharacterType>&);
|
2016-09-28 00:31:44 +00:00
|
|
|
template<typename CharacterType> void appendWindowsDriveLetter(CodePointIterator<CharacterType>&);
|
2016-09-23 23:40:46 +00:00
|
|
|
template<typename CharacterType> size_t currentPosition(const CodePointIterator<CharacterType>&);
|
|
|
|
template<typename UnsignedIntegerType> void appendNumberToASCIIBuffer(UnsignedIntegerType);
|
Implement URLParser::syntaxViolation
https://bugs.webkit.org/show_bug.cgi?id=162593
Reviewed by Geoffrey Garen.
Source/WebCore:
Most of the time when parsing URLs, we just look at the URL, find offsets of the host, path, query, etc.,
and the String can be used untouched. When this happens, we do not want to allocate and copy the String.
We want to just add a reference to an existing String.
Sometimes we need to canonicalize the String because there has been a syntaxViolation,
defined as any String that is different than its canonicalized URL String. In such cases we need to
allocate a new String and fill it with the canonicalized URL String. When a syntaxViolation happens for the
first time, we know that everything in the input String up to that point is equal to what it would have been
if we had canonicalized the beginning of the URL, copy it into a buffer, and continue parsing in a mode where
instead of just looking at the input URL String, we canonicalize each code point into the buffer.
Changes to behavior involve additional spec compliance with tabs and newlines in different places in URLs,
as well as additional spec compliance when parsing empty and null URLs relative to other URLs.
Both are covered by new API tests. Existing behavior covered by existing API tests.
This is about a 15% speed improvement on my URL parsing benchmark.
* platform/URL.cpp:
(WebCore::assertProtocolIsGood):
(WebCore::URL::protocolIs):
(WebCore::protocolIs):
* platform/URL.h:
* platform/URLParser.cpp:
(WebCore::isTabOrNewline):
(WebCore::URLParser::incrementIteratorSkippingTabsAndNewlines):
(WebCore::URLParser::isWindowsDriveLetter):
(WebCore::URLParser::appendToASCIIBuffer):
(WebCore::URLParser::checkWindowsDriveLetter):
(WebCore::URLParser::shouldCopyFileURL):
(WebCore::URLParser::utf8PercentEncode):
(WebCore::URLParser::utf8QueryEncode):
(WebCore::URLParser::copyURLPartsUntil):
(WebCore::URLParser::syntaxViolation):
(WebCore::URLParser::fragmentSyntaxViolation):
(WebCore::URLParser::parsedDataView):
(WebCore::URLParser::currentPosition):
(WebCore::URLParser::URLParser):
(WebCore::URLParser::parse):
(WebCore::URLParser::parseAuthority):
(WebCore::URLParser::parseIPv4Number):
(WebCore::URLParser::parseIPv4Host):
(WebCore::URLParser::parseIPv6Host):
(WebCore::URLParser::parsePort):
(WebCore::URLParser::parseHostAndPort):
(WebCore::serializeURLEncodedForm):
(WebCore::URLParser::allValuesEqual):
(WebCore::URLParser::internalValuesConsistent):
(WebCore::URLParser::incrementIteratorSkippingTabAndNewLine): Deleted.
(WebCore::URLParser::syntaxError): Deleted.
(WebCore::parseIPv4Number): Deleted.
* platform/URLParser.h:
(WebCore::URLParser::incrementIteratorSkippingTabsAndNewlines):
Tools:
* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):
Canonical link: https://commits.webkit.org/180569@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@206457 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2016-09-27 20:07:15 +00:00
|
|
|
template<bool(*isInCodeSet)(UChar32), typename CharacterType> void utf8PercentEncode(const CodePointIterator<CharacterType>&);
|
|
|
|
template<typename CharacterType> void utf8QueryEncode(const CodePointIterator<CharacterType>&);
|
2021-05-30 16:11:40 +00:00
|
|
|
template<typename CharacterType> std::optional<LCharBuffer> domainToASCII(StringImpl&, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition);
|
2018-04-30 21:17:59 +00:00
|
|
|
template<typename CharacterType> LCharBuffer percentDecode(const LChar*, size_t, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition);
|
|
|
|
static LCharBuffer percentDecode(const LChar*, size_t);
|
2021-05-30 16:11:40 +00:00
|
|
|
static std::optional<String> formURLDecode(StringView input);
|
2018-04-30 21:17:59 +00:00
|
|
|
static bool hasForbiddenHostCodePoint(const LCharBuffer&);
|
2016-09-23 22:32:28 +00:00
|
|
|
void percentEncodeByte(uint8_t);
|
|
|
|
void appendToASCIIBuffer(UChar32);
|
|
|
|
void appendToASCIIBuffer(const char*, size_t);
|
|
|
|
void appendToASCIIBuffer(const LChar* characters, size_t size) { appendToASCIIBuffer(reinterpret_cast<const char*>(characters), size); }
|
URLParser should use TextEncoding through an abstract class
https://bugs.webkit.org/show_bug.cgi?id=190027
Reviewed by Andy Estes.
Source/WebCore:
URLParser uses TextEncoding for one call to encode, which is only used for encoding the query of URLs in documents with non-UTF encodings.
There are 3 call sites that specify the TextEncoding to use from the Document, and even those call sites use a UTF encoding most of the time.
All other URL parsing is done using a well-optimized path which assumes UTF-8 encoding and uses macros from ICU headers, not a TextEncoding.
Moving the logic in this way breaks URL and URLParser's dependency on TextEncoding, which makes it possible to use in a lower-level project
without also moving TextEncoding, TextCodec, TextCodecICU, ThreadGlobalData, and the rest of WebCore and JavaScriptCore.
There is no observable change in behavior. There is now one virtual function call in a code path in URLParser that is not performance-sensitive,
and TextEncodings now have a vtable, which uses a few more bytes of memory total for WebKit.
* css/parser/CSSParserContext.h:
(WebCore::CSSParserContext::completeURL const):
* css/parser/CSSParserIdioms.cpp:
(WebCore::completeURL):
* dom/Document.cpp:
(WebCore::Document::completeURL const):
* html/HTMLBaseElement.cpp:
(WebCore::HTMLBaseElement::href const):
Move the call to encodingForFormSubmission from the URL constructor to the 3 call sites that specify the encoding from the Document.
* loader/FormSubmission.cpp:
(WebCore::FormSubmission::create):
* loader/TextResourceDecoder.cpp:
(WebCore::TextResourceDecoder::encodingForURLParsing):
* loader/TextResourceDecoder.h:
* platform/URL.cpp:
(WebCore::URL::URL):
* platform/URL.h:
(WebCore::URLTextEncoding::~URLTextEncoding):
* platform/URLParser.cpp:
(WebCore::URLParser::encodeNonUTF8Query):
(WebCore::URLParser::copyURLPartsUntil):
(WebCore::URLParser::URLParser):
(WebCore::URLParser::parse):
(WebCore::URLParser::encodeQuery): Deleted.
A pointer replaces the boolean isUTF8Encoding and the TextEncoding& which had a default value of UTF8Encoding.
Now the pointer being null means that we use UTF8, and the pointer being non-null means we use that encoding.
* platform/URLParser.h:
(WebCore::URLParser::URLParser):
* platform/text/TextEncoding.cpp:
(WebCore::UTF7Encoding):
(WebCore::TextEncoding::encodingForFormSubmissionOrURLParsing const):
(WebCore::ASCIIEncoding):
(WebCore::Latin1Encoding):
(WebCore::UTF16BigEndianEncoding):
(WebCore::UTF16LittleEndianEncoding):
(WebCore::UTF8Encoding):
(WebCore::WindowsLatin1Encoding):
(WebCore::TextEncoding::encodingForFormSubmission const): Deleted.
Use NeverDestroyed because TextEncoding now has a virtual destructor.
* platform/text/TextEncoding.h:
Rename encodingForFormSubmission to encodingForFormSubmissionOrURLParsing to make it more clear that we are intentionally using it for both.
Tools:
* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::checkURL):
(TestWebKitAPI::TEST_F):
Canonical link: https://commits.webkit.org/205005@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@236565 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2018-09-27 20:05:52 +00:00
|
|
|
template<typename CharacterType> void encodeNonUTF8Query(const Vector<UChar>& source, const URLTextEncoding&, CodePointIterator<CharacterType>);
|
2016-10-03 17:28:04 +00:00
|
|
|
void copyASCIIStringUntil(const String&, size_t length);
|
2017-03-06 22:55:33 +00:00
|
|
|
bool copyBaseWindowsDriveLetter(const URL&);
|
Implement URLParser::syntaxViolation
https://bugs.webkit.org/show_bug.cgi?id=162593
Reviewed by Geoffrey Garen.
Source/WebCore:
Most of the time when parsing URLs, we just look at the URL, find offsets of the host, path, query, etc.,
and the String can be used untouched. When this happens, we do not want to allocate and copy the String.
We want to just add a reference to an existing String.
Sometimes we need to canonicalize the String because there has been a syntaxViolation,
defined as any String that is different than its canonicalized URL String. In such cases we need to
allocate a new String and fill it with the canonicalized URL String. When a syntaxViolation happens for the
first time, we know that everything in the input String up to that point is equal to what it would have been
if we had canonicalized the beginning of the URL, copy it into a buffer, and continue parsing in a mode where
instead of just looking at the input URL String, we canonicalize each code point into the buffer.
Changes to behavior involve additional spec compliance with tabs and newlines in different places in URLs,
as well as additional spec compliance when parsing empty and null URLs relative to other URLs.
Both are covered by new API tests. Existing behavior covered by existing API tests.
This is about a 15% speed improvement on my URL parsing benchmark.
* platform/URL.cpp:
(WebCore::assertProtocolIsGood):
(WebCore::URL::protocolIs):
(WebCore::protocolIs):
* platform/URL.h:
* platform/URLParser.cpp:
(WebCore::isTabOrNewline):
(WebCore::URLParser::incrementIteratorSkippingTabsAndNewlines):
(WebCore::URLParser::isWindowsDriveLetter):
(WebCore::URLParser::appendToASCIIBuffer):
(WebCore::URLParser::checkWindowsDriveLetter):
(WebCore::URLParser::shouldCopyFileURL):
(WebCore::URLParser::utf8PercentEncode):
(WebCore::URLParser::utf8QueryEncode):
(WebCore::URLParser::copyURLPartsUntil):
(WebCore::URLParser::syntaxViolation):
(WebCore::URLParser::fragmentSyntaxViolation):
(WebCore::URLParser::parsedDataView):
(WebCore::URLParser::currentPosition):
(WebCore::URLParser::URLParser):
(WebCore::URLParser::parse):
(WebCore::URLParser::parseAuthority):
(WebCore::URLParser::parseIPv4Number):
(WebCore::URLParser::parseIPv4Host):
(WebCore::URLParser::parseIPv6Host):
(WebCore::URLParser::parsePort):
(WebCore::URLParser::parseHostAndPort):
(WebCore::serializeURLEncodedForm):
(WebCore::URLParser::allValuesEqual):
(WebCore::URLParser::internalValuesConsistent):
(WebCore::URLParser::incrementIteratorSkippingTabAndNewLine): Deleted.
(WebCore::URLParser::syntaxError): Deleted.
(WebCore::parseIPv4Number): Deleted.
* platform/URLParser.h:
(WebCore::URLParser::incrementIteratorSkippingTabsAndNewlines):
Tools:
* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):
Canonical link: https://commits.webkit.org/180569@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@206457 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2016-09-27 20:07:15 +00:00
|
|
|
StringView parsedDataView(size_t start, size_t length);
|
2016-11-16 22:53:48 +00:00
|
|
|
UChar parsedDataView(size_t position);
|
2021-06-15 16:59:15 +00:00
|
|
|
template<typename CharacterType> bool subdomainStartsWithXNDashDash(CodePointIterator<CharacterType>);
|
|
|
|
bool subdomainStartsWithXNDashDash(StringImpl&);
|
2016-08-28 05:55:17 +00:00
|
|
|
|
Non-special URLs are not idempotent
https://bugs.webkit.org/show_bug.cgi?id=215762
Reviewed by Tim Horton.
LayoutTests/imported/w3c:
* web-platform-tests/url/a-element-expected.txt:
* web-platform-tests/url/a-element-xhtml-expected.txt:
* web-platform-tests/url/url-constructor-expected.txt:
* web-platform-tests/url/url-setters-expected.txt:
Source/WTF:
https://github.com/whatwg/url/pull/505 added an interesting edge case to the URL serialization:
"If url’s host is null, url’s path’s size is greater than 1, and url’s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output."
The problem was that URLs like "a:/a/..//a" would be parsed into "a://a" with a pathname of "//a" and an empty host. If "a://a" was then reparsed, it would again have an href of "a://a"
but its host would be "a" and it would have an empty path. There is consensus that URL parsing should be idempotent, so we need to do something different here.
According to https://github.com/whatwg/url/issues/415#issuecomment-419197290 this follows what Edge did (and then subsequently abandoned when they switched to Chromium)
to make URL parsing idempotent by adding "/." before the path in the edge case of a URL with a non-special scheme (not http, https, wss, etc.) and a null host and a non-empty path that
has an empty first segment. All the members of the URL remain unchanged except the full serialization (href). This is not important in practice, but important in theory.
Our URL parser tries very hard to use the exact same WTF::String object given as input if it can. However, this step is better implemented as a post-processing step that will almost never happen
because otherwise we would have to parse the entire path twice to find out if we need to add "./" or if the "./" that may have already been there needs to stay. This is illustrated with the test URL
"t:/.//p/../../../..//x" which does need the "./".
In the common case, this adds one well-predicted branch to URL parsing, so I expect performance to be unaffected. Since this is such a rare edge case of URLs, I expect no compatibility problems.
* wtf/URL.cpp:
(WTF::URL::pathStart const):
* wtf/URL.h:
(WTF::URL::pathStart const): Deleted.
* wtf/URLParser.cpp:
(WTF::URLParser::copyURLPartsUntil):
(WTF::URLParser::URLParser):
(WTF::URLParser::needsNonSpecialDotSlash const):
(WTF::URLParser::addNonSpecialDotSlash):
* wtf/URLParser.h:
Tools:
* TestWebKitAPI/Tests/WTF/URLParser.cpp:
(TestWebKitAPI::TEST_F):
Canonical link: https://commits.webkit.org/229956@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@267837 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2020-10-01 17:05:41 +00:00
|
|
|
bool needsNonSpecialDotSlash() const;
|
|
|
|
void addNonSpecialDotSlash();
|
|
|
|
|
2016-09-23 23:40:46 +00:00
|
|
|
using IPv4Address = uint32_t;
|
|
|
|
void serializeIPv4(IPv4Address);
|
2017-02-14 07:43:28 +00:00
|
|
|
enum class IPv4ParsingError;
|
|
|
|
enum class IPv4PieceParsingError;
|
|
|
|
template<typename CharacterTypeForSyntaxViolation, typename CharacterType> Expected<IPv4Address, IPv4ParsingError> parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>&, CodePointIterator<CharacterType>);
|
|
|
|
template<typename CharacterType> Expected<uint32_t, URLParser::IPv4PieceParsingError> parseIPv4Piece(CodePointIterator<CharacterType>&, bool& syntaxViolation);
|
2016-09-23 23:40:46 +00:00
|
|
|
using IPv6Address = std::array<uint16_t, 8>;
|
2021-05-30 16:11:40 +00:00
|
|
|
template<typename CharacterType> std::optional<IPv6Address> parseIPv6Host(CodePointIterator<CharacterType>);
|
|
|
|
template<typename CharacterType> std::optional<uint32_t> parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>&);
|
|
|
|
template<typename CharacterType> std::optional<IPv4Address> parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType>);
|
2016-09-23 23:40:46 +00:00
|
|
|
void serializeIPv6Piece(uint16_t piece);
|
2016-09-29 18:18:04 +00:00
|
|
|
void serializeIPv6(IPv6Address);
|
2016-09-23 23:40:46 +00:00
|
|
|
|
2016-08-28 05:55:17 +00:00
|
|
|
enum class URLPart;
|
URLParser should use TextEncoding through an abstract class
https://bugs.webkit.org/show_bug.cgi?id=190027
Reviewed by Andy Estes.
Source/WebCore:
URLParser uses TextEncoding for one call to encode, which is only used for encoding the query of URLs in documents with non-UTF encodings.
There are 3 call sites that specify the TextEncoding to use from the Document, and even those call sites use a UTF encoding most of the time.
All other URL parsing is done using a well-optimized path which assumes UTF-8 encoding and uses macros from ICU headers, not a TextEncoding.
Moving the logic in this way breaks URL and URLParser's dependency on TextEncoding, which makes it possible to use in a lower-level project
without also moving TextEncoding, TextCodec, TextCodecICU, ThreadGlobalData, and the rest of WebCore and JavaScriptCore.
There is no observable change in behavior. There is now one virtual function call in a code path in URLParser that is not performance-sensitive,
and TextEncodings now have a vtable, which uses a few more bytes of memory total for WebKit.
* css/parser/CSSParserContext.h:
(WebCore::CSSParserContext::completeURL const):
* css/parser/CSSParserIdioms.cpp:
(WebCore::completeURL):
* dom/Document.cpp:
(WebCore::Document::completeURL const):
* html/HTMLBaseElement.cpp:
(WebCore::HTMLBaseElement::href const):
Move the call to encodingForFormSubmission from the URL constructor to the 3 call sites that specify the encoding from the Document.
* loader/FormSubmission.cpp:
(WebCore::FormSubmission::create):
* loader/TextResourceDecoder.cpp:
(WebCore::TextResourceDecoder::encodingForURLParsing):
* loader/TextResourceDecoder.h:
* platform/URL.cpp:
(WebCore::URL::URL):
* platform/URL.h:
(WebCore::URLTextEncoding::~URLTextEncoding):
* platform/URLParser.cpp:
(WebCore::URLParser::encodeNonUTF8Query):
(WebCore::URLParser::copyURLPartsUntil):
(WebCore::URLParser::URLParser):
(WebCore::URLParser::parse):
(WebCore::URLParser::encodeQuery): Deleted.
A pointer replaces the boolean isUTF8Encoding and the TextEncoding& which had a default value of UTF8Encoding.
Now the pointer being null means that we use UTF8, and the pointer being non-null means we use that encoding.
* platform/URLParser.h:
(WebCore::URLParser::URLParser):
* platform/text/TextEncoding.cpp:
(WebCore::UTF7Encoding):
(WebCore::TextEncoding::encodingForFormSubmissionOrURLParsing const):
(WebCore::ASCIIEncoding):
(WebCore::Latin1Encoding):
(WebCore::UTF16BigEndianEncoding):
(WebCore::UTF16LittleEndianEncoding):
(WebCore::UTF8Encoding):
(WebCore::WindowsLatin1Encoding):
(WebCore::TextEncoding::encodingForFormSubmission const): Deleted.
Use NeverDestroyed because TextEncoding now has a virtual destructor.
* platform/text/TextEncoding.h:
Rename encodingForFormSubmission to encodingForFormSubmissionOrURLParsing to make it more clear that we are intentionally using it for both.
Tools:
* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::checkURL):
(TestWebKitAPI::TEST_F):
Canonical link: https://commits.webkit.org/205005@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@236565 268f45cc-cd09-0410-ab3c-d52691b4dbfc
2018-09-27 20:05:52 +00:00
|
|
|
template<typename CharacterType> void copyURLPartsUntil(const URL& base, URLPart, const CodePointIterator<CharacterType>&, const URLTextEncoding*&);
|
2016-08-28 05:55:17 +00:00
|
|
|
static size_t urlLengthUntilPart(const URL&, URLPart);
|
2016-09-01 20:33:31 +00:00
|
|
|
void popPath();
|
2017-02-24 16:45:32 +00:00
|
|
|
bool shouldPopPath(unsigned);
|
2016-08-11 19:09:39 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|