193 lines
8.4 KiB
C++
193 lines
8.4 KiB
C++
/*
|
|
* Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
|
|
* Copyright (c) 2012 Google, inc. All Rights Reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the name of Google Inc. nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "TextEncoding.h"
|
|
#include <wtf/ASCIICType.h>
|
|
#include <wtf/Assertions.h>
|
|
#include <wtf/text/StringBuilder.h>
|
|
|
|
namespace WebCore {
|
|
|
|
// See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
|
|
struct Unicode16BitEscapeSequence {
|
|
enum { sequenceSize = 6 }; // e.g. %u26C4
|
|
static size_t findInString(StringView string, size_t startPosition) { return string.find(StringView("%u"), startPosition); }
|
|
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
|
|
{
|
|
size_t runEnd = startPosition;
|
|
while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
|
|
&& isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
|
|
&& isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
|
|
runEnd += sequenceSize;
|
|
}
|
|
return runEnd;
|
|
}
|
|
static String decodeRun(StringView run, const TextEncoding&)
|
|
{
|
|
// Each %u-escape sequence represents a UTF-16 code unit.
|
|
// See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
|
|
// For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
|
|
// without any intervening characters, so decode the run without additional checks.
|
|
auto numberOfSequences = run.length() / sequenceSize;
|
|
StringBuilder builder;
|
|
builder.reserveCapacity(numberOfSequences);
|
|
while (numberOfSequences--) {
|
|
UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]);
|
|
builder.append(codeUnit);
|
|
run = run.substring(sequenceSize);
|
|
}
|
|
return builder.toString();
|
|
}
|
|
};
|
|
|
|
struct URLEscapeSequence {
|
|
enum { sequenceSize = 3 }; // e.g. %41
|
|
static size_t findInString(StringView string, size_t startPosition) { return string.find('%', startPosition); }
|
|
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
|
|
{
|
|
// Make the simplifying assumption that supported encodings may have up to two unescaped characters
|
|
// in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
|
|
// decoder as part of the run. In other words, we end the run at the first value outside of the
|
|
// 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
|
|
// escape sequence.
|
|
size_t runEnd = startPosition;
|
|
int numberOfTrailingCharacters = 0;
|
|
while (runEnd < endPosition) {
|
|
if (string[runEnd] == '%') {
|
|
if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
|
|
runEnd += sequenceSize;
|
|
numberOfTrailingCharacters = 0;
|
|
} else
|
|
break;
|
|
} else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
|
|
runEnd += 1;
|
|
numberOfTrailingCharacters += 1;
|
|
} else
|
|
break;
|
|
}
|
|
return runEnd;
|
|
}
|
|
|
|
static Vector<char, 512> decodeRun(StringView run)
|
|
{
|
|
// For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
|
|
// a valid escape sequence, but there may be characters between the sequences.
|
|
Vector<char, 512> buffer;
|
|
buffer.grow(run.length()); // Unescaping hex sequences only makes the length smaller.
|
|
char* p = buffer.data();
|
|
while (!run.isEmpty()) {
|
|
if (run[0] == '%') {
|
|
*p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]);
|
|
run = run.substring(sequenceSize);
|
|
} else {
|
|
*p++ = run[0];
|
|
run = run.substring(1);
|
|
}
|
|
}
|
|
ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun.
|
|
buffer.shrink(p - buffer.data());
|
|
return buffer;
|
|
}
|
|
|
|
static String decodeRun(StringView run, const TextEncoding& encoding)
|
|
{
|
|
auto buffer = decodeRun(run);
|
|
if (!encoding.isValid())
|
|
return UTF8Encoding().decode(buffer.data(), buffer.size());
|
|
return encoding.decode(buffer.data(), buffer.size());
|
|
}
|
|
};
|
|
|
|
template<typename EscapeSequence>
|
|
String decodeEscapeSequences(StringView string, const TextEncoding& encoding)
|
|
{
|
|
StringBuilder result;
|
|
size_t length = string.length();
|
|
size_t decodedPosition = 0;
|
|
size_t searchPosition = 0;
|
|
size_t encodedRunPosition;
|
|
while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
|
|
size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
|
|
searchPosition = encodedRunEnd;
|
|
if (encodedRunEnd == encodedRunPosition) {
|
|
++searchPosition;
|
|
continue;
|
|
}
|
|
|
|
String decoded = EscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
|
|
if (decoded.isEmpty())
|
|
continue;
|
|
|
|
result.append(string.substring(decodedPosition, encodedRunPosition - decodedPosition));
|
|
result.append(decoded);
|
|
decodedPosition = encodedRunEnd;
|
|
}
|
|
result.append(string.substring(decodedPosition, length - decodedPosition));
|
|
return result.toString();
|
|
}
|
|
|
|
inline Vector<uint8_t> decodeURLEscapeSequencesAsData(StringView string, const TextEncoding& encoding)
|
|
{
|
|
ASSERT(encoding.isValid());
|
|
|
|
Vector<uint8_t> result;
|
|
size_t decodedPosition = 0;
|
|
size_t searchPosition = 0;
|
|
while (true) {
|
|
size_t encodedRunPosition = URLEscapeSequence::findInString(string, searchPosition);
|
|
size_t encodedRunEnd = 0;
|
|
if (encodedRunPosition != notFound) {
|
|
encodedRunEnd = URLEscapeSequence::findEndOfRun(string, encodedRunPosition, string.length());
|
|
searchPosition = encodedRunEnd;
|
|
if (encodedRunEnd == encodedRunPosition) {
|
|
++searchPosition;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Strings are encoded as requested.
|
|
result.appendVector(encoding.encode(string.substring(decodedPosition, encodedRunPosition - decodedPosition), UnencodableHandling::URLEncodedEntities));
|
|
|
|
if (encodedRunPosition == notFound)
|
|
return result;
|
|
|
|
// Bytes go through as-is.
|
|
auto decodedEscapeSequence = URLEscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition));
|
|
ASSERT(!decodedEscapeSequence.isEmpty());
|
|
result.appendVector(decodedEscapeSequence);
|
|
|
|
decodedPosition = encodedRunEnd;
|
|
}
|
|
}
|
|
|
|
} // namespace WebCore
|
|
|