/* * Copyright (C) 2007-2021 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #pragma once #include #include // The behavior of many of the functions in the header is dependent // on the current locale. But in the WebKit project, all uses of those functions // are in code processing something that's not locale-specific. These equivalents // for some of the functions are named more explicitly, not dependent // on the C library locale, and we should also optimize them as needed. // All functions return false or leave the character unchanged if passed a character // that is outside the range 0-7F. So they can be used on Unicode strings or // characters if the intent is to do processing only if the character is ASCII. namespace WTF { template constexpr bool isASCII(CharacterType); template constexpr bool isASCIIAlpha(CharacterType); template constexpr bool isASCIIAlphanumeric(CharacterType); template constexpr bool isASCIIBinaryDigit(CharacterType); template constexpr bool isASCIIDigit(CharacterType); template constexpr bool isASCIIDigitOrPunctuation(CharacterType); template constexpr bool isASCIIHexDigit(CharacterType); template constexpr bool isASCIILower(CharacterType); template constexpr bool isASCIIOctalDigit(CharacterType); template constexpr bool isASCIIPrintable(CharacterType); template constexpr bool isASCIISpace(CharacterType); template constexpr bool isASCIIUpper(CharacterType); template CharacterType toASCIILower(CharacterType); template CharacterType toASCIIUpper(CharacterType); template uint8_t toASCIIHexValue(CharacterType); template uint8_t toASCIIHexValue(CharacterType firstCharacter, CharacterType secondCharacter); constexpr char lowerNibbleToASCIIHexDigit(uint8_t); constexpr char upperNibbleToASCIIHexDigit(uint8_t); constexpr char lowerNibbleToLowercaseASCIIHexDigit(uint8_t); constexpr char upperNibbleToLowercaseASCIIHexDigit(uint8_t); template constexpr bool isASCIIAlphaCaselessEqual(CharacterType, char expectedASCIILowercaseLetter); // The toASCIILowerUnchecked function can be used for comparing any input character // to a lowercase English character. The isASCIIAlphaCaselessEqual function should // be used for regular comparison of ASCII alpha characters, but switch statements // in the CSS tokenizer, for example, instead make direct use toASCIILowerUnchecked. template constexpr CharacterType toASCIILowerUnchecked(CharacterType); extern WTF_EXPORT_PRIVATE const unsigned char asciiCaseFoldTable[256]; template constexpr bool isASCII(CharacterType character) { return !(character & ~0x7F); } template constexpr bool isASCIILower(CharacterType character) { return character >= 'a' && character <= 'z'; } template constexpr CharacterType toASCIILowerUnchecked(CharacterType character) { // This function can be used for comparing any input character // to a lowercase English character. The isASCIIAlphaCaselessEqual // below should be used for regular comparison of ASCII alpha // characters, but switch statements in CSS tokenizer instead make // direct use of this function. return character | 0x20; } template constexpr bool isASCIIAlpha(CharacterType character) { return isASCIILower(toASCIILowerUnchecked(character)); } template constexpr bool isASCIIDigit(CharacterType character) { return character >= '0' && character <= '9'; } template constexpr bool isASCIIAlphanumeric(CharacterType character) { return isASCIIDigit(character) || isASCIIAlpha(character); } template constexpr bool isASCIIHexDigit(CharacterType character) { return isASCIIDigit(character) || (toASCIILowerUnchecked(character) >= 'a' && toASCIILowerUnchecked(character) <= 'f'); } template constexpr bool isASCIIBinaryDigit(CharacterType character) { return character == '0' || character == '1'; } template constexpr bool isASCIIOctalDigit(CharacterType character) { return character >= '0' && character <= '7'; } template constexpr bool isASCIIPrintable(CharacterType character) { return character >= ' ' && character <= '~'; } /* Statistics from a run of Apple's page load test for callers of isASCIISpace: character count --------- ----- non-spaces 689383 20 space 294720 0A \n 89059 09 \t 28320 0D \r 0 0C \f 0 0B \v 0 Because of those, we first check to quickly return false for non-control characters, then check for space itself to quickly return true for that case, then do the rest. */ template constexpr bool isASCIISpace(CharacterType character) { return character <= ' ' && (character == ' ' || (character <= 0xD && character >= 0x9)); } template constexpr bool isASCIIUpper(CharacterType character) { return character >= 'A' && character <= 'Z'; } template inline CharacterType toASCIILower(CharacterType character) { return character | (isASCIIUpper(character) << 5); } template<> inline char toASCIILower(char character) { return static_cast(asciiCaseFoldTable[static_cast(character)]); } template<> inline LChar toASCIILower(LChar character) { return asciiCaseFoldTable[character]; } template inline CharacterType toASCIIUpper(CharacterType character) { return character & ~(isASCIILower(character) << 5); } template inline uint8_t toASCIIHexValue(CharacterType character) { ASSERT(isASCIIHexDigit(character)); return character < 'A' ? character - '0' : (character - 'A' + 10) & 0xF; } template inline uint8_t toASCIIHexValue(CharacterType firstCharacter, CharacterType secondCharacter) { return toASCIIHexValue(firstCharacter) << 4 | toASCIIHexValue(secondCharacter); } constexpr char lowerNibbleToASCIIHexDigit(uint8_t value) { uint8_t nibble = value & 0xF; return nibble + (nibble < 10 ? '0' : 'A' - 10); } constexpr char upperNibbleToASCIIHexDigit(uint8_t value) { uint8_t nibble = value >> 4; return nibble + (nibble < 10 ? '0' : 'A' - 10); } constexpr char lowerNibbleToLowercaseASCIIHexDigit(uint8_t value) { uint8_t nibble = value & 0xF; return nibble + (nibble < 10 ? '0' : 'a' - 10); } constexpr char upperNibbleToLowercaseASCIIHexDigit(uint8_t value) { uint8_t nibble = value >> 4; return nibble + (nibble < 10 ? '0' : 'a' - 10); } template constexpr bool isASCIIAlphaCaselessEqual(CharacterType inputCharacter, char expectedASCIILowercaseLetter) { // Name of this argument says this must be a lowercase letter, but it can actually be: // - a lowercase letter // - a numeric digit // - a space // - punctuation in the range 0x21-0x3F, including "-", "/", and "+" // It cannot be: // - an uppercase letter // - a non-ASCII character // - other punctuation, such as underscore and backslash // - a control character such as "\n" // FIXME: Would be nice to make both the function name and expectedASCIILowercaseLetter argument name clearer. ASSERT(toASCIILowerUnchecked(expectedASCIILowercaseLetter) == expectedASCIILowercaseLetter); return LIKELY(toASCIILowerUnchecked(inputCharacter) == expectedASCIILowercaseLetter); } template constexpr bool isASCIIDigitOrPunctuation(CharacterType character) { return (character >= '!' && character <= '@') || (character >= '[' && character <= '`') || (character >= '{' && character <= '~'); } } using WTF::isASCII; using WTF::isASCIIAlpha; using WTF::isASCIIAlphaCaselessEqual; using WTF::isASCIIAlphanumeric; using WTF::isASCIIBinaryDigit; using WTF::isASCIIDigit; using WTF::isASCIIDigitOrPunctuation; using WTF::isASCIIHexDigit; using WTF::isASCIILower; using WTF::isASCIIOctalDigit; using WTF::isASCIIPrintable; using WTF::isASCIISpace; using WTF::isASCIIUpper; using WTF::lowerNibbleToASCIIHexDigit; using WTF::lowerNibbleToLowercaseASCIIHexDigit; using WTF::toASCIIHexValue; using WTF::toASCIILower; using WTF::toASCIILowerUnchecked; using WTF::toASCIIUpper; using WTF::upperNibbleToASCIIHexDigit; using WTF::upperNibbleToLowercaseASCIIHexDigit;