356 lines
11 KiB
C++
356 lines
11 KiB
C++
/*
|
|
* Copyright (C) 2008, 2015 Apple Inc. All Rights Reserved.
|
|
* Copyright (C) 2010 Google, Inc. All Rights Reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "HTMLParserOptions.h"
|
|
#include "HTMLToken.h"
|
|
#include "InputStreamPreprocessor.h"
|
|
|
|
namespace WebCore {
|
|
|
|
class SegmentedString;
|
|
|
|
class HTMLTokenizer {
|
|
public:
|
|
explicit HTMLTokenizer(const HTMLParserOptions& = HTMLParserOptions());
|
|
|
|
// If we can't parse a whole token, this returns null.
|
|
class TokenPtr;
|
|
TokenPtr nextToken(SegmentedString&);
|
|
|
|
// Used by HTMLSourceTracker.
|
|
void setTokenAttributeBaseOffset(unsigned);
|
|
|
|
// Returns a copy of any characters buffered internally by the tokenizer.
|
|
// The tokenizer buffers characters when searching for the </script> token that terminates a script element.
|
|
String bufferedCharacters() const;
|
|
size_t numberOfBufferedCharacters() const;
|
|
|
|
// Updates the tokenizer's state according to the given tag name. This is an approximation of how the tree
|
|
// builder would update the tokenizer's state. This method is useful for approximating HTML tokenization.
|
|
// To get exactly the correct tokenization, you need the real tree builder.
|
|
//
|
|
// The main failures in the approximation are as follows:
|
|
//
|
|
// * The first set of character tokens emitted for a <pre> element might contain an extra leading newline.
|
|
// * The replacement of U+0000 with U+FFFD will not be sensitive to the tree builder's insertion mode.
|
|
// * CDATA sections in foreign content will be tokenized as bogus comments instead of as character tokens.
|
|
//
|
|
// This approximation is also the algorithm called for when parsing an HTML fragment.
|
|
// https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
|
|
void updateStateFor(const AtomString& tagName);
|
|
|
|
void setForceNullCharacterReplacement(bool);
|
|
|
|
bool shouldAllowCDATA() const;
|
|
void setShouldAllowCDATA(bool);
|
|
|
|
bool isInDataState() const;
|
|
|
|
void setDataState();
|
|
void setPLAINTEXTState();
|
|
void setRAWTEXTState();
|
|
void setRCDATAState();
|
|
void setScriptDataState();
|
|
|
|
bool neverSkipNullCharacters() const;
|
|
|
|
private:
|
|
enum State {
|
|
DataState,
|
|
CharacterReferenceInDataState,
|
|
RCDATAState,
|
|
CharacterReferenceInRCDATAState,
|
|
RAWTEXTState,
|
|
ScriptDataState,
|
|
PLAINTEXTState,
|
|
TagOpenState,
|
|
EndTagOpenState,
|
|
TagNameState,
|
|
RCDATALessThanSignState,
|
|
RCDATAEndTagOpenState,
|
|
RCDATAEndTagNameState,
|
|
RAWTEXTLessThanSignState,
|
|
RAWTEXTEndTagOpenState,
|
|
RAWTEXTEndTagNameState,
|
|
ScriptDataLessThanSignState,
|
|
ScriptDataEndTagOpenState,
|
|
ScriptDataEndTagNameState,
|
|
ScriptDataEscapeStartState,
|
|
ScriptDataEscapeStartDashState,
|
|
ScriptDataEscapedState,
|
|
ScriptDataEscapedDashState,
|
|
ScriptDataEscapedDashDashState,
|
|
ScriptDataEscapedLessThanSignState,
|
|
ScriptDataEscapedEndTagOpenState,
|
|
ScriptDataEscapedEndTagNameState,
|
|
ScriptDataDoubleEscapeStartState,
|
|
ScriptDataDoubleEscapedState,
|
|
ScriptDataDoubleEscapedDashState,
|
|
ScriptDataDoubleEscapedDashDashState,
|
|
ScriptDataDoubleEscapedLessThanSignState,
|
|
ScriptDataDoubleEscapeEndState,
|
|
BeforeAttributeNameState,
|
|
AttributeNameState,
|
|
AfterAttributeNameState,
|
|
BeforeAttributeValueState,
|
|
AttributeValueDoubleQuotedState,
|
|
AttributeValueSingleQuotedState,
|
|
AttributeValueUnquotedState,
|
|
CharacterReferenceInAttributeValueState,
|
|
AfterAttributeValueQuotedState,
|
|
SelfClosingStartTagState,
|
|
BogusCommentState,
|
|
ContinueBogusCommentState, // Not in the HTML spec, used internally to track whether we started the bogus comment token.
|
|
MarkupDeclarationOpenState,
|
|
CommentStartState,
|
|
CommentStartDashState,
|
|
CommentState,
|
|
CommentEndDashState,
|
|
CommentEndState,
|
|
CommentEndBangState,
|
|
DOCTYPEState,
|
|
BeforeDOCTYPENameState,
|
|
DOCTYPENameState,
|
|
AfterDOCTYPENameState,
|
|
AfterDOCTYPEPublicKeywordState,
|
|
BeforeDOCTYPEPublicIdentifierState,
|
|
DOCTYPEPublicIdentifierDoubleQuotedState,
|
|
DOCTYPEPublicIdentifierSingleQuotedState,
|
|
AfterDOCTYPEPublicIdentifierState,
|
|
BetweenDOCTYPEPublicAndSystemIdentifiersState,
|
|
AfterDOCTYPESystemKeywordState,
|
|
BeforeDOCTYPESystemIdentifierState,
|
|
DOCTYPESystemIdentifierDoubleQuotedState,
|
|
DOCTYPESystemIdentifierSingleQuotedState,
|
|
AfterDOCTYPESystemIdentifierState,
|
|
BogusDOCTYPEState,
|
|
CDATASectionState,
|
|
// These CDATA states are not in the HTML5 spec, but we use them internally.
|
|
CDATASectionRightSquareBracketState,
|
|
CDATASectionDoubleRightSquareBracketState,
|
|
};
|
|
|
|
bool processToken(SegmentedString&);
|
|
bool processEntity(SegmentedString&);
|
|
|
|
void parseError();
|
|
|
|
void bufferASCIICharacter(UChar);
|
|
void bufferCharacter(UChar);
|
|
|
|
bool emitAndResumeInDataState(SegmentedString&);
|
|
bool emitAndReconsumeInDataState();
|
|
bool emitEndOfFile(SegmentedString&);
|
|
|
|
// Return true if we wil emit a character token before dealing with the buffered end tag.
|
|
void flushBufferedEndTag();
|
|
bool commitToPartialEndTag(SegmentedString&, UChar, State);
|
|
bool commitToCompleteEndTag(SegmentedString&);
|
|
|
|
void appendToTemporaryBuffer(UChar);
|
|
bool temporaryBufferIs(const char*);
|
|
|
|
// Sometimes we speculatively consume input characters and we don't know whether they represent
|
|
// end tags or RCDATA, etc. These functions help manage these state.
|
|
bool inEndTagBufferingState() const;
|
|
void appendToPossibleEndTag(UChar);
|
|
void saveEndTagNameIfNeeded();
|
|
bool isAppropriateEndTag() const;
|
|
|
|
bool haveBufferedCharacterToken() const;
|
|
|
|
static bool isNullCharacterSkippingState(State);
|
|
|
|
State m_state { DataState };
|
|
bool m_forceNullCharacterReplacement { false };
|
|
bool m_shouldAllowCDATA { false };
|
|
|
|
mutable HTMLToken m_token;
|
|
|
|
// https://html.spec.whatwg.org/#additional-allowed-character
|
|
UChar m_additionalAllowedCharacter { 0 };
|
|
|
|
// https://html.spec.whatwg.org/#preprocessing-the-input-stream
|
|
InputStreamPreprocessor<HTMLTokenizer> m_preprocessor;
|
|
|
|
Vector<UChar, 32> m_appropriateEndTagName;
|
|
|
|
// https://html.spec.whatwg.org/#temporary-buffer
|
|
Vector<LChar, 32> m_temporaryBuffer;
|
|
|
|
// We occasionally want to emit both a character token and an end tag
|
|
// token (e.g., when lexing script). We buffer the name of the end tag
|
|
// token here so we remember it next time we re-enter the tokenizer.
|
|
Vector<LChar, 32> m_bufferedEndTagName;
|
|
|
|
const HTMLParserOptions m_options;
|
|
};
|
|
|
|
class HTMLTokenizer::TokenPtr {
|
|
public:
|
|
TokenPtr();
|
|
~TokenPtr();
|
|
|
|
TokenPtr(TokenPtr&&);
|
|
TokenPtr& operator=(TokenPtr&&) = delete;
|
|
|
|
void clear();
|
|
|
|
operator bool() const;
|
|
|
|
HTMLToken& operator*() const;
|
|
HTMLToken* operator->() const;
|
|
|
|
private:
|
|
friend class HTMLTokenizer;
|
|
explicit TokenPtr(HTMLToken*);
|
|
|
|
HTMLToken* m_token { nullptr };
|
|
};
|
|
|
|
inline HTMLTokenizer::TokenPtr::TokenPtr()
|
|
{
|
|
}
|
|
|
|
inline HTMLTokenizer::TokenPtr::TokenPtr(HTMLToken* token)
|
|
: m_token(token)
|
|
{
|
|
}
|
|
|
|
inline HTMLTokenizer::TokenPtr::~TokenPtr()
|
|
{
|
|
if (m_token)
|
|
m_token->clear();
|
|
}
|
|
|
|
inline HTMLTokenizer::TokenPtr::TokenPtr(TokenPtr&& other)
|
|
: m_token(other.m_token)
|
|
{
|
|
other.m_token = nullptr;
|
|
}
|
|
|
|
inline void HTMLTokenizer::TokenPtr::clear()
|
|
{
|
|
if (m_token) {
|
|
m_token->clear();
|
|
m_token = nullptr;
|
|
}
|
|
}
|
|
|
|
inline HTMLTokenizer::TokenPtr::operator bool() const
|
|
{
|
|
return m_token;
|
|
}
|
|
|
|
inline HTMLToken& HTMLTokenizer::TokenPtr::operator*() const
|
|
{
|
|
ASSERT(m_token);
|
|
return *m_token;
|
|
}
|
|
|
|
inline HTMLToken* HTMLTokenizer::TokenPtr::operator->() const
|
|
{
|
|
ASSERT(m_token);
|
|
return m_token;
|
|
}
|
|
|
|
inline HTMLTokenizer::TokenPtr HTMLTokenizer::nextToken(SegmentedString& source)
|
|
{
|
|
return TokenPtr(processToken(source) ? &m_token : nullptr);
|
|
}
|
|
|
|
inline void HTMLTokenizer::setTokenAttributeBaseOffset(unsigned offset)
|
|
{
|
|
m_token.setAttributeBaseOffset(offset);
|
|
}
|
|
|
|
inline size_t HTMLTokenizer::numberOfBufferedCharacters() const
|
|
{
|
|
// Notice that we add 2 to the length of the m_temporaryBuffer to
|
|
// account for the "</" characters, which are effecitvely buffered in
|
|
// the tokenizer's state machine.
|
|
return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
|
|
}
|
|
|
|
inline void HTMLTokenizer::setForceNullCharacterReplacement(bool value)
|
|
{
|
|
m_forceNullCharacterReplacement = value;
|
|
}
|
|
|
|
inline bool HTMLTokenizer::shouldAllowCDATA() const
|
|
{
|
|
return m_shouldAllowCDATA;
|
|
}
|
|
|
|
inline void HTMLTokenizer::setShouldAllowCDATA(bool value)
|
|
{
|
|
m_shouldAllowCDATA = value;
|
|
}
|
|
|
|
inline bool HTMLTokenizer::isInDataState() const
|
|
{
|
|
return m_state == DataState;
|
|
}
|
|
|
|
inline void HTMLTokenizer::setDataState()
|
|
{
|
|
m_state = DataState;
|
|
}
|
|
|
|
inline void HTMLTokenizer::setPLAINTEXTState()
|
|
{
|
|
m_state = PLAINTEXTState;
|
|
}
|
|
|
|
inline void HTMLTokenizer::setRAWTEXTState()
|
|
{
|
|
m_state = RAWTEXTState;
|
|
}
|
|
|
|
inline void HTMLTokenizer::setRCDATAState()
|
|
{
|
|
m_state = RCDATAState;
|
|
}
|
|
|
|
inline void HTMLTokenizer::setScriptDataState()
|
|
{
|
|
m_state = ScriptDataState;
|
|
}
|
|
|
|
inline bool HTMLTokenizer::isNullCharacterSkippingState(State state)
|
|
{
|
|
return state == DataState || state == RCDATAState || state == RAWTEXTState;
|
|
}
|
|
|
|
inline bool HTMLTokenizer::neverSkipNullCharacters() const
|
|
{
|
|
return m_forceNullCharacterReplacement;
|
|
}
|
|
|
|
} // namespace WebCore
|