|
|
from __future__ import absolute_import, division, unicode_literals |
|
|
|
|
|
from pip._vendor.six import unichr as chr |
|
|
|
|
|
from collections import deque, OrderedDict |
|
|
from sys import version_info |
|
|
|
|
|
from .constants import spaceCharacters |
|
|
from .constants import entities |
|
|
from .constants import asciiLetters, asciiUpper2Lower |
|
|
from .constants import digits, hexDigits, EOF |
|
|
from .constants import tokenTypes, tagTokenTypes |
|
|
from .constants import replacementCharacters |
|
|
|
|
|
from ._inputstream import HTMLInputStream |
|
|
|
|
|
from ._trie import Trie |
|
|
|
|
|
entitiesTrie = Trie(entities) |
|
|
|
|
|
if version_info >= (3, 7): |
|
|
attributeMap = dict |
|
|
else: |
|
|
attributeMap = OrderedDict |
|
|
|
|
|
|
|
|
class HTMLTokenizer(object): |
|
|
""" This class takes care of tokenizing HTML. |
|
|
|
|
|
* self.currentToken |
|
|
Holds the token that is currently being processed. |
|
|
|
|
|
* self.state |
|
|
Holds a reference to the method to be invoked... XXX |
|
|
|
|
|
* self.stream |
|
|
Points to HTMLInputStream object. |
|
|
""" |
|
|
|
|
|
def __init__(self, stream, parser=None, **kwargs): |
|
|
|
|
|
self.stream = HTMLInputStream(stream, **kwargs) |
|
|
self.parser = parser |
|
|
|
|
|
|
|
|
self.escapeFlag = False |
|
|
self.lastFourChars = [] |
|
|
self.state = self.dataState |
|
|
self.escape = False |
|
|
|
|
|
|
|
|
self.currentToken = None |
|
|
super(HTMLTokenizer, self).__init__() |
|
|
|
|
|
def __iter__(self): |
|
|
""" This is where the magic happens. |
|
|
|
|
|
We do our usually processing through the states and when we have a token |
|
|
to return we yield the token which pauses processing until the next token |
|
|
is requested. |
|
|
""" |
|
|
self.tokenQueue = deque([]) |
|
|
|
|
|
|
|
|
while self.state(): |
|
|
while self.stream.errors: |
|
|
yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} |
|
|
while self.tokenQueue: |
|
|
yield self.tokenQueue.popleft() |
|
|
|
|
|
def consumeNumberEntity(self, isHex): |
|
|
"""This function returns either U+FFFD or the character based on the |
|
|
decimal or hexadecimal representation. It also discards ";" if present. |
|
|
If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. |
|
|
""" |
|
|
|
|
|
allowed = digits |
|
|
radix = 10 |
|
|
if isHex: |
|
|
allowed = hexDigits |
|
|
radix = 16 |
|
|
|
|
|
charStack = [] |
|
|
|
|
|
|
|
|
|
|
|
c = self.stream.char() |
|
|
while c in allowed and c is not EOF: |
|
|
charStack.append(c) |
|
|
c = self.stream.char() |
|
|
|
|
|
|
|
|
charAsInt = int("".join(charStack), radix) |
|
|
|
|
|
|
|
|
if charAsInt in replacementCharacters: |
|
|
char = replacementCharacters[charAsInt] |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"illegal-codepoint-for-numeric-entity", |
|
|
"datavars": {"charAsInt": charAsInt}}) |
|
|
elif ((0xD800 <= charAsInt <= 0xDFFF) or |
|
|
(charAsInt > 0x10FFFF)): |
|
|
char = "\uFFFD" |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"illegal-codepoint-for-numeric-entity", |
|
|
"datavars": {"charAsInt": charAsInt}}) |
|
|
else: |
|
|
|
|
|
if ((0x0001 <= charAsInt <= 0x0008) or |
|
|
(0x000E <= charAsInt <= 0x001F) or |
|
|
(0x007F <= charAsInt <= 0x009F) or |
|
|
(0xFDD0 <= charAsInt <= 0xFDEF) or |
|
|
charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, |
|
|
0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, |
|
|
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, |
|
|
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, |
|
|
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, |
|
|
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, |
|
|
0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, |
|
|
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, |
|
|
0xFFFFF, 0x10FFFE, 0x10FFFF])): |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": |
|
|
"illegal-codepoint-for-numeric-entity", |
|
|
"datavars": {"charAsInt": charAsInt}}) |
|
|
try: |
|
|
|
|
|
|
|
|
char = chr(charAsInt) |
|
|
except ValueError: |
|
|
v = charAsInt - 0x10000 |
|
|
char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) |
|
|
|
|
|
|
|
|
|
|
|
if c != ";": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"numeric-entity-without-semicolon"}) |
|
|
self.stream.unget(c) |
|
|
|
|
|
return char |
|
|
|
|
|
def consumeEntity(self, allowedChar=None, fromAttribute=False): |
|
|
|
|
|
output = "&" |
|
|
|
|
|
charStack = [self.stream.char()] |
|
|
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or |
|
|
(allowedChar is not None and allowedChar == charStack[0])): |
|
|
self.stream.unget(charStack[0]) |
|
|
|
|
|
elif charStack[0] == "#": |
|
|
|
|
|
hex = False |
|
|
charStack.append(self.stream.char()) |
|
|
if charStack[-1] in ("x", "X"): |
|
|
hex = True |
|
|
charStack.append(self.stream.char()) |
|
|
|
|
|
|
|
|
if (hex and charStack[-1] in hexDigits) \ |
|
|
or (not hex and charStack[-1] in digits): |
|
|
|
|
|
self.stream.unget(charStack[-1]) |
|
|
output = self.consumeNumberEntity(hex) |
|
|
else: |
|
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "expected-numeric-entity"}) |
|
|
self.stream.unget(charStack.pop()) |
|
|
output = "&" + "".join(charStack) |
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while (charStack[-1] is not EOF): |
|
|
if not entitiesTrie.has_keys_with_prefix("".join(charStack)): |
|
|
break |
|
|
charStack.append(self.stream.char()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) |
|
|
entityLength = len(entityName) |
|
|
except KeyError: |
|
|
entityName = None |
|
|
|
|
|
if entityName is not None: |
|
|
if entityName[-1] != ";": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"named-entity-without-semicolon"}) |
|
|
if (entityName[-1] != ";" and fromAttribute and |
|
|
(charStack[entityLength] in asciiLetters or |
|
|
charStack[entityLength] in digits or |
|
|
charStack[entityLength] == "=")): |
|
|
self.stream.unget(charStack.pop()) |
|
|
output = "&" + "".join(charStack) |
|
|
else: |
|
|
output = entities[entityName] |
|
|
self.stream.unget(charStack.pop()) |
|
|
output += "".join(charStack[entityLength:]) |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-named-entity"}) |
|
|
self.stream.unget(charStack.pop()) |
|
|
output = "&" + "".join(charStack) |
|
|
|
|
|
if fromAttribute: |
|
|
self.currentToken["data"][-1][1] += output |
|
|
else: |
|
|
if output in spaceCharacters: |
|
|
tokenType = "SpaceCharacters" |
|
|
else: |
|
|
tokenType = "Characters" |
|
|
self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) |
|
|
|
|
|
def processEntityInAttribute(self, allowedChar): |
|
|
"""This method replaces the need for "entityInAttributeValueState". |
|
|
""" |
|
|
self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) |
|
|
|
|
|
def emitCurrentToken(self): |
|
|
"""This method is a generic handler for emitting the tags. It also sets |
|
|
the state to "data" because that's what's needed after a token has been |
|
|
emitted. |
|
|
""" |
|
|
token = self.currentToken |
|
|
|
|
|
if (token["type"] in tagTokenTypes): |
|
|
token["name"] = token["name"].translate(asciiUpper2Lower) |
|
|
if token["type"] == tokenTypes["StartTag"]: |
|
|
raw = token["data"] |
|
|
data = attributeMap(raw) |
|
|
if len(raw) > len(data): |
|
|
|
|
|
data.update(raw[::-1]) |
|
|
token["data"] = data |
|
|
|
|
|
if token["type"] == tokenTypes["EndTag"]: |
|
|
if token["data"]: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "attributes-in-end-tag"}) |
|
|
if token["selfClosing"]: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "self-closing-flag-on-end-tag"}) |
|
|
self.tokenQueue.append(token) |
|
|
self.state = self.dataState |
|
|
|
|
|
|
|
|
def dataState(self): |
|
|
data = self.stream.char() |
|
|
if data == "&": |
|
|
self.state = self.entityDataState |
|
|
elif data == "<": |
|
|
self.state = self.tagOpenState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\u0000"}) |
|
|
elif data is EOF: |
|
|
|
|
|
return False |
|
|
elif data in spaceCharacters: |
|
|
|
|
|
|
|
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": |
|
|
data + self.stream.charsUntil(spaceCharacters, True)}) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
chars = self.stream.charsUntil(("&", "<", "\u0000")) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
|
|
data + chars}) |
|
|
return True |
|
|
|
|
|
def entityDataState(self): |
|
|
self.consumeEntity() |
|
|
self.state = self.dataState |
|
|
return True |
|
|
|
|
|
def rcdataState(self): |
|
|
data = self.stream.char() |
|
|
if data == "&": |
|
|
self.state = self.characterReferenceInRcdata |
|
|
elif data == "<": |
|
|
self.state = self.rcdataLessThanSignState |
|
|
elif data == EOF: |
|
|
|
|
|
return False |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
elif data in spaceCharacters: |
|
|
|
|
|
|
|
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": |
|
|
data + self.stream.charsUntil(spaceCharacters, True)}) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
chars = self.stream.charsUntil(("&", "<", "\u0000")) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
|
|
data + chars}) |
|
|
return True |
|
|
|
|
|
def characterReferenceInRcdata(self): |
|
|
self.consumeEntity() |
|
|
self.state = self.rcdataState |
|
|
return True |
|
|
|
|
|
def rawtextState(self): |
|
|
data = self.stream.char() |
|
|
if data == "<": |
|
|
self.state = self.rawtextLessThanSignState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
elif data == EOF: |
|
|
|
|
|
return False |
|
|
else: |
|
|
chars = self.stream.charsUntil(("<", "\u0000")) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
|
|
data + chars}) |
|
|
return True |
|
|
|
|
|
def scriptDataState(self): |
|
|
data = self.stream.char() |
|
|
if data == "<": |
|
|
self.state = self.scriptDataLessThanSignState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
elif data == EOF: |
|
|
|
|
|
return False |
|
|
else: |
|
|
chars = self.stream.charsUntil(("<", "\u0000")) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
|
|
data + chars}) |
|
|
return True |
|
|
|
|
|
def plaintextState(self): |
|
|
data = self.stream.char() |
|
|
if data == EOF: |
|
|
|
|
|
return False |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
|
|
data + self.stream.charsUntil("\u0000")}) |
|
|
return True |
|
|
|
|
|
def tagOpenState(self): |
|
|
data = self.stream.char() |
|
|
if data == "!": |
|
|
self.state = self.markupDeclarationOpenState |
|
|
elif data == "/": |
|
|
self.state = self.closeTagOpenState |
|
|
elif data in asciiLetters: |
|
|
self.currentToken = {"type": tokenTypes["StartTag"], |
|
|
"name": data, "data": [], |
|
|
"selfClosing": False, |
|
|
"selfClosingAcknowledged": False} |
|
|
self.state = self.tagNameState |
|
|
elif data == ">": |
|
|
|
|
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-tag-name-but-got-right-bracket"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) |
|
|
self.state = self.dataState |
|
|
elif data == "?": |
|
|
|
|
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-tag-name-but-got-question-mark"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.bogusCommentState |
|
|
else: |
|
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-tag-name"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.dataState |
|
|
return True |
|
|
|
|
|
def closeTagOpenState(self): |
|
|
data = self.stream.char() |
|
|
if data in asciiLetters: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], "name": data, |
|
|
"data": [], "selfClosing": False} |
|
|
self.state = self.tagNameState |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-closing-tag-but-got-right-bracket"}) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-closing-tag-but-got-eof"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-closing-tag-but-got-char", |
|
|
"datavars": {"data": data}}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.bogusCommentState |
|
|
return True |
|
|
|
|
|
def tagNameState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.state = self.beforeAttributeNameState |
|
|
elif data == ">": |
|
|
self.emitCurrentToken() |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-tag-name"}) |
|
|
self.state = self.dataState |
|
|
elif data == "/": |
|
|
self.state = self.selfClosingStartTagState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["name"] += "\uFFFD" |
|
|
else: |
|
|
self.currentToken["name"] += data |
|
|
|
|
|
|
|
|
return True |
|
|
|
|
|
def rcdataLessThanSignState(self): |
|
|
data = self.stream.char() |
|
|
if data == "/": |
|
|
self.temporaryBuffer = "" |
|
|
self.state = self.rcdataEndTagOpenState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.rcdataState |
|
|
return True |
|
|
|
|
|
def rcdataEndTagOpenState(self): |
|
|
data = self.stream.char() |
|
|
if data in asciiLetters: |
|
|
self.temporaryBuffer += data |
|
|
self.state = self.rcdataEndTagNameState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.rcdataState |
|
|
return True |
|
|
|
|
|
def rcdataEndTagNameState(self): |
|
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.state = self.beforeAttributeNameState |
|
|
elif data == "/" and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.state = self.selfClosingStartTagState |
|
|
elif data == ">" and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.emitCurrentToken() |
|
|
self.state = self.dataState |
|
|
elif data in asciiLetters: |
|
|
self.temporaryBuffer += data |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "</" + self.temporaryBuffer}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.rcdataState |
|
|
return True |
|
|
|
|
|
def rawtextLessThanSignState(self): |
|
|
data = self.stream.char() |
|
|
if data == "/": |
|
|
self.temporaryBuffer = "" |
|
|
self.state = self.rawtextEndTagOpenState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.rawtextState |
|
|
return True |
|
|
|
|
|
def rawtextEndTagOpenState(self): |
|
|
data = self.stream.char() |
|
|
if data in asciiLetters: |
|
|
self.temporaryBuffer += data |
|
|
self.state = self.rawtextEndTagNameState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.rawtextState |
|
|
return True |
|
|
|
|
|
def rawtextEndTagNameState(self): |
|
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.state = self.beforeAttributeNameState |
|
|
elif data == "/" and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.state = self.selfClosingStartTagState |
|
|
elif data == ">" and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.emitCurrentToken() |
|
|
self.state = self.dataState |
|
|
elif data in asciiLetters: |
|
|
self.temporaryBuffer += data |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "</" + self.temporaryBuffer}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.rawtextState |
|
|
return True |
|
|
|
|
|
def scriptDataLessThanSignState(self): |
|
|
data = self.stream.char() |
|
|
if data == "/": |
|
|
self.temporaryBuffer = "" |
|
|
self.state = self.scriptDataEndTagOpenState |
|
|
elif data == "!": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"}) |
|
|
self.state = self.scriptDataEscapeStartState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataState |
|
|
return True |
|
|
|
|
|
def scriptDataEndTagOpenState(self): |
|
|
data = self.stream.char() |
|
|
if data in asciiLetters: |
|
|
self.temporaryBuffer += data |
|
|
self.state = self.scriptDataEndTagNameState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataState |
|
|
return True |
|
|
|
|
|
def scriptDataEndTagNameState(self): |
|
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.state = self.beforeAttributeNameState |
|
|
elif data == "/" and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.state = self.selfClosingStartTagState |
|
|
elif data == ">" and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.emitCurrentToken() |
|
|
self.state = self.dataState |
|
|
elif data in asciiLetters: |
|
|
self.temporaryBuffer += data |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "</" + self.temporaryBuffer}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataState |
|
|
return True |
|
|
|
|
|
def scriptDataEscapeStartState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
|
|
self.state = self.scriptDataEscapeStartDashState |
|
|
else: |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataState |
|
|
return True |
|
|
|
|
|
def scriptDataEscapeStartDashState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
|
|
self.state = self.scriptDataEscapedDashDashState |
|
|
else: |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataState |
|
|
return True |
|
|
|
|
|
def scriptDataEscapedState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
|
|
self.state = self.scriptDataEscapedDashState |
|
|
elif data == "<": |
|
|
self.state = self.scriptDataEscapedLessThanSignState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
elif data == EOF: |
|
|
self.state = self.dataState |
|
|
else: |
|
|
chars = self.stream.charsUntil(("<", "-", "\u0000")) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
|
|
data + chars}) |
|
|
return True |
|
|
|
|
|
def scriptDataEscapedDashState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
|
|
self.state = self.scriptDataEscapedDashDashState |
|
|
elif data == "<": |
|
|
self.state = self.scriptDataEscapedLessThanSignState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
self.state = self.scriptDataEscapedState |
|
|
elif data == EOF: |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
|
|
self.state = self.scriptDataEscapedState |
|
|
return True |
|
|
|
|
|
def scriptDataEscapedDashDashState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
|
|
elif data == "<": |
|
|
self.state = self.scriptDataEscapedLessThanSignState |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) |
|
|
self.state = self.scriptDataState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
self.state = self.scriptDataEscapedState |
|
|
elif data == EOF: |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
|
|
self.state = self.scriptDataEscapedState |
|
|
return True |
|
|
|
|
|
def scriptDataEscapedLessThanSignState(self): |
|
|
data = self.stream.char() |
|
|
if data == "/": |
|
|
self.temporaryBuffer = "" |
|
|
self.state = self.scriptDataEscapedEndTagOpenState |
|
|
elif data in asciiLetters: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) |
|
|
self.temporaryBuffer = data |
|
|
self.state = self.scriptDataDoubleEscapeStartState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataEscapedState |
|
|
return True |
|
|
|
|
|
def scriptDataEscapedEndTagOpenState(self): |
|
|
data = self.stream.char() |
|
|
if data in asciiLetters: |
|
|
self.temporaryBuffer = data |
|
|
self.state = self.scriptDataEscapedEndTagNameState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataEscapedState |
|
|
return True |
|
|
|
|
|
def scriptDataEscapedEndTagNameState(self): |
|
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.state = self.beforeAttributeNameState |
|
|
elif data == "/" and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.state = self.selfClosingStartTagState |
|
|
elif data == ">" and appropriate: |
|
|
self.currentToken = {"type": tokenTypes["EndTag"], |
|
|
"name": self.temporaryBuffer, |
|
|
"data": [], "selfClosing": False} |
|
|
self.emitCurrentToken() |
|
|
self.state = self.dataState |
|
|
elif data in asciiLetters: |
|
|
self.temporaryBuffer += data |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "</" + self.temporaryBuffer}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataEscapedState |
|
|
return True |
|
|
|
|
|
def scriptDataDoubleEscapeStartState(self): |
|
|
data = self.stream.char() |
|
|
if data in (spaceCharacters | frozenset(("/", ">"))): |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
|
|
if self.temporaryBuffer.lower() == "script": |
|
|
self.state = self.scriptDataDoubleEscapedState |
|
|
else: |
|
|
self.state = self.scriptDataEscapedState |
|
|
elif data in asciiLetters: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
|
|
self.temporaryBuffer += data |
|
|
else: |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataEscapedState |
|
|
return True |
|
|
|
|
|
def scriptDataDoubleEscapedState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
|
|
self.state = self.scriptDataDoubleEscapedDashState |
|
|
elif data == "<": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
|
|
self.state = self.scriptDataDoubleEscapedLessThanSignState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
elif data == EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-script-in-script"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
|
|
return True |
|
|
|
|
|
def scriptDataDoubleEscapedDashState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
|
|
self.state = self.scriptDataDoubleEscapedDashDashState |
|
|
elif data == "<": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
|
|
self.state = self.scriptDataDoubleEscapedLessThanSignState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
self.state = self.scriptDataDoubleEscapedState |
|
|
elif data == EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-script-in-script"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
|
|
self.state = self.scriptDataDoubleEscapedState |
|
|
return True |
|
|
|
|
|
def scriptDataDoubleEscapedDashDashState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
|
|
elif data == "<": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
|
|
self.state = self.scriptDataDoubleEscapedLessThanSignState |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) |
|
|
self.state = self.scriptDataState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": "\uFFFD"}) |
|
|
self.state = self.scriptDataDoubleEscapedState |
|
|
elif data == EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-script-in-script"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
|
|
self.state = self.scriptDataDoubleEscapedState |
|
|
return True |
|
|
|
|
|
def scriptDataDoubleEscapedLessThanSignState(self): |
|
|
data = self.stream.char() |
|
|
if data == "/": |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) |
|
|
self.temporaryBuffer = "" |
|
|
self.state = self.scriptDataDoubleEscapeEndState |
|
|
else: |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataDoubleEscapedState |
|
|
return True |
|
|
|
|
|
def scriptDataDoubleEscapeEndState(self): |
|
|
data = self.stream.char() |
|
|
if data in (spaceCharacters | frozenset(("/", ">"))): |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
|
|
if self.temporaryBuffer.lower() == "script": |
|
|
self.state = self.scriptDataEscapedState |
|
|
else: |
|
|
self.state = self.scriptDataDoubleEscapedState |
|
|
elif data in asciiLetters: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
|
|
self.temporaryBuffer += data |
|
|
else: |
|
|
self.stream.unget(data) |
|
|
self.state = self.scriptDataDoubleEscapedState |
|
|
return True |
|
|
|
|
|
def beforeAttributeNameState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.stream.charsUntil(spaceCharacters, True) |
|
|
elif data in asciiLetters: |
|
|
self.currentToken["data"].append([data, ""]) |
|
|
self.state = self.attributeNameState |
|
|
elif data == ">": |
|
|
self.emitCurrentToken() |
|
|
elif data == "/": |
|
|
self.state = self.selfClosingStartTagState |
|
|
elif data in ("'", '"', "=", "<"): |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"invalid-character-in-attribute-name"}) |
|
|
self.currentToken["data"].append([data, ""]) |
|
|
self.state = self.attributeNameState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"].append(["\uFFFD", ""]) |
|
|
self.state = self.attributeNameState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-attribute-name-but-got-eof"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"].append([data, ""]) |
|
|
self.state = self.attributeNameState |
|
|
return True |
|
|
|
|
|
def attributeNameState(self): |
|
|
data = self.stream.char() |
|
|
leavingThisState = True |
|
|
emitToken = False |
|
|
if data == "=": |
|
|
self.state = self.beforeAttributeValueState |
|
|
elif data in asciiLetters: |
|
|
self.currentToken["data"][-1][0] += data +\ |
|
|
self.stream.charsUntil(asciiLetters, True) |
|
|
leavingThisState = False |
|
|
elif data == ">": |
|
|
|
|
|
|
|
|
|
|
|
emitToken = True |
|
|
elif data in spaceCharacters: |
|
|
self.state = self.afterAttributeNameState |
|
|
elif data == "/": |
|
|
self.state = self.selfClosingStartTagState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"][-1][0] += "\uFFFD" |
|
|
leavingThisState = False |
|
|
elif data in ("'", '"', "<"): |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": |
|
|
"invalid-character-in-attribute-name"}) |
|
|
self.currentToken["data"][-1][0] += data |
|
|
leavingThisState = False |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "eof-in-attribute-name"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"][-1][0] += data |
|
|
leavingThisState = False |
|
|
|
|
|
if leavingThisState: |
|
|
|
|
|
|
|
|
|
|
|
self.currentToken["data"][-1][0] = ( |
|
|
self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) |
|
|
for name, _ in self.currentToken["data"][:-1]: |
|
|
if self.currentToken["data"][-1][0] == name: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"duplicate-attribute"}) |
|
|
break |
|
|
|
|
|
if emitToken: |
|
|
self.emitCurrentToken() |
|
|
return True |
|
|
|
|
|
def afterAttributeNameState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.stream.charsUntil(spaceCharacters, True) |
|
|
elif data == "=": |
|
|
self.state = self.beforeAttributeValueState |
|
|
elif data == ">": |
|
|
self.emitCurrentToken() |
|
|
elif data in asciiLetters: |
|
|
self.currentToken["data"].append([data, ""]) |
|
|
self.state = self.attributeNameState |
|
|
elif data == "/": |
|
|
self.state = self.selfClosingStartTagState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"].append(["\uFFFD", ""]) |
|
|
self.state = self.attributeNameState |
|
|
elif data in ("'", '"', "<"): |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"invalid-character-after-attribute-name"}) |
|
|
self.currentToken["data"].append([data, ""]) |
|
|
self.state = self.attributeNameState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-end-of-tag-but-got-eof"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"].append([data, ""]) |
|
|
self.state = self.attributeNameState |
|
|
return True |
|
|
|
|
|
def beforeAttributeValueState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.stream.charsUntil(spaceCharacters, True) |
|
|
elif data == "\"": |
|
|
self.state = self.attributeValueDoubleQuotedState |
|
|
elif data == "&": |
|
|
self.state = self.attributeValueUnQuotedState |
|
|
self.stream.unget(data) |
|
|
elif data == "'": |
|
|
self.state = self.attributeValueSingleQuotedState |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-attribute-value-but-got-right-bracket"}) |
|
|
self.emitCurrentToken() |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"][-1][1] += "\uFFFD" |
|
|
self.state = self.attributeValueUnQuotedState |
|
|
elif data in ("=", "<", "`"): |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"equals-in-unquoted-attribute-value"}) |
|
|
self.currentToken["data"][-1][1] += data |
|
|
self.state = self.attributeValueUnQuotedState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-attribute-value-but-got-eof"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"][-1][1] += data |
|
|
self.state = self.attributeValueUnQuotedState |
|
|
return True |
|
|
|
|
|
def attributeValueDoubleQuotedState(self): |
|
|
data = self.stream.char() |
|
|
if data == "\"": |
|
|
self.state = self.afterAttributeValueState |
|
|
elif data == "&": |
|
|
self.processEntityInAttribute('"') |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"][-1][1] += "\uFFFD" |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-attribute-value-double-quote"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"][-1][1] += data +\ |
|
|
self.stream.charsUntil(("\"", "&", "\u0000")) |
|
|
return True |
|
|
|
|
|
def attributeValueSingleQuotedState(self): |
|
|
data = self.stream.char() |
|
|
if data == "'": |
|
|
self.state = self.afterAttributeValueState |
|
|
elif data == "&": |
|
|
self.processEntityInAttribute("'") |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"][-1][1] += "\uFFFD" |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-attribute-value-single-quote"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"][-1][1] += data +\ |
|
|
self.stream.charsUntil(("'", "&", "\u0000")) |
|
|
return True |
|
|
|
|
|
def attributeValueUnQuotedState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.state = self.beforeAttributeNameState |
|
|
elif data == "&": |
|
|
self.processEntityInAttribute(">") |
|
|
elif data == ">": |
|
|
self.emitCurrentToken() |
|
|
elif data in ('"', "'", "=", "<", "`"): |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-character-in-unquoted-attribute-value"}) |
|
|
self.currentToken["data"][-1][1] += data |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"][-1][1] += "\uFFFD" |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-attribute-value-no-quotes"}) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( |
|
|
frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) |
|
|
return True |
|
|
|
|
|
def afterAttributeValueState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.state = self.beforeAttributeNameState |
|
|
elif data == ">": |
|
|
self.emitCurrentToken() |
|
|
elif data == "/": |
|
|
self.state = self.selfClosingStartTagState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-EOF-after-attribute-value"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-character-after-attribute-value"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.beforeAttributeNameState |
|
|
return True |
|
|
|
|
|
def selfClosingStartTagState(self): |
|
|
data = self.stream.char() |
|
|
if data == ">": |
|
|
self.currentToken["selfClosing"] = True |
|
|
self.emitCurrentToken() |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": |
|
|
"unexpected-EOF-after-solidus-in-tag"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-character-after-solidus-in-tag"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.beforeAttributeNameState |
|
|
return True |
|
|
|
|
|
def bogusCommentState(self): |
|
|
|
|
|
|
|
|
|
|
|
data = self.stream.charsUntil(">") |
|
|
data = data.replace("\u0000", "\uFFFD") |
|
|
self.tokenQueue.append( |
|
|
{"type": tokenTypes["Comment"], "data": data}) |
|
|
|
|
|
|
|
|
|
|
|
self.stream.char() |
|
|
self.state = self.dataState |
|
|
return True |
|
|
|
|
|
def markupDeclarationOpenState(self): |
|
|
charStack = [self.stream.char()] |
|
|
if charStack[-1] == "-": |
|
|
charStack.append(self.stream.char()) |
|
|
if charStack[-1] == "-": |
|
|
self.currentToken = {"type": tokenTypes["Comment"], "data": ""} |
|
|
self.state = self.commentStartState |
|
|
return True |
|
|
elif charStack[-1] in ('d', 'D'): |
|
|
matched = True |
|
|
for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), |
|
|
('y', 'Y'), ('p', 'P'), ('e', 'E')): |
|
|
charStack.append(self.stream.char()) |
|
|
if charStack[-1] not in expected: |
|
|
matched = False |
|
|
break |
|
|
if matched: |
|
|
self.currentToken = {"type": tokenTypes["Doctype"], |
|
|
"name": "", |
|
|
"publicId": None, "systemId": None, |
|
|
"correct": True} |
|
|
self.state = self.doctypeState |
|
|
return True |
|
|
elif (charStack[-1] == "[" and |
|
|
self.parser is not None and |
|
|
self.parser.tree.openElements and |
|
|
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): |
|
|
matched = True |
|
|
for expected in ["C", "D", "A", "T", "A", "["]: |
|
|
charStack.append(self.stream.char()) |
|
|
if charStack[-1] != expected: |
|
|
matched = False |
|
|
break |
|
|
if matched: |
|
|
self.state = self.cdataSectionState |
|
|
return True |
|
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-dashes-or-doctype"}) |
|
|
|
|
|
while charStack: |
|
|
self.stream.unget(charStack.pop()) |
|
|
self.state = self.bogusCommentState |
|
|
return True |
|
|
|
|
|
def commentStartState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.state = self.commentStartDashState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"] += "\uFFFD" |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"incorrect-comment"}) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-comment"}) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"] += data |
|
|
self.state = self.commentState |
|
|
return True |
|
|
|
|
|
def commentStartDashState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.state = self.commentEndState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"] += "-\uFFFD" |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"incorrect-comment"}) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-comment"}) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"] += "-" + data |
|
|
self.state = self.commentState |
|
|
return True |
|
|
|
|
|
def commentState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.state = self.commentEndDashState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"] += "\uFFFD" |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "eof-in-comment"}) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"] += data + \ |
|
|
self.stream.charsUntil(("-", "\u0000")) |
|
|
return True |
|
|
|
|
|
def commentEndDashState(self): |
|
|
data = self.stream.char() |
|
|
if data == "-": |
|
|
self.state = self.commentEndState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"] += "-\uFFFD" |
|
|
self.state = self.commentState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-comment-end-dash"}) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"] += "-" + data |
|
|
self.state = self.commentState |
|
|
return True |
|
|
|
|
|
def commentEndState(self): |
|
|
data = self.stream.char() |
|
|
if data == ">": |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"] += "--\uFFFD" |
|
|
self.state = self.commentState |
|
|
elif data == "!": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-bang-after-double-dash-in-comment"}) |
|
|
self.state = self.commentEndBangState |
|
|
elif data == "-": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-dash-after-double-dash-in-comment"}) |
|
|
self.currentToken["data"] += data |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-comment-double-dash"}) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-comment"}) |
|
|
self.currentToken["data"] += "--" + data |
|
|
self.state = self.commentState |
|
|
return True |
|
|
|
|
|
def commentEndBangState(self): |
|
|
data = self.stream.char() |
|
|
if data == ">": |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data == "-": |
|
|
self.currentToken["data"] += "--!" |
|
|
self.state = self.commentEndDashState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["data"] += "--!\uFFFD" |
|
|
self.state = self.commentState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-comment-end-bang-state"}) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["data"] += "--!" + data |
|
|
self.state = self.commentState |
|
|
return True |
|
|
|
|
|
def doctypeState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.state = self.beforeDoctypeNameState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-doctype-name-but-got-eof"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"need-space-after-doctype"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.beforeDoctypeNameState |
|
|
return True |
|
|
|
|
|
def beforeDoctypeNameState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
pass |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-doctype-name-but-got-right-bracket"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["name"] = "\uFFFD" |
|
|
self.state = self.doctypeNameState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-doctype-name-but-got-eof"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["name"] = data |
|
|
self.state = self.doctypeNameState |
|
|
return True |
|
|
|
|
|
def doctypeNameState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) |
|
|
self.state = self.afterDoctypeNameState |
|
|
elif data == ">": |
|
|
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["name"] += "\uFFFD" |
|
|
self.state = self.doctypeNameState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype-name"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["name"] += data |
|
|
return True |
|
|
|
|
|
def afterDoctypeNameState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
pass |
|
|
elif data == ">": |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.currentToken["correct"] = False |
|
|
self.stream.unget(data) |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
if data in ("p", "P"): |
|
|
matched = True |
|
|
for expected in (("u", "U"), ("b", "B"), ("l", "L"), |
|
|
("i", "I"), ("c", "C")): |
|
|
data = self.stream.char() |
|
|
if data not in expected: |
|
|
matched = False |
|
|
break |
|
|
if matched: |
|
|
self.state = self.afterDoctypePublicKeywordState |
|
|
return True |
|
|
elif data in ("s", "S"): |
|
|
matched = True |
|
|
for expected in (("y", "Y"), ("s", "S"), ("t", "T"), |
|
|
("e", "E"), ("m", "M")): |
|
|
data = self.stream.char() |
|
|
if data not in expected: |
|
|
matched = False |
|
|
break |
|
|
if matched: |
|
|
self.state = self.afterDoctypeSystemKeywordState |
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.stream.unget(data) |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"expected-space-or-right-bracket-in-doctype", "datavars": |
|
|
{"data": data}}) |
|
|
self.currentToken["correct"] = False |
|
|
self.state = self.bogusDoctypeState |
|
|
|
|
|
return True |
|
|
|
|
|
def afterDoctypePublicKeywordState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.state = self.beforeDoctypePublicIdentifierState |
|
|
elif data in ("'", '"'): |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.beforeDoctypePublicIdentifierState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.stream.unget(data) |
|
|
self.state = self.beforeDoctypePublicIdentifierState |
|
|
return True |
|
|
|
|
|
def beforeDoctypePublicIdentifierState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
pass |
|
|
elif data == "\"": |
|
|
self.currentToken["publicId"] = "" |
|
|
self.state = self.doctypePublicIdentifierDoubleQuotedState |
|
|
elif data == "'": |
|
|
self.currentToken["publicId"] = "" |
|
|
self.state = self.doctypePublicIdentifierSingleQuotedState |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-end-of-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.state = self.bogusDoctypeState |
|
|
return True |
|
|
|
|
|
def doctypePublicIdentifierDoubleQuotedState(self): |
|
|
data = self.stream.char() |
|
|
if data == "\"": |
|
|
self.state = self.afterDoctypePublicIdentifierState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["publicId"] += "\uFFFD" |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-end-of-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["publicId"] += data |
|
|
return True |
|
|
|
|
|
def doctypePublicIdentifierSingleQuotedState(self): |
|
|
data = self.stream.char() |
|
|
if data == "'": |
|
|
self.state = self.afterDoctypePublicIdentifierState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["publicId"] += "\uFFFD" |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-end-of-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["publicId"] += data |
|
|
return True |
|
|
|
|
|
def afterDoctypePublicIdentifierState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.state = self.betweenDoctypePublicAndSystemIdentifiersState |
|
|
elif data == ">": |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data == '"': |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.currentToken["systemId"] = "" |
|
|
self.state = self.doctypeSystemIdentifierDoubleQuotedState |
|
|
elif data == "'": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.currentToken["systemId"] = "" |
|
|
self.state = self.doctypeSystemIdentifierSingleQuotedState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.state = self.bogusDoctypeState |
|
|
return True |
|
|
|
|
|
def betweenDoctypePublicAndSystemIdentifiersState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
pass |
|
|
elif data == ">": |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data == '"': |
|
|
self.currentToken["systemId"] = "" |
|
|
self.state = self.doctypeSystemIdentifierDoubleQuotedState |
|
|
elif data == "'": |
|
|
self.currentToken["systemId"] = "" |
|
|
self.state = self.doctypeSystemIdentifierSingleQuotedState |
|
|
elif data == EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.state = self.bogusDoctypeState |
|
|
return True |
|
|
|
|
|
def afterDoctypeSystemKeywordState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
self.state = self.beforeDoctypeSystemIdentifierState |
|
|
elif data in ("'", '"'): |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.stream.unget(data) |
|
|
self.state = self.beforeDoctypeSystemIdentifierState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.stream.unget(data) |
|
|
self.state = self.beforeDoctypeSystemIdentifierState |
|
|
return True |
|
|
|
|
|
def beforeDoctypeSystemIdentifierState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
pass |
|
|
elif data == "\"": |
|
|
self.currentToken["systemId"] = "" |
|
|
self.state = self.doctypeSystemIdentifierDoubleQuotedState |
|
|
elif data == "'": |
|
|
self.currentToken["systemId"] = "" |
|
|
self.state = self.doctypeSystemIdentifierSingleQuotedState |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.state = self.bogusDoctypeState |
|
|
return True |
|
|
|
|
|
def doctypeSystemIdentifierDoubleQuotedState(self): |
|
|
data = self.stream.char() |
|
|
if data == "\"": |
|
|
self.state = self.afterDoctypeSystemIdentifierState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["systemId"] += "\uFFFD" |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-end-of-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["systemId"] += data |
|
|
return True |
|
|
|
|
|
def doctypeSystemIdentifierSingleQuotedState(self): |
|
|
data = self.stream.char() |
|
|
if data == "'": |
|
|
self.state = self.afterDoctypeSystemIdentifierState |
|
|
elif data == "\u0000": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
self.currentToken["systemId"] += "\uFFFD" |
|
|
elif data == ">": |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-end-of-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.currentToken["systemId"] += data |
|
|
return True |
|
|
|
|
|
def afterDoctypeSystemIdentifierState(self): |
|
|
data = self.stream.char() |
|
|
if data in spaceCharacters: |
|
|
pass |
|
|
elif data == ">": |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"eof-in-doctype"}) |
|
|
self.currentToken["correct"] = False |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
|
|
"unexpected-char-in-doctype"}) |
|
|
self.state = self.bogusDoctypeState |
|
|
return True |
|
|
|
|
|
def bogusDoctypeState(self): |
|
|
data = self.stream.char() |
|
|
if data == ">": |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
elif data is EOF: |
|
|
|
|
|
self.stream.unget(data) |
|
|
self.tokenQueue.append(self.currentToken) |
|
|
self.state = self.dataState |
|
|
else: |
|
|
pass |
|
|
return True |
|
|
|
|
|
def cdataSectionState(self): |
|
|
data = [] |
|
|
while True: |
|
|
data.append(self.stream.charsUntil("]")) |
|
|
data.append(self.stream.charsUntil(">")) |
|
|
char = self.stream.char() |
|
|
if char == EOF: |
|
|
break |
|
|
else: |
|
|
assert char == ">" |
|
|
if data[-1][-2:] == "]]": |
|
|
data[-1] = data[-1][:-2] |
|
|
break |
|
|
else: |
|
|
data.append(char) |
|
|
|
|
|
data = "".join(data) |
|
|
|
|
|
nullCount = data.count("\u0000") |
|
|
if nullCount > 0: |
|
|
for _ in range(nullCount): |
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], |
|
|
"data": "invalid-codepoint"}) |
|
|
data = data.replace("\u0000", "\uFFFD") |
|
|
if data: |
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], |
|
|
"data": data}) |
|
|
self.state = self.dataState |
|
|
return True |
|
|
|