120 lines
4.2 KiB
Python
120 lines
4.2 KiB
Python
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
import re
|
|
|
|
try:
|
|
from html import unescape as html_unescape
|
|
except ImportError:
|
|
from HTMLParser import HTMLParser
|
|
|
|
html_parser = HTMLParser()
|
|
html_unescape = html_parser.unescape
|
|
|
|
from .base import Entity, Comment, Junk, Parser
|
|
|
|
|
|
class DTDEntityMixin:
|
|
@property
|
|
def val(self):
|
|
"""Unescape HTML entities into corresponding Unicode characters.
|
|
|
|
Named (&), decimal (&), and hex (& and &) formats
|
|
are supported. Unknown entities are left intact.
|
|
|
|
As of Python 3.7 the following 252 named entities are
|
|
recognized and unescaped:
|
|
|
|
https://github.com/python/cpython/blob/3.7/Lib/html/entities.py
|
|
"""
|
|
return html_unescape(self.raw_val)
|
|
|
|
def value_position(self, offset=0):
|
|
# DTDChecker already returns tuples of (line, col) positions
|
|
if isinstance(offset, tuple):
|
|
line_pos, col_pos = offset
|
|
line, col = super().value_position()
|
|
if line_pos == 1:
|
|
col = col + col_pos
|
|
else:
|
|
col = col_pos
|
|
line += line_pos - 1
|
|
return line, col
|
|
else:
|
|
return super().value_position(offset)
|
|
|
|
|
|
class DTDEntity(DTDEntityMixin, Entity):
|
|
pass
|
|
|
|
|
|
class DTDParser(Parser):
|
|
# http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
|
|
# ":" | [A-Z] | "_" | [a-z] |
|
|
# [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
|
|
# | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
|
|
# [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
|
|
# [#x10000-#xEFFFF]
|
|
CharMinusDash = "\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD"
|
|
XmlComment = "<!--(?:-?[%s])*?-->" % CharMinusDash
|
|
NameStartChar = (
|
|
":A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF"
|
|
+ "\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F"
|
|
+ "\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD"
|
|
)
|
|
# + \U00010000-\U000EFFFF seems to be unsupported in python
|
|
|
|
# NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
|
|
# [#x0300-#x036F] | [#x203F-#x2040]
|
|
NameChar = NameStartChar + r"\-\.0-9" + "\xB7\u0300-\u036F\u203F-\u2040"
|
|
Name = "[" + NameStartChar + "][" + NameChar + "]*"
|
|
reKey = re.compile(
|
|
"<!ENTITY[ \t\r\n]+(?P<key>" + Name + ")[ \t\r\n]+"
|
|
"(?P<val>\"[^\"]*\"|'[^']*'?)[ \t\r\n]*>",
|
|
re.DOTALL | re.M,
|
|
)
|
|
# add BOM to DTDs, details in bug 435002
|
|
reHeader = re.compile("^\ufeff")
|
|
reComment = re.compile("<!--(?P<val>-?[%s])*?-->" % CharMinusDash, re.S)
|
|
rePE = re.compile(
|
|
"<!ENTITY[ \t\r\n]+%[ \t\r\n]+(?P<key>" + Name + ")"
|
|
"[ \t\r\n]+SYSTEM[ \t\r\n]+"
|
|
"(?P<val>\"[^\"]*\"|'[^']*')[ \t\r\n]*>[ \t\r\n]*"
|
|
"%" + Name + ";"
|
|
"(?:[ \t]*(?:" + XmlComment + "[ \t\r\n]*)*\n?)?"
|
|
)
|
|
|
|
class Comment(Comment):
|
|
@property
|
|
def val(self):
|
|
if self._val_cache is None:
|
|
# Strip "<!--" and "-->" to comment contents
|
|
self._val_cache = self.all[4:-3]
|
|
return self._val_cache
|
|
|
|
def getNext(self, ctx, offset):
|
|
"""
|
|
Overload Parser.getNext to special-case ParsedEntities.
|
|
Just check for a parsed entity if that method claims junk.
|
|
|
|
<!ENTITY % foo SYSTEM "url">
|
|
%foo;
|
|
"""
|
|
if offset == 0 and self.reHeader.match(ctx.contents):
|
|
offset += 1
|
|
entity = Parser.getNext(self, ctx, offset)
|
|
if (entity and isinstance(entity, Junk)) or entity is None:
|
|
m = self.rePE.match(ctx.contents, offset)
|
|
if m:
|
|
entity = DTDEntity(
|
|
ctx, None, None, m.span(), m.span("key"), m.span("val")
|
|
)
|
|
return entity
|
|
|
|
def createEntity(self, ctx, m, current_comment, white_space):
|
|
valspan = m.span("val")
|
|
valspan = (valspan[0] + 1, valspan[1] - 1)
|
|
return DTDEntity(
|
|
ctx, current_comment, white_space, m.span(), m.span("key"), valspan
|
|
)
|