# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import re try: from html import unescape as html_unescape except ImportError: from HTMLParser import HTMLParser html_parser = HTMLParser() html_unescape = html_parser.unescape from .base import Entity, Comment, Junk, Parser class DTDEntityMixin: @property def val(self): """Unescape HTML entities into corresponding Unicode characters. Named (&), decimal (&), and hex (& and &) formats are supported. Unknown entities are left intact. As of Python 3.7 the following 252 named entities are recognized and unescaped: https://github.com/python/cpython/blob/3.7/Lib/html/entities.py """ return html_unescape(self.raw_val) def value_position(self, offset=0): # DTDChecker already returns tuples of (line, col) positions if isinstance(offset, tuple): line_pos, col_pos = offset line, col = super().value_position() if line_pos == 1: col = col + col_pos else: col = col_pos line += line_pos - 1 return line, col else: return super().value_position(offset) class DTDEntity(DTDEntityMixin, Entity): pass class DTDParser(Parser): # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar # ":" | [A-Z] | "_" | [a-z] | # [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] # | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | # [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | # [#x10000-#xEFFFF] CharMinusDash = "\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD" XmlComment = "" % CharMinusDash NameStartChar = ( ":A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF" + "\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F" + "\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD" ) # + \U00010000-\U000EFFFF seems to be unsupported in python # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | # [#x0300-#x036F] | [#x203F-#x2040] NameChar = NameStartChar + r"\-\.0-9" + "\xB7\u0300-\u036F\u203F-\u2040" Name = "[" + NameStartChar + "][" + NameChar + "]*" reKey = re.compile( "" + Name + ")[ \t\r\n]+" "(?P\"[^\"]*\"|'[^']*'?)[ \t\r\n]*>", re.DOTALL | re.M, ) # add BOM to DTDs, details in bug 435002 reHeader = re.compile("^\ufeff") reComment = re.compile("" % CharMinusDash, re.S) rePE = re.compile( "" + Name + ")" "[ \t\r\n]+SYSTEM[ \t\r\n]+" "(?P\"[^\"]*\"|'[^']*')[ \t\r\n]*>[ \t\r\n]*" "%" + Name + ";" "(?:[ \t]*(?:" + XmlComment + "[ \t\r\n]*)*\n?)?" ) class Comment(Comment): @property def val(self): if self._val_cache is None: # Strip "" to comment contents self._val_cache = self.all[4:-3] return self._val_cache def getNext(self, ctx, offset): """ Overload Parser.getNext to special-case ParsedEntities. Just check for a parsed entity if that method claims junk. %foo; """ if offset == 0 and self.reHeader.match(ctx.contents): offset += 1 entity = Parser.getNext(self, ctx, offset) if (entity and isinstance(entity, Junk)) or entity is None: m = self.rePE.match(ctx.contents, offset) if m: entity = DTDEntity( ctx, None, None, m.span(), m.span("key"), m.span("val") ) return entity def createEntity(self, ctx, m, current_comment, white_space): valspan = m.span("val") valspan = (valspan[0] + 1, valspan[1] - 1) return DTDEntity( ctx, current_comment, white_space, m.span(), m.span("key"), valspan )