trisquel-icecat/icecat/l10n/compare-locales/compare_locales/checks/dtd.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from io import BytesIO
import re
from xml import sax

from compare_locales.parser import DTDParser
from .base import Checker, CSSCheckMixin


class DTDChecker(Checker, CSSCheckMixin):
    """Tests to run on DTD files.

    Uses xml.sax for the heavy lifting of xml parsing.

    The code tries to parse until it doesn't find any unresolved entities
    anymore. If it finds one, it tries to grab the key, and adds an empty
    <!ENTITY key ""> definition to the header.

    Also checks for some CSS and number heuristics in the values.
    """

    pattern = re.compile(r".*\.dtd$")
    needs_reference = True  # to cast a wider net for known entity references

    eref = re.compile("&(%s);" % DTDParser.Name)
    tmpl = b"""<!DOCTYPE elem [%s]>
<elem>%s</elem>
"""
    xmllist = {"amp", "lt", "gt", "apos", "quot"}

    def __init__(self, extra_tests, locale=None):
        super().__init__(extra_tests, locale=locale)
        self.processContent = False
        if self.extra_tests is not None and "android-dtd" in self.extra_tests:
            self.processContent = True
        self.__known_entities = None

    def known_entities(self, refValue):
        if self.__known_entities is None and self.reference is not None:
            self.__known_entities = set()
            for ent in self.reference.values():
                self.__known_entities.update(self.entities_for_value(ent.raw_val))
        return (
            self.__known_entities
            if self.__known_entities is not None
            else self.entities_for_value(refValue)
        )

    def entities_for_value(self, value):
        reflist = {m.group(1) for m in self.eref.finditer(value)}
        reflist -= self.xmllist
        return reflist

    # Setup for XML parser, with default and text-only content handler
    class TextContent(sax.handler.ContentHandler):
        textcontent = ""

        def characters(self, content):
            self.textcontent += content

    defaulthandler = sax.handler.ContentHandler()
    texthandler = TextContent()

    numPattern = r"([0-9]+|[0-9]*\.[0-9]+)"
    num = re.compile("^%s$" % numPattern)
    lengthPattern = "%s(em|px|ch|cm|in)" % numPattern
    length = re.compile("^%s$" % lengthPattern)

    def check(self, refEnt, l10nEnt):
        """Try to parse the refvalue inside a dummy element, and keep
        track of entities that we need to define to make that work.

        Return a checker that offers just those entities.
        """
        yield from super().check(refEnt, l10nEnt)
        refValue, l10nValue = refEnt.raw_val, l10nEnt.raw_val
        # find entities the refValue references,
        # reusing markup from DTDParser.
        reflist = self.known_entities(refValue)
        inContext = self.entities_for_value(refValue)
        entities = "".join('<!ENTITY %s "">' % s for s in sorted(reflist))
        parser = sax.make_parser()
        parser.setFeature(sax.handler.feature_external_ges, False)

        parser.setContentHandler(self.defaulthandler)
        try:
            parser.parse(
                BytesIO(
                    self.tmpl % (entities.encode("utf-8"), refValue.encode("utf-8"))
                )
            )
            # also catch stray %
            parser.parse(
                BytesIO(
                    self.tmpl
                    % (
                        (refEnt.all + entities).encode("utf-8"),
                        b"&%s;" % refEnt.key.encode("utf-8"),
                    )
                )
            )
        except sax.SAXParseException as e:
            e  # noqa
            yield ("warning", (0, 0), "can't parse en-US value", "xmlparse")

        # find entities the l10nValue references,
        # reusing markup from DTDParser.
        l10nlist = self.entities_for_value(l10nValue)
        missing = sorted(l10nlist - reflist)
        _entities = entities + "".join('<!ENTITY %s "">' % s for s in missing)
        if self.processContent:
            self.texthandler.textcontent = ""
            parser.setContentHandler(self.texthandler)
        try:
            parser.parse(
                BytesIO(
                    self.tmpl % (_entities.encode("utf-8"), l10nValue.encode("utf-8"))
                )
            )
            # also catch stray %
            # if this fails, we need to substract the entity definition
            parser.setContentHandler(self.defaulthandler)
            parser.parse(
                BytesIO(
                    self.tmpl
                    % (
                        (l10nEnt.all + _entities).encode("utf-8"),
                        b"&%s;" % l10nEnt.key.encode("utf-8"),
                    )
                )
            )
        except sax.SAXParseException as e:
            # xml parse error, yield error
            # sometimes, the error is reported on our fake closing
            # element, make that the end of the last line
            lnr = e.getLineNumber() - 1
            lines = l10nValue.splitlines()
            if lnr > len(lines):
                lnr = len(lines)
                col = len(lines[lnr - 1])
            else:
                col = e.getColumnNumber()
                if lnr == 1:
                    # first line starts with <elem>, substract
                    col -= len("<elem>")
                elif lnr == 0:
                    col -= len("<!DOCTYPE elem [")  # first line is DOCTYPE
            yield ("error", (lnr, col), " ".join(e.args), "xmlparse")

        warntmpl = "Referencing unknown entity `%s`"
        if reflist:
            if inContext:
                elsewhere = reflist - inContext
                warntmpl += " (%s used in context" % ", ".join(sorted(inContext))
                if elsewhere:
                    warntmpl += ", %s known)" % ", ".join(sorted(elsewhere))
                else:
                    warntmpl += ")"
            else:
                warntmpl += " (%s known)" % ", ".join(sorted(reflist))
        for key in missing:
            yield ("warning", (0, 0), warntmpl % key, "xmlparse")
        if inContext and l10nlist and l10nlist - inContext - set(missing):
            mismatch = sorted(l10nlist - inContext - set(missing))
            for key in mismatch:
                yield (
                    "warning",
                    (0, 0),
                    "Entity {} referenced, but {} used in context".format(
                        key, ", ".join(sorted(inContext))
                    ),
                    "xmlparse",
                )

        # Number check
        if self.num.match(refValue) and not self.num.match(l10nValue):
            yield ("warning", 0, "reference is a number", "number")
        # CSS checks
        # just a length, width="100em"
        if self.length.match(refValue) and not self.length.match(l10nValue):
            yield ("error", 0, "reference is a CSS length", "css")
        # Check for actual CSS style attribute values
        yield from self.maybe_style(refValue, l10nValue)

        if self.extra_tests is not None and "android-dtd" in self.extra_tests:
            yield from self.processAndroidContent(self.texthandler.textcontent)

    quoted = re.compile("(?P<q>[\"']).*(?P=q)$")

    def unicode_escape(self, str):
        """Helper method to try to decode all unicode escapes in a string.

        This code uses the standard python decode for unicode-escape, but
        that's somewhat tricky, as its input needs to be ascii. To get to
        ascii, the unicode string gets converted to ascii with
        backslashreplace, i.e., all non-ascii unicode chars get unicode
        escaped. And then we try to roll all of that back.
        Now, when that hits an error, that's from the original string, and we
        need to search for the actual error position in the original string,
        as the backslashreplace code changes string positions quite badly.
        See also the last check in TestAndroid.test_android_dtd, with a
        lengthy chinese string.
        """
        val = str.encode("ascii", "backslashreplace")
        try:
            val.decode("unicode-escape")
        except UnicodeDecodeError as e:
            args = list(e.args)
            badstring = args[1][args[2] : args[3]]
            i = len(args[1][: args[2]].decode("unicode-escape"))
            args[2] = i
            args[3] = i + len(badstring)
            raise UnicodeDecodeError(*args)

    def processAndroidContent(self, val):
        """Check for the string values that Android puts into an XML container.

        http://developer.android.com/guide/topics/resources/string-resource.html#FormattingAndStyling  # noqa

        Check for unicode escapes and unescaped quotes and apostrophes,
        if string's not quoted.
        """
        # first, try to decode unicode escapes
        try:
            self.unicode_escape(val)
        except UnicodeDecodeError as e:
            yield ("error", e.args[2], e.args[4], "android")
        # check for unescaped single or double quotes.
        # first, see if the complete string is single or double quoted,
        # that changes the rules
        m = self.quoted.match(val)
        if m:
            q = m.group("q")
            offset = 0
            val = val[1:-1]  # strip quotes
        else:
            q = "[\"']"
            offset = -1
        stray_quot = re.compile(r"[\\\\]*(%s)" % q)

        for m in stray_quot.finditer(val):
            if len(m.group(0)) % 2:
                # found an unescaped single or double quote, which message?
                if m.group(1) == '"':
                    msg = (
                        'Quotes in Android DTDs need escaping with \\" '
                        "or \\u0022, or put string in apostrophes."
                    )
                else:
                    msg = (
                        "Apostrophes in Android DTDs need escaping with "
                        "\\' or \\u0027, or use \u2019, or put string in "
                        "quotes."
                    )
                yield ("error", m.end(0) + offset, msg, "android")