# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import re import bisect import codecs from collections import Counter from compare_locales.keyedtuple import KeyedTuple from compare_locales.paths import File __constructors = [] # The allowed capabilities for the Parsers. They define the exact strategy # used by ContentComparer.merge. # Don't perform any merging CAN_NONE = 0 # Copy the entire reference file CAN_COPY = 1 # Remove broken entities from localization # Without CAN_MERGE, en-US is not good to use for localization. CAN_SKIP = 2 # Add missing and broken entities from the reference to localization # This effectively means that en-US is good to use for localized files. CAN_MERGE = 4 class Entry: """ Abstraction layer for a localizable entity. Currently supported are grammars of the form: 1: entity definition 2: entity key (name) 3: entity value <--- definition ----> """ def __init__(self, ctx, pre_comment, inner_white, span, key_span, val_span): self.ctx = ctx self.span = span self.key_span = key_span self.val_span = val_span self.pre_comment = pre_comment self.inner_white = inner_white def position(self, offset=0): """Get the 1-based line and column of the character with given offset into the Entity. If offset is negative, return the end of the Entity. """ if offset < 0: pos = self.span[1] else: pos = self.span[0] + offset return self.ctx.linecol(pos) def value_position(self, offset=0): """Get the 1-based line and column of the character with given offset into the value. If offset is negative, return the end of the value. """ assert self.val_span is not None if offset < 0: pos = self.val_span[1] else: pos = self.val_span[0] + offset return self.ctx.linecol(pos) def _span_start(self): start = self.span[0] if hasattr(self, "pre_comment") and self.pre_comment is not None: start = self.pre_comment.span[0] return start @property def all(self): start = self._span_start() end = self.span[1] return self.ctx.contents[start:end] @property def key(self): return self.ctx.contents[self.key_span[0] : self.key_span[1]] @property def raw_val(self): if self.val_span is None: return None return self.ctx.contents[self.val_span[0] : self.val_span[1]] @property def val(self): return self.raw_val def __repr__(self): return self.key re_br = re.compile("", re.U) re_sgml = re.compile(r"", re.U | re.M) def count_words(self): """Count the words in an English string. Replace a couple of xml markup to make that safer, too. """ value = self.re_br.sub("\n", self.val) value = self.re_sgml.sub("", value) return len(value.split()) def equals(self, other): return self.key == other.key and self.val == other.val class StickyEntry(Entry): """Subclass of Entry to use in for syntax fragments which should always be overwritten in the serializer. """ pass class Entity(Entry): @property def localized(self): """Is this entity localized. Always true for monolingual files. In bilingual files, this is a dynamic property. """ return True def unwrap(self): """Return the literal value to be used by tools.""" return self.raw_val def wrap(self, raw_val): """Create literal entity based on reference and raw value. This is used by the serialization logic. """ start = self._span_start() all = ( self.ctx.contents[start : self.val_span[0]] + raw_val + self.ctx.contents[self.val_span[1] : self.span[1]] ) return LiteralEntity(self.key, raw_val, all) class LiteralEntity(Entity): """Subclass of Entity to represent entities without context slices. It's storing string literals for key, raw_val and all instead of spans. """ def __init__(self, key, val, all): super().__init__(None, None, None, None, None, None) self._key = key self._raw_val = val self._all = all @property def key(self): return self._key @property def raw_val(self): return self._raw_val @property def all(self): return self._all class PlaceholderEntity(LiteralEntity): """Subclass of Entity to be removed in merges.""" def __init__(self, key): super().__init__(key, "", "\nplaceholder\n") class Comment(Entry): def __init__(self, ctx, span): self.ctx = ctx self.span = span self.val_span = None self._val_cache = None @property def key(self): return None @property def val(self): if self._val_cache is None: self._val_cache = self.all return self._val_cache def __repr__(self): return self.all class OffsetComment(Comment): """Helper for file formats that have a constant number of leading chars to strip from comments. Offset defaults to 1 """ comment_offset = 1 @property def val(self): if self._val_cache is None: self._val_cache = "".join( line[self.comment_offset :] for line in self.all.splitlines(True) ) return self._val_cache class Junk: """ An almost-Entity, representing junk data that we didn't parse. This way, we can signal bad content as stuff we don't understand. And the either fix that, or report real bugs in localizations. """ junkid = 0 def __init__(self, ctx, span): self.ctx = ctx self.span = span self.__class__.junkid += 1 self.key = "_junk_%d_%d-%d" % (self.__class__.junkid, span[0], span[1]) def position(self, offset=0): """Get the 1-based line and column of the character with given offset into the Entity. If offset is negative, return the end of the Entity. """ if offset < 0: pos = self.span[1] else: pos = self.span[0] + offset return self.ctx.linecol(pos) @property def all(self): return self.ctx.contents[self.span[0] : self.span[1]] @property def raw_val(self): return self.all @property def val(self): return self.all def error_message(self): params = (self.val,) + self.position() + self.position(-1) return ( 'Unparsed content "%s" from line %d column %d' " to line %d column %d" % params ) def __repr__(self): return self.key class Whitespace(Entry): """Entity-like object representing an empty file with whitespace, if allowed """ def __init__(self, ctx, span): self.ctx = ctx self.span = self.key_span = self.val_span = span def __repr__(self): return self.raw_val class BadEntity(ValueError): """Raised when the parser can't create an Entity for a found match.""" pass class Parser: capabilities = CAN_SKIP | CAN_MERGE reWhitespace = re.compile("[ \t\r\n]+", re.M) Comment = Comment # NotImplementedError would be great, but also tedious reKey = reComment = None class Context: "Fixture for content and line numbers" def __init__(self, contents): self.contents = contents # cache split lines self._lines = None def linecol(self, position): "Returns 1-based line and column numbers." if self._lines is None: nl = re.compile("\n", re.M) self._lines = [m.end() for m in nl.finditer(self.contents)] line_offset = bisect.bisect(self._lines, position) line_start = self._lines[line_offset - 1] if line_offset else 0 col_offset = position - line_start return line_offset + 1, col_offset + 1 def __init__(self): if not hasattr(self, "encoding"): self.encoding = "utf-8" self.ctx = None def readFile(self, file): """Read contents from disk, with universal_newlines""" if isinstance(file, File): file = file.fullpath # python 2 has binary input with universal newlines, # python 3 doesn't. Let's split code paths with open(file, encoding=self.encoding, errors="replace", newline=None) as f: self.readUnicode(f.read()) def readContents(self, contents): """Read contents and create parsing context. contents are in native encoding, but with normalized line endings. """ (contents, _) = codecs.getdecoder(self.encoding)(contents, "replace") self.readUnicode(contents) def readUnicode(self, contents): self.ctx = self.Context(contents) def parse(self): return KeyedTuple(self) def __iter__(self): return self.walk(only_localizable=True) def walk(self, only_localizable=False): if not self.ctx: # loading file failed, or we just didn't load anything return ctx = self.ctx contents = ctx.contents next_offset = 0 while next_offset < len(contents): entity = self.getNext(ctx, next_offset) if isinstance(entity, (Entity, Junk)): yield entity elif not only_localizable: yield entity next_offset = entity.span[1] def getNext(self, ctx, offset): """Parse the next fragment. Parse comments first, then white-space. If an entity follows, create that entity with such pre_comment and inner white-space. If not, emit comment or white-space as standlone. It's OK that this might parse whitespace more than once. Comments are associated with entities if they're not separated by blank lines. Multiple consecutive comments are joined. """ junk_offset = offset m = self.reComment.match(ctx.contents, offset) if m: current_comment = self.Comment(ctx, m.span()) if offset < 2 and "License" in current_comment.val: # Heuristic. A early comment with "License" is probably # a license header, and should be standalone. # Not glueing ourselves to offset == 0 as we might have # skipped a BOM. return current_comment offset = m.end() else: current_comment = None m = self.reWhitespace.match(ctx.contents, offset) if m: white_space = Whitespace(ctx, m.span()) offset = m.end() if current_comment is not None and white_space.raw_val.count("\n") > 1: # standalone comment # return the comment, and reparse the whitespace next time return current_comment if current_comment is None: return white_space else: white_space = None m = self.reKey.match(ctx.contents, offset) if m: try: return self.createEntity(ctx, m, current_comment, white_space) except BadEntity: # fall through to Junk, probably pass if current_comment is not None: return current_comment if white_space is not None: return white_space return self.getJunk(ctx, junk_offset, self.reKey, self.reComment) def getJunk(self, ctx, offset, *expressions): junkend = None for exp in expressions: m = exp.search(ctx.contents, offset) if m: junkend = min(junkend, m.start()) if junkend else m.start() return Junk(ctx, (offset, junkend or len(ctx.contents))) def createEntity(self, ctx, m, current_comment, white_space): return Entity( ctx, current_comment, white_space, m.span(), m.span("key"), m.span("val") ) @classmethod def findDuplicates(cls, entities): found = Counter(entity.key for entity in entities) for entity_id, cnt in found.items(): if cnt > 1: yield f"{entity_id} occurs {cnt} times"