trisquel-icecat/icecat/toolkit/components/translations/tests/scripts/translations-perf-data.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""
description:
  A CLI tool to extract perftest metadata from a Translations HTML test file.

example:
  ❯ python3 toolkit/components/translations/tests/scripts/translations-perf-data.py \\
    --page_path="toolkit/components/translations/tests/browser/translations-bencher-es.html" \\
    --model_path="~/Downloads/cab5e093-7b55-47ea-a247-9747cc0109e3.spm"

note:
  The vocab model file can be downloaded from the following page:
  https://gregtatum.github.io/taskcluster-tools/src/models/
"""

import argparse
import sys
from pathlib import Path

import sentencepiece as spm
from bs4 import BeautifulSoup
from icu import BreakIterator, Locale


class CustomArgumentParser(argparse.ArgumentParser):
    """Custom argument parser to display help on errors."""

    def error(self, message):
        """Override error to display help message."""
        print(f"\nerror: {message}\n", file=sys.stderr)
        self.print_help()
        sys.exit(2)


def parse_arguments() -> argparse.Namespace:
    """Parse CLI arguments."""
    parser = CustomArgumentParser(
        description=__doc__,  # Use the module's docstring as the description
        formatter_class=argparse.RawDescriptionHelpFormatter,  # Use custom formatter
    )
    parser.add_argument(
        "--page_path",
        required=True,
        type=Path,
        help="The HTML test file from which to extract perftest metadata.",
    )
    parser.add_argument(
        "--model_path",
        required=True,
        type=Path,
        help="The SentencePiece vocab model file for the test page's language.",
    )
    return parser


def extract_page_language(html_path: Path) -> str:
    """Extract the lang attribute from the HTML file."""
    with html_path.open("r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    lang = soup.find("html").get("lang")

    if not lang:
        raise ValueError(f"Language not specified in the HTML file at {html_path}.")
    return lang


def extract_body_text(page_language: str, html_path: Path) -> str:
    """Extract text content from the <body> element of an HTML file,
    ignoring sub-elements with a lang attribute not matching page language."""
    with html_path.open("r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    body = soup.find("body")
    if body is None:
        raise ValueError(f"No <body> element found in the HTML file at {html_path}.")

    # Find all elements with a `lang` attribute that does not match source_lang_tag
    for element in body.find_all(attrs={"lang": True}):
        if element["lang"] != page_language:
            element.decompose()  # Remove the element and its children

    return body.get_text()


def is_word_like(segment: str) -> bool:
    """Determine if a segment is word-like."""
    segment = segment.strip()

    if not segment:
        # A word-like segment should not be only whitespace.
        return False

    # A word-like segment should not be only punctuation.
    return any(char.isalnum() for char in segment)


def count_words(text: str, language: str) -> int:
    """Count the words in text using ICU BreakIterator."""
    locale = Locale(language)
    break_iterator = BreakIterator.createWordInstance(locale)
    break_iterator.setText(text)

    word_count = 0
    lhs_boundary = break_iterator.first()
    rhs_boundary = break_iterator.nextBoundary()

    while rhs_boundary != BreakIterator.DONE:
        if is_word_like(text[lhs_boundary:rhs_boundary]):
            word_count += 1

        lhs_boundary = rhs_boundary
        rhs_boundary = break_iterator.nextBoundary()

    return word_count


def count_tokens(text: str, model_path: Path) -> int:
    """Count the tokens in the text using SentencePiece."""
    processor = spm.SentencePieceProcessor(model_file=str(model_path))
    return len(processor.encode(text))


def main() -> None:
    parser = parse_arguments()
    args = parser.parse_args()

    args.page_path = args.page_path.expanduser()
    args.model_path = args.model_path.expanduser()

    page_language = extract_page_language(args.page_path)
    body_text = extract_body_text(page_language, args.page_path)

    token_count = count_tokens(body_text, args.model_path)
    word_count = count_words(body_text, page_language)

    print()
    print(f'pageLanguage: "{page_language}",')
    print(f"tokenCount: {token_count},")
    print(f"wordCount: {word_count},")

    print("\n⏩ NEXT STEPS ⏩\n")
    print(
        "These metadata should be added to the TranslationsBencher static #PAGE_DATA located in:\n"
    )
    print("browser/components/translations/tests/browser/head.js")
    print()


if __name__ == "__main__":
    main()