trisquel-icecat/icecat/toolkit/components/translations/tests/scripts/translations-perf-data.py
2025-10-06 02:35:48 -06:00

153 lines
4.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
description:
A CLI tool to extract perftest metadata from a Translations HTML test file.
example:
python3 toolkit/components/translations/tests/scripts/translations-perf-data.py \\
--page_path="toolkit/components/translations/tests/browser/translations-bencher-es.html" \\
--model_path="~/Downloads/cab5e093-7b55-47ea-a247-9747cc0109e3.spm"
note:
The vocab model file can be downloaded from the following page:
https://gregtatum.github.io/taskcluster-tools/src/models/
"""
import argparse
import sys
from pathlib import Path
import sentencepiece as spm
from bs4 import BeautifulSoup
from icu import BreakIterator, Locale
class CustomArgumentParser(argparse.ArgumentParser):
"""Custom argument parser to display help on errors."""
def error(self, message):
"""Override error to display help message."""
print(f"\nerror: {message}\n", file=sys.stderr)
self.print_help()
sys.exit(2)
def parse_arguments() -> argparse.Namespace:
"""Parse CLI arguments."""
parser = CustomArgumentParser(
description=__doc__, # Use the module's docstring as the description
formatter_class=argparse.RawDescriptionHelpFormatter, # Use custom formatter
)
parser.add_argument(
"--page_path",
required=True,
type=Path,
help="The HTML test file from which to extract perftest metadata.",
)
parser.add_argument(
"--model_path",
required=True,
type=Path,
help="The SentencePiece vocab model file for the test page's language.",
)
return parser
def extract_page_language(html_path: Path) -> str:
"""Extract the lang attribute from the HTML file."""
with html_path.open("r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
lang = soup.find("html").get("lang")
if not lang:
raise ValueError(f"Language not specified in the HTML file at {html_path}.")
return lang
def extract_body_text(page_language: str, html_path: Path) -> str:
"""Extract text content from the <body> element of an HTML file,
ignoring sub-elements with a lang attribute not matching page language."""
with html_path.open("r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
body = soup.find("body")
if body is None:
raise ValueError(f"No <body> element found in the HTML file at {html_path}.")
# Find all elements with a `lang` attribute that does not match source_lang_tag
for element in body.find_all(attrs={"lang": True}):
if element["lang"] != page_language:
element.decompose() # Remove the element and its children
return body.get_text()
def is_word_like(segment: str) -> bool:
"""Determine if a segment is word-like."""
segment = segment.strip()
if not segment:
# A word-like segment should not be only whitespace.
return False
# A word-like segment should not be only punctuation.
return any(char.isalnum() for char in segment)
def count_words(text: str, language: str) -> int:
"""Count the words in text using ICU BreakIterator."""
locale = Locale(language)
break_iterator = BreakIterator.createWordInstance(locale)
break_iterator.setText(text)
word_count = 0
lhs_boundary = break_iterator.first()
rhs_boundary = break_iterator.nextBoundary()
while rhs_boundary != BreakIterator.DONE:
if is_word_like(text[lhs_boundary:rhs_boundary]):
word_count += 1
lhs_boundary = rhs_boundary
rhs_boundary = break_iterator.nextBoundary()
return word_count
def count_tokens(text: str, model_path: Path) -> int:
"""Count the tokens in the text using SentencePiece."""
processor = spm.SentencePieceProcessor(model_file=str(model_path))
return len(processor.encode(text))
def main() -> None:
parser = parse_arguments()
args = parser.parse_args()
args.page_path = args.page_path.expanduser()
args.model_path = args.model_path.expanduser()
page_language = extract_page_language(args.page_path)
body_text = extract_body_text(page_language, args.page_path)
token_count = count_tokens(body_text, args.model_path)
word_count = count_words(body_text, page_language)
print()
print(f'pageLanguage: "{page_language}",')
print(f"tokenCount: {token_count},")
print(f"wordCount: {word_count},")
print("\n⏩ NEXT STEPS ⏩\n")
print(
"These metadata should be added to the TranslationsBencher static #PAGE_DATA located in:\n"
)
print("browser/components/translations/tests/browser/head.js")
print()
if __name__ == "__main__":
main()