# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ description: A CLI tool to extract perftest metadata from a Translations HTML test file. example: ❯ python3 toolkit/components/translations/tests/scripts/translations-perf-data.py \\ --page_path="toolkit/components/translations/tests/browser/translations-bencher-es.html" \\ --model_path="~/Downloads/cab5e093-7b55-47ea-a247-9747cc0109e3.spm" note: The vocab model file can be downloaded from the following page: https://gregtatum.github.io/taskcluster-tools/src/models/ """ import argparse import sys from pathlib import Path import sentencepiece as spm from bs4 import BeautifulSoup from icu import BreakIterator, Locale class CustomArgumentParser(argparse.ArgumentParser): """Custom argument parser to display help on errors.""" def error(self, message): """Override error to display help message.""" print(f"\nerror: {message}\n", file=sys.stderr) self.print_help() sys.exit(2) def parse_arguments() -> argparse.Namespace: """Parse CLI arguments.""" parser = CustomArgumentParser( description=__doc__, # Use the module's docstring as the description formatter_class=argparse.RawDescriptionHelpFormatter, # Use custom formatter ) parser.add_argument( "--page_path", required=True, type=Path, help="The HTML test file from which to extract perftest metadata.", ) parser.add_argument( "--model_path", required=True, type=Path, help="The SentencePiece vocab model file for the test page's language.", ) return parser def extract_page_language(html_path: Path) -> str: """Extract the lang attribute from the HTML file.""" with html_path.open("r", encoding="utf-8") as file: soup = BeautifulSoup(file, "html.parser") lang = soup.find("html").get("lang") if not lang: raise ValueError(f"Language not specified in the HTML file at {html_path}.") return lang def extract_body_text(page_language: str, html_path: Path) -> str: """Extract text content from the element of an HTML file, ignoring sub-elements with a lang attribute not matching page language.""" with html_path.open("r", encoding="utf-8") as file: soup = BeautifulSoup(file, "html.parser") body = soup.find("body") if body is None: raise ValueError(f"No element found in the HTML file at {html_path}.") # Find all elements with a `lang` attribute that does not match source_lang_tag for element in body.find_all(attrs={"lang": True}): if element["lang"] != page_language: element.decompose() # Remove the element and its children return body.get_text() def is_word_like(segment: str) -> bool: """Determine if a segment is word-like.""" segment = segment.strip() if not segment: # A word-like segment should not be only whitespace. return False # A word-like segment should not be only punctuation. return any(char.isalnum() for char in segment) def count_words(text: str, language: str) -> int: """Count the words in text using ICU BreakIterator.""" locale = Locale(language) break_iterator = BreakIterator.createWordInstance(locale) break_iterator.setText(text) word_count = 0 lhs_boundary = break_iterator.first() rhs_boundary = break_iterator.nextBoundary() while rhs_boundary != BreakIterator.DONE: if is_word_like(text[lhs_boundary:rhs_boundary]): word_count += 1 lhs_boundary = rhs_boundary rhs_boundary = break_iterator.nextBoundary() return word_count def count_tokens(text: str, model_path: Path) -> int: """Count the tokens in the text using SentencePiece.""" processor = spm.SentencePieceProcessor(model_file=str(model_path)) return len(processor.encode(text)) def main() -> None: parser = parse_arguments() args = parser.parse_args() args.page_path = args.page_path.expanduser() args.model_path = args.model_path.expanduser() page_language = extract_page_language(args.page_path) body_text = extract_body_text(page_language, args.page_path) token_count = count_tokens(body_text, args.model_path) word_count = count_words(body_text, page_language) print() print(f'pageLanguage: "{page_language}",') print(f"tokenCount: {token_count},") print(f"wordCount: {word_count},") print("\n⏩ NEXT STEPS ⏩\n") print( "These metadata should be added to the TranslationsBencher static #PAGE_DATA located in:\n" ) print("browser/components/translations/tests/browser/head.js") print() if __name__ == "__main__": main()