#! /usr/bin/python3 # Copyright (C) 2024 Luis Guzmán # Copyright (C) 2020, 2021, 2022, 2023, 2024 grizzlyuser # Based on: https://gitlab.trisquel.org/trisquel/wrapage-helpers/-/blob/81881d89b2bf7d502dd14fcccdb471fec6f6b206/helpers/DATA/firefox/reprocess-search-config.py # Below is the notice from the original author: # # Copyright (C) 2020, 2021 Ruben Rodriguez # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA import json import sys import time import copy import argparse import pathlib import logging from collections import namedtuple from jsonschema import validate parser = argparse.ArgumentParser() parser.add_argument( 'MAIN_PATH', type=pathlib.Path, help='path to main application source code directory') parser.add_argument( 'BRANDING_PATH', type=pathlib.Path, help='path to branding source code directory') parser.add_argument( '-i', '--indent', type=int, default=2, help='indent for pretty printing of output files') parser.add_argument( '-l', '--loglevel', choices=logging._nameToLevel.keys(), default=logging.INFO, help='logging level') arguments = parser.parse_args() logging.basicConfig(level=arguments.loglevel) logger = logging.getLogger(str(pathlib.Path(__file__).name)) File = namedtuple('File', ['path', 'content']) class JsonProcessor: @classmethod def process(cls): parsed_jsons = [] for json_path in cls.JSON_PATHS: logger.info('Reading input: ' + str(json_path) + '...') with json_path.open(encoding='utf-8') as file: parsed_jsons.append(File(json_path, json.load(file))) parsed_schema = None if hasattr(cls, "SCHEMA_PATH"): logger.info('Reading schema: ' + str(json_path) + '...') with cls.SCHEMA_PATH.open() as file: parsed_schema = json.load(file) processed = cls.process_parsed(parsed_jsons, parsed_schema) with processed.path.open('w') as file: json.dump(processed.content, file, indent=arguments.indent) logger.info('Wrote: ' + str(processed.path)) class RemoteSettings(JsonProcessor): DUMPS_PATH_RELATIVE = 'services/settings/dumps' DUMPS_PATH_ABSOLUTE = arguments.MAIN_PATH / DUMPS_PATH_RELATIVE _WRAPPER_NAME = 'data' _LAST_MODIFIED_KEY_NAME = 'last_modified' @classmethod def get_collection_timestamp(cls, collection): return max((record[cls._LAST_MODIFIED_KEY_NAME] for record in collection.content), default=0) @classmethod def wrap(cls, processed): return File(processed.path, {cls._WRAPPER_NAME: processed.content, 'timestamp': cls.get_collection_timestamp(processed)}) @classmethod def unwrap(cls, parsed_jsons): return [File(json.path, json.content[cls._WRAPPER_NAME]) for json in parsed_jsons] @classmethod def should_modify_collection(cls, collection): return True @classmethod def now(cls): return int(round(time.time() * 1000)) @classmethod def process_raw(cls, unwrapped_jsons, parsed_schema): timestamps, result = [], [] for collection in unwrapped_jsons: should_modify_collection = cls.should_modify_collection(collection) for record in collection.content: if should_modify_collection: if cls.should_drop_record(record): continue clone = copy.deepcopy(record) record = cls.process_record(record) if clone != record: timestamp = cls.now() while timestamp in timestamps: timestamp += 1 timestamps.append(timestamp) record[cls._LAST_MODIFIED_KEY_NAME] = timestamp if parsed_schema is not None: validate(record, schema=parsed_schema) result.append(record) result.sort( key=lambda record: record[cls._LAST_MODIFIED_KEY_NAME], reverse=True) cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) return File(cls.OUTPUT_PATH, result) @classmethod def process_parsed(cls, parsed_jsons, parsed_schema): return cls.wrap( cls.process_raw( cls.unwrap(parsed_jsons), parsed_schema)) class EmptyRemoteSettings(RemoteSettings): @classmethod def should_drop_record(cls, search_engine): return True @classmethod def process_record(cls, record): return record class Changes(RemoteSettings): JSON_PATHS = tuple(RemoteSettings.DUMPS_PATH_ABSOLUTE.glob('*/*.json')) OUTPUT_PATH = RemoteSettings.DUMPS_PATH_ABSOLUTE / 'monitor/changes' @classmethod def wrap(cls, processed): return File( processed.path, { 'changes': processed.content, 'timestamp': cls.now()}) @classmethod def process_raw(cls, unwrapped_jsons, parsed_schema): changes = [] for collection in unwrapped_jsons: if collection.path != RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/example.json': latest_change = {} latest_change[cls._LAST_MODIFIED_KEY_NAME] = cls.get_collection_timestamp( collection) latest_change['bucket'] = collection.path.parent.name latest_change['collection'] = collection.path.stem changes.append(latest_change) cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) return File(cls.OUTPUT_PATH, changes) class SearchConfigV2(RemoteSettings): JSON_PATHS = ( RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/search-config-v2.json', ) SCHEMA_PATH = arguments.MAIN_PATH / \ 'toolkit/components/search/schema/search-config-v2-schema.json' OUTPUT_PATH = JSON_PATHS[0] _DUCKDUCKGO_SEARCH_ENGINE_IDENTIFIER = 'ddg' @classmethod def should_drop_record(cls, record): if record['recordType'] != 'engine': return False identifier = record['identifier'] excluded_identifiers = ['ecosia', 'qwant', 'trisquel', 'trisquel-packages'] return ( identifier != cls._DUCKDUCKGO_SEARCH_ENGINE_IDENTIFIER and not (identifier.startswith('wikipedia') or identifier in excluded_identifiers) ) @classmethod def process_record(cls, record): if record['recordType'] == 'defaultEngines': return cls.process_default_engines(record) elif record['recordType'] == 'engine': return cls.process_engine(record) elif record['recordType'] == 'engineOrders': return cls.process_engine_orders(record) else: return record @classmethod def process_default_engines(cls, default_engines): default_engines['globalDefault'] = cls._DUCKDUCKGO_SEARCH_ENGINE_IDENTIFIER default_engines['specificDefaults'] = [] return default_engines @classmethod def process_engine(cls, engine): engine['base'].pop('partnerCode', None) engine['base']['urls']['search'].pop('params', None) if engine['identifier'] == cls._DUCKDUCKGO_SEARCH_ENGINE_IDENTIFIER: engine['base']['name'] += ' HTML' engine['base']['urls']['search']['base'] = 'https://html.duckduckgo.com/html' allRegions_prefixes = ['ecosia', 'qwant', 'trisquel'] if any(engine['identifier'].startswith(prefix) for prefix in allRegions_prefixes) or \ engine['identifier'] == cls._DUCKDUCKGO_SEARCH_ENGINE_IDENTIFIER: engine['variants'] = [{'environment': {'allRegionsAndLocales': True}}] return engine @classmethod def process_engine_orders(cls, engine_orders): engine_orders['orders'] = [] return engine_orders class SearchConfigOverridesV2(EmptyRemoteSettings): JSON_PATHS = ( RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/search-config-overrides-v2.json', ) SCHEMA_PATH = arguments.MAIN_PATH / \ 'toolkit/components/search/schema/search-config-overrides-v2-schema.json' OUTPUT_PATH = JSON_PATHS[0] class SearchDefaultOverrideAllowlist(EmptyRemoteSettings): JSON_PATHS = ( RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/search-default-override-allowlist.json', ) SCHEMA_PATH = arguments.MAIN_PATH / \ 'toolkit/components/search/schema/search-default-override-allowlist-schema.json' OUTPUT_PATH = JSON_PATHS[0] class SearchTelemetryV2(EmptyRemoteSettings): JSON_PATHS = ( RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/search-telemetry-v2.json', ) SCHEMA_PATH = arguments.MAIN_PATH / \ 'browser/components/search/schema/search-telemetry-v2-schema.json' OUTPUT_PATH = JSON_PATHS[0] class UrlClassifierSkipUrls(EmptyRemoteSettings): JSON_PATHS = ( RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/url-classifier-skip-urls.json', ) OUTPUT_PATH = JSON_PATHS[0] class TippyTopSites(JsonProcessor): JSON_PATHS = ( arguments.MAIN_PATH / 'browser/components/newtab/data/content/tippytop/top_sites.json', arguments.BRANDING_PATH / 'tippytop/top_sites.json') @classmethod def process_parsed(cls, parsed_jsons, parsed_schema): tippy_top_sites_main = parsed_jsons[0] tippy_top_sites_branding = parsed_jsons[1] result = tippy_top_sites_branding.content + \ [site for site in tippy_top_sites_main.content if 'wikipedia.org' in site['domains']] return File(tippy_top_sites_main.path, result) class TopSites(RemoteSettings): _TOP_SITES_JSON_PATH = 'main/top-sites.json' _TOP_SITES_PATH_MAIN = RemoteSettings.DUMPS_PATH_ABSOLUTE / _TOP_SITES_JSON_PATH JSON_PATHS = ( arguments.BRANDING_PATH / RemoteSettings.DUMPS_PATH_RELATIVE / _TOP_SITES_JSON_PATH, _TOP_SITES_PATH_MAIN) OUTPUT_PATH = _TOP_SITES_PATH_MAIN @classmethod def should_modify_collection(cls, collection): return cls._TOP_SITES_PATH_MAIN == collection.path @classmethod def should_drop_record(cls, site): return site['url'] != 'https://www.wikipedia.org/' @classmethod def process_record(cls, site): site.pop('exclude_regions', None) return site # To reflect the latest timestamps, Changes class should always come after # all other RemoteSettings subclasses processors = ( SearchConfigV2, SearchConfigOverridesV2, SearchDefaultOverrideAllowlist, SearchTelemetryV2, UrlClassifierSkipUrls, TopSites, Changes, TippyTopSites) for processor in processors: processor.process()