From 56045fd3f0526560dbffd997917c7e58eb8a7ab9 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 26 2023 13:10:15 +0000 Subject: [PATCH 1/17] srpm files should be outside of results we remove results folder each time we process our data, while we want to be able to keep srpms between each run --- diff --git a/build.py b/build.py index e3c30f2..f2ff6d4 100755 --- a/build.py +++ b/build.py @@ -76,7 +76,7 @@ def main(): srpm_regex = re.compile("^{}$".format(args.filter)) packages_folder = "./results/{v}/packages/".format(v=args.results) - srpms_path = os.path.abspath("./results/{v}/srpms/".format(v=args.results)) + srpms_path = os.path.abspath("./srpms/".format(v=args.results)) if not os.path.exists(packages_folder): os.makedirs(packages_folder) From 6570089bf920da749d6659adb0a321de1ac3bd8b Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 26 2023 13:15:54 +0000 Subject: [PATCH 2/17] merge language stats files together move from two json files and one log file per language to one single json doing that allow to open the file once to get language code and debug information also, the patterns may cover multiple time the same file, let's save time by not opening the same time multiple times this simplifies everything, which is good --- diff --git a/build_language_list.py b/build_language_list.py index 828d079..cfd73f8 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -2,7 +2,6 @@ """ Parse translation files to deduct language list """ import argparse -import csv import glob import json import os @@ -31,20 +30,6 @@ def main(): parser.add_argument("--refresh", action="store_true", help="Force refresh") parser.add_argument( - "--describe", action="store_true", help="Describe the current list of languages" - ) - - parser.add_argument( - "--analyzelang", type=str, help="Produce an analyze file for a language" - ) - - parser.add_argument( - "--analyzealllangs", - action="store_true", - help="Produce an analyze file for all languages", - ) - - parser.add_argument( "-v", "--verbose", default=False, @@ -61,38 +46,6 @@ def main(): results_folder = "./results/{v}/".format(v=args.results) lang_folder = os.path.join(results_folder, "languages/") package_folder = os.path.join(results_folder, "packages/") - lang_analyze_folder = os.path.join(results_folder, "languages-analyses/") - - if args.describe: - log.info("Describing detecting languages") - describe(lang_folder) - - elif args.analyzealllangs: - log.info("Provide more data to analyze errors") - rmtree(lang_analyze_folder, ignore_errors=True) - os.mkdir(lang_analyze_folder) - - langs = [ - f - for f in os.listdir(lang_folder) - if os.path.isfile(os.path.join(lang_folder, f)) - ] - for lang in sorted(langs): - analyze = analyze_lang(lang_folder, lang[: -len(".json")]) - - with open(os.path.join(lang_analyze_folder, lang), "w") as f: - f.write(json.dumps(analyze, indent=2)) - - elif args.analyzelang: - log.info("Provide more data to analyze errors") - if not os.path.exists(lang_analyze_folder): - os.makedirs(lang_analyze_folder) - - analyze = analyze_lang(lang_folder, args.analyzelang) - result_file = os.path.join(lang_analyze_folder, args.analyzelang + ".json") - - with open(result_file, "w") as f: - f.write(json.dumps(analyze, indent=2)) if args.refresh and os.path.isdir(lang_folder): rmtree(lang_folder) @@ -100,7 +53,7 @@ def main(): if os.path.exists(lang_folder) is False: log.info("Detecting the list of languages") os.makedirs(lang_folder) - po_langs = detect_languages(package_folder, results_folder) + po_langs = scan_packages(package_folder) for lang in po_langs.keys(): with open(os.path.join(lang_folder, str(lang) + ".json"), "w") as f: @@ -109,79 +62,12 @@ def main(): log.info("done") -def analyze_lang(lang_folder, analized_lang): - """ Analyze one lang """ - log = logging.getLogger("buildLanguageList.analyze_lang") - files = [] - results = dict() - with open(os.path.join(lang_folder, analized_lang + ".json"), "r") as read_file: - files = json.load(read_file)["po"] - - log.info(" Analysing language {l}, with {c} files".format(l=analized_lang, c=len(files))) - - for file in files: - metadata = dict() - try: - metadata = polib.pofile(file).metadata - except OSError: - # maybe a polib bug? to investigate before using it in TM - metadata["Language"] = "error-os" - except TypeError: - metadata["Language"] = "error-type" - except UnicodeDecodeError: - # encoding error, to investigate before using it in TM - metadata["Language"] = "error-unicode" - - if "Language" not in metadata.keys(): - metadata["Language"] = "zzz_null" - elif metadata["Language"] == "": - metadata["Language"] = "zzz_empty" - - language = results.get(metadata.get("Language"), dict()) - - count = language.get("Count", 0) - count += 1 - language["Count"] = count - - lang_files = language.get("Files", []) - lang_files.append(file) - language["Files"] = sorted(lang_files) - - plurals = language.get("Plural-Forms", []) - plurals.append(metadata.get("Plural-Forms")) - plurals = list(set(plurals)) - language["Plural-Forms"] = plurals - - teams = language.get("Language-Team", []) - teams.append(metadata.get("Language-Team")) - teams = list(set(teams)) - language["Language-Team"] = teams - - results[metadata.get("Language")] = language - - return dict(sorted(results.items(), key=lambda item: item[0])) - - -def describe(lang_folder): - """ Provide the number of files per language """ - log = logging.getLogger("buildLanguageList.describe") - langs = [ - f - for f in os.listdir(lang_folder) - if os.path.isfile(os.path.join(lang_folder, f)) - ] - - for lang in sorted(langs): - with open(os.path.join(lang_folder, lang), "r") as read_file: - files = json.load(read_file) - - log.info(" {l}:{c}".format(l=lang[:-len('.json')], c=len(files))) - - -def detect_languages(package_folder, results_folder): - """ For each po file, detect metadatas and deduct the language """ - """ Requires: a file hierarchy with po files """ - """ Returns: a dictionary of lists, key=lang code, value=file list """ +def scan_packages(package_folder: str): + """ For each po file, detect metadata and deduct the language + Requires: a file hierarchy with po files + :param package_folder: + :return: a dictionary of lists, key=lang code, value=file lis + """ log = logging.getLogger("buildLanguageList.detect_languages") langs = {} packages = [ @@ -190,25 +76,36 @@ def detect_languages(package_folder, results_folder): if os.path.isdir(os.path.join(package_folder, f)) ] - log_file = os.path.join(results_folder, "build_language_list.log") - debug_file = list() count = 0 + processed_files_count = 0 + processed_files_duplicates_count = 0 total = len(packages) for package in sorted(packages): count += 1 log.info("{c}/{t} {p}".format(c=count, t=total, p=package)) discovery_file = os.path.join(package_folder, package, "discover.json") + processed_files = list() with open(discovery_file, "r") as read_file: - alls = json.load(read_file) + discover_patterns = json.load(read_file) - to_process = [p for p in alls if p["file_format"] == "po"] + po_patterns = [p for p in discover_patterns if p["file_format"] == "po"] - for pattern in to_process: - mask = os.path.join(package_folder, package, pattern["filemask"]) - p = re.compile(mask.replace("*", "(.*)").replace("+", r"\+")) + for pattern in po_patterns: + filemask = os.path.join(package_folder, package, pattern["filemask"]) + p = re.compile(filemask.replace("*", "(.*)").replace("+", r"\+")) + + for po in glob.glob(filemask): + + if po in processed_files: + # there is no need to process the file it were processed already + log.debug(f"{po} were already processed") + processed_files_duplicates_count += 1 + continue + + processed_files.append(po) + processed_files_count += 1 - for po in glob.glob(mask): result = p.search(po) lang_code = result.group(1) metadata = dict() @@ -224,35 +121,38 @@ def detect_languages(package_folder, results_folder): # maybe a polib bug? to investigate before using it in TM error = "error-os" - lang, decision = choose_lang(lang_code, metadata, error) + lang, decision = choose_language_code_from_po(lang_code, metadata) - debug_file.append([ - po, - lang_code, - metadata.get("Language", ""), - error, - lang, - str(decision), - ]) + debug_file = {"file": po, + "lang_in_path": lang_code, + "metadata_lang": metadata.get("Language", ""), + "metadata_plurals": metadata.get("Plural-Forms", ""), + "metadata_language_team": metadata.get("Language-Team", ""), + "polib_error": error, + "lang_code_chosen": lang, + "lang_code_decision": str(decision) + } lang_result = langs.get(lang, dict()) + po_results = lang_result.get("po", list()) - po_results.append(po) + po_results.append(debug_file) lang_result["po"] = po_results langs[lang] = lang_result - with open(log_file, "w") as file_object: - write_file_object = csv.writer(file_object) - write_file_object.writerows(debug_file) + log.info(f"Done {processed_files_count} files were processed, we skipped {processed_files_duplicates_count} duplicates") return langs -def choose_lang(filename, metadata, error): - """ From a po file and its medata, choose the most likely language code """ - """ By priority: the Language medata """ - """ Returns: a language code """ +def choose_language_code_from_po(filename: str, metadata: dict[str]) -> tuple[str, int]: + """ From a po file and its metadata, choose the most likely language code + By priority: the Language metadata + :param filename: the po file + :param metadata: + :return: a language code + """ log = logging.getLogger("buildLanguageList.choose_lang") lang = "noresult" diff --git a/build_stats.py b/build_stats.py index 50a5bea..ce3645e 100755 --- a/build_stats.py +++ b/build_stats.py @@ -119,8 +119,9 @@ def main(): files = discoveries.get("po", []) if files: + po_files = [f["file"] for f in files] with open(stats_file, "w") as f: - json.dump(get_po_translation_level(files, stats_file), f, indent=2) + json.dump(get_po_translation_level(po_files, stats_file), f, indent=2) log.info("Storing distribution stats") diff --git a/check_dnf_files.sh b/check_dnf_files.sh index 916afe1..3da7c48 100755 --- a/check_dnf_files.sh +++ b/check_dnf_files.sh @@ -26,9 +26,6 @@ function call_sall { /src/build_language_list.py --results "$results" podman run -it --rm -v ./:/src:z -v "$WORK_DIR"/results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G -e DNF_CONF=/src/"$dnf_conf" fedlocstats:latest \ - /src/build_language_list.py --results "$results" --analyzealllang - - podman run -it --rm -v ./:/src:z -v "$WORK_DIR"/results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G -e DNF_CONF=/src/"$dnf_conf" fedlocstats:latest \ /src/build_tm.py --results "$results" --compress podman run -it --rm -v ./:/src:z -v "$WORK_DIR"/results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G -e DNF_CONF=/src/"$dnf_conf" fedlocstats:latest \ diff --git a/runall.sh b/runall.sh index 5951544..cd04eae 100755 --- a/runall.sh +++ b/runall.sh @@ -20,9 +20,6 @@ podman run -it --rm -v ./:/src:z -v ./results:/src/results:z -v ./srpms:/srpms:z # ~ 18 m ./build_language_list.py --results "$results" -# ~ 18 m -./build_language_list.py --results "$results" --analyzealllang - # Creates useful translator files for every languages # ~ 3 h 00 LANG=C ./build_tm.py --results "$results" --compress From 2b86ef59b93694c6f8fc5f1d2465c2871a1bdf6d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 26 2023 13:36:24 +0000 Subject: [PATCH 3/17] make build_tm to use the new file --- diff --git a/build_stats.py b/build_stats.py index ce3645e..6f75f55 100755 --- a/build_stats.py +++ b/build_stats.py @@ -118,8 +118,8 @@ def main(): continue files = discoveries.get("po", []) + files = [f["file"] for f in files] if files: - po_files = [f["file"] for f in files] with open(stats_file, "w") as f: json.dump(get_po_translation_level(po_files, stats_file), f, indent=2) diff --git a/build_tm.py b/build_tm.py index 238c2f0..01c45f6 100755 --- a/build_tm.py +++ b/build_tm.py @@ -67,6 +67,7 @@ def main(): with open(os.path.join(lang_path, lang), "r") as read_file: files = json.load(read_file)["po"] + files = [f["file"] for f in files] compendium_file = os.path.join(tm_folder, lang_code + ".po") compendium_file = os.path.join( From 99f5a8ae36eb5afcf20ec11a5fb27c3e51fec5e0 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 28 2023 21:10:21 +0000 Subject: [PATCH 4/17] merge all stats in one file this moves a lot of logic into build_stats, and simplifies a lot of things --- diff --git a/build_language_list.py b/build_language_list.py index cfd73f8..80f46a5 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -10,7 +10,6 @@ import re import logging import utils -from shutil import rmtree from weblate_language_data import aliases, languages, language_codes, countries LOCAL_ALIASES = {"ca_valencia": "ca@valencia"} @@ -43,29 +42,18 @@ def main(): utils.set_logging(args.verbose, args.results) log = logging.getLogger("buildLanguageList") - results_folder = "./results/{v}/".format(v=args.results) - lang_folder = os.path.join(results_folder, "languages/") + results_folder = f"./results/{args.results}/" package_folder = os.path.join(results_folder, "packages/") - if args.refresh and os.path.isdir(lang_folder): - rmtree(lang_folder) - - if os.path.exists(lang_folder) is False: - log.info("Detecting the list of languages") - os.makedirs(lang_folder) - po_langs = scan_packages(package_folder) - - for lang in po_langs.keys(): - with open(os.path.join(lang_folder, str(lang) + ".json"), "w") as f: - f.write(json.dumps(po_langs[lang], indent=2)) + scan_packages(package_folder, args.refresh) log.info("done") -def scan_packages(package_folder: str): +def scan_packages(package_folder: str, refresh: bool): """ For each po file, detect metadata and deduct the language - Requires: a file hierarchy with po files - :param package_folder: + :param refresh: force to compute again the values + :param package_folder: where to find packages hierarchy with discover.json :return: a dictionary of lists, key=lang code, value=file lis """ log = logging.getLogger("buildLanguageList.detect_languages") @@ -82,9 +70,16 @@ def scan_packages(package_folder: str): total = len(packages) for package in sorted(packages): count += 1 - log.info("{c}/{t} {p}".format(c=count, t=total, p=package)) + log.info(f"{count}/{total} {package}") discovery_file = os.path.join(package_folder, package, "discover.json") - processed_files = list() + languages_file = os.path.join(package_folder, package, "stats.json") + + if os.path.isfile(languages_file) is True: + if refresh is False: + log.info("Language file already exist, no need to process") + continue + + processed_files = dict() with open(discovery_file, "r") as read_file: discover_patterns = json.load(read_file) @@ -95,23 +90,22 @@ def scan_packages(package_folder: str): filemask = os.path.join(package_folder, package, pattern["filemask"]) p = re.compile(filemask.replace("*", "(.*)").replace("+", r"\+")) - for po in glob.glob(filemask): + for po_file in glob.glob(filemask): - if po in processed_files: + if po_file in processed_files.get("po", {}).keys(): # there is no need to process the file it were processed already - log.debug(f"{po} were already processed") + log.debug(f"{po_file} were already processed") processed_files_duplicates_count += 1 continue - processed_files.append(po) processed_files_count += 1 - result = p.search(po) + result = p.search(po_file) lang_code = result.group(1) metadata = dict() error = "" try: - metadata = polib.pofile(po).metadata + metadata = polib.pofile(po_file).metadata except UnicodeDecodeError: # encoding error, to investigate before using it in TM error = "error-unicode" @@ -123,8 +117,7 @@ def scan_packages(package_folder: str): lang, decision = choose_language_code_from_po(lang_code, metadata) - debug_file = {"file": po, - "lang_in_path": lang_code, + debug_file = {"lang_in_path": lang_code, "metadata_lang": metadata.get("Language", ""), "metadata_plurals": metadata.get("Plural-Forms", ""), "metadata_language_team": metadata.get("Language-Team", ""), @@ -133,13 +126,12 @@ def scan_packages(package_folder: str): "lang_code_decision": str(decision) } - lang_result = langs.get(lang, dict()) - - po_results = lang_result.get("po", list()) - po_results.append(debug_file) - lang_result["po"] = po_results + processed_po_files = processed_files.get("po", {}) + processed_po_files[po_file] = debug_file + processed_files["po"] = processed_po_files - langs[lang] = lang_result + with open(languages_file, "w") as f: + json.dump(processed_files, f, indent=2) log.info(f"Done {processed_files_count} files were processed, we skipped {processed_files_duplicates_count} duplicates") diff --git a/build_stats.py b/build_stats.py index 6f75f55..4ef1cef 100755 --- a/build_stats.py +++ b/build_stats.py @@ -2,18 +2,102 @@ """For each package, compute stats""" import argparse -import glob import json +import logging import os -import shutil import subprocess +from collections import defaultdict -import polib -import logging -import utils +from numpyencoder import NumpyEncoder +import pandas as pd from translate.tools.pocount import calcstats +import utils + + +def compute_language_statistics(languages: dict, total_distribution_source_words: int) -> dict: + """ + Target: + "packages": [ + { + "name": "blueberry", + "progress": 100, + "translated": 166, + "team": "French " + } + ], + "progress": 98.1, + "progress_d": 63.4, + "totalsourcewords_d": 11491, + "totalsourcewordssum": 7428, + "translatedsourcewordssum": 7287 + """ + log = logging.getLogger("buildStats.compute_language_statistics") + + results_languages = dict() + po_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy", + "untranslated", "translatedtargetwords"] + package_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy", + "untranslated", "translatedtargetwords", "package"] + + for code, stats in languages.items(): + results_languages[code] = {} + results_languages[code]["po"] = stats + result = {} + + df = pd.DataFrame.from_records(stats) + + for kpi in po_fields: + result[kpi] = df[kpi].sum() + + result["totalsourcewordssum"] = result["translatedsourcewords"] + result["fuzzysourcewords"] + result[ + "untranslatedsourcewords"] + result["totalsourcewords_d"] = total_distribution_source_words + result["progress"] = result["translatedsourcewords"] / result["totalsourcewordssum"] + result["progress_d"] = result["translatedsourcewords"] / result["totalsourcewords_d"] + + packages_stats = df[package_fields].groupby("package").sum() + packages_stats["totalsourcewordssum"] = packages_stats["translatedsourcewords"] + packages_stats["fuzzysourcewords"] + packages_stats["untranslatedsourcewords"] + packages_stats["progress"] = packages_stats["translatedsourcewords"] / packages_stats["totalsourcewordssum"] + packages_stats["team"] = df[["metadata_language_team", "package"]].groupby("package").first() + result["packages"] = packages_stats.to_dict(orient="index") + + results_languages[code].update(result) + + return results_languages + + +def compute_package_statistics(df): + """ + [ + { + "lang_code": "de", + "team": "Low German , German ", + "filename": [ + "po/blueberry-nds.po", + "po/blueberry-de.po" + ], + "progress": 179, + "translated": 297 + }, + """ + log = logging.getLogger("buildStats.compute_language_statistics") + results = dict() + index = "lang_code_chosen" + po_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy", + "untranslated", "translatedtargetwords", index] + + stats = df[po_fields].groupby(index).sum() + stats["totalsourcewordssum"] = stats["translatedsourcewords"] + stats["fuzzysourcewords"] + stats["untranslatedsourcewords"] + stats["progress"] = stats["translatedsourcewords"] / stats["totalsourcewordssum"] + stats["team"] = df[["metadata_language_team", index]].groupby(index).first() + df['filename'] = df.index + stats["filename"] = df[["filename", index]].groupby(index).sum() + results["languages"] = stats.to_dict(orient="index") + + return results + def main(): """Handle params""" @@ -40,18 +124,10 @@ def main(): utils.set_logging(args.verbose, args.results) log = logging.getLogger("buildStats") - results_folder = "./results/{v}/".format(v=args.results) - packages_folder = "./results/{v}/packages/".format(v=args.results) - packages_stats_folder = "./results/{v}/packages-stats/".format(v=args.results) - languages_folder = "./results/{v}/languages/".format(v=args.results) - languages_stats_folder = "./results/{v}/languages-stats/".format(v=args.results) - - for folder in [ - packages_stats_folder, - languages_stats_folder - ]: - if args.refresh and os.path.isdir(folder): - shutil.rmtree(folder) + results_folder = f"./results/{args.results}/" + packages_folder = f"./results/{args.results}/packages/" + languages_stats_folder = f"./results/{args.results}/languages/" + os.makedirs(languages_stats_folder, exist_ok=True) log.info("Computing packages stats") packages = [ @@ -60,188 +136,124 @@ def main(): if os.path.isdir(os.path.join(packages_folder, f)) ] count = 0 - distribution_stats = dict() - - if not os.path.exists(packages_stats_folder): - os.makedirs(packages_stats_folder) + all_stats = list() for package in sorted(packages): count += 1 - log.info(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package)) + log.info(f" {count}/{len(packages)} - {package}") + stats_file = os.path.join(packages_folder, package, "stats.json") - src_folder = os.path.join(packages_folder, package) - stats_file = os.path.join(packages_stats_folder, package + ".json") + with open(stats_file, "r") as f: + stats = json.load(f) - if os.path.isfile(stats_file) is False: - with open(os.path.join(packages_folder, package, "discover.json"), "r") as f: - discoveries = json.load(f) + stats["package"] = package - results = dict() - for discover in discoveries: - files = glob.glob(os.path.join(src_folder, discover["filemask"])) + # some packages have no detected po files + if "po" not in stats.keys(): + continue - if discover["file_format"] == "po": - results[discover["filemask"]] = get_po_translation_level( - files, stats_file - ) + for file in stats["po"].keys(): + if "translated" in stats["po"][file].keys() \ + and args.refresh is False: + log.debug(f"{file} is already processed") + continue - if len(results) == 0: - log.warning("No translation file found?") - else: - with open(stats_file, "w") as f: - json.dump(results, f, indent=2) - else: - with open(stats_file, "r") as f: - results = json.load(f) + stats["po"][file].update(get_po_translation_level(file)) - distribution_stats = extract_release_stats(distribution_stats, results) + df = pd.DataFrame.from_dict(stats["po"], orient='index') + stats["stats"] = compute_package_statistics(df) + stats["totalsourcewords"] = df[["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords"]].sum().sum() - log.info("Computing language stats") - languages = [f for f in os.listdir(languages_folder)] - count = 0 + with open(stats_file, "w") as f: + json.dump(stats, f, indent=2, cls=NumpyEncoder) - languages_stats_folder = languages_stats_folder - if not os.path.exists(languages_stats_folder): - os.makedirs(languages_stats_folder) + all_stats.append(stats) - for language in sorted(languages): - count += 1 - lang = language[:-5] + log.info("Aggregating language stats") + languages = defaultdict(list) + total_distribution_source_words = 0 + for package in all_stats: + max_languages = defaultdict(int) + for filename, stats in package["po"].items(): + lang_code = stats["lang_code_chosen"] + stats["filename"] = filename + stats["package"] = package["package"] - log.info(" {c}/{t} - {l}".format(c=count, t=len(languages), l=lang)) - with open(os.path.join(languages_folder, language), "r") as f: - discoveries = json.load(f) + languages[lang_code].append(stats) - stats_file = os.path.join(languages_stats_folder, lang + ".json") + max_languages[lang_code] += stats["translatedsourcewords"] + max_languages[lang_code] += stats["untranslatedsourcewords"] - if os.path.isfile(stats_file): - continue + try: + del max_languages["error"] + except KeyError: + pass + total_distribution_source_words += max(max_languages.values()) - files = discoveries.get("po", []) - files = [f["file"] for f in files] - if files: - with open(stats_file, "w") as f: - json.dump(get_po_translation_level(po_files, stats_file), f, indent=2) + log.info("Storing language stats") + for lang_code, language in languages.items(): + stats_file = os.path.join(languages_stats_folder, f"{lang_code}.json") + with open(stats_file, "w") as f: + json.dump(language, f, indent=2) - log.info("Storing distribution stats") + log.info("Computing language stats") + languages = compute_language_statistics(languages, total_distribution_source_words) + + log.info("Storing language stats") + for lang_code, language in languages.items(): + stats_file = os.path.join(languages_stats_folder, f"{lang_code}.json") + with open(stats_file, "w") as f: + json.dump(language, f, indent=2, cls=NumpyEncoder) + + + log.info("Processing distribution stats") distribution_file = os.path.join(results_folder, "release.json") + distribution_stats = dict() with open(os.path.join(results_folder, "data.json"), "r") as f: distribution_stats["total_release_packages"] = len(json.load(f)) - total_packages_files = list() - for base, dirs, files in os.walk(packages_folder): - for file in files: - if file != "discover.json": - total_packages_files.append(os.path.join(base, file)) - - distribution_stats["total_packages_files"] = len(total_packages_files) + distribution_stats["totalsourcewords"] = total_distribution_source_words + distribution_stats["total_packages_with_stats"] = len(packages) + distribution_stats["total_packages_files"] = sum([len(package["po"]) for package in all_stats]) distribution_stats["total_packages"] = len(packages) - distribution_stats["nb_files"] = len(list(set(distribution_stats["files"]))) - - packages_with_stats = [f for f in os.listdir(packages_stats_folder) if os.path.isfile(os.path.join(packages_stats_folder, f))] - distribution_stats["total_packages_with_stats"] = len(packages_with_stats) + distribution_stats["nb_files"] = len([file for file in all_stats if file.get("could_not_process", 0) == 0]) distribution_stats["total_languages"] = len(languages) + + log.info(distribution_stats) + + log.info("Storing distribution stats") with open(distribution_file, "w") as f: json.dump(distribution_stats, f, indent=2) - log.info("Searching for bugs ;)") - used_files = list(set(distribution_stats["files"])) - if len(total_packages_files) != len(used_files): - log.debug("source:{s} used: {u}".format(s=len(total_packages_files), u=len(used_files))) - missing_files = [source for source in total_packages_files if not source in used_files] - missing_files_po = [file for file in missing_files if file.endswith(".po")] - if len(missing_files_po) > 0: - log.debug("Some po files are missing") - distribution_file = os.path.join(results_folder, "build_stats.missing_po_files.json") - with open(distribution_file, "w") as f: - json.dump(missing_files_po, f, indent=2) - log.info("done") -def get_po_translation_level(files, stats_file): +def get_po_translation_level(file: str) -> dict: """ Compute results """ log = logging.getLogger("buildStats.get_po_translation_level") - stats = dict() - - for file in files: - # remove non standard comments - # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean - command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file] - subprocess.run(command, check=True, capture_output=True) + command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file] + subprocess.run(command, check=True, capture_output=True) - try: - stat = calcstats(file) - except Exception as e: - log.error(" {f} triggered an {t} exception: {e}".format(f=file, t=type(e).__name__, e=e)) - continue - - keys = [ - "translatedsourcewords", - "fuzzysourcewords", - "untranslatedsourcewords", - "translated", - "fuzzy", - "untranslated", - "translatedtargetwords", - ] - results = dict() - for key in keys: - results[key] = stat.get(key, 0) - - results["team"] = get_language_team(file) - - stats[file] = results - - return stats - - -def get_language_team(file): - log = logging.getLogger("buildStats.get_language_team") - metadata = dict() try: - metadata = polib.pofile(file).metadata - except OSError: - # maybe a polib bug? to investigate before using it in TM - metadata["Language"] = "error-os" - except UnicodeDecodeError: - # encoding error, to investigate before using it in TM - metadata["Language"] = "error-unicode" - except TypeError: - # TypeError: '>' not supported between instances of 'str' and 'int' - metadata["Language"] = "error-valuerror" - - team = "Unknown..." - try: - team = metadata["Language-Team"] - except KeyError: - log.debug("The file {f} have no Language team? Here are the metadata: {m}".format(f=file, m=metadata)) - return team - - -def extract_release_stats(results, files_stats): - log = logging.getLogger("buildStats.extract_release_stats") - number_of_packages = results.get("nb_packages", 0) - number_of_packages += 1 - files = results.get("files", list()) - total_source_words = results.get("totalsourcewords", 0) - - for template in files_stats: - maxresult = 0 - for file in files_stats[template]: - translated = files_stats[template][file]["translatedsourcewords"] - untranslated = files_stats[template][file]["untranslatedsourcewords"] - maxresult = max(maxresult, translated + untranslated) - files.append(file) - - total_source_words += maxresult - - results = { - "nb_packages": number_of_packages, - "files": files, - "totalsourcewords": total_source_words, - } + stat = calcstats(file) + except Exception as e: + log.error(f" {file} triggered an {type(e).__name__} exception: {e}") + stat = {"could_not_process": 1} + + keys = [ + "translatedsourcewords", + "fuzzysourcewords", + "untranslatedsourcewords", + "translated", + "fuzzy", + "untranslated", + "translatedtargetwords", + "could_not_process" + ] + results = dict() + for key in keys: + results[key] = stat.get(key, 0) return results diff --git a/build_tm.py b/build_tm.py index 01c45f6..7ce3c34 100755 --- a/build_tm.py +++ b/build_tm.py @@ -39,13 +39,11 @@ def main(): utils.set_logging(args.verbose, args.results) log = logging.getLogger("buildTm") - results_folder = "./results/{v}/".format(v=args.results) + results_folder = f"./results/{args.results}/" lang_path = os.path.join(results_folder, "languages/") tm_folder = os.path.join(results_folder, "languages-tm/") - debug_folder = os.path.join(results_folder, "debug_folder/") os.makedirs(tm_folder, exist_ok=True) - os.makedirs(debug_folder, exist_ok=True) # clean destination folders if args.refresh and os.path.isdir(tm_folder): @@ -56,57 +54,49 @@ def main(): log.info("Building the translation memory for every languages") - langs = [ + languages = [ f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f)) ] - for lang in sorted(langs): - lang_code = lang[: -len(".json")] + for language in sorted(languages): + language_code = language[: -len(".json")] - log.info("Processing {l}".format(l=lang_code)) + log.info(f"Processing {language_code}") - with open(os.path.join(lang_path, lang), "r") as read_file: + with open(os.path.join(lang_path, language), "r") as read_file: files = json.load(read_file)["po"] - files = [f["file"] for f in files] + files = [f["filename"] for f in files] - compendium_file = os.path.join(tm_folder, lang_code + ".po") + compendium_file = os.path.join(tm_folder, f"{language_code}.po") compendium_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), compendium_file ) - compendium_archive = compendium_file + ".gz" + compendium_archive = f"{compendium_file}.gz" if os.path.isfile(compendium_file) is False and os.path.isfile(compendium_archive) is False: log.info("Compendium generation") - process_compendium(files, compendium_file, debug_folder) + process_compendium(files, compendium_file, tm_folder, language_code) # remove non standard comments # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", compendium_file] subprocess.run(command, check=True, capture_output=True) - tmx_file = os.path.join(tm_folder, lang_code + ".tmx") - tmx_archive = tmx_file + ".gz" + tmx_file = os.path.join(tm_folder, f"{language_code}.tmx") + tmx_archive = f"{tmx_file}.gz" if os.path.isfile(tmx_file) is False and os.path.isfile(tmx_archive) is False: log.info("TMX generation") try: - process_tmx(lang_code, compendium_file, tmx_file) + process_tmx(language_code, compendium_file, tmx_file) except Exception as e: - log.error( - " TMX generation triggered an {t} exception: {e}".format( - t=type(e).__name__, e=e - ) - ) - - terminology_file = os.path.join(tm_folder, lang_code + ".terminology.po") - terminology_archive = terminology_file + ".gz" + log.error(f" TMX generation triggered an {type(e)} exception: {e}") + + terminology_file = os.path.join(tm_folder, f"{language_code}.terminology.po") + terminology_archive = f"{terminology_file}.gz" if os.path.isfile(terminology_file) is False and os.path.isfile(terminology_archive) is False: log.info("Terminology generation") try: process_terminology(compendium_file, terminology_file) except Exception as e: - log.error( - " Terminology generation triggered an {t} exception: {e}".format( - t=type(e).__name__, e=e - ) - ) + log.error(f" Terminology generation triggered an {type(e)} exception: {e}") if args.compress: if os.path.isfile(compendium_file): @@ -121,28 +111,26 @@ def main(): log.info("All languages are processed") log.info("Detecting missing files") - for lang in sorted(langs): - check_lang(lang[: -len(".json")], tm_folder, args.compress) + for language in sorted(languages): + check_lang(language[: -len(".json")], tm_folder, args.compress) log.info("done") -def process_compendium(langfiles, dest, debug_folder): +def process_compendium(po_files: list, destination_file, debug_folder: str, language_code: str) -> None: """ Generate a compendium (a concatenation of all po files) """ log = logging.getLogger("buildTm.process_compendium") - pofiles = [ - os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in langfiles - ] + po_files = [os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in po_files] count = 0 with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp: - for i in pofiles: + for file in po_files: try: command = [ "msguniq", - i, + file, "--output-file", count.__str__(), "--no-location", @@ -152,7 +140,7 @@ def process_compendium(langfiles, dest, debug_folder): try: command = [ "msguniq", - i, + file, "--output-file", count.__str__(), "--to-code", @@ -161,22 +149,22 @@ def process_compendium(langfiles, dest, debug_folder): ] subprocess.run(command, check=True, cwd=tmp, capture_output=True) except subprocess.CalledProcessError as e: - debug_filename = "tm-msguniq-{lang}-{name}".format(lang=dest.split("/")[-1], name=count.__str__()) - log.error(" msguniq error with {i} a copy of this file is into {d} as {n}".format(i=i, e=e.output, - d=debug_folder, - n=debug_filename)) - shutil.copyfile(i, os.path.join(debug_folder, debug_filename)) + short_filename = os.path.relpath(file, os.path.dirname(os.path.abspath(__file__))) + short_filename = "_".join(short_filename.split(sep=os.path.sep)) + debug_filename = os.path.join(debug_folder, f"{language_code}-tm-msguniq-{short_filename}") + log.error(f" msguniq error, a copy of this file is into {debug_filename}") + shutil.copyfile(file, debug_filename) count += 1 all_files = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))] if len(all_files) == 1: - shutil.copyfile(os.path.join(tmp, all_files[0]), dest) + shutil.copyfile(os.path.join(tmp, all_files[0]), destination_file) else: - msgcat_loop(dest, tmp, debug_folder, all_files) + msgcat_loop(destination_file, tmp, debug_folder, all_files, language_code) -def msgcat(files, destination, path): +def msgcat(files: list[str], destination: str, path: str): """ Call the msgcat command on a list of po files Return stderr, if any """ command = [ @@ -196,15 +184,8 @@ def msgcat(files, destination, path): stderr = e.stderr.decode('utf8') return stderr -def store_debug_file(path, name, file, debug_folder): - """ Move the temporary move file in debug folder """ - log = logging.getLogger("buildTm.store_debug_file") - target = os.path.join(debug_folder, "{n}-{f}".format(n=name, f=file)) - log.error("The file {f} were moved into {t}".format(f=file, t=target)) - shutil.move(os.path.join(path, file), target) - -def msgcat_loop(destination, path, debug_folder, files): +def msgcat_loop(destination: str, path: str, debug_folder: str, files: list[str], language: str) -> None: """ call msgcat, and exclude any problematic files """ log = logging.getLogger("buildTm.msgcat_loop") log.debug("Starting msgcat loop") @@ -214,68 +195,72 @@ def msgcat_loop(destination, path, debug_folder, files): ids += re.findall(r"\d+:\d+: (\d+): input is not valid in", ret) if ids: file = ids[0] - log.debug("This file raised a msgcat bug: {f}".format(f=file)) - store_debug_file(path, "tm-msgcat-" + destination.split("/")[-1], file, debug_folder) + short_filename = os.path.relpath(file, os.path.dirname(os.path.abspath(__file__))) + short_filename = "_".join(short_filename.split(sep=os.path.sep)) + destination_file = f"{language}-tm-msgcat-{short_filename}" + target = os.path.join(debug_folder, f"{destination_file}") + log.error(f"msgcat error, a copy of this file is into {target}") + shutil.move(os.path.join(path, file), target) files.remove(file) else: # nothing found in stderr if os.path.isfile(destination) is False: # and destination not here : unhandled exception # TODO: maybe actually throw an exception here? - log.error("Error with msgcat: {e}".format(e=ret)) - return False + log.error(f"Error with msgcat: {ret}") + return # no stderr and final file is here : all good break log.debug("next try") log.debug("msgcat loop over") -def process_tmx(lang, source, dest): +def process_tmx(lang: str, source: str, destination: str) -> None: """ Generate a translation memory from a po file """ - command = ["po2tmx", "--language=" + lang, "--progress=none", source, "--output=" + dest] + command = ["po2tmx", f"--language={lang}", "--progress=none", source, f"--output={destination}"] subprocess.run(command, check=True, capture_output=True) -def process_terminology(source, dest): +def process_terminology(source: str, destination: str) -> None: """ Generate a termonology from a po file """ command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", - "--progress=none", source, "--output=" + dest] + "--progress=none", source, f"--output={destination}"] subprocess.run(command, check=True, capture_output=True) -def check_lang(lang, tm_folder, compress): +def check_lang(lang: str, tm_folder: str, to_compress: bool) -> None: """ Check if expected files were generated """ log = logging.getLogger("buildTm.check_lang") - compendium_file = os.path.join(tm_folder, lang + ".po") - tmx_file = os.path.join(tm_folder, lang + ".tmx") - terminology_file = os.path.join(tm_folder, lang + ".terminology.po") + compendium_file = os.path.join(tm_folder, f"{lang}.po") + tmx_file = os.path.join(tm_folder, f"{lang}.tmx") + terminology_file = os.path.join(tm_folder, f"{lang}.terminology.po") - if compress is True: + if to_compress is True: compendium_file += ".gz" tmx_file += ".gz" terminology_file += ".gz" if os.path.isfile(compendium_file) is False: - log.warning("{l}-compendium is missing".format(l=lang)) + log.warning(f"{lang}-compendium is missing") if os.path.isfile(tmx_file) is False: - log.warning("{l}-tmx is missing".format(l=lang)) + log.warning(f"{lang}-tmx is missing") if os.path.isfile(terminology_file) is False: - log.warning("{l}-terminology is missing".format(l=lang)) + log.warning(f"{lang}-terminology is missing") -def compress(source, archive): +def compress(source: str, destination_file: str) -> None: """ Compress files uzing gzip """ log = logging.getLogger("buildTm.compress") - log.debug("Compressing {s}".format(s=source)) + log.debug(f"Compressing {source}") with open(source, "rb") as file_in: - with gzip.open(archive, "wb") as file_out: + with gzip.open(destination_file, "wb") as file_out: file_out.writelines(file_in) os.remove(source) diff --git a/build_website.py b/build_website.py index 450a706..b8a877a 100755 --- a/build_website.py +++ b/build_website.py @@ -15,6 +15,27 @@ import logging import utils +def get_territories_for_language(language_name: str, cldr_languages: dict) -> list: + log = logging.getLogger("buildWebsite.get_territory") + territories = [] + + code = language_name.split("_", 1)[0] # ro_MD or zh_Hant_HK + code = code.split("@", 1)[0] # ca@valencia + + territories.append(cldr_languages.get(code, {}).get("_territories", [])) + territories.append(cldr_languages.get(code + "-alt-secondary", {}).get("_territories", [])) + + # if language contains a territory code, then only keep this one + if len(language_name.split("_")) > 1: + if language_name.split("_")[-1] in territories: + territories = [language_name.split("_")[-1]] + + if len(territories) == 0: + log.warning(f"The language {code} does not exist in territories data from CLDR") + + return territories + + def main(): """Handle params""" @@ -45,26 +66,20 @@ def main(): utils.set_logging(args.verbose, args.results) log = logging.getLogger("buildWebsite") - results_folder = "./results/{v}/".format(v=args.results) - langs_log = os.path.join(results_folder, "build_language_list.log") - langs_stats = os.path.join(results_folder, "languages-stats") - packages_stats = os.path.join(results_folder, "packages-stats") - - data_langs_folder = os.path.join(results_folder, "languages-website") - data_pkgs_folder = os.path.join(results_folder, "packages-website") + results_folder = f"./results/{args.results}/" + langs_stats = os.path.join(results_folder, "languages") + packages_stats = os.path.join(results_folder, "packages") tm_folder = os.path.join(results_folder, "languages-tm") - static_folder = "./website/content/{v}/".format(v=args.results) + static_folder = f"./website/content/{args.results}/" static_territories_folder = "./website/content/territories" static_langs_folder = os.path.join(static_folder, "language") static_pkgs_folder = os.path.join(static_folder, "package") - static_tm_folder = "./website/static/{v}/".format(v=args.results) + static_tm_folder = f"./website/static/{args.results}/" # clean destination folders for folder in [ - data_langs_folder, - data_pkgs_folder, static_langs_folder, static_pkgs_folder, static_tm_folder, @@ -77,58 +92,6 @@ def main(): log.info("Get distribution stats") distribution_stats = json.load(open(os.path.join(results_folder, "release.json"))) - log.info("Prepare json files for packages") - packages = [ - d - for d in os.listdir(packages_stats) - if os.path.isfile(os.path.join(packages_stats, d)) - ] - log_files = pd.read_csv(langs_log, header=None, skipinitialspace=True) - log_files = log_files.iloc[:, [0, 4]] - log_files.columns = ["filename", "lang_code"] - - packages_langs_results = dict() - count = 0 - total = len(packages) - for package in sorted(packages): - count += 1 - log.debug("Preparing package {c}/{t} - {p}".format(c=count, t=total, p=package)) - package_name = package[: -len(".json")] - package_statistics_file = os.path.join(data_pkgs_folder, package_name + ".json") - file_stats = os.path.join(packages_stats, package_name + ".json") - - results = consolidate_package_stats(file_stats, log_files, os.path.join(results_folder, "package", package_name)) - store_json_file(results, package_statistics_file) - for lang in results.get("stats", []): - val = packages_langs_results.get(lang["lang_code"], []) - val.append( - { - "name": package_name, - "progress": lang["progress"], - "translated": lang["translated"], - "team": lang["team"], - } - ) - packages_langs_results[lang["lang_code"]] = val - - log.info("Prepare json files for languages") - languages = [ - f - for f in os.listdir(langs_stats) - if os.path.isfile(os.path.join(langs_stats, f)) - ] - for lang in sorted(languages): - if lang.endswith(".json"): - code = lang[: -len(".json")] - package_statistics_file = os.path.join(data_langs_folder, code + ".json") - - if os.path.isfile(package_statistics_file): - continue - - results = consolidate_language_stats(os.path.join(langs_stats, lang), distribution_stats) - results["packages"] = packages_langs_results.get(code, dict()) - store_json_file(results, package_statistics_file) - log.info("Load CLDR data") with open("CLDR-raw/languageData.json", "r") as read_file: cldr_languages = json.load(read_file) @@ -145,53 +108,36 @@ def main(): log.info("Generate static content for languages") languages = [ f - for f in os.listdir(data_langs_folder) - if os.path.isfile(os.path.join(data_langs_folder, f)) + for f in os.listdir(langs_stats) + if os.path.isfile(os.path.join(langs_stats, f)) ] - for lang in sorted(languages): - code = lang[: -len(".json")] - package_statistics_file = os.path.join(static_langs_folder, code + ".adoc") - - if os.path.isfile(package_statistics_file): - continue + for language_file in sorted(languages): + language = language_file[: -len(".json")] + stats_file = os.path.join(langs_stats, language_file) + destination_file = os.path.join(static_langs_folder, f"{language}.adoc") - with open(os.path.join(data_langs_folder, lang), "r") as read_file: + with open(stats_file, "r") as read_file: content = json.load(read_file) - pd.DataFrame(content["packages"]).to_csv(os.path.join(static_tm_folder, f"{code}.csv"), index=False) - - cldr_code = code.split("_", 1)[0] # ro_MD or zh_Hant_HK - cldr_code = cldr_code.split("@", 1)[0] # ca@valencia - - territories = cldr_languages.get(cldr_code, {}).get("_territories", []) \ - + cldr_languages.get(cldr_code + "-alt-secondary", {}).get("_territories", []) - - # if language contains a territory code, then only keep this one - if len(code.split("_")) > 1: - if code.split("_")[-1] in territories: - territories = [code.split("_")[-1]] + pd.DataFrame(content["packages"]).to_csv(os.path.join(static_tm_folder, f"{language}.csv"), index=False) - if len(territories) == 0: - log.warning("The language {l} does not exist in territories data from CLDR".format(l=code)) - generate_static_pages_langs(args.results, code, content, package_statistics_file, territories, tm_folder, static_tm_folder) + territories = get_territories_for_language(language, cldr_languages) + generate_static_pages_langs(args.results, language, content, destination_file, territories, tm_folder, static_tm_folder) log.info("Generate static content for packages") packages = [ f - for f in os.listdir(data_pkgs_folder) - if os.path.isfile(os.path.join(data_pkgs_folder, f)) + for f in os.listdir(packages_stats) + if os.path.isdir(os.path.join(packages_stats, f)) ] for package in sorted(packages): - code = package[: -len(".json")] - package_statistics_file = os.path.join(static_pkgs_folder, code + ".adoc") + stats_file = os.path.join(packages_stats, package, "stats.json") + destination_file = os.path.join(static_pkgs_folder, f"{package}.adoc") - if os.path.isfile(package_statistics_file): - continue - - with open(os.path.join(data_pkgs_folder, package), "r") as read_file: + with open(stats_file, "r") as read_file: content = json.load(read_file) - generate_static_pages_packages(args.results, code, content, package_statistics_file) + generate_static_pages_packages(args.results, package, content, destination_file) log.info("Generating indexes") package_statistics_file = os.path.join(static_folder, "_index.adoc") @@ -222,54 +168,7 @@ def main(): log.info("done") -def consolidate_language_stats(stats_file, distribution_stats): - """ From a CSV file, return key indicators """ - log = logging.getLogger("buildWebsite.consolidate_language_stats") - results = dict() - total_words_distrib = distribution_stats.get("totalsourcewords", 0) - - fieldnames = { - "filename": "str", - "translatedsourcewords": "int", - "fuzzysourcewords": "int", - "untranslatedsourcewords": "int", - "translated": "int", - "fuzzy": "int", - "untranslated": "int", - "translatedtargetwords": "int", - "team": "str", - "totalsourcewords": "int", - } - - stats_df = pd.read_json(stats_file, orient="index") - stats_df.fillna(0, inplace=True) - stats_df.reset_index(level=0, inplace=True) - stats_df["totalsourcewords"] = ( - stats_df["untranslatedsourcewords"] + stats_df["translatedsourcewords"] - ) - stats_df.columns = fieldnames.keys() - - stats_df["package"] = stats_df["filename"].str.split("/", expand=True)[4] - - results["packages"] = stats_df["package"].unique().tolist() - results["progress"] = round( - stats_df["translatedsourcewords"].sum() - / stats_df["totalsourcewords"].sum() - * 100, - 1, - ) - results["progress_d"] = round( - stats_df["translatedsourcewords"].sum() / total_words_distrib * 100, 1 - ) - results["totalsourcewords_d"] = total_words_distrib - - for kpi in ["totalsourcewords", "translatedsourcewords"]: - results[kpi + "sum"] = int(stats_df[kpi].sum()) - - return results - - -def consolidate_package_stats(stats_file, log_files, package_folder): +def consolidate_package_stats(stats_file, package_folder): """ From a CSV file, return key indicators """ log = logging.getLogger("buildWebsite.consolidate_package_stats") results = dict() @@ -298,7 +197,7 @@ def consolidate_package_stats(stats_file, log_files, package_folder): # sometimes, no file were found, which means no stats can be used if len(tmp_df) == 0: - log.debug(" The template {t} for {f} is empty".format(t=template, f=stats_file)) + log.debug(f" The template {template} for {stats_file} is empty") continue tmp_df["totalsourcewords"] = ( @@ -313,22 +212,13 @@ def consolidate_package_stats(stats_file, log_files, package_folder): if len(dfs) > 1: stats_df = pd.concat(dfs) elif len(dfs) == 0: - log.debug("There is no stats for {f}".format(f=stats_file)) + log.debug(f"There is no stats for {stats_file}") return results else: stats_df = dfs[0] - stats_df_w_lang = pd.merge(stats_df, log_files, how="inner", on="filename") - stats_df_no_lang = pd.merge(stats_df, log_files, how="outer", indicator=True).loc[ - lambda x: x["_merge"] == "left_only" - ] - - stats_df_w_lang["filename"] = stats_df_w_lang["filename"].apply( - lambda s: s[len(package_folder) + 2:] - ) - temp_translated = ( - stats_df_w_lang.groupby(["lang_code"]) + stats_df.groupby(["lang_code"]) .agg( { "translatedsourcewords": ["sum"], @@ -338,10 +228,10 @@ def consolidate_package_stats(stats_file, log_files, package_folder): .droplevel(1, axis=1) ) - temp_teams = stats_df_w_lang.groupby("lang_code")["team"].apply( + temp_teams = stats_df.groupby("lang_code")["team"].apply( lambda x: ", ".join(x.drop_duplicates()) ) - temp_files = stats_df_w_lang.groupby("lang_code")["filename"].apply( + temp_files = stats_df.groupby("lang_code")["filename"].apply( lambda x: ",".join(x) ) temp_bis = pd.merge(temp_teams, temp_files, how="inner", on="lang_code") @@ -354,11 +244,7 @@ def consolidate_package_stats(stats_file, log_files, package_folder): line["translated"] = line["translatedsourcewords"] if total_source_words == 0: - log.info( - " File {f} for file has translatedsourcewords = 0 in line {l}".format( - f=stats_file, l=line - ) - ) + log.info(f" File {stats_file} for file has translatedsourcewords = 0 in line {line}") line["progress"] = 0 continue try: @@ -366,13 +252,7 @@ def consolidate_package_stats(stats_file, log_files, package_folder): (int(line["translatedsourcewords"]) / total_source_words) * 100 ) except OverflowError: - log.info( - " File {f} has Translated={t} and Source={tot}".format( - f=stats_file, - t=line["translatedsourcewords"], - tot=total_source_words, - ) - ) + log.info(f" {stats_file} have Translated={line['translatedsourcewords']} and Source={total_source_words}") line["filename"] = line["filename"].split(",") @@ -382,7 +262,6 @@ def consolidate_package_stats(stats_file, log_files, package_folder): results["stats"].append(line) results["totalsourcewords"] = total_source_words - results["no_languages"] = stats_df_no_lang["filename"].tolist() return results @@ -413,17 +292,17 @@ def generate_static_pages_langs(results: str, code: str, content: dict, destinat apply_jinja_template(data, destination_file, "language.adoc") -def generate_static_pages_packages(results, code, content, destination_file): +def generate_static_pages_packages(release: str, package: str, statistics, destination_file): log = logging.getLogger("buildWebsite.generate_static_pages_packages") - data = content - data["results"] = results - data["package"] = code + data = statistics + data["results"] = release + data["package"] = package data["now"] = datetime.datetime.utcnow() apply_jinja_template(data, destination_file, "package.adoc") -def generate_release_index(release, destination_file, data): +def generate_release_index(release: str, destination_file: str, data: dict): log = logging.getLogger("buildWebsite.generate_release_index") data["release"] = release data["now"] = datetime.datetime.utcnow() @@ -431,7 +310,7 @@ def generate_release_index(release, destination_file, data): apply_jinja_template(data, destination_file, "_index.release.adoc") -def generate_language_index(release, destination_file): +def generate_language_index(release: str, destination_file: str): log = logging.getLogger("buildWebsite.generate_language_index") data = dict() data["release"] = release @@ -440,7 +319,7 @@ def generate_language_index(release, destination_file): apply_jinja_template(data, destination_file, "_index.language.adoc") -def generate_package_index(distribution, destination_file): +def generate_package_index(distribution: str, destination_file: str): log = logging.getLogger("buildWebsite.generate_package_index") data = dict() data["distribution"] = distribution diff --git a/templates/_index.release.adoc b/templates/_index.release.adoc index 74919f8..440074c 100644 --- a/templates/_index.release.adoc +++ b/templates/_index.release.adoc @@ -10,7 +10,7 @@ Fedora {{ release }}:: * it represents {{ total_packages_files }} translations files (po). What we were able to process:: -* {{ total_packages_with_stats }} packages, +* {{ total_packages }} packages, * {{ nb_files }} translation files containing {{ totalsourcewords }} words to translate, * {{ total_languages }} languages. diff --git a/templates/language.adoc b/templates/language.adoc index 849481d..b488f61 100644 --- a/templates/language.adoc +++ b/templates/language.adoc @@ -23,7 +23,7 @@ Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} * Total translatable string in Fedora {{ results }}: {{ totalsourcewords_d }} * Source words to translate in started packages: {{ totalsourcewordssum }} -* Translated words: {{ translatedsourcewordssum }} +* Translated words: {{ translatedsourcewords }} Download: @@ -34,17 +34,19 @@ Download: Packages: -[cols="1a,1,1,3", options="header"] +[cols="1a,1,1,1,3", options="header"] |=== | Name | Translated words +| Total source words | Progress -| Language team +| Language teams {% for package in packages -%} -| link:{{ '{{' }}< ref "/{{ results }}/package/{{ package.name }}.adoc" >{{ '}}' }}[{{ package.name }}] ->| {{ package.translated }} ->| {{ package.progress }} -| {{ package.team }} +| link:{{ '{{' }}< ref "/{{ results }}/package/{{ package }}.adoc" >{{ '}}' }}[{{ package }}] +>| {{ packages[package].translatedsourcewords }} +>| {{ packages[package].totalsourcewordssum }} +>| {{ packages[package].progress }} +| {{ packages[package].team }} {% endfor %} |=== \ No newline at end of file diff --git a/templates/package.adoc b/templates/package.adoc index 19d546f..faabe81 100644 --- a/templates/package.adoc +++ b/templates/package.adoc @@ -7,31 +7,23 @@ no_languages: {{ no_languages|length }} The package {{ package }}: * represents {{ totalsourcewords }} source words to be translated, -* is translated into {{ stats|length }} languages in Fedora {{ results }}, +* is translated into {{ stats.languages|length }} languages in Fedora {{ results }}, * contains {{ no_languages|length }} files for which no languages could be deducted. -[cols="1a,1,1,3a", options="header"] +[cols="1a,1,1,1,3a", options="header"] |=== | Language | Translated words +| Total source words | Progress | Files -{% for stat in stats|sort(attribute="lang_code") -%} -| link:{{ '{{' }}< ref "/{{ results }}/language/{{ stat.lang_code }}.adoc" >{{ '}}' }}[{{ stat.lang_code }}] ->| {{ stat.translated }} ->| {{ stat.progress }} -| {% for file in stat.filename -%}{{ file }}{{ " " }}{%- endfor %} +{% for stat in stats.languages|sort -%} +| link:{{ '{{' }}< ref "/{{ results }}/language/{{ stat }}.adoc" >{{ '}}' }}[{{ stat }}] +>| {{ stats.languages[stat].translatedsourcewords }} +>| {{ stats.languages[stat].totalsourcewordssum }} +>| {{ stats.languages[stat].progress }} +| {{ stats.languages[stat].filename }} {% endfor %} |=== - -## Errors -{% if no_languages %} -List of files for which language detection were impossible: -{% for missing in no_languages -%} -* {{ missing }} -{% endfor %} -{% else %} -None -{% endif %} diff --git a/todo.md b/todo.md index cd426a3..21aec6c 100644 --- a/todo.md +++ b/todo.md @@ -21,7 +21,7 @@ Detecting missing files # build_stats.py -roxterm triggers an error +when %package%/stats.json is empty, make sure it is counted as an existing package for which we were not able to extract anything (release stats) # global From 15fa25ab9e81ed959b10c4d8ace2ddea9cd1ad39 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 28 2023 21:10:39 +0000 Subject: [PATCH 5/17] use f-strings --- diff --git a/build.py b/build.py index f2ff6d4..045a544 100755 --- a/build.py +++ b/build.py @@ -73,18 +73,18 @@ def main(): srpm_regex = None if args.filter: - srpm_regex = re.compile("^{}$".format(args.filter)) + srpm_regex = re.compile(f"^{args.filter}$") - packages_folder = "./results/{v}/packages/".format(v=args.results) - srpms_path = os.path.abspath("./srpms/".format(v=args.results)) + packages_folder = f"./results/{args.results}/packages/" + srpms_path = os.path.abspath("./srpms/") if not os.path.exists(packages_folder): os.makedirs(packages_folder) if not os.path.exists(srpms_path): os.makedirs(srpms_path) - data_file = os.path.join("./results/{v}/".format(v=args.results), "data.json") - srpm_list_file = os.path.join(srpms_path, "{v}.txt".format(v=args.results)) + data_file = os.path.join(f"./results/{args.results}/", "data.json") + srpm_list_file = os.path.join(srpms_path, f"{args.results}.txt") url_list = None if os.path.isfile(srpm_list_file): @@ -102,12 +102,12 @@ def main(): if dnf_file: dnf_fp = os.path.join("dnf", dnf_file) if os.path.isfile(dnf_fp): - dnf_args = "-c {}".format(dnf_fp) - log.info("Using dnf conf {}".format(dnf_file)) + dnf_args = f"-c {dnf_fp}" + log.info(f"Using dnf conf {dnf_file}") else: - log.warning("dnf conf {} not found".format(dnf_file)) + log.warning(f"dnf conf {dnf_file} not found") p = subprocess.Popen( - "dnf {dnf_args} download --source --skip-broken --url '*' | grep src.rpm".format(dnf_args=dnf_args), + f"dnf {dnf_args} download --source --skip-broken --url '*' | grep src.rpm", stdout=subprocess.PIPE, shell=True) From c84be7b6ab0e9024ca1de9a75caf7bc2eb626e4f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 28 2023 21:11:10 +0000 Subject: [PATCH 6/17] document staging in readme --- diff --git a/README.md b/README.md index 0975345..f060db3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This project aims at computing global statistics for Fedora/Linux operating syst * Motivation is described in https://fedoraproject.org/wiki/Changes/LocalizationMeasurementAndTooling * It is deployed in https://languages.fedoraproject.org and https://languages.stg.fedoraproject.org -* Infrastructure is hosted by https://console-openshift-console.apps.ocp.fedoraproject.org/ +* Infrastructure is hosted by https://console-openshift-console.apps.ocp.fedoraproject.org/ and https://console-openshift-console.apps.ocp.stg.fedoraproject.org * Infrastructure code is in https://pagure.io/fedora-infra/ansible/blob/main/f/roles/openshift-apps/languages # Licensing From 63ecbcc679e4f638703308f7fd0578fb8a5daf58 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 29 2023 05:54:38 +0000 Subject: [PATCH 7/17] improve documentation a simple review of comments and variable names, to make the code easier to read --- diff --git a/build_language_list.py b/build_language_list.py index 80f46a5..efa7571 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -""" Parse translation files to deduct language list """ +""" Detect language for each translation file """ import argparse import glob @@ -19,7 +19,7 @@ def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Creates a list of languages form translation files" + description="Detect language for each translation file" ) parser.add_argument( @@ -101,7 +101,7 @@ def scan_packages(package_folder: str, refresh: bool): processed_files_count += 1 result = p.search(po_file) - lang_code = result.group(1) + path_lang_code = result.group(1) metadata = dict() error = "" try: @@ -115,9 +115,9 @@ def scan_packages(package_folder: str, refresh: bool): # maybe a polib bug? to investigate before using it in TM error = "error-os" - lang, decision = choose_language_code_from_po(lang_code, metadata) + lang, decision = choose_language_code_from_po(path_lang_code, metadata) - debug_file = {"lang_in_path": lang_code, + debug_file = {"lang_in_path": path_lang_code, "metadata_lang": metadata.get("Language", ""), "metadata_plurals": metadata.get("Plural-Forms", ""), "metadata_language_team": metadata.get("Language-Team", ""), @@ -139,11 +139,11 @@ def scan_packages(package_folder: str, refresh: bool): def choose_language_code_from_po(filename: str, metadata: dict[str]) -> tuple[str, int]: - """ From a po file and its metadata, choose the most likely language code - By priority: the Language metadata - :param filename: the po file - :param metadata: - :return: a language code + """ Deduct a language code from a filename and its metadata + + :param filename: po filename + :param metadata: po metadata + :return: a language code, a decision path """ log = logging.getLogger("buildLanguageList.choose_lang") diff --git a/build_stats.py b/build_stats.py index 4ef1cef..8a6f471 100755 --- a/build_stats.py +++ b/build_stats.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""For each package, compute stats""" +""" Computes stats for each package with translations and each detected language """ import argparse import json @@ -16,22 +16,21 @@ from translate.tools.pocount import calcstats import utils -def compute_language_statistics(languages: dict, total_distribution_source_words: int) -> dict: - """ - Target: - "packages": [ - { - "name": "blueberry", - "progress": 100, - "translated": 166, - "team": "French " - } - ], - "progress": 98.1, - "progress_d": 63.4, - "totalsourcewords_d": 11491, - "totalsourcewordssum": 7428, - "translatedsourcewordssum": 7287 +def compute_language_statistics(languages_stats: dict, total_release_source_words: int) -> dict: + """ For each language, produce global statistics and per package statistics + + global statistics target: + "totalsourcewordssum": total words on started packages + "totalsourcewords_d": total words in release + "translatedsourcewordssum": total translated words + "progress": current translation progress on started packages (in percents) + "progress_d": current translation progress on all strings in release (in percents) + + per package statistics target: + "name": package name + "progress": current translation progress (in percents) + "translated": total translated words (source words, it can vary in target language) + "team": language team info """ log = logging.getLogger("buildStats.compute_language_statistics") @@ -41,7 +40,7 @@ def compute_language_statistics(languages: dict, total_distribution_source_words package_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy", "untranslated", "translatedtargetwords", "package"] - for code, stats in languages.items(): + for code, stats in languages_stats.items(): results_languages[code] = {} results_languages[code]["po"] = stats result = {} @@ -53,7 +52,7 @@ def compute_language_statistics(languages: dict, total_distribution_source_words result["totalsourcewordssum"] = result["translatedsourcewords"] + result["fuzzysourcewords"] + result[ "untranslatedsourcewords"] - result["totalsourcewords_d"] = total_distribution_source_words + result["totalsourcewords_d"] = total_release_source_words result["progress"] = result["translatedsourcewords"] / result["totalsourcewordssum"] result["progress_d"] = result["translatedsourcewords"] / result["totalsourcewords_d"] @@ -68,19 +67,15 @@ def compute_language_statistics(languages: dict, total_distribution_source_words return results_languages -def compute_package_statistics(df): - """ - [ - { - "lang_code": "de", - "team": "Low German , German ", - "filename": [ - "po/blueberry-nds.po", - "po/blueberry-de.po" - ], - "progress": 179, - "translated": 297 - }, +def compute_package_statistics(df: pd.DataFrame) -> dict: + """ For each package, per language statistics + + global statistics target: + "lang_code": language code + "team": language team info + "progress": current translation progress (in percents), + "translated": total translated words (source words, it can vary in target language) + "filename": list of files considered for statistics """ log = logging.getLogger("buildStats.compute_language_statistics") results = dict() @@ -103,7 +98,7 @@ def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Computes stats for each package with translations" + description="Computes stats for each package with translations and each detected language" ) parser.add_argument( "--results", required=True, help="Set the results folder to use" @@ -230,7 +225,7 @@ def main(): def get_po_translation_level(file: str) -> dict: - """ Compute results """ + """ Call pocount to get translation stats for a file """ log = logging.getLogger("buildStats.get_po_translation_level") command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file] subprocess.run(command, check=True, capture_output=True) diff --git a/build_tm.py b/build_tm.py index 7ce3c34..4be4d93 100755 --- a/build_tm.py +++ b/build_tm.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Consolidate each po files into compendium""" +""" Creates useful translator files for every language """ import argparse import gzip @@ -17,7 +17,7 @@ def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Creates useful translator files for every languages" + description="Creates useful translator files for every language" ) parser.add_argument( @@ -52,7 +52,7 @@ def main(): if os.path.exists(tm_folder) is False: os.makedirs(tm_folder) - log.info("Building the translation memory for every languages") + log.info("Find detected languages") languages = [ f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f)) @@ -222,12 +222,12 @@ def process_tmx(lang: str, source: str, destination: str) -> None: subprocess.run(command, check=True, capture_output=True) -def process_terminology(source: str, destination: str) -> None: - """ Generate a termonology from a po file """ +def process_terminology(compendium: str, destination: str) -> None: + """ Generate a terminology from a po file """ command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", - "--progress=none", source, f"--output={destination}"] + "--progress=none", compendium, f"--output={destination}"] subprocess.run(command, check=True, capture_output=True) @@ -255,7 +255,7 @@ def check_lang(lang: str, tm_folder: str, to_compress: bool) -> None: def compress(source: str, destination_file: str) -> None: - """ Compress files uzing gzip """ + """ Compress files using gzip """ log = logging.getLogger("buildTm.compress") log.debug(f"Compressing {source}") diff --git a/build_website.py b/build_website.py index b8a877a..0a1b3e3 100755 --- a/build_website.py +++ b/build_website.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Consolidate each po files into compendium""" +""" Generate static asciidoc pages from generated statistics """ import argparse import datetime @@ -39,7 +39,7 @@ def get_territories_for_language(language_name: str, cldr_languages: dict) -> li def main(): """Handle params""" - parser = argparse.ArgumentParser(description="") + parser = argparse.ArgumentParser(description="Generate static asciidoc pages from generated statistics") parser.add_argument( "--results", @@ -267,6 +267,7 @@ def consolidate_package_stats(stats_file, package_folder): def generate_static_pages_langs(results: str, code: str, content: dict, destination_file: str, territories: list[str], tm_folder: str, static_tm_folder: str) -> None: + """ Aggregate info and call language template """ log = logging.getLogger("buildWebsite.generate_static_pages_langs") data = content data["lang_name_en"] = langtable.language_name( @@ -292,7 +293,8 @@ def generate_static_pages_langs(results: str, code: str, content: dict, destinat apply_jinja_template(data, destination_file, "language.adoc") -def generate_static_pages_packages(release: str, package: str, statistics, destination_file): +def generate_static_pages_packages(release: str, package: str, statistics: dict, destination_file: str) -> None: + """ Aggregate info and call package template """ log = logging.getLogger("buildWebsite.generate_static_pages_packages") data = statistics data["results"] = release @@ -302,7 +304,8 @@ def generate_static_pages_packages(release: str, package: str, statistics, desti apply_jinja_template(data, destination_file, "package.adoc") -def generate_release_index(release: str, destination_file: str, data: dict): +def generate_release_index(release: str, destination_file: str, data: dict) -> None: + """ Aggregate info and call release index template """ log = logging.getLogger("buildWebsite.generate_release_index") data["release"] = release data["now"] = datetime.datetime.utcnow() @@ -310,7 +313,8 @@ def generate_release_index(release: str, destination_file: str, data: dict): apply_jinja_template(data, destination_file, "_index.release.adoc") -def generate_language_index(release: str, destination_file: str): +def generate_language_index(release: str, destination_file: str) -> None: + """ Aggregate info and call language index template """ log = logging.getLogger("buildWebsite.generate_language_index") data = dict() data["release"] = release @@ -319,7 +323,8 @@ def generate_language_index(release: str, destination_file: str): apply_jinja_template(data, destination_file, "_index.language.adoc") -def generate_package_index(distribution: str, destination_file: str): +def generate_package_index(distribution: str, destination_file: str) -> None: + """ Aggregate info and call package index template """ log = logging.getLogger("buildWebsite.generate_package_index") data = dict() data["distribution"] = distribution @@ -328,7 +333,8 @@ def generate_package_index(distribution: str, destination_file: str): apply_jinja_template(data, destination_file, "_index.package.adoc") -def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict): +def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict) -> None: + """ Aggregate info and call territory index template """ log = logging.getLogger("buildWebsite.generate_package_index") data["name"] = name data["code"] = code @@ -336,12 +342,14 @@ def generate_territory_index(destination_file: str, name: list[str], code: str, apply_jinja_template(data, destination_file, "_index.territory.adoc") -def store_json_file(content, destination_file): +def store_json_file(content: dict, destination_file: str) -> None: + """ Store a json file""" with open(destination_file, "w") as f: f.write(json.dumps(content, indent=2)) -def apply_jinja_template(data: dict, destination_file: str, template_file: str): +def apply_jinja_template(data: dict, destination_file: str, template_file: str) -> None: + """ Call a jinja template with a data dictionary """ os.makedirs(os.path.dirname(os.path.abspath(destination_file)), exist_ok=True) template_loader = jinja2.FileSystemLoader(searchpath="./templates/") diff --git a/requirements.txt b/requirements.txt index e5941dc..af51268 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ polib weblate-language-data langtable translate-toolkit + +numpyencoder \ No newline at end of file From 9d3bd6ed86a8ff44bb47069f5afd17ed149f043f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 29 2023 05:55:20 +0000 Subject: [PATCH 8/17] remove package stats in build_website we do this processing in build_stats.py now --- diff --git a/build_website.py b/build_website.py index 0a1b3e3..82906f7 100755 --- a/build_website.py +++ b/build_website.py @@ -168,104 +168,6 @@ def main(): log.info("done") -def consolidate_package_stats(stats_file, package_folder): - """ From a CSV file, return key indicators """ - log = logging.getLogger("buildWebsite.consolidate_package_stats") - results = dict() - - fieldnames = { - "filename": "str", - "translatedsourcewords": "int", - "fuzzysourcewords": "int", - "untranslatedsourcewords": "int", - "translated": "int", - "fuzzy": "int", - "untranslated": "int", - "translatedtargetwords": "int", - "team": "str", - "totalsourcewords": "int", - } - - _json = json.load(open(stats_file)) - dfs = [] - total_source_words = 0 - - for template in _json.keys(): - tmp_df = pd.DataFrame.from_dict(_json.get(template), orient="index") - tmp_df.fillna(0, inplace=True) - tmp_df.reset_index(level=0, inplace=True) - - # sometimes, no file were found, which means no stats can be used - if len(tmp_df) == 0: - log.debug(f" The template {template} for {stats_file} is empty") - continue - - tmp_df["totalsourcewords"] = ( - tmp_df["untranslatedsourcewords"] + tmp_df["translatedsourcewords"] - ) - tmp_df.columns = fieldnames.keys() - - total_source_words += max(tmp_df["totalsourcewords"]) - - dfs.append(tmp_df) - - if len(dfs) > 1: - stats_df = pd.concat(dfs) - elif len(dfs) == 0: - log.debug(f"There is no stats for {stats_file}") - return results - else: - stats_df = dfs[0] - - temp_translated = ( - stats_df.groupby(["lang_code"]) - .agg( - { - "translatedsourcewords": ["sum"], - } - ) - .reset_index() - .droplevel(1, axis=1) - ) - - temp_teams = stats_df.groupby("lang_code")["team"].apply( - lambda x: ", ".join(x.drop_duplicates()) - ) - temp_files = stats_df.groupby("lang_code")["filename"].apply( - lambda x: ",".join(x) - ) - temp_bis = pd.merge(temp_teams, temp_files, how="inner", on="lang_code") - temp = pd.merge(temp_translated, temp_bis, how="inner", on="lang_code").to_dict( - orient="records" - ) - - for line in temp: - line["progress"] = 0 - line["translated"] = line["translatedsourcewords"] - - if total_source_words == 0: - log.info(f" File {stats_file} for file has translatedsourcewords = 0 in line {line}") - line["progress"] = 0 - continue - try: - line["progress"] = round( - (int(line["translatedsourcewords"]) / total_source_words) * 100 - ) - except OverflowError: - log.info(f" {stats_file} have Translated={line['translatedsourcewords']} and Source={total_source_words}") - - line["filename"] = line["filename"].split(",") - - results["stats"] = list() - for line in sorted(temp, key=lambda k: k["progress"], reverse=True): - del line["translatedsourcewords"] - results["stats"].append(line) - - results["totalsourcewords"] = total_source_words - - return results - - def generate_static_pages_langs(results: str, code: str, content: dict, destination_file: str, territories: list[str], tm_folder: str, static_tm_folder: str) -> None: """ Aggregate info and call language template """ log = logging.getLogger("buildWebsite.generate_static_pages_langs") From 91dd17ca8dc19001cfd23f742ee0b7091e8cffce Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 29 2023 05:57:30 +0000 Subject: [PATCH 9/17] support empty package stats in rare cases, a package may not contain translation progress, we do want the package to exist anyway here, there is no detected PO file it should be po files because there is a transifex file telling their path, but it's not the case --- diff --git a/build_website.py b/build_website.py index 82906f7..dbfa61c 100755 --- a/build_website.py +++ b/build_website.py @@ -203,6 +203,10 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, data["package"] = package data["now"] = datetime.datetime.utcnow() + # in some rare cases, a package may have no translation progress + if "stats" not in statistics.keys(): + data["stats"] = {} + apply_jinja_template(data, destination_file, "package.adoc") @@ -257,7 +261,11 @@ def apply_jinja_template(data: dict, destination_file: str, template_file: str) template_loader = jinja2.FileSystemLoader(searchpath="./templates/") template_env = jinja2.Environment(loader=template_loader, undefined=jinja2.Undefined) template = template_env.get_template(template_file) - output_text = template.render(data) + try: + output_text = template.render(data) + except jinja2.exceptions.UndefinedError as e: + logging.error(f"Error with {destination_file}: {e}") + raise with open(destination_file, "w") as write_out: write_out.write(output_text) From 8deb1254d4874a2f528e43ebf8f2eb7f94e55131 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 30 2023 21:24:39 +0000 Subject: [PATCH 10/17] pivot per language csv file content before statistic generation change, the per package statistics were stored as simple list since we are now using a dictionary structure, let's ask first Pandas to create the DataFrame from a dictionary like structure before creating the CSV file. --- diff --git a/build_website.py b/build_website.py index dbfa61c..49a04d2 100755 --- a/build_website.py +++ b/build_website.py @@ -119,7 +119,9 @@ def main(): with open(stats_file, "r") as read_file: content = json.load(read_file) - pd.DataFrame(content["packages"]).to_csv(os.path.join(static_tm_folder, f"{language}.csv"), index=False) + pd.DataFrame\ + .from_dict(content['packages'], orient="index")\ + .to_csv(os.path.join(static_tm_folder, f"{language}.csv"), index_label="package") territories = get_territories_for_language(language, cldr_languages) generate_static_pages_langs(args.results, language, content, destination_file, territories, tm_folder, static_tm_folder) From 3bbfe1b5f2737f640dc2ca8625b77cdf2caff78d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 30 2023 21:53:25 +0000 Subject: [PATCH 11/17] display file names as string we had one line per file, now this is a single string, no need of asciidoc parsing here remove useless path from display --- diff --git a/build_website.py b/build_website.py index 49a04d2..7f7022c 100755 --- a/build_website.py +++ b/build_website.py @@ -208,6 +208,12 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, # in some rare cases, a package may have no translation progress if "stats" not in statistics.keys(): data["stats"] = {} + data["stats"]["languages"] = {} + + # remove local path + for lang in data["stats"]["languages"].keys(): + path = f"./results/{release}/packages/{package}/" + data["stats"]["languages"][lang]["filename"] = data["stats"]["languages"][lang]["filename"].replace(path, " ") apply_jinja_template(data, destination_file, "package.adoc") diff --git a/templates/package.adoc b/templates/package.adoc index faabe81..8fd428e 100644 --- a/templates/package.adoc +++ b/templates/package.adoc @@ -10,7 +10,7 @@ The package {{ package }}: * is translated into {{ stats.languages|length }} languages in Fedora {{ results }}, * contains {{ no_languages|length }} files for which no languages could be deducted. -[cols="1a,1,1,1,3a", options="header"] +[cols="1a,1,1,1,3", options="header"] |=== | Language | Translated words @@ -24,6 +24,7 @@ The package {{ package }}: >| {{ stats.languages[stat].totalsourcewordssum }} >| {{ stats.languages[stat].progress }} | {{ stats.languages[stat].filename }} + {% endfor %} |=== From d8ebabfd28caf9b2ca6e6352e7e634af6e89ca77 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 07:54:44 +0000 Subject: [PATCH 12/17] display progress as percents --- diff --git a/templates/language.adoc b/templates/language.adoc index b488f61..68af78e 100644 --- a/templates/language.adoc +++ b/templates/language.adoc @@ -4,7 +4,8 @@ date: {{ now }} code: {{ lang_code }} name_english: {{ lang_name_en }} name_local: {{ lang_name_local }} -progress_d: {{ progress_d }} +progress: {{ '{:.2f}'.format(progress) }} +progress_d: {{ '{:.2f}'.format(progress_d) }} release: {{ results }} {%- if territories %} territories: @@ -16,8 +17,8 @@ territories: Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ results }} is: -* {{ progress }}% when we only look on started packages for this language. -* {{ progress_d }}% when we compare to every single translatable string in Fedora {{ results }}. +* {{ '{:.2f}'.format(progress) }} when we only look on started packages for this language. +* {{ '{:.2f}'.format(progress_d) }} when we compare to every single translatable string in Fedora {{ results }}. Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} @@ -39,14 +40,14 @@ Packages: | Name | Translated words | Total source words -| Progress +| Progress (%) | Language teams {% for package in packages -%} | link:{{ '{{' }}< ref "/{{ results }}/package/{{ package }}.adoc" >{{ '}}' }}[{{ package }}] >| {{ packages[package].translatedsourcewords }} >| {{ packages[package].totalsourcewordssum }} ->| {{ packages[package].progress }} +>| {{ '{:.1f}'.format(packages[package].progress) }} | {{ packages[package].team }} {% endfor %} |=== \ No newline at end of file diff --git a/templates/package.adoc b/templates/package.adoc index 8fd428e..c501ebc 100644 --- a/templates/package.adoc +++ b/templates/package.adoc @@ -15,14 +15,14 @@ The package {{ package }}: | Language | Translated words | Total source words -| Progress +| Progress (%) | Files {% for stat in stats.languages|sort -%} | link:{{ '{{' }}< ref "/{{ results }}/language/{{ stat }}.adoc" >{{ '}}' }}[{{ stat }}] >| {{ stats.languages[stat].translatedsourcewords }} >| {{ stats.languages[stat].totalsourcewordssum }} ->| {{ stats.languages[stat].progress }} +>| {{ '{:.1f}'.format(stats.languages[stat].progress) }} | {{ stats.languages[stat].filename }} {% endfor %} diff --git a/website/themes/beautifulhugo/layouts/_default/list_languages.html b/website/themes/beautifulhugo/layouts/_default/list_languages.html index 4f3fec7..ec74b82 100644 --- a/website/themes/beautifulhugo/layouts/_default/list_languages.html +++ b/website/themes/beautifulhugo/layouts/_default/list_languages.html @@ -12,7 +12,7 @@ code English name Local name - Progress + Progress (%) {{ range sort .Pages "Title" "asc" }} From f6d270650a553a7a6993882e15e0bf5311fa5e46 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 08:19:40 +0000 Subject: [PATCH 13/17] fix territories merge lists together instead of joining them --- diff --git a/build_website.py b/build_website.py index 7f7022c..ec8da3a 100755 --- a/build_website.py +++ b/build_website.py @@ -17,13 +17,12 @@ import utils def get_territories_for_language(language_name: str, cldr_languages: dict) -> list: log = logging.getLogger("buildWebsite.get_territory") - territories = [] code = language_name.split("_", 1)[0] # ro_MD or zh_Hant_HK code = code.split("@", 1)[0] # ca@valencia - territories.append(cldr_languages.get(code, {}).get("_territories", [])) - territories.append(cldr_languages.get(code + "-alt-secondary", {}).get("_territories", [])) + territories = cldr_languages.get(code, {}).get("_territories", []) + territories = territories + cldr_languages.get(code + "-alt-secondary", {}).get("_territories", []) # if language contains a territory code, then only keep this one if len(language_name.split("_")) > 1: @@ -95,6 +94,7 @@ def main(): log.info("Load CLDR data") with open("CLDR-raw/languageData.json", "r") as read_file: cldr_languages = json.load(read_file) + cldr_version = cldr_languages["supplemental"]["version"]["_cldrVersion"] cldr_languages = cldr_languages["supplemental"]["languageData"] with open("CLDR-raw/territories.json", "r") as read_file: @@ -155,7 +155,7 @@ def main(): # prevent containers and alternative names to be included if code in cldr_territories_info.keys(): package_statistics_file = os.path.join(static_territories_folder, code, "_index.adoc") - generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {})) + generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}), cldr_version) log.info("Copy translation memories") languages = [ @@ -247,11 +247,12 @@ def generate_package_index(distribution: str, destination_file: str) -> None: apply_jinja_template(data, destination_file, "_index.package.adoc") -def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict) -> None: +def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict, cldr_version: str) -> None: """ Aggregate info and call territory index template """ log = logging.getLogger("buildWebsite.generate_package_index") data["name"] = name data["code"] = code + data["cldr_version"] = cldr_version apply_jinja_template(data, destination_file, "_index.territory.adoc") diff --git a/templates/_index.territory.adoc b/templates/_index.territory.adoc index 4886457..ad6e959 100644 --- a/templates/_index.territory.adoc +++ b/templates/_index.territory.adoc @@ -1,7 +1,7 @@ --- title: "{{ code }} {{ name }}" --- -Data coming from Unicode consortium (CLDR 38): +Data coming from Unicode consortium (CLDR {{ cldr_version }}): * Population: {{ _population }} * Literacy percent: {{_literacyPercent}} From 042c2e5b34e8a4d4acd579522d390f99274bbdbd Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 08:21:17 +0000 Subject: [PATCH 14/17] add fake territory languages not in CLDR CLDR do not contain every possible categories, let's create a fake category so that we can easily identify the languages falling in this scenario --- diff --git a/build_website.py b/build_website.py index ec8da3a..b2c4e02 100755 --- a/build_website.py +++ b/build_website.py @@ -30,6 +30,7 @@ def get_territories_for_language(language_name: str, cldr_languages: dict) -> li territories = [language_name.split("_")[-1]] if len(territories) == 0: + territories = ["not-found-in-cldr"] log.warning(f"The language {code} does not exist in territories data from CLDR") return territories From f115ee6f44107ec7eb77f75ea73c8aea7fc110a0 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 08:45:40 +0000 Subject: [PATCH 15/17] webiste: package: compute metrics in build_website instead of using jinja2 and have no results, move processing into python --- diff --git a/build_website.py b/build_website.py index b2c4e02..362d914 100755 --- a/build_website.py +++ b/build_website.py @@ -211,6 +211,13 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, data["stats"] = {} data["stats"]["languages"] = {} + if "error" in data["stats"]["languages"].keys(): + data["started_languages"] = len(data["stats"]["languages"]) - 1 + data["no_languages"] = len(data["stats"]["languages"]["error"]["filename"].split("./")) - 1 + else: + data["started_languages"] = len(data["stats"]["languages"]) + data["no_languages"] = 0 + # remove local path for lang in data["stats"]["languages"].keys(): path = f"./results/{release}/packages/{package}/" diff --git a/templates/package.adoc b/templates/package.adoc index c501ebc..06d6da6 100644 --- a/templates/package.adoc +++ b/templates/package.adoc @@ -1,14 +1,14 @@ --- title: "{{ package }}" date: {{ now }} -started_languages: {{ stats|length }} -no_languages: {{ no_languages|length }} +started_languages: {{ started_languages }} +no_languages: {{ no_languages }} --- The package {{ package }}: * represents {{ totalsourcewords }} source words to be translated, * is translated into {{ stats.languages|length }} languages in Fedora {{ results }}, -* contains {{ no_languages|length }} files for which no languages could be deducted. +* contains {{ no_languages }} files for which no languages could be deducted. [cols="1a,1,1,1,3", options="header"] |=== diff --git a/todo.md b/todo.md index 21aec6c..a300187 100644 --- a/todo.md +++ b/todo.md @@ -11,18 +11,19 @@ direct call to: # build_tm.py -Detecting missing files -- en-compendium is missing -- error-compendium is missing -- gl-compendium is missing -- nb_no-compendium is missing -- sk-compendium is missing -- zh_hant-compendium is missing +move error detection (check_lang) into %language%/stats.json and display erros +move error files into %language%/stats.json and make these accessible via website +remove terminology (someone who wants it can do it locally) # build_stats.py when %package%/stats.json is empty, make sure it is counted as an existing package for which we were not able to extract anything (release stats) +# website + +list why we could not deduct error files +allow sort on all tables + # global From 13aaa42908435f691299e5873b738eaaf8752a33 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 09:16:19 +0000 Subject: [PATCH 16/17] website: display progress as percents let's use human percents everywhere --- diff --git a/build_stats.py b/build_stats.py index 8a6f471..afa5305 100755 --- a/build_stats.py +++ b/build_stats.py @@ -53,12 +53,21 @@ def compute_language_statistics(languages_stats: dict, total_release_source_word result["totalsourcewordssum"] = result["translatedsourcewords"] + result["fuzzysourcewords"] + result[ "untranslatedsourcewords"] result["totalsourcewords_d"] = total_release_source_words - result["progress"] = result["translatedsourcewords"] / result["totalsourcewordssum"] - result["progress_d"] = result["translatedsourcewords"] / result["totalsourcewords_d"] + + # prevent a Runtime warning for languages with no content + if result["totalsourcewordssum"] > 0: + result["progress"] = (result["translatedsourcewords"] / result["totalsourcewordssum"]) * 100 + else: + result["progress"] = 0.0 + + result["progress_d"] = (result["translatedsourcewords"] / result["totalsourcewords_d"]) * 100 packages_stats = df[package_fields].groupby("package").sum() packages_stats["totalsourcewordssum"] = packages_stats["translatedsourcewords"] + packages_stats["fuzzysourcewords"] + packages_stats["untranslatedsourcewords"] - packages_stats["progress"] = packages_stats["translatedsourcewords"] / packages_stats["totalsourcewordssum"] + + packages_stats["progress"] = (packages_stats["translatedsourcewords"] / packages_stats["totalsourcewordssum"]) * 100 + # prevent NaN values when a package have total source words = 0 + packages_stats.fillna(0, inplace=True) packages_stats["team"] = df[["metadata_language_team", "package"]].groupby("package").first() result["packages"] = packages_stats.to_dict(orient="index") @@ -85,7 +94,9 @@ def compute_package_statistics(df: pd.DataFrame) -> dict: stats = df[po_fields].groupby(index).sum() stats["totalsourcewordssum"] = stats["translatedsourcewords"] + stats["fuzzysourcewords"] + stats["untranslatedsourcewords"] - stats["progress"] = stats["translatedsourcewords"] / stats["totalsourcewordssum"] + stats["progress"] = (stats["translatedsourcewords"] / stats["totalsourcewordssum"]) * 100 + # prevent NaN values when a package have total source words = 0 + stats.fillna(0, inplace=True) stats["team"] = df[["metadata_language_team", index]].groupby(index).first() df['filename'] = df.index stats["filename"] = df[["filename", index]].groupby(index).sum() diff --git a/templates/language.adoc b/templates/language.adoc index 68af78e..55e6b33 100644 --- a/templates/language.adoc +++ b/templates/language.adoc @@ -17,8 +17,8 @@ territories: Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ results }} is: -* {{ '{:.2f}'.format(progress) }} when we only look on started packages for this language. -* {{ '{:.2f}'.format(progress_d) }} when we compare to every single translatable string in Fedora {{ results }}. +* {{ '{:.2f}'.format(progress) }}% when we only look on started packages for this language. +* {{ '{:.2f}'.format(progress_d) }}% when we compare to every single translatable string in Fedora {{ results }}. Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} From 4a8425a9894c284ac2cec8244268dd7efeca505d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 09:48:04 +0000 Subject: [PATCH 17/17] adapt release wide stats adapt release wide counts with the new data model --- diff --git a/build_stats.py b/build_stats.py index afa5305..1d840c6 100755 --- a/build_stats.py +++ b/build_stats.py @@ -217,14 +217,34 @@ def main(): distribution_file = os.path.join(results_folder, "release.json") distribution_stats = dict() with open(os.path.join(results_folder, "data.json"), "r") as f: - distribution_stats["total_release_packages"] = len(json.load(f)) + distribution_stats["packages_count"] = len(json.load(f)) + + # detected = identified with translation files + distribution_stats["packages_detected_count"] = len(packages) + distribution_stats["files_detected_count"] = sum([len(package["po"]) for package in all_stats]) + + # processed = what we were able to use + distribution_stats["packages_processed_count"] = 0 + distribution_stats["files_processed_count"] = 0 + + for package in sorted(packages): + log.info(package) + stats_file = os.path.join(packages_folder, package, "stats.json") + + with open(stats_file, "r") as f: + stats = json.load(f) + + # if there is no source words, it means we were not able to process anything + if "totalsourcewords" in stats.keys(): + if stats["totalsourcewords"] > 0: + distribution_stats["packages_processed_count"] += 1 + + for _, detected in stats["po"].items(): + if detected["lang_code_chosen"] != "error": + distribution_stats["files_processed_count"] += 1 distribution_stats["totalsourcewords"] = total_distribution_source_words - distribution_stats["total_packages_with_stats"] = len(packages) - distribution_stats["total_packages_files"] = sum([len(package["po"]) for package in all_stats]) - distribution_stats["total_packages"] = len(packages) - distribution_stats["nb_files"] = len([file for file in all_stats if file.get("could_not_process", 0) == 0]) - distribution_stats["total_languages"] = len(languages) + distribution_stats["languages_processed_count"] = len(languages) log.info(distribution_stats) diff --git a/templates/_index.release.adoc b/templates/_index.release.adoc index 440074c..60d0de3 100644 --- a/templates/_index.release.adoc +++ b/templates/_index.release.adoc @@ -5,14 +5,14 @@ layout: "release" --- Fedora {{ release }}:: -* contains {{ total_release_packages }} packages, -* we identified {{ total_packages }} packages with translations files, -* it represents {{ total_packages_files }} translations files (po). +* contains {{ packages_count }} packages, +* we identified {{ packages_detected_count }} packages with translations files, +* it represents {{ files_detected_count }} translations files (po). What we were able to process:: -* {{ total_packages }} packages, -* {{ nb_files }} translation files containing {{ totalsourcewords }} words to translate, -* {{ total_languages }} languages. +* {{ packages_processed_count }} packages, +* {{ files_processed_count }} translation files containing {{ totalsourcewords }} words to translate, +* {{ languages_processed_count }} languages. Why such gaps?:: . File reading was not possible (encoding or format issue),