From 63ecbcc679e4f638703308f7fd0578fb8a5daf58 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 29 2023 05:54:38 +0000 Subject: [PATCH 1/26] improve documentation a simple review of comments and variable names, to make the code easier to read --- diff --git a/build_language_list.py b/build_language_list.py index 80f46a5..efa7571 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -""" Parse translation files to deduct language list """ +""" Detect language for each translation file """ import argparse import glob @@ -19,7 +19,7 @@ def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Creates a list of languages form translation files" + description="Detect language for each translation file" ) parser.add_argument( @@ -101,7 +101,7 @@ def scan_packages(package_folder: str, refresh: bool): processed_files_count += 1 result = p.search(po_file) - lang_code = result.group(1) + path_lang_code = result.group(1) metadata = dict() error = "" try: @@ -115,9 +115,9 @@ def scan_packages(package_folder: str, refresh: bool): # maybe a polib bug? to investigate before using it in TM error = "error-os" - lang, decision = choose_language_code_from_po(lang_code, metadata) + lang, decision = choose_language_code_from_po(path_lang_code, metadata) - debug_file = {"lang_in_path": lang_code, + debug_file = {"lang_in_path": path_lang_code, "metadata_lang": metadata.get("Language", ""), "metadata_plurals": metadata.get("Plural-Forms", ""), "metadata_language_team": metadata.get("Language-Team", ""), @@ -139,11 +139,11 @@ def scan_packages(package_folder: str, refresh: bool): def choose_language_code_from_po(filename: str, metadata: dict[str]) -> tuple[str, int]: - """ From a po file and its metadata, choose the most likely language code - By priority: the Language metadata - :param filename: the po file - :param metadata: - :return: a language code + """ Deduct a language code from a filename and its metadata + + :param filename: po filename + :param metadata: po metadata + :return: a language code, a decision path """ log = logging.getLogger("buildLanguageList.choose_lang") diff --git a/build_stats.py b/build_stats.py index 4ef1cef..8a6f471 100755 --- a/build_stats.py +++ b/build_stats.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""For each package, compute stats""" +""" Computes stats for each package with translations and each detected language """ import argparse import json @@ -16,22 +16,21 @@ from translate.tools.pocount import calcstats import utils -def compute_language_statistics(languages: dict, total_distribution_source_words: int) -> dict: - """ - Target: - "packages": [ - { - "name": "blueberry", - "progress": 100, - "translated": 166, - "team": "French " - } - ], - "progress": 98.1, - "progress_d": 63.4, - "totalsourcewords_d": 11491, - "totalsourcewordssum": 7428, - "translatedsourcewordssum": 7287 +def compute_language_statistics(languages_stats: dict, total_release_source_words: int) -> dict: + """ For each language, produce global statistics and per package statistics + + global statistics target: + "totalsourcewordssum": total words on started packages + "totalsourcewords_d": total words in release + "translatedsourcewordssum": total translated words + "progress": current translation progress on started packages (in percents) + "progress_d": current translation progress on all strings in release (in percents) + + per package statistics target: + "name": package name + "progress": current translation progress (in percents) + "translated": total translated words (source words, it can vary in target language) + "team": language team info """ log = logging.getLogger("buildStats.compute_language_statistics") @@ -41,7 +40,7 @@ def compute_language_statistics(languages: dict, total_distribution_source_words package_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy", "untranslated", "translatedtargetwords", "package"] - for code, stats in languages.items(): + for code, stats in languages_stats.items(): results_languages[code] = {} results_languages[code]["po"] = stats result = {} @@ -53,7 +52,7 @@ def compute_language_statistics(languages: dict, total_distribution_source_words result["totalsourcewordssum"] = result["translatedsourcewords"] + result["fuzzysourcewords"] + result[ "untranslatedsourcewords"] - result["totalsourcewords_d"] = total_distribution_source_words + result["totalsourcewords_d"] = total_release_source_words result["progress"] = result["translatedsourcewords"] / result["totalsourcewordssum"] result["progress_d"] = result["translatedsourcewords"] / result["totalsourcewords_d"] @@ -68,19 +67,15 @@ def compute_language_statistics(languages: dict, total_distribution_source_words return results_languages -def compute_package_statistics(df): - """ - [ - { - "lang_code": "de", - "team": "Low German , German ", - "filename": [ - "po/blueberry-nds.po", - "po/blueberry-de.po" - ], - "progress": 179, - "translated": 297 - }, +def compute_package_statistics(df: pd.DataFrame) -> dict: + """ For each package, per language statistics + + global statistics target: + "lang_code": language code + "team": language team info + "progress": current translation progress (in percents), + "translated": total translated words (source words, it can vary in target language) + "filename": list of files considered for statistics """ log = logging.getLogger("buildStats.compute_language_statistics") results = dict() @@ -103,7 +98,7 @@ def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Computes stats for each package with translations" + description="Computes stats for each package with translations and each detected language" ) parser.add_argument( "--results", required=True, help="Set the results folder to use" @@ -230,7 +225,7 @@ def main(): def get_po_translation_level(file: str) -> dict: - """ Compute results """ + """ Call pocount to get translation stats for a file """ log = logging.getLogger("buildStats.get_po_translation_level") command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file] subprocess.run(command, check=True, capture_output=True) diff --git a/build_tm.py b/build_tm.py index 7ce3c34..4be4d93 100755 --- a/build_tm.py +++ b/build_tm.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Consolidate each po files into compendium""" +""" Creates useful translator files for every language """ import argparse import gzip @@ -17,7 +17,7 @@ def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Creates useful translator files for every languages" + description="Creates useful translator files for every language" ) parser.add_argument( @@ -52,7 +52,7 @@ def main(): if os.path.exists(tm_folder) is False: os.makedirs(tm_folder) - log.info("Building the translation memory for every languages") + log.info("Find detected languages") languages = [ f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f)) @@ -222,12 +222,12 @@ def process_tmx(lang: str, source: str, destination: str) -> None: subprocess.run(command, check=True, capture_output=True) -def process_terminology(source: str, destination: str) -> None: - """ Generate a termonology from a po file """ +def process_terminology(compendium: str, destination: str) -> None: + """ Generate a terminology from a po file """ command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", - "--progress=none", source, f"--output={destination}"] + "--progress=none", compendium, f"--output={destination}"] subprocess.run(command, check=True, capture_output=True) @@ -255,7 +255,7 @@ def check_lang(lang: str, tm_folder: str, to_compress: bool) -> None: def compress(source: str, destination_file: str) -> None: - """ Compress files uzing gzip """ + """ Compress files using gzip """ log = logging.getLogger("buildTm.compress") log.debug(f"Compressing {source}") diff --git a/build_website.py b/build_website.py index b8a877a..0a1b3e3 100755 --- a/build_website.py +++ b/build_website.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Consolidate each po files into compendium""" +""" Generate static asciidoc pages from generated statistics """ import argparse import datetime @@ -39,7 +39,7 @@ def get_territories_for_language(language_name: str, cldr_languages: dict) -> li def main(): """Handle params""" - parser = argparse.ArgumentParser(description="") + parser = argparse.ArgumentParser(description="Generate static asciidoc pages from generated statistics") parser.add_argument( "--results", @@ -267,6 +267,7 @@ def consolidate_package_stats(stats_file, package_folder): def generate_static_pages_langs(results: str, code: str, content: dict, destination_file: str, territories: list[str], tm_folder: str, static_tm_folder: str) -> None: + """ Aggregate info and call language template """ log = logging.getLogger("buildWebsite.generate_static_pages_langs") data = content data["lang_name_en"] = langtable.language_name( @@ -292,7 +293,8 @@ def generate_static_pages_langs(results: str, code: str, content: dict, destinat apply_jinja_template(data, destination_file, "language.adoc") -def generate_static_pages_packages(release: str, package: str, statistics, destination_file): +def generate_static_pages_packages(release: str, package: str, statistics: dict, destination_file: str) -> None: + """ Aggregate info and call package template """ log = logging.getLogger("buildWebsite.generate_static_pages_packages") data = statistics data["results"] = release @@ -302,7 +304,8 @@ def generate_static_pages_packages(release: str, package: str, statistics, desti apply_jinja_template(data, destination_file, "package.adoc") -def generate_release_index(release: str, destination_file: str, data: dict): +def generate_release_index(release: str, destination_file: str, data: dict) -> None: + """ Aggregate info and call release index template """ log = logging.getLogger("buildWebsite.generate_release_index") data["release"] = release data["now"] = datetime.datetime.utcnow() @@ -310,7 +313,8 @@ def generate_release_index(release: str, destination_file: str, data: dict): apply_jinja_template(data, destination_file, "_index.release.adoc") -def generate_language_index(release: str, destination_file: str): +def generate_language_index(release: str, destination_file: str) -> None: + """ Aggregate info and call language index template """ log = logging.getLogger("buildWebsite.generate_language_index") data = dict() data["release"] = release @@ -319,7 +323,8 @@ def generate_language_index(release: str, destination_file: str): apply_jinja_template(data, destination_file, "_index.language.adoc") -def generate_package_index(distribution: str, destination_file: str): +def generate_package_index(distribution: str, destination_file: str) -> None: + """ Aggregate info and call package index template """ log = logging.getLogger("buildWebsite.generate_package_index") data = dict() data["distribution"] = distribution @@ -328,7 +333,8 @@ def generate_package_index(distribution: str, destination_file: str): apply_jinja_template(data, destination_file, "_index.package.adoc") -def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict): +def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict) -> None: + """ Aggregate info and call territory index template """ log = logging.getLogger("buildWebsite.generate_package_index") data["name"] = name data["code"] = code @@ -336,12 +342,14 @@ def generate_territory_index(destination_file: str, name: list[str], code: str, apply_jinja_template(data, destination_file, "_index.territory.adoc") -def store_json_file(content, destination_file): +def store_json_file(content: dict, destination_file: str) -> None: + """ Store a json file""" with open(destination_file, "w") as f: f.write(json.dumps(content, indent=2)) -def apply_jinja_template(data: dict, destination_file: str, template_file: str): +def apply_jinja_template(data: dict, destination_file: str, template_file: str) -> None: + """ Call a jinja template with a data dictionary """ os.makedirs(os.path.dirname(os.path.abspath(destination_file)), exist_ok=True) template_loader = jinja2.FileSystemLoader(searchpath="./templates/") diff --git a/requirements.txt b/requirements.txt index e5941dc..af51268 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ polib weblate-language-data langtable translate-toolkit + +numpyencoder \ No newline at end of file From 9d3bd6ed86a8ff44bb47069f5afd17ed149f043f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 29 2023 05:55:20 +0000 Subject: [PATCH 2/26] remove package stats in build_website we do this processing in build_stats.py now --- diff --git a/build_website.py b/build_website.py index 0a1b3e3..82906f7 100755 --- a/build_website.py +++ b/build_website.py @@ -168,104 +168,6 @@ def main(): log.info("done") -def consolidate_package_stats(stats_file, package_folder): - """ From a CSV file, return key indicators """ - log = logging.getLogger("buildWebsite.consolidate_package_stats") - results = dict() - - fieldnames = { - "filename": "str", - "translatedsourcewords": "int", - "fuzzysourcewords": "int", - "untranslatedsourcewords": "int", - "translated": "int", - "fuzzy": "int", - "untranslated": "int", - "translatedtargetwords": "int", - "team": "str", - "totalsourcewords": "int", - } - - _json = json.load(open(stats_file)) - dfs = [] - total_source_words = 0 - - for template in _json.keys(): - tmp_df = pd.DataFrame.from_dict(_json.get(template), orient="index") - tmp_df.fillna(0, inplace=True) - tmp_df.reset_index(level=0, inplace=True) - - # sometimes, no file were found, which means no stats can be used - if len(tmp_df) == 0: - log.debug(f" The template {template} for {stats_file} is empty") - continue - - tmp_df["totalsourcewords"] = ( - tmp_df["untranslatedsourcewords"] + tmp_df["translatedsourcewords"] - ) - tmp_df.columns = fieldnames.keys() - - total_source_words += max(tmp_df["totalsourcewords"]) - - dfs.append(tmp_df) - - if len(dfs) > 1: - stats_df = pd.concat(dfs) - elif len(dfs) == 0: - log.debug(f"There is no stats for {stats_file}") - return results - else: - stats_df = dfs[0] - - temp_translated = ( - stats_df.groupby(["lang_code"]) - .agg( - { - "translatedsourcewords": ["sum"], - } - ) - .reset_index() - .droplevel(1, axis=1) - ) - - temp_teams = stats_df.groupby("lang_code")["team"].apply( - lambda x: ", ".join(x.drop_duplicates()) - ) - temp_files = stats_df.groupby("lang_code")["filename"].apply( - lambda x: ",".join(x) - ) - temp_bis = pd.merge(temp_teams, temp_files, how="inner", on="lang_code") - temp = pd.merge(temp_translated, temp_bis, how="inner", on="lang_code").to_dict( - orient="records" - ) - - for line in temp: - line["progress"] = 0 - line["translated"] = line["translatedsourcewords"] - - if total_source_words == 0: - log.info(f" File {stats_file} for file has translatedsourcewords = 0 in line {line}") - line["progress"] = 0 - continue - try: - line["progress"] = round( - (int(line["translatedsourcewords"]) / total_source_words) * 100 - ) - except OverflowError: - log.info(f" {stats_file} have Translated={line['translatedsourcewords']} and Source={total_source_words}") - - line["filename"] = line["filename"].split(",") - - results["stats"] = list() - for line in sorted(temp, key=lambda k: k["progress"], reverse=True): - del line["translatedsourcewords"] - results["stats"].append(line) - - results["totalsourcewords"] = total_source_words - - return results - - def generate_static_pages_langs(results: str, code: str, content: dict, destination_file: str, territories: list[str], tm_folder: str, static_tm_folder: str) -> None: """ Aggregate info and call language template """ log = logging.getLogger("buildWebsite.generate_static_pages_langs") From 91dd17ca8dc19001cfd23f742ee0b7091e8cffce Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 29 2023 05:57:30 +0000 Subject: [PATCH 3/26] support empty package stats in rare cases, a package may not contain translation progress, we do want the package to exist anyway here, there is no detected PO file it should be po files because there is a transifex file telling their path, but it's not the case --- diff --git a/build_website.py b/build_website.py index 82906f7..dbfa61c 100755 --- a/build_website.py +++ b/build_website.py @@ -203,6 +203,10 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, data["package"] = package data["now"] = datetime.datetime.utcnow() + # in some rare cases, a package may have no translation progress + if "stats" not in statistics.keys(): + data["stats"] = {} + apply_jinja_template(data, destination_file, "package.adoc") @@ -257,7 +261,11 @@ def apply_jinja_template(data: dict, destination_file: str, template_file: str) template_loader = jinja2.FileSystemLoader(searchpath="./templates/") template_env = jinja2.Environment(loader=template_loader, undefined=jinja2.Undefined) template = template_env.get_template(template_file) - output_text = template.render(data) + try: + output_text = template.render(data) + except jinja2.exceptions.UndefinedError as e: + logging.error(f"Error with {destination_file}: {e}") + raise with open(destination_file, "w") as write_out: write_out.write(output_text) From 8deb1254d4874a2f528e43ebf8f2eb7f94e55131 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 30 2023 21:24:39 +0000 Subject: [PATCH 4/26] pivot per language csv file content before statistic generation change, the per package statistics were stored as simple list since we are now using a dictionary structure, let's ask first Pandas to create the DataFrame from a dictionary like structure before creating the CSV file. --- diff --git a/build_website.py b/build_website.py index dbfa61c..49a04d2 100755 --- a/build_website.py +++ b/build_website.py @@ -119,7 +119,9 @@ def main(): with open(stats_file, "r") as read_file: content = json.load(read_file) - pd.DataFrame(content["packages"]).to_csv(os.path.join(static_tm_folder, f"{language}.csv"), index=False) + pd.DataFrame\ + .from_dict(content['packages'], orient="index")\ + .to_csv(os.path.join(static_tm_folder, f"{language}.csv"), index_label="package") territories = get_territories_for_language(language, cldr_languages) generate_static_pages_langs(args.results, language, content, destination_file, territories, tm_folder, static_tm_folder) From 3bbfe1b5f2737f640dc2ca8625b77cdf2caff78d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Aug 30 2023 21:53:25 +0000 Subject: [PATCH 5/26] display file names as string we had one line per file, now this is a single string, no need of asciidoc parsing here remove useless path from display --- diff --git a/build_website.py b/build_website.py index 49a04d2..7f7022c 100755 --- a/build_website.py +++ b/build_website.py @@ -208,6 +208,12 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, # in some rare cases, a package may have no translation progress if "stats" not in statistics.keys(): data["stats"] = {} + data["stats"]["languages"] = {} + + # remove local path + for lang in data["stats"]["languages"].keys(): + path = f"./results/{release}/packages/{package}/" + data["stats"]["languages"][lang]["filename"] = data["stats"]["languages"][lang]["filename"].replace(path, " ") apply_jinja_template(data, destination_file, "package.adoc") diff --git a/templates/package.adoc b/templates/package.adoc index faabe81..8fd428e 100644 --- a/templates/package.adoc +++ b/templates/package.adoc @@ -10,7 +10,7 @@ The package {{ package }}: * is translated into {{ stats.languages|length }} languages in Fedora {{ results }}, * contains {{ no_languages|length }} files for which no languages could be deducted. -[cols="1a,1,1,1,3a", options="header"] +[cols="1a,1,1,1,3", options="header"] |=== | Language | Translated words @@ -24,6 +24,7 @@ The package {{ package }}: >| {{ stats.languages[stat].totalsourcewordssum }} >| {{ stats.languages[stat].progress }} | {{ stats.languages[stat].filename }} + {% endfor %} |=== From d8ebabfd28caf9b2ca6e6352e7e634af6e89ca77 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 07:54:44 +0000 Subject: [PATCH 6/26] display progress as percents --- diff --git a/templates/language.adoc b/templates/language.adoc index b488f61..68af78e 100644 --- a/templates/language.adoc +++ b/templates/language.adoc @@ -4,7 +4,8 @@ date: {{ now }} code: {{ lang_code }} name_english: {{ lang_name_en }} name_local: {{ lang_name_local }} -progress_d: {{ progress_d }} +progress: {{ '{:.2f}'.format(progress) }} +progress_d: {{ '{:.2f}'.format(progress_d) }} release: {{ results }} {%- if territories %} territories: @@ -16,8 +17,8 @@ territories: Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ results }} is: -* {{ progress }}% when we only look on started packages for this language. -* {{ progress_d }}% when we compare to every single translatable string in Fedora {{ results }}. +* {{ '{:.2f}'.format(progress) }} when we only look on started packages for this language. +* {{ '{:.2f}'.format(progress_d) }} when we compare to every single translatable string in Fedora {{ results }}. Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} @@ -39,14 +40,14 @@ Packages: | Name | Translated words | Total source words -| Progress +| Progress (%) | Language teams {% for package in packages -%} | link:{{ '{{' }}< ref "/{{ results }}/package/{{ package }}.adoc" >{{ '}}' }}[{{ package }}] >| {{ packages[package].translatedsourcewords }} >| {{ packages[package].totalsourcewordssum }} ->| {{ packages[package].progress }} +>| {{ '{:.1f}'.format(packages[package].progress) }} | {{ packages[package].team }} {% endfor %} |=== \ No newline at end of file diff --git a/templates/package.adoc b/templates/package.adoc index 8fd428e..c501ebc 100644 --- a/templates/package.adoc +++ b/templates/package.adoc @@ -15,14 +15,14 @@ The package {{ package }}: | Language | Translated words | Total source words -| Progress +| Progress (%) | Files {% for stat in stats.languages|sort -%} | link:{{ '{{' }}< ref "/{{ results }}/language/{{ stat }}.adoc" >{{ '}}' }}[{{ stat }}] >| {{ stats.languages[stat].translatedsourcewords }} >| {{ stats.languages[stat].totalsourcewordssum }} ->| {{ stats.languages[stat].progress }} +>| {{ '{:.1f}'.format(stats.languages[stat].progress) }} | {{ stats.languages[stat].filename }} {% endfor %} diff --git a/website/themes/beautifulhugo/layouts/_default/list_languages.html b/website/themes/beautifulhugo/layouts/_default/list_languages.html index 4f3fec7..ec74b82 100644 --- a/website/themes/beautifulhugo/layouts/_default/list_languages.html +++ b/website/themes/beautifulhugo/layouts/_default/list_languages.html @@ -12,7 +12,7 @@ code English name Local name - Progress + Progress (%) {{ range sort .Pages "Title" "asc" }} From f6d270650a553a7a6993882e15e0bf5311fa5e46 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 08:19:40 +0000 Subject: [PATCH 7/26] fix territories merge lists together instead of joining them --- diff --git a/build_website.py b/build_website.py index 7f7022c..ec8da3a 100755 --- a/build_website.py +++ b/build_website.py @@ -17,13 +17,12 @@ import utils def get_territories_for_language(language_name: str, cldr_languages: dict) -> list: log = logging.getLogger("buildWebsite.get_territory") - territories = [] code = language_name.split("_", 1)[0] # ro_MD or zh_Hant_HK code = code.split("@", 1)[0] # ca@valencia - territories.append(cldr_languages.get(code, {}).get("_territories", [])) - territories.append(cldr_languages.get(code + "-alt-secondary", {}).get("_territories", [])) + territories = cldr_languages.get(code, {}).get("_territories", []) + territories = territories + cldr_languages.get(code + "-alt-secondary", {}).get("_territories", []) # if language contains a territory code, then only keep this one if len(language_name.split("_")) > 1: @@ -95,6 +94,7 @@ def main(): log.info("Load CLDR data") with open("CLDR-raw/languageData.json", "r") as read_file: cldr_languages = json.load(read_file) + cldr_version = cldr_languages["supplemental"]["version"]["_cldrVersion"] cldr_languages = cldr_languages["supplemental"]["languageData"] with open("CLDR-raw/territories.json", "r") as read_file: @@ -155,7 +155,7 @@ def main(): # prevent containers and alternative names to be included if code in cldr_territories_info.keys(): package_statistics_file = os.path.join(static_territories_folder, code, "_index.adoc") - generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {})) + generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}), cldr_version) log.info("Copy translation memories") languages = [ @@ -247,11 +247,12 @@ def generate_package_index(distribution: str, destination_file: str) -> None: apply_jinja_template(data, destination_file, "_index.package.adoc") -def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict) -> None: +def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict, cldr_version: str) -> None: """ Aggregate info and call territory index template """ log = logging.getLogger("buildWebsite.generate_package_index") data["name"] = name data["code"] = code + data["cldr_version"] = cldr_version apply_jinja_template(data, destination_file, "_index.territory.adoc") diff --git a/templates/_index.territory.adoc b/templates/_index.territory.adoc index 4886457..ad6e959 100644 --- a/templates/_index.territory.adoc +++ b/templates/_index.territory.adoc @@ -1,7 +1,7 @@ --- title: "{{ code }} {{ name }}" --- -Data coming from Unicode consortium (CLDR 38): +Data coming from Unicode consortium (CLDR {{ cldr_version }}): * Population: {{ _population }} * Literacy percent: {{_literacyPercent}} From 042c2e5b34e8a4d4acd579522d390f99274bbdbd Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 08:21:17 +0000 Subject: [PATCH 8/26] add fake territory languages not in CLDR CLDR do not contain every possible categories, let's create a fake category so that we can easily identify the languages falling in this scenario --- diff --git a/build_website.py b/build_website.py index ec8da3a..b2c4e02 100755 --- a/build_website.py +++ b/build_website.py @@ -30,6 +30,7 @@ def get_territories_for_language(language_name: str, cldr_languages: dict) -> li territories = [language_name.split("_")[-1]] if len(territories) == 0: + territories = ["not-found-in-cldr"] log.warning(f"The language {code} does not exist in territories data from CLDR") return territories From f115ee6f44107ec7eb77f75ea73c8aea7fc110a0 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 08:45:40 +0000 Subject: [PATCH 9/26] webiste: package: compute metrics in build_website instead of using jinja2 and have no results, move processing into python --- diff --git a/build_website.py b/build_website.py index b2c4e02..362d914 100755 --- a/build_website.py +++ b/build_website.py @@ -211,6 +211,13 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, data["stats"] = {} data["stats"]["languages"] = {} + if "error" in data["stats"]["languages"].keys(): + data["started_languages"] = len(data["stats"]["languages"]) - 1 + data["no_languages"] = len(data["stats"]["languages"]["error"]["filename"].split("./")) - 1 + else: + data["started_languages"] = len(data["stats"]["languages"]) + data["no_languages"] = 0 + # remove local path for lang in data["stats"]["languages"].keys(): path = f"./results/{release}/packages/{package}/" diff --git a/templates/package.adoc b/templates/package.adoc index c501ebc..06d6da6 100644 --- a/templates/package.adoc +++ b/templates/package.adoc @@ -1,14 +1,14 @@ --- title: "{{ package }}" date: {{ now }} -started_languages: {{ stats|length }} -no_languages: {{ no_languages|length }} +started_languages: {{ started_languages }} +no_languages: {{ no_languages }} --- The package {{ package }}: * represents {{ totalsourcewords }} source words to be translated, * is translated into {{ stats.languages|length }} languages in Fedora {{ results }}, -* contains {{ no_languages|length }} files for which no languages could be deducted. +* contains {{ no_languages }} files for which no languages could be deducted. [cols="1a,1,1,1,3", options="header"] |=== diff --git a/todo.md b/todo.md index 21aec6c..a300187 100644 --- a/todo.md +++ b/todo.md @@ -11,18 +11,19 @@ direct call to: # build_tm.py -Detecting missing files -- en-compendium is missing -- error-compendium is missing -- gl-compendium is missing -- nb_no-compendium is missing -- sk-compendium is missing -- zh_hant-compendium is missing +move error detection (check_lang) into %language%/stats.json and display erros +move error files into %language%/stats.json and make these accessible via website +remove terminology (someone who wants it can do it locally) # build_stats.py when %package%/stats.json is empty, make sure it is counted as an existing package for which we were not able to extract anything (release stats) +# website + +list why we could not deduct error files +allow sort on all tables + # global From 13aaa42908435f691299e5873b738eaaf8752a33 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 09:16:19 +0000 Subject: [PATCH 10/26] website: display progress as percents let's use human percents everywhere --- diff --git a/build_stats.py b/build_stats.py index 8a6f471..afa5305 100755 --- a/build_stats.py +++ b/build_stats.py @@ -53,12 +53,21 @@ def compute_language_statistics(languages_stats: dict, total_release_source_word result["totalsourcewordssum"] = result["translatedsourcewords"] + result["fuzzysourcewords"] + result[ "untranslatedsourcewords"] result["totalsourcewords_d"] = total_release_source_words - result["progress"] = result["translatedsourcewords"] / result["totalsourcewordssum"] - result["progress_d"] = result["translatedsourcewords"] / result["totalsourcewords_d"] + + # prevent a Runtime warning for languages with no content + if result["totalsourcewordssum"] > 0: + result["progress"] = (result["translatedsourcewords"] / result["totalsourcewordssum"]) * 100 + else: + result["progress"] = 0.0 + + result["progress_d"] = (result["translatedsourcewords"] / result["totalsourcewords_d"]) * 100 packages_stats = df[package_fields].groupby("package").sum() packages_stats["totalsourcewordssum"] = packages_stats["translatedsourcewords"] + packages_stats["fuzzysourcewords"] + packages_stats["untranslatedsourcewords"] - packages_stats["progress"] = packages_stats["translatedsourcewords"] / packages_stats["totalsourcewordssum"] + + packages_stats["progress"] = (packages_stats["translatedsourcewords"] / packages_stats["totalsourcewordssum"]) * 100 + # prevent NaN values when a package have total source words = 0 + packages_stats.fillna(0, inplace=True) packages_stats["team"] = df[["metadata_language_team", "package"]].groupby("package").first() result["packages"] = packages_stats.to_dict(orient="index") @@ -85,7 +94,9 @@ def compute_package_statistics(df: pd.DataFrame) -> dict: stats = df[po_fields].groupby(index).sum() stats["totalsourcewordssum"] = stats["translatedsourcewords"] + stats["fuzzysourcewords"] + stats["untranslatedsourcewords"] - stats["progress"] = stats["translatedsourcewords"] / stats["totalsourcewordssum"] + stats["progress"] = (stats["translatedsourcewords"] / stats["totalsourcewordssum"]) * 100 + # prevent NaN values when a package have total source words = 0 + stats.fillna(0, inplace=True) stats["team"] = df[["metadata_language_team", index]].groupby(index).first() df['filename'] = df.index stats["filename"] = df[["filename", index]].groupby(index).sum() diff --git a/templates/language.adoc b/templates/language.adoc index 68af78e..55e6b33 100644 --- a/templates/language.adoc +++ b/templates/language.adoc @@ -17,8 +17,8 @@ territories: Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ results }} is: -* {{ '{:.2f}'.format(progress) }} when we only look on started packages for this language. -* {{ '{:.2f}'.format(progress_d) }} when we compare to every single translatable string in Fedora {{ results }}. +* {{ '{:.2f}'.format(progress) }}% when we only look on started packages for this language. +* {{ '{:.2f}'.format(progress_d) }}% when we compare to every single translatable string in Fedora {{ results }}. Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} From 4a8425a9894c284ac2cec8244268dd7efeca505d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 09:48:04 +0000 Subject: [PATCH 11/26] adapt release wide stats adapt release wide counts with the new data model --- diff --git a/build_stats.py b/build_stats.py index afa5305..1d840c6 100755 --- a/build_stats.py +++ b/build_stats.py @@ -217,14 +217,34 @@ def main(): distribution_file = os.path.join(results_folder, "release.json") distribution_stats = dict() with open(os.path.join(results_folder, "data.json"), "r") as f: - distribution_stats["total_release_packages"] = len(json.load(f)) + distribution_stats["packages_count"] = len(json.load(f)) + + # detected = identified with translation files + distribution_stats["packages_detected_count"] = len(packages) + distribution_stats["files_detected_count"] = sum([len(package["po"]) for package in all_stats]) + + # processed = what we were able to use + distribution_stats["packages_processed_count"] = 0 + distribution_stats["files_processed_count"] = 0 + + for package in sorted(packages): + log.info(package) + stats_file = os.path.join(packages_folder, package, "stats.json") + + with open(stats_file, "r") as f: + stats = json.load(f) + + # if there is no source words, it means we were not able to process anything + if "totalsourcewords" in stats.keys(): + if stats["totalsourcewords"] > 0: + distribution_stats["packages_processed_count"] += 1 + + for _, detected in stats["po"].items(): + if detected["lang_code_chosen"] != "error": + distribution_stats["files_processed_count"] += 1 distribution_stats["totalsourcewords"] = total_distribution_source_words - distribution_stats["total_packages_with_stats"] = len(packages) - distribution_stats["total_packages_files"] = sum([len(package["po"]) for package in all_stats]) - distribution_stats["total_packages"] = len(packages) - distribution_stats["nb_files"] = len([file for file in all_stats if file.get("could_not_process", 0) == 0]) - distribution_stats["total_languages"] = len(languages) + distribution_stats["languages_processed_count"] = len(languages) log.info(distribution_stats) diff --git a/templates/_index.release.adoc b/templates/_index.release.adoc index 440074c..60d0de3 100644 --- a/templates/_index.release.adoc +++ b/templates/_index.release.adoc @@ -5,14 +5,14 @@ layout: "release" --- Fedora {{ release }}:: -* contains {{ total_release_packages }} packages, -* we identified {{ total_packages }} packages with translations files, -* it represents {{ total_packages_files }} translations files (po). +* contains {{ packages_count }} packages, +* we identified {{ packages_detected_count }} packages with translations files, +* it represents {{ files_detected_count }} translations files (po). What we were able to process:: -* {{ total_packages }} packages, -* {{ nb_files }} translation files containing {{ totalsourcewords }} words to translate, -* {{ total_languages }} languages. +* {{ packages_processed_count }} packages, +* {{ files_processed_count }} translation files containing {{ totalsourcewords }} words to translate, +* {{ languages_processed_count }} languages. Why such gaps?:: . File reading was not possible (encoding or format issue), From 8ca12160e1d90234cc46e6aa4a96d05de7e992eb Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 10:54:32 +0000 Subject: [PATCH 12/26] Merge #45 `merge language stats files together` --- diff --git a/README.md b/README.md index 0975345..f060db3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This project aims at computing global statistics for Fedora/Linux operating syst * Motivation is described in https://fedoraproject.org/wiki/Changes/LocalizationMeasurementAndTooling * It is deployed in https://languages.fedoraproject.org and https://languages.stg.fedoraproject.org -* Infrastructure is hosted by https://console-openshift-console.apps.ocp.fedoraproject.org/ +* Infrastructure is hosted by https://console-openshift-console.apps.ocp.fedoraproject.org/ and https://console-openshift-console.apps.ocp.stg.fedoraproject.org * Infrastructure code is in https://pagure.io/fedora-infra/ansible/blob/main/f/roles/openshift-apps/languages # Licensing diff --git a/build.py b/build.py index e3c30f2..045a544 100755 --- a/build.py +++ b/build.py @@ -73,18 +73,18 @@ def main(): srpm_regex = None if args.filter: - srpm_regex = re.compile("^{}$".format(args.filter)) + srpm_regex = re.compile(f"^{args.filter}$") - packages_folder = "./results/{v}/packages/".format(v=args.results) - srpms_path = os.path.abspath("./results/{v}/srpms/".format(v=args.results)) + packages_folder = f"./results/{args.results}/packages/" + srpms_path = os.path.abspath("./srpms/") if not os.path.exists(packages_folder): os.makedirs(packages_folder) if not os.path.exists(srpms_path): os.makedirs(srpms_path) - data_file = os.path.join("./results/{v}/".format(v=args.results), "data.json") - srpm_list_file = os.path.join(srpms_path, "{v}.txt".format(v=args.results)) + data_file = os.path.join(f"./results/{args.results}/", "data.json") + srpm_list_file = os.path.join(srpms_path, f"{args.results}.txt") url_list = None if os.path.isfile(srpm_list_file): @@ -102,12 +102,12 @@ def main(): if dnf_file: dnf_fp = os.path.join("dnf", dnf_file) if os.path.isfile(dnf_fp): - dnf_args = "-c {}".format(dnf_fp) - log.info("Using dnf conf {}".format(dnf_file)) + dnf_args = f"-c {dnf_fp}" + log.info(f"Using dnf conf {dnf_file}") else: - log.warning("dnf conf {} not found".format(dnf_file)) + log.warning(f"dnf conf {dnf_file} not found") p = subprocess.Popen( - "dnf {dnf_args} download --source --skip-broken --url '*' | grep src.rpm".format(dnf_args=dnf_args), + f"dnf {dnf_args} download --source --skip-broken --url '*' | grep src.rpm", stdout=subprocess.PIPE, shell=True) diff --git a/build_language_list.py b/build_language_list.py index 828d079..efa7571 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -1,8 +1,7 @@ #!/usr/bin/env python3 -""" Parse translation files to deduct language list """ +""" Detect language for each translation file """ import argparse -import csv import glob import json import os @@ -11,7 +10,6 @@ import re import logging import utils -from shutil import rmtree from weblate_language_data import aliases, languages, language_codes, countries LOCAL_ALIASES = {"ca_valencia": "ca@valencia"} @@ -21,7 +19,7 @@ def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Creates a list of languages form translation files" + description="Detect language for each translation file" ) parser.add_argument( @@ -31,20 +29,6 @@ def main(): parser.add_argument("--refresh", action="store_true", help="Force refresh") parser.add_argument( - "--describe", action="store_true", help="Describe the current list of languages" - ) - - parser.add_argument( - "--analyzelang", type=str, help="Produce an analyze file for a language" - ) - - parser.add_argument( - "--analyzealllangs", - action="store_true", - help="Produce an analyze file for all languages", - ) - - parser.add_argument( "-v", "--verbose", default=False, @@ -58,130 +42,20 @@ def main(): utils.set_logging(args.verbose, args.results) log = logging.getLogger("buildLanguageList") - results_folder = "./results/{v}/".format(v=args.results) - lang_folder = os.path.join(results_folder, "languages/") + results_folder = f"./results/{args.results}/" package_folder = os.path.join(results_folder, "packages/") - lang_analyze_folder = os.path.join(results_folder, "languages-analyses/") - if args.describe: - log.info("Describing detecting languages") - describe(lang_folder) - - elif args.analyzealllangs: - log.info("Provide more data to analyze errors") - rmtree(lang_analyze_folder, ignore_errors=True) - os.mkdir(lang_analyze_folder) - - langs = [ - f - for f in os.listdir(lang_folder) - if os.path.isfile(os.path.join(lang_folder, f)) - ] - for lang in sorted(langs): - analyze = analyze_lang(lang_folder, lang[: -len(".json")]) - - with open(os.path.join(lang_analyze_folder, lang), "w") as f: - f.write(json.dumps(analyze, indent=2)) - - elif args.analyzelang: - log.info("Provide more data to analyze errors") - if not os.path.exists(lang_analyze_folder): - os.makedirs(lang_analyze_folder) - - analyze = analyze_lang(lang_folder, args.analyzelang) - result_file = os.path.join(lang_analyze_folder, args.analyzelang + ".json") - - with open(result_file, "w") as f: - f.write(json.dumps(analyze, indent=2)) - - if args.refresh and os.path.isdir(lang_folder): - rmtree(lang_folder) - - if os.path.exists(lang_folder) is False: - log.info("Detecting the list of languages") - os.makedirs(lang_folder) - po_langs = detect_languages(package_folder, results_folder) - - for lang in po_langs.keys(): - with open(os.path.join(lang_folder, str(lang) + ".json"), "w") as f: - f.write(json.dumps(po_langs[lang], indent=2)) + scan_packages(package_folder, args.refresh) log.info("done") -def analyze_lang(lang_folder, analized_lang): - """ Analyze one lang """ - log = logging.getLogger("buildLanguageList.analyze_lang") - files = [] - results = dict() - with open(os.path.join(lang_folder, analized_lang + ".json"), "r") as read_file: - files = json.load(read_file)["po"] - - log.info(" Analysing language {l}, with {c} files".format(l=analized_lang, c=len(files))) - - for file in files: - metadata = dict() - try: - metadata = polib.pofile(file).metadata - except OSError: - # maybe a polib bug? to investigate before using it in TM - metadata["Language"] = "error-os" - except TypeError: - metadata["Language"] = "error-type" - except UnicodeDecodeError: - # encoding error, to investigate before using it in TM - metadata["Language"] = "error-unicode" - - if "Language" not in metadata.keys(): - metadata["Language"] = "zzz_null" - elif metadata["Language"] == "": - metadata["Language"] = "zzz_empty" - - language = results.get(metadata.get("Language"), dict()) - - count = language.get("Count", 0) - count += 1 - language["Count"] = count - - lang_files = language.get("Files", []) - lang_files.append(file) - language["Files"] = sorted(lang_files) - - plurals = language.get("Plural-Forms", []) - plurals.append(metadata.get("Plural-Forms")) - plurals = list(set(plurals)) - language["Plural-Forms"] = plurals - - teams = language.get("Language-Team", []) - teams.append(metadata.get("Language-Team")) - teams = list(set(teams)) - language["Language-Team"] = teams - - results[metadata.get("Language")] = language - - return dict(sorted(results.items(), key=lambda item: item[0])) - - -def describe(lang_folder): - """ Provide the number of files per language """ - log = logging.getLogger("buildLanguageList.describe") - langs = [ - f - for f in os.listdir(lang_folder) - if os.path.isfile(os.path.join(lang_folder, f)) - ] - - for lang in sorted(langs): - with open(os.path.join(lang_folder, lang), "r") as read_file: - files = json.load(read_file) - - log.info(" {l}:{c}".format(l=lang[:-len('.json')], c=len(files))) - - -def detect_languages(package_folder, results_folder): - """ For each po file, detect metadatas and deduct the language """ - """ Requires: a file hierarchy with po files """ - """ Returns: a dictionary of lists, key=lang code, value=file list """ +def scan_packages(package_folder: str, refresh: bool): + """ For each po file, detect metadata and deduct the language + :param refresh: force to compute again the values + :param package_folder: where to find packages hierarchy with discover.json + :return: a dictionary of lists, key=lang code, value=file lis + """ log = logging.getLogger("buildLanguageList.detect_languages") langs = {} packages = [ @@ -190,31 +64,48 @@ def detect_languages(package_folder, results_folder): if os.path.isdir(os.path.join(package_folder, f)) ] - log_file = os.path.join(results_folder, "build_language_list.log") - debug_file = list() count = 0 + processed_files_count = 0 + processed_files_duplicates_count = 0 total = len(packages) for package in sorted(packages): count += 1 - log.info("{c}/{t} {p}".format(c=count, t=total, p=package)) + log.info(f"{count}/{total} {package}") discovery_file = os.path.join(package_folder, package, "discover.json") + languages_file = os.path.join(package_folder, package, "stats.json") + + if os.path.isfile(languages_file) is True: + if refresh is False: + log.info("Language file already exist, no need to process") + continue + + processed_files = dict() with open(discovery_file, "r") as read_file: - alls = json.load(read_file) + discover_patterns = json.load(read_file) - to_process = [p for p in alls if p["file_format"] == "po"] + po_patterns = [p for p in discover_patterns if p["file_format"] == "po"] - for pattern in to_process: - mask = os.path.join(package_folder, package, pattern["filemask"]) - p = re.compile(mask.replace("*", "(.*)").replace("+", r"\+")) + for pattern in po_patterns: + filemask = os.path.join(package_folder, package, pattern["filemask"]) + p = re.compile(filemask.replace("*", "(.*)").replace("+", r"\+")) - for po in glob.glob(mask): - result = p.search(po) - lang_code = result.group(1) + for po_file in glob.glob(filemask): + + if po_file in processed_files.get("po", {}).keys(): + # there is no need to process the file it were processed already + log.debug(f"{po_file} were already processed") + processed_files_duplicates_count += 1 + continue + + processed_files_count += 1 + + result = p.search(po_file) + path_lang_code = result.group(1) metadata = dict() error = "" try: - metadata = polib.pofile(po).metadata + metadata = polib.pofile(po_file).metadata except UnicodeDecodeError: # encoding error, to investigate before using it in TM error = "error-unicode" @@ -224,35 +115,36 @@ def detect_languages(package_folder, results_folder): # maybe a polib bug? to investigate before using it in TM error = "error-os" - lang, decision = choose_lang(lang_code, metadata, error) + lang, decision = choose_language_code_from_po(path_lang_code, metadata) - debug_file.append([ - po, - lang_code, - metadata.get("Language", ""), - error, - lang, - str(decision), - ]) + debug_file = {"lang_in_path": path_lang_code, + "metadata_lang": metadata.get("Language", ""), + "metadata_plurals": metadata.get("Plural-Forms", ""), + "metadata_language_team": metadata.get("Language-Team", ""), + "polib_error": error, + "lang_code_chosen": lang, + "lang_code_decision": str(decision) + } - lang_result = langs.get(lang, dict()) - po_results = lang_result.get("po", list()) - po_results.append(po) - lang_result["po"] = po_results + processed_po_files = processed_files.get("po", {}) + processed_po_files[po_file] = debug_file + processed_files["po"] = processed_po_files - langs[lang] = lang_result + with open(languages_file, "w") as f: + json.dump(processed_files, f, indent=2) - with open(log_file, "w") as file_object: - write_file_object = csv.writer(file_object) - write_file_object.writerows(debug_file) + log.info(f"Done {processed_files_count} files were processed, we skipped {processed_files_duplicates_count} duplicates") return langs -def choose_lang(filename, metadata, error): - """ From a po file and its medata, choose the most likely language code """ - """ By priority: the Language medata """ - """ Returns: a language code """ +def choose_language_code_from_po(filename: str, metadata: dict[str]) -> tuple[str, int]: + """ Deduct a language code from a filename and its metadata + + :param filename: po filename + :param metadata: po metadata + :return: a language code, a decision path + """ log = logging.getLogger("buildLanguageList.choose_lang") lang = "noresult" diff --git a/build_stats.py b/build_stats.py index 50a5bea..1d840c6 100755 --- a/build_stats.py +++ b/build_stats.py @@ -1,25 +1,115 @@ #!/usr/bin/env python3 -"""For each package, compute stats""" +""" Computes stats for each package with translations and each detected language """ import argparse -import glob import json +import logging import os -import shutil import subprocess +from collections import defaultdict -import polib -import logging -import utils +from numpyencoder import NumpyEncoder +import pandas as pd from translate.tools.pocount import calcstats +import utils + + +def compute_language_statistics(languages_stats: dict, total_release_source_words: int) -> dict: + """ For each language, produce global statistics and per package statistics + + global statistics target: + "totalsourcewordssum": total words on started packages + "totalsourcewords_d": total words in release + "translatedsourcewordssum": total translated words + "progress": current translation progress on started packages (in percents) + "progress_d": current translation progress on all strings in release (in percents) + + per package statistics target: + "name": package name + "progress": current translation progress (in percents) + "translated": total translated words (source words, it can vary in target language) + "team": language team info + """ + log = logging.getLogger("buildStats.compute_language_statistics") + + results_languages = dict() + po_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy", + "untranslated", "translatedtargetwords"] + package_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy", + "untranslated", "translatedtargetwords", "package"] + + for code, stats in languages_stats.items(): + results_languages[code] = {} + results_languages[code]["po"] = stats + result = {} + + df = pd.DataFrame.from_records(stats) + + for kpi in po_fields: + result[kpi] = df[kpi].sum() + + result["totalsourcewordssum"] = result["translatedsourcewords"] + result["fuzzysourcewords"] + result[ + "untranslatedsourcewords"] + result["totalsourcewords_d"] = total_release_source_words + + # prevent a Runtime warning for languages with no content + if result["totalsourcewordssum"] > 0: + result["progress"] = (result["translatedsourcewords"] / result["totalsourcewordssum"]) * 100 + else: + result["progress"] = 0.0 + + result["progress_d"] = (result["translatedsourcewords"] / result["totalsourcewords_d"]) * 100 + + packages_stats = df[package_fields].groupby("package").sum() + packages_stats["totalsourcewordssum"] = packages_stats["translatedsourcewords"] + packages_stats["fuzzysourcewords"] + packages_stats["untranslatedsourcewords"] + + packages_stats["progress"] = (packages_stats["translatedsourcewords"] / packages_stats["totalsourcewordssum"]) * 100 + # prevent NaN values when a package have total source words = 0 + packages_stats.fillna(0, inplace=True) + packages_stats["team"] = df[["metadata_language_team", "package"]].groupby("package").first() + result["packages"] = packages_stats.to_dict(orient="index") + + results_languages[code].update(result) + + return results_languages + + +def compute_package_statistics(df: pd.DataFrame) -> dict: + """ For each package, per language statistics + + global statistics target: + "lang_code": language code + "team": language team info + "progress": current translation progress (in percents), + "translated": total translated words (source words, it can vary in target language) + "filename": list of files considered for statistics + """ + log = logging.getLogger("buildStats.compute_language_statistics") + results = dict() + index = "lang_code_chosen" + po_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy", + "untranslated", "translatedtargetwords", index] + + stats = df[po_fields].groupby(index).sum() + stats["totalsourcewordssum"] = stats["translatedsourcewords"] + stats["fuzzysourcewords"] + stats["untranslatedsourcewords"] + stats["progress"] = (stats["translatedsourcewords"] / stats["totalsourcewordssum"]) * 100 + # prevent NaN values when a package have total source words = 0 + stats.fillna(0, inplace=True) + stats["team"] = df[["metadata_language_team", index]].groupby(index).first() + df['filename'] = df.index + stats["filename"] = df[["filename", index]].groupby(index).sum() + results["languages"] = stats.to_dict(orient="index") + + return results + def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Computes stats for each package with translations" + description="Computes stats for each package with translations and each detected language" ) parser.add_argument( "--results", required=True, help="Set the results folder to use" @@ -40,18 +130,10 @@ def main(): utils.set_logging(args.verbose, args.results) log = logging.getLogger("buildStats") - results_folder = "./results/{v}/".format(v=args.results) - packages_folder = "./results/{v}/packages/".format(v=args.results) - packages_stats_folder = "./results/{v}/packages-stats/".format(v=args.results) - languages_folder = "./results/{v}/languages/".format(v=args.results) - languages_stats_folder = "./results/{v}/languages-stats/".format(v=args.results) - - for folder in [ - packages_stats_folder, - languages_stats_folder - ]: - if args.refresh and os.path.isdir(folder): - shutil.rmtree(folder) + results_folder = f"./results/{args.results}/" + packages_folder = f"./results/{args.results}/packages/" + languages_stats_folder = f"./results/{args.results}/languages/" + os.makedirs(languages_stats_folder, exist_ok=True) log.info("Computing packages stats") packages = [ @@ -60,187 +142,144 @@ def main(): if os.path.isdir(os.path.join(packages_folder, f)) ] count = 0 - distribution_stats = dict() - - if not os.path.exists(packages_stats_folder): - os.makedirs(packages_stats_folder) + all_stats = list() for package in sorted(packages): count += 1 - log.info(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package)) + log.info(f" {count}/{len(packages)} - {package}") + stats_file = os.path.join(packages_folder, package, "stats.json") - src_folder = os.path.join(packages_folder, package) - stats_file = os.path.join(packages_stats_folder, package + ".json") + with open(stats_file, "r") as f: + stats = json.load(f) - if os.path.isfile(stats_file) is False: - with open(os.path.join(packages_folder, package, "discover.json"), "r") as f: - discoveries = json.load(f) + stats["package"] = package - results = dict() - for discover in discoveries: - files = glob.glob(os.path.join(src_folder, discover["filemask"])) + # some packages have no detected po files + if "po" not in stats.keys(): + continue - if discover["file_format"] == "po": - results[discover["filemask"]] = get_po_translation_level( - files, stats_file - ) + for file in stats["po"].keys(): + if "translated" in stats["po"][file].keys() \ + and args.refresh is False: + log.debug(f"{file} is already processed") + continue - if len(results) == 0: - log.warning("No translation file found?") - else: - with open(stats_file, "w") as f: - json.dump(results, f, indent=2) - else: - with open(stats_file, "r") as f: - results = json.load(f) + stats["po"][file].update(get_po_translation_level(file)) - distribution_stats = extract_release_stats(distribution_stats, results) + df = pd.DataFrame.from_dict(stats["po"], orient='index') + stats["stats"] = compute_package_statistics(df) + stats["totalsourcewords"] = df[["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords"]].sum().sum() - log.info("Computing language stats") - languages = [f for f in os.listdir(languages_folder)] - count = 0 + with open(stats_file, "w") as f: + json.dump(stats, f, indent=2, cls=NumpyEncoder) - languages_stats_folder = languages_stats_folder - if not os.path.exists(languages_stats_folder): - os.makedirs(languages_stats_folder) + all_stats.append(stats) - for language in sorted(languages): - count += 1 - lang = language[:-5] + log.info("Aggregating language stats") + languages = defaultdict(list) + total_distribution_source_words = 0 + for package in all_stats: + max_languages = defaultdict(int) + for filename, stats in package["po"].items(): + lang_code = stats["lang_code_chosen"] + stats["filename"] = filename + stats["package"] = package["package"] - log.info(" {c}/{t} - {l}".format(c=count, t=len(languages), l=lang)) - with open(os.path.join(languages_folder, language), "r") as f: - discoveries = json.load(f) + languages[lang_code].append(stats) - stats_file = os.path.join(languages_stats_folder, lang + ".json") + max_languages[lang_code] += stats["translatedsourcewords"] + max_languages[lang_code] += stats["untranslatedsourcewords"] - if os.path.isfile(stats_file): - continue + try: + del max_languages["error"] + except KeyError: + pass + total_distribution_source_words += max(max_languages.values()) - files = discoveries.get("po", []) - if files: - with open(stats_file, "w") as f: - json.dump(get_po_translation_level(files, stats_file), f, indent=2) + log.info("Storing language stats") + for lang_code, language in languages.items(): + stats_file = os.path.join(languages_stats_folder, f"{lang_code}.json") + with open(stats_file, "w") as f: + json.dump(language, f, indent=2) - log.info("Storing distribution stats") + log.info("Computing language stats") + languages = compute_language_statistics(languages, total_distribution_source_words) - distribution_file = os.path.join(results_folder, "release.json") - with open(os.path.join(results_folder, "data.json"), "r") as f: - distribution_stats["total_release_packages"] = len(json.load(f)) + log.info("Storing language stats") + for lang_code, language in languages.items(): + stats_file = os.path.join(languages_stats_folder, f"{lang_code}.json") + with open(stats_file, "w") as f: + json.dump(language, f, indent=2, cls=NumpyEncoder) - total_packages_files = list() - for base, dirs, files in os.walk(packages_folder): - for file in files: - if file != "discover.json": - total_packages_files.append(os.path.join(base, file)) - distribution_stats["total_packages_files"] = len(total_packages_files) - distribution_stats["total_packages"] = len(packages) - distribution_stats["nb_files"] = len(list(set(distribution_stats["files"]))) + log.info("Processing distribution stats") - packages_with_stats = [f for f in os.listdir(packages_stats_folder) if os.path.isfile(os.path.join(packages_stats_folder, f))] - distribution_stats["total_packages_with_stats"] = len(packages_with_stats) - distribution_stats["total_languages"] = len(languages) - with open(distribution_file, "w") as f: - json.dump(distribution_stats, f, indent=2) + distribution_file = os.path.join(results_folder, "release.json") + distribution_stats = dict() + with open(os.path.join(results_folder, "data.json"), "r") as f: + distribution_stats["packages_count"] = len(json.load(f)) - log.info("Searching for bugs ;)") - used_files = list(set(distribution_stats["files"])) - if len(total_packages_files) != len(used_files): - log.debug("source:{s} used: {u}".format(s=len(total_packages_files), u=len(used_files))) - missing_files = [source for source in total_packages_files if not source in used_files] - missing_files_po = [file for file in missing_files if file.endswith(".po")] - if len(missing_files_po) > 0: - log.debug("Some po files are missing") - distribution_file = os.path.join(results_folder, "build_stats.missing_po_files.json") - with open(distribution_file, "w") as f: - json.dump(missing_files_po, f, indent=2) + # detected = identified with translation files + distribution_stats["packages_detected_count"] = len(packages) + distribution_stats["files_detected_count"] = sum([len(package["po"]) for package in all_stats]) - log.info("done") + # processed = what we were able to use + distribution_stats["packages_processed_count"] = 0 + distribution_stats["files_processed_count"] = 0 + for package in sorted(packages): + log.info(package) + stats_file = os.path.join(packages_folder, package, "stats.json") -def get_po_translation_level(files, stats_file): - """ Compute results """ - log = logging.getLogger("buildStats.get_po_translation_level") - stats = dict() + with open(stats_file, "r") as f: + stats = json.load(f) - for file in files: - # remove non standard comments - # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean - command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file] - subprocess.run(command, check=True, capture_output=True) + # if there is no source words, it means we were not able to process anything + if "totalsourcewords" in stats.keys(): + if stats["totalsourcewords"] > 0: + distribution_stats["packages_processed_count"] += 1 - try: - stat = calcstats(file) - except Exception as e: - log.error(" {f} triggered an {t} exception: {e}".format(f=file, t=type(e).__name__, e=e)) - continue + for _, detected in stats["po"].items(): + if detected["lang_code_chosen"] != "error": + distribution_stats["files_processed_count"] += 1 + + distribution_stats["totalsourcewords"] = total_distribution_source_words + distribution_stats["languages_processed_count"] = len(languages) - keys = [ - "translatedsourcewords", - "fuzzysourcewords", - "untranslatedsourcewords", - "translated", - "fuzzy", - "untranslated", - "translatedtargetwords", - ] - results = dict() - for key in keys: - results[key] = stat.get(key, 0) + log.info(distribution_stats) - results["team"] = get_language_team(file) + log.info("Storing distribution stats") + with open(distribution_file, "w") as f: + json.dump(distribution_stats, f, indent=2) - stats[file] = results + log.info("done") - return stats +def get_po_translation_level(file: str) -> dict: + """ Call pocount to get translation stats for a file """ + log = logging.getLogger("buildStats.get_po_translation_level") + command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file] + subprocess.run(command, check=True, capture_output=True) -def get_language_team(file): - log = logging.getLogger("buildStats.get_language_team") - metadata = dict() - try: - metadata = polib.pofile(file).metadata - except OSError: - # maybe a polib bug? to investigate before using it in TM - metadata["Language"] = "error-os" - except UnicodeDecodeError: - # encoding error, to investigate before using it in TM - metadata["Language"] = "error-unicode" - except TypeError: - # TypeError: '>' not supported between instances of 'str' and 'int' - metadata["Language"] = "error-valuerror" - - team = "Unknown..." try: - team = metadata["Language-Team"] - except KeyError: - log.debug("The file {f} have no Language team? Here are the metadata: {m}".format(f=file, m=metadata)) - return team - - -def extract_release_stats(results, files_stats): - log = logging.getLogger("buildStats.extract_release_stats") - number_of_packages = results.get("nb_packages", 0) - number_of_packages += 1 - files = results.get("files", list()) - total_source_words = results.get("totalsourcewords", 0) - - for template in files_stats: - maxresult = 0 - for file in files_stats[template]: - translated = files_stats[template][file]["translatedsourcewords"] - untranslated = files_stats[template][file]["untranslatedsourcewords"] - maxresult = max(maxresult, translated + untranslated) - files.append(file) - - total_source_words += maxresult - - results = { - "nb_packages": number_of_packages, - "files": files, - "totalsourcewords": total_source_words, - } + stat = calcstats(file) + except Exception as e: + log.error(f" {file} triggered an {type(e).__name__} exception: {e}") + stat = {"could_not_process": 1} + + keys = [ + "translatedsourcewords", + "fuzzysourcewords", + "untranslatedsourcewords", + "translated", + "fuzzy", + "untranslated", + "translatedtargetwords", + "could_not_process" + ] + results = dict() + for key in keys: + results[key] = stat.get(key, 0) return results diff --git a/build_tm.py b/build_tm.py index 238c2f0..4be4d93 100755 --- a/build_tm.py +++ b/build_tm.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Consolidate each po files into compendium""" +""" Creates useful translator files for every language """ import argparse import gzip @@ -17,7 +17,7 @@ def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Creates useful translator files for every languages" + description="Creates useful translator files for every language" ) parser.add_argument( @@ -39,13 +39,11 @@ def main(): utils.set_logging(args.verbose, args.results) log = logging.getLogger("buildTm") - results_folder = "./results/{v}/".format(v=args.results) + results_folder = f"./results/{args.results}/" lang_path = os.path.join(results_folder, "languages/") tm_folder = os.path.join(results_folder, "languages-tm/") - debug_folder = os.path.join(results_folder, "debug_folder/") os.makedirs(tm_folder, exist_ok=True) - os.makedirs(debug_folder, exist_ok=True) # clean destination folders if args.refresh and os.path.isdir(tm_folder): @@ -54,58 +52,51 @@ def main(): if os.path.exists(tm_folder) is False: os.makedirs(tm_folder) - log.info("Building the translation memory for every languages") + log.info("Find detected languages") - langs = [ + languages = [ f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f)) ] - for lang in sorted(langs): - lang_code = lang[: -len(".json")] + for language in sorted(languages): + language_code = language[: -len(".json")] - log.info("Processing {l}".format(l=lang_code)) + log.info(f"Processing {language_code}") - with open(os.path.join(lang_path, lang), "r") as read_file: + with open(os.path.join(lang_path, language), "r") as read_file: files = json.load(read_file)["po"] + files = [f["filename"] for f in files] - compendium_file = os.path.join(tm_folder, lang_code + ".po") + compendium_file = os.path.join(tm_folder, f"{language_code}.po") compendium_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), compendium_file ) - compendium_archive = compendium_file + ".gz" + compendium_archive = f"{compendium_file}.gz" if os.path.isfile(compendium_file) is False and os.path.isfile(compendium_archive) is False: log.info("Compendium generation") - process_compendium(files, compendium_file, debug_folder) + process_compendium(files, compendium_file, tm_folder, language_code) # remove non standard comments # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", compendium_file] subprocess.run(command, check=True, capture_output=True) - tmx_file = os.path.join(tm_folder, lang_code + ".tmx") - tmx_archive = tmx_file + ".gz" + tmx_file = os.path.join(tm_folder, f"{language_code}.tmx") + tmx_archive = f"{tmx_file}.gz" if os.path.isfile(tmx_file) is False and os.path.isfile(tmx_archive) is False: log.info("TMX generation") try: - process_tmx(lang_code, compendium_file, tmx_file) + process_tmx(language_code, compendium_file, tmx_file) except Exception as e: - log.error( - " TMX generation triggered an {t} exception: {e}".format( - t=type(e).__name__, e=e - ) - ) - - terminology_file = os.path.join(tm_folder, lang_code + ".terminology.po") - terminology_archive = terminology_file + ".gz" + log.error(f" TMX generation triggered an {type(e)} exception: {e}") + + terminology_file = os.path.join(tm_folder, f"{language_code}.terminology.po") + terminology_archive = f"{terminology_file}.gz" if os.path.isfile(terminology_file) is False and os.path.isfile(terminology_archive) is False: log.info("Terminology generation") try: process_terminology(compendium_file, terminology_file) except Exception as e: - log.error( - " Terminology generation triggered an {t} exception: {e}".format( - t=type(e).__name__, e=e - ) - ) + log.error(f" Terminology generation triggered an {type(e)} exception: {e}") if args.compress: if os.path.isfile(compendium_file): @@ -120,28 +111,26 @@ def main(): log.info("All languages are processed") log.info("Detecting missing files") - for lang in sorted(langs): - check_lang(lang[: -len(".json")], tm_folder, args.compress) + for language in sorted(languages): + check_lang(language[: -len(".json")], tm_folder, args.compress) log.info("done") -def process_compendium(langfiles, dest, debug_folder): +def process_compendium(po_files: list, destination_file, debug_folder: str, language_code: str) -> None: """ Generate a compendium (a concatenation of all po files) """ log = logging.getLogger("buildTm.process_compendium") - pofiles = [ - os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in langfiles - ] + po_files = [os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in po_files] count = 0 with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp: - for i in pofiles: + for file in po_files: try: command = [ "msguniq", - i, + file, "--output-file", count.__str__(), "--no-location", @@ -151,7 +140,7 @@ def process_compendium(langfiles, dest, debug_folder): try: command = [ "msguniq", - i, + file, "--output-file", count.__str__(), "--to-code", @@ -160,22 +149,22 @@ def process_compendium(langfiles, dest, debug_folder): ] subprocess.run(command, check=True, cwd=tmp, capture_output=True) except subprocess.CalledProcessError as e: - debug_filename = "tm-msguniq-{lang}-{name}".format(lang=dest.split("/")[-1], name=count.__str__()) - log.error(" msguniq error with {i} a copy of this file is into {d} as {n}".format(i=i, e=e.output, - d=debug_folder, - n=debug_filename)) - shutil.copyfile(i, os.path.join(debug_folder, debug_filename)) + short_filename = os.path.relpath(file, os.path.dirname(os.path.abspath(__file__))) + short_filename = "_".join(short_filename.split(sep=os.path.sep)) + debug_filename = os.path.join(debug_folder, f"{language_code}-tm-msguniq-{short_filename}") + log.error(f" msguniq error, a copy of this file is into {debug_filename}") + shutil.copyfile(file, debug_filename) count += 1 all_files = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))] if len(all_files) == 1: - shutil.copyfile(os.path.join(tmp, all_files[0]), dest) + shutil.copyfile(os.path.join(tmp, all_files[0]), destination_file) else: - msgcat_loop(dest, tmp, debug_folder, all_files) + msgcat_loop(destination_file, tmp, debug_folder, all_files, language_code) -def msgcat(files, destination, path): +def msgcat(files: list[str], destination: str, path: str): """ Call the msgcat command on a list of po files Return stderr, if any """ command = [ @@ -195,15 +184,8 @@ def msgcat(files, destination, path): stderr = e.stderr.decode('utf8') return stderr -def store_debug_file(path, name, file, debug_folder): - """ Move the temporary move file in debug folder """ - log = logging.getLogger("buildTm.store_debug_file") - target = os.path.join(debug_folder, "{n}-{f}".format(n=name, f=file)) - log.error("The file {f} were moved into {t}".format(f=file, t=target)) - shutil.move(os.path.join(path, file), target) - -def msgcat_loop(destination, path, debug_folder, files): +def msgcat_loop(destination: str, path: str, debug_folder: str, files: list[str], language: str) -> None: """ call msgcat, and exclude any problematic files """ log = logging.getLogger("buildTm.msgcat_loop") log.debug("Starting msgcat loop") @@ -213,68 +195,72 @@ def msgcat_loop(destination, path, debug_folder, files): ids += re.findall(r"\d+:\d+: (\d+): input is not valid in", ret) if ids: file = ids[0] - log.debug("This file raised a msgcat bug: {f}".format(f=file)) - store_debug_file(path, "tm-msgcat-" + destination.split("/")[-1], file, debug_folder) + short_filename = os.path.relpath(file, os.path.dirname(os.path.abspath(__file__))) + short_filename = "_".join(short_filename.split(sep=os.path.sep)) + destination_file = f"{language}-tm-msgcat-{short_filename}" + target = os.path.join(debug_folder, f"{destination_file}") + log.error(f"msgcat error, a copy of this file is into {target}") + shutil.move(os.path.join(path, file), target) files.remove(file) else: # nothing found in stderr if os.path.isfile(destination) is False: # and destination not here : unhandled exception # TODO: maybe actually throw an exception here? - log.error("Error with msgcat: {e}".format(e=ret)) - return False + log.error(f"Error with msgcat: {ret}") + return # no stderr and final file is here : all good break log.debug("next try") log.debug("msgcat loop over") -def process_tmx(lang, source, dest): +def process_tmx(lang: str, source: str, destination: str) -> None: """ Generate a translation memory from a po file """ - command = ["po2tmx", "--language=" + lang, "--progress=none", source, "--output=" + dest] + command = ["po2tmx", f"--language={lang}", "--progress=none", source, f"--output={destination}"] subprocess.run(command, check=True, capture_output=True) -def process_terminology(source, dest): - """ Generate a termonology from a po file """ +def process_terminology(compendium: str, destination: str) -> None: + """ Generate a terminology from a po file """ command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", - "--progress=none", source, "--output=" + dest] + "--progress=none", compendium, f"--output={destination}"] subprocess.run(command, check=True, capture_output=True) -def check_lang(lang, tm_folder, compress): +def check_lang(lang: str, tm_folder: str, to_compress: bool) -> None: """ Check if expected files were generated """ log = logging.getLogger("buildTm.check_lang") - compendium_file = os.path.join(tm_folder, lang + ".po") - tmx_file = os.path.join(tm_folder, lang + ".tmx") - terminology_file = os.path.join(tm_folder, lang + ".terminology.po") + compendium_file = os.path.join(tm_folder, f"{lang}.po") + tmx_file = os.path.join(tm_folder, f"{lang}.tmx") + terminology_file = os.path.join(tm_folder, f"{lang}.terminology.po") - if compress is True: + if to_compress is True: compendium_file += ".gz" tmx_file += ".gz" terminology_file += ".gz" if os.path.isfile(compendium_file) is False: - log.warning("{l}-compendium is missing".format(l=lang)) + log.warning(f"{lang}-compendium is missing") if os.path.isfile(tmx_file) is False: - log.warning("{l}-tmx is missing".format(l=lang)) + log.warning(f"{lang}-tmx is missing") if os.path.isfile(terminology_file) is False: - log.warning("{l}-terminology is missing".format(l=lang)) + log.warning(f"{lang}-terminology is missing") -def compress(source, archive): - """ Compress files uzing gzip """ +def compress(source: str, destination_file: str) -> None: + """ Compress files using gzip """ log = logging.getLogger("buildTm.compress") - log.debug("Compressing {s}".format(s=source)) + log.debug(f"Compressing {source}") with open(source, "rb") as file_in: - with gzip.open(archive, "wb") as file_out: + with gzip.open(destination_file, "wb") as file_out: file_out.writelines(file_in) os.remove(source) diff --git a/build_website.py b/build_website.py index 450a706..362d914 100755 --- a/build_website.py +++ b/build_website.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Consolidate each po files into compendium""" +""" Generate static asciidoc pages from generated statistics """ import argparse import datetime @@ -15,10 +15,31 @@ import logging import utils +def get_territories_for_language(language_name: str, cldr_languages: dict) -> list: + log = logging.getLogger("buildWebsite.get_territory") + + code = language_name.split("_", 1)[0] # ro_MD or zh_Hant_HK + code = code.split("@", 1)[0] # ca@valencia + + territories = cldr_languages.get(code, {}).get("_territories", []) + territories = territories + cldr_languages.get(code + "-alt-secondary", {}).get("_territories", []) + + # if language contains a territory code, then only keep this one + if len(language_name.split("_")) > 1: + if language_name.split("_")[-1] in territories: + territories = [language_name.split("_")[-1]] + + if len(territories) == 0: + territories = ["not-found-in-cldr"] + log.warning(f"The language {code} does not exist in territories data from CLDR") + + return territories + + def main(): """Handle params""" - parser = argparse.ArgumentParser(description="") + parser = argparse.ArgumentParser(description="Generate static asciidoc pages from generated statistics") parser.add_argument( "--results", @@ -45,26 +66,20 @@ def main(): utils.set_logging(args.verbose, args.results) log = logging.getLogger("buildWebsite") - results_folder = "./results/{v}/".format(v=args.results) - langs_log = os.path.join(results_folder, "build_language_list.log") - langs_stats = os.path.join(results_folder, "languages-stats") - packages_stats = os.path.join(results_folder, "packages-stats") - - data_langs_folder = os.path.join(results_folder, "languages-website") - data_pkgs_folder = os.path.join(results_folder, "packages-website") + results_folder = f"./results/{args.results}/" + langs_stats = os.path.join(results_folder, "languages") + packages_stats = os.path.join(results_folder, "packages") tm_folder = os.path.join(results_folder, "languages-tm") - static_folder = "./website/content/{v}/".format(v=args.results) + static_folder = f"./website/content/{args.results}/" static_territories_folder = "./website/content/territories" static_langs_folder = os.path.join(static_folder, "language") static_pkgs_folder = os.path.join(static_folder, "package") - static_tm_folder = "./website/static/{v}/".format(v=args.results) + static_tm_folder = f"./website/static/{args.results}/" # clean destination folders for folder in [ - data_langs_folder, - data_pkgs_folder, static_langs_folder, static_pkgs_folder, static_tm_folder, @@ -77,61 +92,10 @@ def main(): log.info("Get distribution stats") distribution_stats = json.load(open(os.path.join(results_folder, "release.json"))) - log.info("Prepare json files for packages") - packages = [ - d - for d in os.listdir(packages_stats) - if os.path.isfile(os.path.join(packages_stats, d)) - ] - log_files = pd.read_csv(langs_log, header=None, skipinitialspace=True) - log_files = log_files.iloc[:, [0, 4]] - log_files.columns = ["filename", "lang_code"] - - packages_langs_results = dict() - count = 0 - total = len(packages) - for package in sorted(packages): - count += 1 - log.debug("Preparing package {c}/{t} - {p}".format(c=count, t=total, p=package)) - package_name = package[: -len(".json")] - package_statistics_file = os.path.join(data_pkgs_folder, package_name + ".json") - file_stats = os.path.join(packages_stats, package_name + ".json") - - results = consolidate_package_stats(file_stats, log_files, os.path.join(results_folder, "package", package_name)) - store_json_file(results, package_statistics_file) - for lang in results.get("stats", []): - val = packages_langs_results.get(lang["lang_code"], []) - val.append( - { - "name": package_name, - "progress": lang["progress"], - "translated": lang["translated"], - "team": lang["team"], - } - ) - packages_langs_results[lang["lang_code"]] = val - - log.info("Prepare json files for languages") - languages = [ - f - for f in os.listdir(langs_stats) - if os.path.isfile(os.path.join(langs_stats, f)) - ] - for lang in sorted(languages): - if lang.endswith(".json"): - code = lang[: -len(".json")] - package_statistics_file = os.path.join(data_langs_folder, code + ".json") - - if os.path.isfile(package_statistics_file): - continue - - results = consolidate_language_stats(os.path.join(langs_stats, lang), distribution_stats) - results["packages"] = packages_langs_results.get(code, dict()) - store_json_file(results, package_statistics_file) - log.info("Load CLDR data") with open("CLDR-raw/languageData.json", "r") as read_file: cldr_languages = json.load(read_file) + cldr_version = cldr_languages["supplemental"]["version"]["_cldrVersion"] cldr_languages = cldr_languages["supplemental"]["languageData"] with open("CLDR-raw/territories.json", "r") as read_file: @@ -145,53 +109,38 @@ def main(): log.info("Generate static content for languages") languages = [ f - for f in os.listdir(data_langs_folder) - if os.path.isfile(os.path.join(data_langs_folder, f)) + for f in os.listdir(langs_stats) + if os.path.isfile(os.path.join(langs_stats, f)) ] - for lang in sorted(languages): - code = lang[: -len(".json")] - package_statistics_file = os.path.join(static_langs_folder, code + ".adoc") + for language_file in sorted(languages): + language = language_file[: -len(".json")] + stats_file = os.path.join(langs_stats, language_file) + destination_file = os.path.join(static_langs_folder, f"{language}.adoc") - if os.path.isfile(package_statistics_file): - continue - - with open(os.path.join(data_langs_folder, lang), "r") as read_file: + with open(stats_file, "r") as read_file: content = json.load(read_file) - pd.DataFrame(content["packages"]).to_csv(os.path.join(static_tm_folder, f"{code}.csv"), index=False) - - cldr_code = code.split("_", 1)[0] # ro_MD or zh_Hant_HK - cldr_code = cldr_code.split("@", 1)[0] # ca@valencia - - territories = cldr_languages.get(cldr_code, {}).get("_territories", []) \ - + cldr_languages.get(cldr_code + "-alt-secondary", {}).get("_territories", []) - - # if language contains a territory code, then only keep this one - if len(code.split("_")) > 1: - if code.split("_")[-1] in territories: - territories = [code.split("_")[-1]] + pd.DataFrame\ + .from_dict(content['packages'], orient="index")\ + .to_csv(os.path.join(static_tm_folder, f"{language}.csv"), index_label="package") - if len(territories) == 0: - log.warning("The language {l} does not exist in territories data from CLDR".format(l=code)) - generate_static_pages_langs(args.results, code, content, package_statistics_file, territories, tm_folder, static_tm_folder) + territories = get_territories_for_language(language, cldr_languages) + generate_static_pages_langs(args.results, language, content, destination_file, territories, tm_folder, static_tm_folder) log.info("Generate static content for packages") packages = [ f - for f in os.listdir(data_pkgs_folder) - if os.path.isfile(os.path.join(data_pkgs_folder, f)) + for f in os.listdir(packages_stats) + if os.path.isdir(os.path.join(packages_stats, f)) ] for package in sorted(packages): - code = package[: -len(".json")] - package_statistics_file = os.path.join(static_pkgs_folder, code + ".adoc") + stats_file = os.path.join(packages_stats, package, "stats.json") + destination_file = os.path.join(static_pkgs_folder, f"{package}.adoc") - if os.path.isfile(package_statistics_file): - continue - - with open(os.path.join(data_pkgs_folder, package), "r") as read_file: + with open(stats_file, "r") as read_file: content = json.load(read_file) - generate_static_pages_packages(args.results, code, content, package_statistics_file) + generate_static_pages_packages(args.results, package, content, destination_file) log.info("Generating indexes") package_statistics_file = os.path.join(static_folder, "_index.adoc") @@ -207,7 +156,7 @@ def main(): # prevent containers and alternative names to be included if code in cldr_territories_info.keys(): package_statistics_file = os.path.join(static_territories_folder, code, "_index.adoc") - generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {})) + generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}), cldr_version) log.info("Copy translation memories") languages = [ @@ -222,172 +171,8 @@ def main(): log.info("done") -def consolidate_language_stats(stats_file, distribution_stats): - """ From a CSV file, return key indicators """ - log = logging.getLogger("buildWebsite.consolidate_language_stats") - results = dict() - total_words_distrib = distribution_stats.get("totalsourcewords", 0) - - fieldnames = { - "filename": "str", - "translatedsourcewords": "int", - "fuzzysourcewords": "int", - "untranslatedsourcewords": "int", - "translated": "int", - "fuzzy": "int", - "untranslated": "int", - "translatedtargetwords": "int", - "team": "str", - "totalsourcewords": "int", - } - - stats_df = pd.read_json(stats_file, orient="index") - stats_df.fillna(0, inplace=True) - stats_df.reset_index(level=0, inplace=True) - stats_df["totalsourcewords"] = ( - stats_df["untranslatedsourcewords"] + stats_df["translatedsourcewords"] - ) - stats_df.columns = fieldnames.keys() - - stats_df["package"] = stats_df["filename"].str.split("/", expand=True)[4] - - results["packages"] = stats_df["package"].unique().tolist() - results["progress"] = round( - stats_df["translatedsourcewords"].sum() - / stats_df["totalsourcewords"].sum() - * 100, - 1, - ) - results["progress_d"] = round( - stats_df["translatedsourcewords"].sum() / total_words_distrib * 100, 1 - ) - results["totalsourcewords_d"] = total_words_distrib - - for kpi in ["totalsourcewords", "translatedsourcewords"]: - results[kpi + "sum"] = int(stats_df[kpi].sum()) - - return results - - -def consolidate_package_stats(stats_file, log_files, package_folder): - """ From a CSV file, return key indicators """ - log = logging.getLogger("buildWebsite.consolidate_package_stats") - results = dict() - - fieldnames = { - "filename": "str", - "translatedsourcewords": "int", - "fuzzysourcewords": "int", - "untranslatedsourcewords": "int", - "translated": "int", - "fuzzy": "int", - "untranslated": "int", - "translatedtargetwords": "int", - "team": "str", - "totalsourcewords": "int", - } - - _json = json.load(open(stats_file)) - dfs = [] - total_source_words = 0 - - for template in _json.keys(): - tmp_df = pd.DataFrame.from_dict(_json.get(template), orient="index") - tmp_df.fillna(0, inplace=True) - tmp_df.reset_index(level=0, inplace=True) - - # sometimes, no file were found, which means no stats can be used - if len(tmp_df) == 0: - log.debug(" The template {t} for {f} is empty".format(t=template, f=stats_file)) - continue - - tmp_df["totalsourcewords"] = ( - tmp_df["untranslatedsourcewords"] + tmp_df["translatedsourcewords"] - ) - tmp_df.columns = fieldnames.keys() - - total_source_words += max(tmp_df["totalsourcewords"]) - - dfs.append(tmp_df) - - if len(dfs) > 1: - stats_df = pd.concat(dfs) - elif len(dfs) == 0: - log.debug("There is no stats for {f}".format(f=stats_file)) - return results - else: - stats_df = dfs[0] - - stats_df_w_lang = pd.merge(stats_df, log_files, how="inner", on="filename") - stats_df_no_lang = pd.merge(stats_df, log_files, how="outer", indicator=True).loc[ - lambda x: x["_merge"] == "left_only" - ] - - stats_df_w_lang["filename"] = stats_df_w_lang["filename"].apply( - lambda s: s[len(package_folder) + 2:] - ) - - temp_translated = ( - stats_df_w_lang.groupby(["lang_code"]) - .agg( - { - "translatedsourcewords": ["sum"], - } - ) - .reset_index() - .droplevel(1, axis=1) - ) - - temp_teams = stats_df_w_lang.groupby("lang_code")["team"].apply( - lambda x: ", ".join(x.drop_duplicates()) - ) - temp_files = stats_df_w_lang.groupby("lang_code")["filename"].apply( - lambda x: ",".join(x) - ) - temp_bis = pd.merge(temp_teams, temp_files, how="inner", on="lang_code") - temp = pd.merge(temp_translated, temp_bis, how="inner", on="lang_code").to_dict( - orient="records" - ) - - for line in temp: - line["progress"] = 0 - line["translated"] = line["translatedsourcewords"] - - if total_source_words == 0: - log.info( - " File {f} for file has translatedsourcewords = 0 in line {l}".format( - f=stats_file, l=line - ) - ) - line["progress"] = 0 - continue - try: - line["progress"] = round( - (int(line["translatedsourcewords"]) / total_source_words) * 100 - ) - except OverflowError: - log.info( - " File {f} has Translated={t} and Source={tot}".format( - f=stats_file, - t=line["translatedsourcewords"], - tot=total_source_words, - ) - ) - - line["filename"] = line["filename"].split(",") - - results["stats"] = list() - for line in sorted(temp, key=lambda k: k["progress"], reverse=True): - del line["translatedsourcewords"] - results["stats"].append(line) - - results["totalsourcewords"] = total_source_words - results["no_languages"] = stats_df_no_lang["filename"].tolist() - - return results - - def generate_static_pages_langs(results: str, code: str, content: dict, destination_file: str, territories: list[str], tm_folder: str, static_tm_folder: str) -> None: + """ Aggregate info and call language template """ log = logging.getLogger("buildWebsite.generate_static_pages_langs") data = content data["lang_name_en"] = langtable.language_name( @@ -413,17 +198,36 @@ def generate_static_pages_langs(results: str, code: str, content: dict, destinat apply_jinja_template(data, destination_file, "language.adoc") -def generate_static_pages_packages(results, code, content, destination_file): +def generate_static_pages_packages(release: str, package: str, statistics: dict, destination_file: str) -> None: + """ Aggregate info and call package template """ log = logging.getLogger("buildWebsite.generate_static_pages_packages") - data = content - data["results"] = results - data["package"] = code + data = statistics + data["results"] = release + data["package"] = package data["now"] = datetime.datetime.utcnow() + # in some rare cases, a package may have no translation progress + if "stats" not in statistics.keys(): + data["stats"] = {} + data["stats"]["languages"] = {} + + if "error" in data["stats"]["languages"].keys(): + data["started_languages"] = len(data["stats"]["languages"]) - 1 + data["no_languages"] = len(data["stats"]["languages"]["error"]["filename"].split("./")) - 1 + else: + data["started_languages"] = len(data["stats"]["languages"]) + data["no_languages"] = 0 + + # remove local path + for lang in data["stats"]["languages"].keys(): + path = f"./results/{release}/packages/{package}/" + data["stats"]["languages"][lang]["filename"] = data["stats"]["languages"][lang]["filename"].replace(path, " ") + apply_jinja_template(data, destination_file, "package.adoc") -def generate_release_index(release, destination_file, data): +def generate_release_index(release: str, destination_file: str, data: dict) -> None: + """ Aggregate info and call release index template """ log = logging.getLogger("buildWebsite.generate_release_index") data["release"] = release data["now"] = datetime.datetime.utcnow() @@ -431,7 +235,8 @@ def generate_release_index(release, destination_file, data): apply_jinja_template(data, destination_file, "_index.release.adoc") -def generate_language_index(release, destination_file): +def generate_language_index(release: str, destination_file: str) -> None: + """ Aggregate info and call language index template """ log = logging.getLogger("buildWebsite.generate_language_index") data = dict() data["release"] = release @@ -440,7 +245,8 @@ def generate_language_index(release, destination_file): apply_jinja_template(data, destination_file, "_index.language.adoc") -def generate_package_index(distribution, destination_file): +def generate_package_index(distribution: str, destination_file: str) -> None: + """ Aggregate info and call package index template """ log = logging.getLogger("buildWebsite.generate_package_index") data = dict() data["distribution"] = distribution @@ -449,26 +255,34 @@ def generate_package_index(distribution, destination_file): apply_jinja_template(data, destination_file, "_index.package.adoc") -def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict): +def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict, cldr_version: str) -> None: + """ Aggregate info and call territory index template """ log = logging.getLogger("buildWebsite.generate_package_index") data["name"] = name data["code"] = code + data["cldr_version"] = cldr_version apply_jinja_template(data, destination_file, "_index.territory.adoc") -def store_json_file(content, destination_file): +def store_json_file(content: dict, destination_file: str) -> None: + """ Store a json file""" with open(destination_file, "w") as f: f.write(json.dumps(content, indent=2)) -def apply_jinja_template(data: dict, destination_file: str, template_file: str): +def apply_jinja_template(data: dict, destination_file: str, template_file: str) -> None: + """ Call a jinja template with a data dictionary """ os.makedirs(os.path.dirname(os.path.abspath(destination_file)), exist_ok=True) template_loader = jinja2.FileSystemLoader(searchpath="./templates/") template_env = jinja2.Environment(loader=template_loader, undefined=jinja2.Undefined) template = template_env.get_template(template_file) - output_text = template.render(data) + try: + output_text = template.render(data) + except jinja2.exceptions.UndefinedError as e: + logging.error(f"Error with {destination_file}: {e}") + raise with open(destination_file, "w") as write_out: write_out.write(output_text) diff --git a/check_dnf_files.sh b/check_dnf_files.sh index 916afe1..3da7c48 100755 --- a/check_dnf_files.sh +++ b/check_dnf_files.sh @@ -26,9 +26,6 @@ function call_sall { /src/build_language_list.py --results "$results" podman run -it --rm -v ./:/src:z -v "$WORK_DIR"/results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G -e DNF_CONF=/src/"$dnf_conf" fedlocstats:latest \ - /src/build_language_list.py --results "$results" --analyzealllang - - podman run -it --rm -v ./:/src:z -v "$WORK_DIR"/results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G -e DNF_CONF=/src/"$dnf_conf" fedlocstats:latest \ /src/build_tm.py --results "$results" --compress podman run -it --rm -v ./:/src:z -v "$WORK_DIR"/results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G -e DNF_CONF=/src/"$dnf_conf" fedlocstats:latest \ diff --git a/requirements.txt b/requirements.txt index e5941dc..af51268 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ polib weblate-language-data langtable translate-toolkit + +numpyencoder \ No newline at end of file diff --git a/runall.sh b/runall.sh index 5951544..cd04eae 100755 --- a/runall.sh +++ b/runall.sh @@ -20,9 +20,6 @@ podman run -it --rm -v ./:/src:z -v ./results:/src/results:z -v ./srpms:/srpms:z # ~ 18 m ./build_language_list.py --results "$results" -# ~ 18 m -./build_language_list.py --results "$results" --analyzealllang - # Creates useful translator files for every languages # ~ 3 h 00 LANG=C ./build_tm.py --results "$results" --compress diff --git a/templates/_index.release.adoc b/templates/_index.release.adoc index 74919f8..60d0de3 100644 --- a/templates/_index.release.adoc +++ b/templates/_index.release.adoc @@ -5,14 +5,14 @@ layout: "release" --- Fedora {{ release }}:: -* contains {{ total_release_packages }} packages, -* we identified {{ total_packages }} packages with translations files, -* it represents {{ total_packages_files }} translations files (po). +* contains {{ packages_count }} packages, +* we identified {{ packages_detected_count }} packages with translations files, +* it represents {{ files_detected_count }} translations files (po). What we were able to process:: -* {{ total_packages_with_stats }} packages, -* {{ nb_files }} translation files containing {{ totalsourcewords }} words to translate, -* {{ total_languages }} languages. +* {{ packages_processed_count }} packages, +* {{ files_processed_count }} translation files containing {{ totalsourcewords }} words to translate, +* {{ languages_processed_count }} languages. Why such gaps?:: . File reading was not possible (encoding or format issue), diff --git a/templates/_index.territory.adoc b/templates/_index.territory.adoc index 4886457..ad6e959 100644 --- a/templates/_index.territory.adoc +++ b/templates/_index.territory.adoc @@ -1,7 +1,7 @@ --- title: "{{ code }} {{ name }}" --- -Data coming from Unicode consortium (CLDR 38): +Data coming from Unicode consortium (CLDR {{ cldr_version }}): * Population: {{ _population }} * Literacy percent: {{_literacyPercent}} diff --git a/templates/language.adoc b/templates/language.adoc index 849481d..55e6b33 100644 --- a/templates/language.adoc +++ b/templates/language.adoc @@ -4,7 +4,8 @@ date: {{ now }} code: {{ lang_code }} name_english: {{ lang_name_en }} name_local: {{ lang_name_local }} -progress_d: {{ progress_d }} +progress: {{ '{:.2f}'.format(progress) }} +progress_d: {{ '{:.2f}'.format(progress_d) }} release: {{ results }} {%- if territories %} territories: @@ -16,14 +17,14 @@ territories: Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ results }} is: -* {{ progress }}% when we only look on started packages for this language. -* {{ progress_d }}% when we compare to every single translatable string in Fedora {{ results }}. +* {{ '{:.2f}'.format(progress) }}% when we only look on started packages for this language. +* {{ '{:.2f}'.format(progress_d) }}% when we compare to every single translatable string in Fedora {{ results }}. Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} * Total translatable string in Fedora {{ results }}: {{ totalsourcewords_d }} * Source words to translate in started packages: {{ totalsourcewordssum }} -* Translated words: {{ translatedsourcewordssum }} +* Translated words: {{ translatedsourcewords }} Download: @@ -34,17 +35,19 @@ Download: Packages: -[cols="1a,1,1,3", options="header"] +[cols="1a,1,1,1,3", options="header"] |=== | Name | Translated words -| Progress -| Language team +| Total source words +| Progress (%) +| Language teams {% for package in packages -%} -| link:{{ '{{' }}< ref "/{{ results }}/package/{{ package.name }}.adoc" >{{ '}}' }}[{{ package.name }}] ->| {{ package.translated }} ->| {{ package.progress }} -| {{ package.team }} +| link:{{ '{{' }}< ref "/{{ results }}/package/{{ package }}.adoc" >{{ '}}' }}[{{ package }}] +>| {{ packages[package].translatedsourcewords }} +>| {{ packages[package].totalsourcewordssum }} +>| {{ '{:.1f}'.format(packages[package].progress) }} +| {{ packages[package].team }} {% endfor %} |=== \ No newline at end of file diff --git a/templates/package.adoc b/templates/package.adoc index 19d546f..06d6da6 100644 --- a/templates/package.adoc +++ b/templates/package.adoc @@ -1,37 +1,30 @@ --- title: "{{ package }}" date: {{ now }} -started_languages: {{ stats|length }} -no_languages: {{ no_languages|length }} +started_languages: {{ started_languages }} +no_languages: {{ no_languages }} --- The package {{ package }}: * represents {{ totalsourcewords }} source words to be translated, -* is translated into {{ stats|length }} languages in Fedora {{ results }}, -* contains {{ no_languages|length }} files for which no languages could be deducted. +* is translated into {{ stats.languages|length }} languages in Fedora {{ results }}, +* contains {{ no_languages }} files for which no languages could be deducted. -[cols="1a,1,1,3a", options="header"] +[cols="1a,1,1,1,3", options="header"] |=== | Language | Translated words -| Progress +| Total source words +| Progress (%) | Files -{% for stat in stats|sort(attribute="lang_code") -%} -| link:{{ '{{' }}< ref "/{{ results }}/language/{{ stat.lang_code }}.adoc" >{{ '}}' }}[{{ stat.lang_code }}] ->| {{ stat.translated }} ->| {{ stat.progress }} -| {% for file in stat.filename -%}{{ file }}{{ " " }}{%- endfor %} +{% for stat in stats.languages|sort -%} +| link:{{ '{{' }}< ref "/{{ results }}/language/{{ stat }}.adoc" >{{ '}}' }}[{{ stat }}] +>| {{ stats.languages[stat].translatedsourcewords }} +>| {{ stats.languages[stat].totalsourcewordssum }} +>| {{ '{:.1f}'.format(stats.languages[stat].progress) }} +| {{ stats.languages[stat].filename }} + {% endfor %} |=== - -## Errors -{% if no_languages %} -List of files for which language detection were impossible: -{% for missing in no_languages -%} -* {{ missing }} -{% endfor %} -{% else %} -None -{% endif %} diff --git a/todo.md b/todo.md index cd426a3..a300187 100644 --- a/todo.md +++ b/todo.md @@ -11,17 +11,18 @@ direct call to: # build_tm.py -Detecting missing files -- en-compendium is missing -- error-compendium is missing -- gl-compendium is missing -- nb_no-compendium is missing -- sk-compendium is missing -- zh_hant-compendium is missing +move error detection (check_lang) into %language%/stats.json and display erros +move error files into %language%/stats.json and make these accessible via website +remove terminology (someone who wants it can do it locally) # build_stats.py -roxterm triggers an error +when %package%/stats.json is empty, make sure it is counted as an existing package for which we were not able to extract anything (release stats) + +# website + +list why we could not deduct error files +allow sort on all tables # global diff --git a/website/themes/beautifulhugo/layouts/_default/list_languages.html b/website/themes/beautifulhugo/layouts/_default/list_languages.html index 4f3fec7..ec74b82 100644 --- a/website/themes/beautifulhugo/layouts/_default/list_languages.html +++ b/website/themes/beautifulhugo/layouts/_default/list_languages.html @@ -12,7 +12,7 @@ code English name Local name - Progress + Progress (%) {{ range sort .Pages "Title" "asc" }} From 49b98d7bbe6511c8f52cc2bb1e2120008ea03a0d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 12:43:10 +0000 Subject: [PATCH 13/26] move from asciidoctor to markdown hugo does not interact so well with asciidoctor, making everything slower. with f37 content, asciidoctor takes about 90 seconds while markdown does this in 10 seconds. --- diff --git a/templates/_index.language.adoc b/templates/_index.language.adoc deleted file mode 100644 index 75291bc..0000000 --- a/templates/_index.language.adoc +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: "Languages for {{ release }}" -date: {{ now }} -layout: "list_languages" ---- \ No newline at end of file diff --git a/templates/_index.language.md b/templates/_index.language.md new file mode 100644 index 0000000..75291bc --- /dev/null +++ b/templates/_index.language.md @@ -0,0 +1,5 @@ +--- +title: "Languages for {{ release }}" +date: {{ now }} +layout: "list_languages" +--- \ No newline at end of file diff --git a/templates/_index.package.adoc b/templates/_index.package.adoc deleted file mode 100644 index 0c1f25b..0000000 --- a/templates/_index.package.adoc +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: "Packages for {{ distribution }}" -date: {{ now }} -layout: "list_packages" ---- - -This listing aims at making it easy to find packages with files for which no languages could be deducted. \ No newline at end of file diff --git a/templates/_index.package.md b/templates/_index.package.md new file mode 100644 index 0000000..0c1f25b --- /dev/null +++ b/templates/_index.package.md @@ -0,0 +1,7 @@ +--- +title: "Packages for {{ distribution }}" +date: {{ now }} +layout: "list_packages" +--- + +This listing aims at making it easy to find packages with files for which no languages could be deducted. \ No newline at end of file diff --git a/templates/_index.release.adoc b/templates/_index.release.adoc deleted file mode 100644 index 60d0de3..0000000 --- a/templates/_index.release.adoc +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: "Statistics for {{ release }}" -date: {{ now }} -layout: "release" ---- - -Fedora {{ release }}:: -* contains {{ packages_count }} packages, -* we identified {{ packages_detected_count }} packages with translations files, -* it represents {{ files_detected_count }} translations files (po). - -What we were able to process:: -* {{ packages_processed_count }} packages, -* {{ files_processed_count }} translation files containing {{ totalsourcewords }} words to translate, -* {{ languages_processed_count }} languages. - -Why such gaps?:: -. File reading was not possible (encoding or format issue), -. Language detection failed (missing information). \ No newline at end of file diff --git a/templates/_index.release.md b/templates/_index.release.md new file mode 100644 index 0000000..d083669 --- /dev/null +++ b/templates/_index.release.md @@ -0,0 +1,19 @@ +--- +title: "Statistics for {{ release }}" +date: {{ now }} +layout: "release" +--- + +Fedora {{ release }}: +* contains {{ packages_count }} packages, +* we identified {{ packages_detected_count }} packages with translations files, +* it represents {{ files_detected_count }} translations files (po). + +What we were able to process: +* {{ packages_processed_count }} packages, +* {{ files_processed_count }} translation files containing {{ totalsourcewords }} words to translate, +* {{ languages_processed_count }} languages. + +Why such gaps? +* File reading was not possible (encoding or format issue), +* Language detection failed (missing information). \ No newline at end of file diff --git a/templates/_index.territory.adoc b/templates/_index.territory.adoc deleted file mode 100644 index ad6e959..0000000 --- a/templates/_index.territory.adoc +++ /dev/null @@ -1,16 +0,0 @@ ---- -title: "{{ code }} {{ name }}" ---- -Data coming from Unicode consortium (CLDR {{ cldr_version }}): - -* Population: {{ _population }} -* Literacy percent: {{_literacyPercent}} - -{%- if languagePopulation %} -{% for lang, value in languagePopulation.items() %} -* {{ lang }} -{%- if value._officialStatus %} *{{ value._officialStatus }}*{%- endif %} -{%- if value._populationPercent %} ({{ value._populationPercent }} %){%- endif %} -{%- endfor %} - -{%- endif %} \ No newline at end of file diff --git a/templates/_index.territory.md b/templates/_index.territory.md new file mode 100644 index 0000000..ad6e959 --- /dev/null +++ b/templates/_index.territory.md @@ -0,0 +1,16 @@ +--- +title: "{{ code }} {{ name }}" +--- +Data coming from Unicode consortium (CLDR {{ cldr_version }}): + +* Population: {{ _population }} +* Literacy percent: {{_literacyPercent}} + +{%- if languagePopulation %} +{% for lang, value in languagePopulation.items() %} +* {{ lang }} +{%- if value._officialStatus %} *{{ value._officialStatus }}*{%- endif %} +{%- if value._populationPercent %} ({{ value._populationPercent }} %){%- endif %} +{%- endfor %} + +{%- endif %} \ No newline at end of file diff --git a/templates/language.adoc b/templates/language.adoc deleted file mode 100644 index 55e6b33..0000000 --- a/templates/language.adoc +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: "{{ lang_code }}-{{ lang_name_en }} ({{ lang_name_local }})" -date: {{ now }} -code: {{ lang_code }} -name_english: {{ lang_name_en }} -name_local: {{ lang_name_local }} -progress: {{ '{:.2f}'.format(progress) }} -progress_d: {{ '{:.2f}'.format(progress_d) }} -release: {{ results }} -{%- if territories %} -territories: -{%- for territory in territories %} - - {{ territory }} -{%- endfor %} -{%- endif %} ---- - -Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ results }} is: - -* {{ '{:.2f}'.format(progress) }}% when we only look on started packages for this language. -* {{ '{:.2f}'.format(progress_d) }}% when we compare to every single translatable string in Fedora {{ results }}. - -Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} - -* Total translatable string in Fedora {{ results }}: {{ totalsourcewords_d }} -* Source words to translate in started packages: {{ totalsourcewordssum }} -* Translated words: {{ translatedsourcewords }} - -Download: - -* link:{{ "{{% resource url=" }}"{{ files["compendium"]["url"] }}" {{ "%}}" }}[{{ lang_code }} compendium ({{ files["compendium"]["size"]|filesizeformat() }})] (aggregation of all strings found in po files) -* link:{{ "{{% resource url=" }}"{{ files["terminology"]["url"] }}" {{ "%}}" }}[{{ lang_code }} terminology ({{ files["terminology"]["size"]|filesizeformat() }})] see https://docs.translatehouse.org/projects/translate-toolkit/en/latest/commands/poterminology.html[poterminology] -* link:{{ "{{% resource url=" }}"{{ files["tmx"]["url"] }}" {{ "%}}" }}[{{ lang_code }} translation memory ({{ files["tmx"]["size"]|filesizeformat() }})] see https://en.wikipedia.org/wiki/Translation_Memory_eXchange[tmx] -* link:{{ "{{% resource url=" }}"{{ files["csv"]["url"] }}" {{ "%}}" }}[{{ lang_code }} generated stats ({{ files["csv"]["size"]|filesizeformat() }})] - -Packages: - -[cols="1a,1,1,1,3", options="header"] -|=== -| Name -| Translated words -| Total source words -| Progress (%) -| Language teams - -{% for package in packages -%} -| link:{{ '{{' }}< ref "/{{ results }}/package/{{ package }}.adoc" >{{ '}}' }}[{{ package }}] ->| {{ packages[package].translatedsourcewords }} ->| {{ packages[package].totalsourcewordssum }} ->| {{ '{:.1f}'.format(packages[package].progress) }} -| {{ packages[package].team }} -{% endfor %} -|=== \ No newline at end of file diff --git a/templates/language.md b/templates/language.md new file mode 100644 index 0000000..ed449ec --- /dev/null +++ b/templates/language.md @@ -0,0 +1,47 @@ +--- +title: "{{ lang_code }}-{{ lang_name_en }} ({{ lang_name_local }})" +date: {{ now }} +code: {{ lang_code }} +name_english: {{ lang_name_en }} +name_local: {{ lang_name_local }} +progress: {{ '{:.2f}'.format(progress) }} +progress_d: {{ '{:.2f}'.format(progress_d) }} +release: {{ results }} +{%- if territories %} +territories: +{%- for territory in territories %} + - {{ territory }} +{%- endfor %} +{%- endif %} +--- + +Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ results }} is: + +* {{ '{:.2f}'.format(progress) }}% when we only look on started packages for this language. +* {{ '{:.2f}'.format(progress_d) }}% when we compare to every single translatable string in Fedora {{ results }}. + +Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} + +* Total translatable string in Fedora {{ results }}: {{ totalsourcewords_d }} +* Source words to translate in started packages: {{ totalsourcewordssum }} +* Translated words: {{ translatedsourcewords }} + +Download: +* [{{ lang_code }} compendium ({{ files["compendium"]["size"]|filesizeformat() }})]({{ "{{% resource url=" }}"{{ files["compendium"]["url"] }}" {{ "%}}" }}) (aggregation of all strings found in po files) +* [{{ lang_code }} terminology ({{ files["terminology"]["size"]|filesizeformat() }})]({{ "{{% resource url=" }}"{{ files["terminology"]["url"] }}" {{ "%}}" }}) see [poterminology](https://docs.translatehouse.org/projects/translate-toolkit/en/latest/commands/poterminology.html) +* [{{ lang_code }} translation memory ({{ files["tmx"]["size"]|filesizeformat() }})]({{ "{{% resource url=" }}"{{ files["tmx"]["url"] }}" {{ "%}}" }}) see [tmx](https://en.wikipedia.org/wiki/Translation_Memory_eXchange) +* [{{ lang_code }} generated stats ({{ files["csv"]["size"]|filesizeformat() }})]({{ "{{% resource url=" }}"{{ files["csv"]["url"] }}" {{ "%}}" }}) + +Packages: + +| Name | Translated words | Total source words | Progress (%) | Language teams | +|------|------------------:|-------------------:|-------------:|----------------| +{% for package in packages %} +{%- set output = " | [" ~ package ~ "]({{< ref \"/" ~ results ~ "/package/" ~ package ~ ".md\" >}})" -%} +{%- set output = output ~ " | " ~ packages[package].translatedsourcewords -%} +{%- set output = output ~ " | " ~ packages[package].totalsourcewordssum -%} +{%- set output = output ~ " | " ~ '{:.1f}'.format(packages[package].progress) -%} +{%- set output = output ~ " | " ~ packages[package].team -%} +{%- set output = output ~ " | " -%} +{{ output }} +{% endfor -%} \ No newline at end of file diff --git a/templates/package.adoc b/templates/package.adoc deleted file mode 100644 index 06d6da6..0000000 --- a/templates/package.adoc +++ /dev/null @@ -1,30 +0,0 @@ ---- -title: "{{ package }}" -date: {{ now }} -started_languages: {{ started_languages }} -no_languages: {{ no_languages }} ---- -The package {{ package }}: - -* represents {{ totalsourcewords }} source words to be translated, -* is translated into {{ stats.languages|length }} languages in Fedora {{ results }}, -* contains {{ no_languages }} files for which no languages could be deducted. - -[cols="1a,1,1,1,3", options="header"] -|=== -| Language -| Translated words -| Total source words -| Progress (%) -| Files - -{% for stat in stats.languages|sort -%} -| link:{{ '{{' }}< ref "/{{ results }}/language/{{ stat }}.adoc" >{{ '}}' }}[{{ stat }}] ->| {{ stats.languages[stat].translatedsourcewords }} ->| {{ stats.languages[stat].totalsourcewordssum }} ->| {{ '{:.1f}'.format(stats.languages[stat].progress) }} -| {{ stats.languages[stat].filename }} - -{% endfor %} - -|=== diff --git a/templates/package.md b/templates/package.md new file mode 100644 index 0000000..e589cc3 --- /dev/null +++ b/templates/package.md @@ -0,0 +1,23 @@ +--- +title: "{{ package }}" +date: {{ now }} +started_languages: {{ started_languages }} +no_languages: {{ no_languages }} +--- +The package {{ package }}: + +* represents {{ totalsourcewords }} source words to be translated, +* is translated into {{ stats.languages|length }} languages in Fedora {{ results }}, +* contains {{ no_languages }} files for which no languages could be deducted. + +| Language | Translated words | Total source words | Progress (%) | Files | +|----------|-----------------:|-------------------:|-------------:|-------| +{% for stat in stats.languages|sort -%} +{%- set output = " | [" ~ stat ~ "]({{< ref \"/" ~ results ~ "/language/" ~ stat ~ ".md\" >}})" -%} +{%- set output = output ~ " | " ~ stats.languages[stat].translatedsourcewords -%} +{%- set output = output ~ " | " ~ stats.languages[stat].totalsourcewordssum -%} +{%- set output = output ~ " | " ~ '{:.1f}'.format(stats.languages[stat].progress) -%} +{%- set output = output ~ " | " ~ stats.languages[stat].filename -%} +{%- set output = output ~ " | " -%} +{{ output }} +{% endfor -%} \ No newline at end of file diff --git a/website/config.toml b/website/config.toml index 278ae5e..812f0ea 100644 --- a/website/config.toml +++ b/website/config.toml @@ -11,15 +11,4 @@ unsafe= true since = "2021" [taxonomies] -countries = "territories" - -[security] - enableInlineShortcodes = false - [security.exec] - allow = ['^dart-sass-embedded$', '^go$', '^npx$', '^postcss$', '^asciidoctor$'] - osEnv = ['(?i)^(PATH|PATHEXT|APPDATA|TMP|TEMP|TERM)$'] - [security.funcs] - getenv = ['^HUGO_'] - [security.http] - methods = ['(?i)GET|POST'] - urls = ['.*'] +countries = "territories" \ No newline at end of file From ce7133e034ce85fd38af2f58755c4afc87d67cba Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 12:45:53 +0000 Subject: [PATCH 14/26] move from asciidoctor to markdown --- diff --git a/build_website.py b/build_website.py index 362d914..ab00ca0 100755 --- a/build_website.py +++ b/build_website.py @@ -115,7 +115,7 @@ def main(): for language_file in sorted(languages): language = language_file[: -len(".json")] stats_file = os.path.join(langs_stats, language_file) - destination_file = os.path.join(static_langs_folder, f"{language}.adoc") + destination_file = os.path.join(static_langs_folder, f"{language}.md") with open(stats_file, "r") as read_file: content = json.load(read_file) @@ -135,7 +135,7 @@ def main(): ] for package in sorted(packages): stats_file = os.path.join(packages_stats, package, "stats.json") - destination_file = os.path.join(static_pkgs_folder, f"{package}.adoc") + destination_file = os.path.join(static_pkgs_folder, f"{package}.md") with open(stats_file, "r") as read_file: content = json.load(read_file) @@ -143,23 +143,23 @@ def main(): generate_static_pages_packages(args.results, package, content, destination_file) log.info("Generating indexes") - package_statistics_file = os.path.join(static_folder, "_index.adoc") + package_statistics_file = os.path.join(static_folder, "_index.md") generate_release_index(args.results, package_statistics_file, distribution_stats) - package_statistics_file = os.path.join(static_langs_folder, "_index.adoc") + package_statistics_file = os.path.join(static_langs_folder, "_index.md") generate_language_index(args.results, package_statistics_file) - package_statistics_file = os.path.join(static_pkgs_folder, "_index.adoc") + package_statistics_file = os.path.join(static_pkgs_folder, "_index.md") generate_package_index(args.results, package_statistics_file) for code in cldr_territories.keys(): # prevent containers and alternative names to be included if code in cldr_territories_info.keys(): - package_statistics_file = os.path.join(static_territories_folder, code, "_index.adoc") + package_statistics_file = os.path.join(static_territories_folder, code, "_index.md") generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}), cldr_version) log.info("Copy translation memories") - languages = [ + """languages = [ f for f in os.listdir(tm_folder) if os.path.isfile(os.path.join(tm_folder, f)) ] for lang in languages: @@ -167,7 +167,7 @@ def main(): shutil.copyfile( os.path.join(tm_folder, lang), os.path.join(static_tm_folder, lang) ) - +""" log.info("done") @@ -195,7 +195,7 @@ def generate_static_pages_langs(results: str, code: str, content: dict, destinat if len(territories) > 0: data["territories"] = territories - apply_jinja_template(data, destination_file, "language.adoc") + apply_jinja_template(data, destination_file, "language.md") def generate_static_pages_packages(release: str, package: str, statistics: dict, destination_file: str) -> None: @@ -223,7 +223,7 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, path = f"./results/{release}/packages/{package}/" data["stats"]["languages"][lang]["filename"] = data["stats"]["languages"][lang]["filename"].replace(path, " ") - apply_jinja_template(data, destination_file, "package.adoc") + apply_jinja_template(data, destination_file, "package.md") def generate_release_index(release: str, destination_file: str, data: dict) -> None: @@ -232,7 +232,7 @@ def generate_release_index(release: str, destination_file: str, data: dict) -> N data["release"] = release data["now"] = datetime.datetime.utcnow() - apply_jinja_template(data, destination_file, "_index.release.adoc") + apply_jinja_template(data, destination_file, "_index.release.md") def generate_language_index(release: str, destination_file: str) -> None: @@ -242,7 +242,7 @@ def generate_language_index(release: str, destination_file: str) -> None: data["release"] = release data["now"] = datetime.datetime.utcnow() - apply_jinja_template(data, destination_file, "_index.language.adoc") + apply_jinja_template(data, destination_file, "_index.language.md") def generate_package_index(distribution: str, destination_file: str) -> None: @@ -252,7 +252,7 @@ def generate_package_index(distribution: str, destination_file: str) -> None: data["distribution"] = distribution data["now"] = datetime.datetime.utcnow() - apply_jinja_template(data, destination_file, "_index.package.adoc") + apply_jinja_template(data, destination_file, "_index.package.md") def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict, cldr_version: str) -> None: @@ -262,7 +262,7 @@ def generate_territory_index(destination_file: str, name: list[str], code: str, data["code"] = code data["cldr_version"] = cldr_version - apply_jinja_template(data, destination_file, "_index.territory.adoc") + apply_jinja_template(data, destination_file, "_index.territory.md") def store_json_file(content: dict, destination_file: str) -> None: From 6a19d3cf49d02a8c357b5e3833874abad6439def Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 12:46:41 +0000 Subject: [PATCH 15/26] add favicon --- diff --git a/website/config.toml b/website/config.toml index 812f0ea..7f37113 100644 --- a/website/config.toml +++ b/website/config.toml @@ -9,6 +9,7 @@ unsafe= true [Params] since = "2021" +favicon = "img/favicon.ico" [taxonomies] countries = "territories" \ No newline at end of file diff --git a/website/themes/beautifulhugo/static/img/favicon.ico b/website/themes/beautifulhugo/static/img/favicon.ico index 523bc99..204a34a 100644 Binary files a/website/themes/beautifulhugo/static/img/favicon.ico and b/website/themes/beautifulhugo/static/img/favicon.ico differ From 31a8dae733dc39fafd54528d1a020ac19c91cf02 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 12:49:58 +0000 Subject: [PATCH 16/26] simplify templating remove a lot of complex stuff we do not use and makes it hard to know where to change something in our website. adding a little bit of cache too, the impact is really low, but still, no reason not to use it --- diff --git a/website/themes/beautifulhugo/layouts/_default/baseof.html b/website/themes/beautifulhugo/layouts/_default/baseof.html index a67d0c4..bbe3512 100644 --- a/website/themes/beautifulhugo/layouts/_default/baseof.html +++ b/website/themes/beautifulhugo/layouts/_default/baseof.html @@ -1,13 +1,13 @@ - {{ partial "head.html" . }} + {{ partialCached "head.html" . (.Title | default .Site.Title) }} - {{ partial "nav.html" . }} - {{ block "header" . }}{{ partial "header.html" . }}{{ end }} + {{ partialCached "nav.html" . }} + {{ block "header" . }}{{ partialCached "header.html" . (.Title | default .Site.Title) }}{{ end }} {{ block "main" . }}{{ end }} - {{ partial "footer.html" . }} + {{ partialCached "footer.html" . }} {{ block "footer" . }}{{ end }} diff --git a/website/themes/beautifulhugo/layouts/partials/footer.html b/website/themes/beautifulhugo/layouts/partials/footer.html index 7ddef70..41ad5f3 100644 --- a/website/themes/beautifulhugo/layouts/partials/footer.html +++ b/website/themes/beautifulhugo/layouts/partials/footer.html @@ -1,53 +1,16 @@ - {{ if eq .Type "page" }} - {{ partial "page_meta.html" . }} - {{ end }}
- - -

- {{ i18n "poweredBy" . | safeHTML }} -

- -{{- partial "footer_custom.html" . }} +{{- partialCached "footer_custom.html" . }} diff --git a/website/themes/beautifulhugo/layouts/partials/head.html b/website/themes/beautifulhugo/layouts/partials/head.html index 7f8d4bb..29c0f3b 100644 --- a/website/themes/beautifulhugo/layouts/partials/head.html +++ b/website/themes/beautifulhugo/layouts/partials/head.html @@ -1,41 +1,11 @@ -{{- if eq .Kind "taxonomyTerm" }} - {{- range $key, $value := .Data.Terms.ByCount }} - {{- $.Scratch.Add "most_used" (slice $value.Name) }} - {{- end }} - {{- if not ($.Scratch.Get "most_used") }} - {{- $description := printf "A full overview of all pages with %s, ordered by %s" .Data.Plural .Data.Singular | truncate 180 }} - {{- $.Scratch.Set "Description" $description }} - {{- else }} - {{- $description := printf "A full overview of all pages with %s, ordered by %s, such as: %s" .Data.Plural .Data.Singular ( delimit ( $.Scratch.Get "most_used" ) ", " ", and " ) | truncate 180 }} - {{- $.Scratch.Set "Description" $description }} - {{- end }} - - {{- $title := printf "Overview of all pages with %s, ordered by %s" .Data.Plural .Data.Singular }} - {{- $.Scratch.Set "Title" $title }} -{{- else if eq .Kind "taxonomy" }} - {{- $description := printf "Overview of all pages with the %s #%s, such as: %s" .Data.Singular $.Title ( index .Pages 0).Title | truncate 160 }} - {{- $.Scratch.Set "Description" $description }} - - {{- $title := printf "Overview of all pages with the %s #%s" .Data.Singular $.Title }} - {{- $.Scratch.Set "Title" $title }} -{{- else }} - {{- $.Scratch.Set "Description" ( .Description | default .Params.subtitle | default .Summary ) }} - {{- $.Scratch.Set "Title" ( .Title | default .Site.Title ) }} -{{- end }} - - -{{- with ($.Scratch.Get "Title") }} - {{ . }} - {{ $.Site.Title }} -{{- end }} -{{- with ($.Scratch.Get "Description") }} + + {{ .Title | default .Site.Title }} - {{ $.Site.Title }} +{{- with .Description }} {{- end }} -{{- with .Site.Author.name }} - -{{- end }} {{- with .Site.Params.favicon }} {{- end -}} @@ -48,4 +18,4 @@ -{{- partial "head_custom.html" . }} \ No newline at end of file +{{- partialCached "head_custom.html" . }} \ No newline at end of file diff --git a/website/themes/beautifulhugo/layouts/partials/header.html b/website/themes/beautifulhugo/layouts/partials/header.html index a841f79..8264234 100644 --- a/website/themes/beautifulhugo/layouts/partials/header.html +++ b/website/themes/beautifulhugo/layouts/partials/header.html @@ -1,85 +1,12 @@ -{{ if .IsHome }} - {{ if .Site.Params.homeTitle }}{{ $.Scratch.Set "title" .Site.Params.homeTitle }}{{ else }}{{ $.Scratch.Set "title" .Site.Title }}{{ end }} - {{ if .Site.Params.subtitle }}{{ $.Scratch.Set "subtitle" .Site.Params.subtitle }}{{ end }} - {{ if .Site.Params.bigimg }}{{ $.Scratch.Set "bigimg" .Site.Params.bigimg }}{{ end }} -{{ else }} - {{ $.Scratch.Set "title" .Title }} - {{ if .Params.subtitle }}{{ $.Scratch.Set "subtitle" .Params.subtitle }}{{ end }} - {{ if .Params.bigimg }}{{ $.Scratch.Set "bigimg" .Params.bigimg }}{{ end }} -{{ end }} -{{ $bigimg := $.Scratch.Get "bigimg" }} -{{ $title := $.Scratch.Get "title" }} -{{ $subtitle := $.Scratch.Get "subtitle" }} - -{{ if or $bigimg $title }} - {{ if $bigimg }} -
- {{ end }} - -
- {{ if $bigimg }} -
- {{ $subtitle := $.Scratch.Get "subtitle" }} -
-
-
-
-

{{ with $.Scratch.Get "title" }}{{.}}{{ else }}
{{ end }}

- {{ if $subtitle }} - {{ if eq .Type "page" }} -
- {{ $subtitle }} - {{ else }} -

{{ $subtitle }}

- {{ end }} - {{ end }} - {{ if eq .Type "post" }} - {{ partial "post_meta.html" . }} - {{ end }} -
-
-
-
- -
- {{end}} -
-
-
-
-
- {{ if eq .Type "list" }} -

{{ if .Data.Singular }}#{{ end }}{{ .Title }}

- {{ else }} -

{{ with $title }}{{.}}{{ else }}
{{ end }}

- {{ end }} - {{ if ne .Type "post" }} -
- {{ end }} - {{ if $subtitle }} - {{ if eq .Type "page" }} - {{ $subtitle }} - {{ else }} -

{{ $subtitle }}

- {{ end }} - {{ end }} - {{ if eq .Type "post" }} - {{ partial "post_meta.html" . }} - {{ end }} -
-
+
+
+
+
+
+

{{ .Title | default .Site.Title }}

+
-
-{{ else }} -
-{{ end }} +
+
\ No newline at end of file diff --git a/website/themes/beautifulhugo/layouts/partials/page_meta.html b/website/themes/beautifulhugo/layouts/partials/page_meta.html deleted file mode 100644 index ac9661b..0000000 --- a/website/themes/beautifulhugo/layouts/partials/page_meta.html +++ /dev/null @@ -1,8 +0,0 @@ -
- {{ $lastmodstr := default (i18n "dateFormat") .Site.Params.dateformat | .Lastmod.Format }} - {{ $datestr := default (i18n "dateFormat") .Site.Params.dateformat | .Date.Format }} - {{ if ne $datestr $lastmodstr }} - {{ $lastmodstr | i18n "lastModified" }} - {{ end }} -
- diff --git a/website/themes/beautifulhugo/layouts/partials/post_meta.html b/website/themes/beautifulhugo/layouts/partials/post_meta.html deleted file mode 100644 index a43e844..0000000 --- a/website/themes/beautifulhugo/layouts/partials/post_meta.html +++ /dev/null @@ -1,29 +0,0 @@ - - diff --git a/website/themes/beautifulhugo/layouts/partials/post_preview.html b/website/themes/beautifulhugo/layouts/partials/post_preview.html index 0411585..9df51ef 100644 --- a/website/themes/beautifulhugo/layouts/partials/post_preview.html +++ b/website/themes/beautifulhugo/layouts/partials/post_preview.html @@ -1,24 +1,8 @@

{{ .Title }}

- {{ if .Params.subtitle }} -

- {{ .Params.subtitle }} -

- {{ end }} - {{ if .Params.image }} - {{ .Title }} - {{ end }} - {{ if .Params.video }} - - {{ end }}
-
{{ if .Truncated }} {{ .Summary }} From 22415181812a34c7a5417ff34973ab88fa6bde0a Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 12:58:15 +0000 Subject: [PATCH 17/26] add parameter to keep srpm in results --- diff --git a/build.py b/build.py index 045a544..4ebf4c2 100755 --- a/build.py +++ b/build.py @@ -49,6 +49,13 @@ def main(): help="Keep SRPMs in /srpms", ) parser.add_argument( + "--store-srpms-in-results", + default=False, + action="store_true", + dest="srpms_in_results", + help="Store srpms in results folder (useful for automation)", + ) + parser.add_argument( "-f", "--force", default=False, @@ -76,7 +83,10 @@ def main(): srpm_regex = re.compile(f"^{args.filter}$") packages_folder = f"./results/{args.results}/packages/" - srpms_path = os.path.abspath("./srpms/") + if args.srpms_in_results: + srpms_path = f"./results/{args.results}/srpms/" + else: + srpms_path = os.path.abspath("./srpms/") if not os.path.exists(packages_folder): os.makedirs(packages_folder) From 9833845e4bada0e8d5e8adf96e235ffc70887b1d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 13:18:55 +0000 Subject: [PATCH 18/26] reduce the header size --- diff --git a/website/themes/beautifulhugo/static/css/main.css b/website/themes/beautifulhugo/static/css/main.css index f2ae82a..401aa73 100644 --- a/website/themes/beautifulhugo/static/css/main.css +++ b/website/themes/beautifulhugo/static/css/main.css @@ -176,7 +176,7 @@ img { @media only screen and (min-width: 768px) { .navbar-custom { - padding: 20px 0; + height: 52px; -webkit-transition: background .5s ease-in-out,padding .5s ease-in-out; -moz-transition: background .5s ease-in-out,padding .5s ease-in-out; transition: background .5s ease-in-out,padding .5s ease-in-out; @@ -556,7 +556,7 @@ footer .theme-by { } @media only screen and (min-width: 768px) { .intro-header { - margin-top: 130px; + margin-top: 60px; } .intro-header.big-img { margin-top: 91px; /* Full navbar is small navbar + 20px padding on each side when expanded */ @@ -565,11 +565,6 @@ footer .theme-by { .intro-header.big-img .post-heading { padding: 150px 0; } - .intro-header .page-heading h1, - .intro-header .tags-heading h1, - .intro-header .categories-heading h1 { - font-size: 80px; - } .intro-header .post-heading h1 { font-size: 50px; } From 9a1e08fddcf416ee805447d6ef0c2197bd11d98c Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 14:31:58 +0000 Subject: [PATCH 19/26] restore translation memories --- diff --git a/build_website.py b/build_website.py index ab00ca0..65e39dd 100755 --- a/build_website.py +++ b/build_website.py @@ -159,7 +159,7 @@ def main(): generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}), cldr_version) log.info("Copy translation memories") - """languages = [ + languages = [ f for f in os.listdir(tm_folder) if os.path.isfile(os.path.join(tm_folder, f)) ] for lang in languages: @@ -167,7 +167,7 @@ def main(): shutil.copyfile( os.path.join(tm_folder, lang), os.path.join(static_tm_folder, lang) ) -""" + log.info("done") From 259ad21c8c6d2b8e9762c6e026656749d8114674 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 14:34:04 +0000 Subject: [PATCH 20/26] allow to sort all tables add default sorting un python use global javascript to do so add class to all tables use new hugo >= 0.108 to add a class to a block. Here we add the sortable class by adding {.sortable} the line after the table --- diff --git a/Dockerfile b/Dockerfile index d21d76e..80cb12e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ -FROM registry.fedoraproject.org/fedora:38 +FROM registry.fedoraproject.org/fedora:39 -RUN dnf install -y lbzip2 unzip xz cpio dnf-plugins-core rsync python3-pip hugo gettext git rubygem-asciidoctor glibc-gconv-extra +RUN dnf install -y lbzip2 unzip xz cpio dnf-plugins-core rsync python3-pip hugo gettext git glibc-gconv-extra COPY requirements.txt /src/requirements.txt RUN pip install --no-cache -r /src/requirements.txt diff --git a/README.md b/README.md index f060db3..c3541c7 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Please refer to each Fedora package licenses to get all licenses. ```bash dnf install podman -./runall.sh 34 +./runall.sh 39 ```` This takes from 10 to 20 hours to process, you may wish to reduce the number of packages to scan. @@ -29,5 +29,5 @@ To do that, read the `./runall.sh` comments. # Run it anywhere ```bash -podman run -it --rm -v ./:/src:z -v ./results:/src/results:z -e DNF_CONF=dnf_${release}.conf -e TMP_DIR=/src/results/f${release}/tmp fedlocstats:34 $script +podman run -it --rm -v ./:/src:z -v ./results:/src/results:z -e DNF_CONF=dnf_${release}.conf -e TMP_DIR=/src/results/f${release}/tmp fedlocstats:39 $script ``` diff --git a/build_website.py b/build_website.py index 65e39dd..6eb47f3 100755 --- a/build_website.py +++ b/build_website.py @@ -195,6 +195,9 @@ def generate_static_pages_langs(results: str, code: str, content: dict, destinat if len(territories) > 0: data["territories"] = territories + # sort content + data["packages"] = sorted(data["packages"].items(), key=lambda x: x[1]['progress'], reverse=True) + apply_jinja_template(data, destination_file, "language.md") @@ -223,6 +226,9 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, path = f"./results/{release}/packages/{package}/" data["stats"]["languages"][lang]["filename"] = data["stats"]["languages"][lang]["filename"].replace(path, " ") + # sort content + data["stats"]["languages"] = sorted(data["stats"]["languages"].items(), key=lambda x: x[1]['progress'], reverse=True) + apply_jinja_template(data, destination_file, "package.md") diff --git a/templates/language.md b/templates/language.md index ed449ec..32fb6a0 100644 --- a/templates/language.md +++ b/templates/language.md @@ -36,12 +36,13 @@ Packages: | Name | Translated words | Total source words | Progress (%) | Language teams | |------|------------------:|-------------------:|-------------:|----------------| -{% for package in packages %} +{% for package, stats in packages %} {%- set output = " | [" ~ package ~ "]({{< ref \"/" ~ results ~ "/package/" ~ package ~ ".md\" >}})" -%} -{%- set output = output ~ " | " ~ packages[package].translatedsourcewords -%} -{%- set output = output ~ " | " ~ packages[package].totalsourcewordssum -%} -{%- set output = output ~ " | " ~ '{:.1f}'.format(packages[package].progress) -%} -{%- set output = output ~ " | " ~ packages[package].team -%} +{%- set output = output ~ " | " ~ stats.translatedsourcewords -%} +{%- set output = output ~ " | " ~ stats.totalsourcewordssum -%} +{%- set output = output ~ " | " ~ '{:.1f}'.format(stats.progress) -%} +{%- set output = output ~ " | " ~ stats.team -%} {%- set output = output ~ " | " -%} {{ output }} -{% endfor -%} \ No newline at end of file +{% endfor -%} +{.sortable} \ No newline at end of file diff --git a/templates/package.md b/templates/package.md index e589cc3..33ecfb0 100644 --- a/templates/package.md +++ b/templates/package.md @@ -12,12 +12,13 @@ The package {{ package }}: | Language | Translated words | Total source words | Progress (%) | Files | |----------|-----------------:|-------------------:|-------------:|-------| -{% for stat in stats.languages|sort -%} -{%- set output = " | [" ~ stat ~ "]({{< ref \"/" ~ results ~ "/language/" ~ stat ~ ".md\" >}})" -%} -{%- set output = output ~ " | " ~ stats.languages[stat].translatedsourcewords -%} -{%- set output = output ~ " | " ~ stats.languages[stat].totalsourcewordssum -%} -{%- set output = output ~ " | " ~ '{:.1f}'.format(stats.languages[stat].progress) -%} -{%- set output = output ~ " | " ~ stats.languages[stat].filename -%} +{% for lang, stat in stats.languages -%} +{%- set output = " | [" ~ lang ~ "]({{< ref \"/" ~ results ~ "/language/" ~ lang ~ ".md\" >}})" -%} +{%- set output = output ~ " | " ~ stat.translatedsourcewords -%} +{%- set output = output ~ " | " ~ stat.totalsourcewordssum -%} +{%- set output = output ~ " | " ~ '{:.1f}'.format(stat.progress) -%} +{%- set output = output ~ " | " ~ stat.filename -%} {%- set output = output ~ " | " -%} {{ output }} -{% endfor -%} \ No newline at end of file +{% endfor -%} +{.sortable} \ No newline at end of file diff --git a/todo.md b/todo.md index a300187..e2e843a 100644 --- a/todo.md +++ b/todo.md @@ -15,14 +15,9 @@ move error detection (check_lang) into %language%/stats.json and display erros move error files into %language%/stats.json and make these accessible via website remove terminology (someone who wants it can do it locally) -# build_stats.py - -when %package%/stats.json is empty, make sure it is counted as an existing package for which we were not able to extract anything (release stats) - # website list why we could not deduct error files -allow sort on all tables # global diff --git a/website/config.toml b/website/config.toml index 7f37113..b98b170 100644 --- a/website/config.toml +++ b/website/config.toml @@ -12,4 +12,12 @@ since = "2021" favicon = "img/favicon.ico" [taxonomies] -countries = "territories" \ No newline at end of file +countries = "territories" + + +[markup] + [markup.goldmark] + [markup.goldmark.parser] + wrapStandAloneImageWithinParagraph = false + [markup.goldmark.parser.attribute] + block = true \ No newline at end of file diff --git a/website/themes/beautifulhugo/layouts/_default/list_languages.html b/website/themes/beautifulhugo/layouts/_default/list_languages.html index ec74b82..4e862b4 100644 --- a/website/themes/beautifulhugo/layouts/_default/list_languages.html +++ b/website/themes/beautifulhugo/layouts/_default/list_languages.html @@ -6,13 +6,13 @@
{{ .Content }} - +
- - - - + + + + {{ range sort .Pages "Title" "asc" }} @@ -25,60 +25,4 @@
Click on columns headers to sort values
codeEnglish nameLocal nameProgress (%)codeEnglish nameLocal nameProgress (%)
- {{ end }} \ No newline at end of file diff --git a/website/themes/beautifulhugo/layouts/_default/list_packages.html b/website/themes/beautifulhugo/layouts/_default/list_packages.html index beb53b5..8276d0f 100644 --- a/website/themes/beautifulhugo/layouts/_default/list_packages.html +++ b/website/themes/beautifulhugo/layouts/_default/list_packages.html @@ -13,7 +13,7 @@
  • No language: number of translation files for which no language could be deducted
  • - +
    diff --git a/website/themes/beautifulhugo/layouts/partials/head_custom.html b/website/themes/beautifulhugo/layouts/partials/head_custom.html index 833d64a..f727da0 100644 --- a/website/themes/beautifulhugo/layouts/partials/head_custom.html +++ b/website/themes/beautifulhugo/layouts/partials/head_custom.html @@ -4,6 +4,7 @@ Do not put anything in this file - it's only here so that hugo won't throw an er --> + diff --git a/website/themes/beautifulhugo/layouts/territories/list.html b/website/themes/beautifulhugo/layouts/territories/list.html index a99eb32..d82182a 100644 --- a/website/themes/beautifulhugo/layouts/territories/list.html +++ b/website/themes/beautifulhugo/layouts/territories/list.html @@ -13,7 +13,7 @@
  • First Fedora version: first fedora version with at least a partial coverage of this territory
  • -
    Package Language
    +
    From fe35c1cea58005d6498bd40088e5485fe19a8de3 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 19:20:54 +0000 Subject: [PATCH 21/26] package stats: fix total source word computing the way pandas were used were summing all values, let's keep it simple by using loops --- diff --git a/build_stats.py b/build_stats.py index 1d840c6..32ab76d 100755 --- a/build_stats.py +++ b/build_stats.py @@ -105,6 +105,22 @@ def compute_package_statistics(df: pd.DataFrame) -> dict: return results +def compute_package_totalsourcewords(stats: dict) -> int: + """ compute the total source words for a package """ + log = logging.getLogger("buildStats.compute_package_totalsourcewords") + + langs = {} + for file, stat in stats.items(): + total = langs.get(stat["lang_code_chosen"], 0) + + langs[stat["lang_code_chosen"]] = total + stat["translatedsourcewords"] + stat["fuzzysourcewords"] + stat["untranslatedsourcewords"] + + if "error" in langs.keys(): + del langs["error"] + + return max(langs.values()) + + def main(): """Handle params""" @@ -168,7 +184,7 @@ def main(): df = pd.DataFrame.from_dict(stats["po"], orient='index') stats["stats"] = compute_package_statistics(df) - stats["totalsourcewords"] = df[["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords"]].sum().sum() + stats["totalsourcewords"] = compute_package_totalsourcewords(stats["po"]) with open(stats_file, "w") as f: json.dump(stats, f, indent=2, cls=NumpyEncoder) From dad0cbcc5363895ce110d35d9a4c723238e5e1d6 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 09 2023 19:41:44 +0000 Subject: [PATCH 22/26] website: display errors different types of errors can be seen while processing the stats let's share this information on the website, to make debugging easier also took the opportunity to have the release define on all hugo content --- diff --git a/build_website.py b/build_website.py index 6eb47f3..5df4da4 100755 --- a/build_website.py +++ b/build_website.py @@ -156,7 +156,7 @@ def main(): # prevent containers and alternative names to be included if code in cldr_territories_info.keys(): package_statistics_file = os.path.join(static_territories_folder, code, "_index.md") - generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}), cldr_version) + generate_territory_index(args.results, package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}), cldr_version) log.info("Copy translation memories") languages = [ @@ -171,7 +171,7 @@ def main(): log.info("done") -def generate_static_pages_langs(results: str, code: str, content: dict, destination_file: str, territories: list[str], tm_folder: str, static_tm_folder: str) -> None: +def generate_static_pages_langs(release: str, code: str, content: dict, destination_file: str, territories: list[str], tm_folder: str, static_tm_folder: str) -> None: """ Aggregate info and call language template """ log = logging.getLogger("buildWebsite.generate_static_pages_langs") data = content @@ -180,21 +180,29 @@ def generate_static_pages_langs(results: str, code: str, content: dict, destinat ) data["lang_name_local"] = langtable.language_name(languageId=code) data["scripts"] = langtable.list_scripts(languageId=code) - data["results"] = results + data["release"] = release data["lang_code"] = code data["now"] = datetime.datetime.utcnow() data["files"] = defaultdict(dict) - data["files"]["compendium"]["url"] = f"/{results}/{code}.po.gz" + data["files"]["compendium"]["url"] = f"/{release}/{code}.po.gz" data["files"]["compendium"]["size"] = os.path.getsize(os.path.join(tm_folder, f"{code}.po.gz")) - data["files"]["terminology"]["url"] = f"/{results}/{code}.terminology.po.gz" + data["files"]["terminology"]["url"] = f"/{release}/{code}.terminology.po.gz" data["files"]["terminology"]["size"] = os.path.getsize(os.path.join(tm_folder, f"{code}.terminology.po.gz")) - data["files"]["tmx"]["url"] = f"/{results}/{code}.tmx.gz" + data["files"]["tmx"]["url"] = f"/{release}/{code}.tmx.gz" data["files"]["tmx"]["size"] = os.path.getsize(os.path.join(tm_folder, f"{code}.tmx.gz")) - data["files"]["csv"]["url"] = f"/{results}/{code}.csv" + data["files"]["csv"]["url"] = f"/{release}/{code}.csv" data["files"]["csv"]["size"] = os.path.getsize(os.path.join(static_tm_folder, f"{code}.csv")) if len(territories) > 0: data["territories"] = territories + data["could_not_process_count"] = sum(value["could_not_process"] == 1 for value in data["po"]) + data["polib_error_count"] = sum(value["polib_error"] != "" for value in data["po"]) + + # remove local path + for file in data["po"]: + path = f"./results/{release}/packages/{file['package']}/" + file["filename"] = file["filename"].replace(path, " ") + # sort content data["packages"] = sorted(data["packages"].items(), key=lambda x: x[1]['progress'], reverse=True) @@ -205,7 +213,7 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, """ Aggregate info and call package template """ log = logging.getLogger("buildWebsite.generate_static_pages_packages") data = statistics - data["results"] = release + data["release"] = release data["package"] = package data["now"] = datetime.datetime.utcnow() @@ -214,6 +222,10 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, data["stats"] = {} data["stats"]["languages"] = {} + # in some rare cases, a package may have no file + if "po" not in statistics.keys(): + data["po"] = {} + if "error" in data["stats"]["languages"].keys(): data["started_languages"] = len(data["stats"]["languages"]) - 1 data["no_languages"] = len(data["stats"]["languages"]["error"]["filename"].split("./")) - 1 @@ -221,11 +233,19 @@ def generate_static_pages_packages(release: str, package: str, statistics: dict, data["started_languages"] = len(data["stats"]["languages"]) data["no_languages"] = 0 + data["could_not_process_count"] = sum(data["po"][value]["could_not_process"] == 1 for value in data["po"]) + data["polib_error_count"] = sum(data["po"][value]["polib_error"] != "" for value in data["po"]) + # remove local path + path = f"./results/{release}/packages/{package}/" + for lang in data["stats"]["languages"].keys(): - path = f"./results/{release}/packages/{package}/" data["stats"]["languages"][lang]["filename"] = data["stats"]["languages"][lang]["filename"].replace(path, " ") + for file, stats in data["po"].copy().items(): + data["po"][file.replace(path, " ")] = stats + del data["po"][file] + # sort content data["stats"]["languages"] = sorted(data["stats"]["languages"].items(), key=lambda x: x[1]['progress'], reverse=True) @@ -255,18 +275,19 @@ def generate_package_index(distribution: str, destination_file: str) -> None: """ Aggregate info and call package index template """ log = logging.getLogger("buildWebsite.generate_package_index") data = dict() - data["distribution"] = distribution + data["release"] = distribution data["now"] = datetime.datetime.utcnow() apply_jinja_template(data, destination_file, "_index.package.md") -def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict, cldr_version: str) -> None: +def generate_territory_index(release: str, destination_file: str, name: list[str], code: str, data: dict, cldr_version: str) -> None: """ Aggregate info and call territory index template """ log = logging.getLogger("buildWebsite.generate_package_index") data["name"] = name data["code"] = code data["cldr_version"] = cldr_version + data["release"] = release apply_jinja_template(data, destination_file, "_index.territory.md") diff --git a/templates/_index.language.md b/templates/_index.language.md index 75291bc..5ad6d2a 100644 --- a/templates/_index.language.md +++ b/templates/_index.language.md @@ -1,5 +1,6 @@ --- title: "Languages for {{ release }}" +release: {{ release }} date: {{ now }} layout: "list_languages" --- \ No newline at end of file diff --git a/templates/_index.package.md b/templates/_index.package.md index 0c1f25b..3560416 100644 --- a/templates/_index.package.md +++ b/templates/_index.package.md @@ -1,7 +1,8 @@ --- -title: "Packages for {{ distribution }}" +title: "Packages for {{ release }}" date: {{ now }} layout: "list_packages" +release: {{ release }} --- This listing aims at making it easy to find packages with files for which no languages could be deducted. \ No newline at end of file diff --git a/templates/_index.release.md b/templates/_index.release.md index d083669..2a51ce4 100644 --- a/templates/_index.release.md +++ b/templates/_index.release.md @@ -2,6 +2,7 @@ title: "Statistics for {{ release }}" date: {{ now }} layout: "release" +release: {{ release }} --- Fedora {{ release }}: diff --git a/templates/_index.territory.md b/templates/_index.territory.md index ad6e959..6cce853 100644 --- a/templates/_index.territory.md +++ b/templates/_index.territory.md @@ -1,5 +1,6 @@ --- title: "{{ code }} {{ name }}" +release: {{ release }} --- Data coming from Unicode consortium (CLDR {{ cldr_version }}): diff --git a/templates/language.md b/templates/language.md index 32fb6a0..bd3042e 100644 --- a/templates/language.md +++ b/templates/language.md @@ -1,28 +1,30 @@ --- -title: "{{ lang_code }}-{{ lang_name_en }} ({{ lang_name_local }})" +title: "{{ lang_code }}-{{ lang_name_en }} ({{ lang_name_local }}) - translation progress for {{ release }}" date: {{ now }} code: {{ lang_code }} name_english: {{ lang_name_en }} name_local: {{ lang_name_local }} progress: {{ '{:.2f}'.format(progress) }} progress_d: {{ '{:.2f}'.format(progress_d) }} -release: {{ results }} +release: {{ release }} {%- if territories %} territories: {%- for territory in territories %} - {{ territory }} {%- endfor %} {%- endif %} +polib_error_count: {{ polib_error_count }} +could_not_process_count: {{ could_not_process_count }} --- -Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ results }} is: +Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ release }} is: * {{ '{:.2f}'.format(progress) }}% when we only look on started packages for this language. -* {{ '{:.2f}'.format(progress_d) }}% when we compare to every single translatable string in Fedora {{ results }}. +* {{ '{:.2f}'.format(progress_d) }}% when we compare to every single translatable string in Fedora {{ release }}. Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} -* Total translatable string in Fedora {{ results }}: {{ totalsourcewords_d }} +* Total translatable string in Fedora {{ release }}: {{ totalsourcewords_d }} * Source words to translate in started packages: {{ totalsourcewordssum }} * Translated words: {{ translatedsourcewords }} @@ -37,7 +39,7 @@ Packages: | Name | Translated words | Total source words | Progress (%) | Language teams | |------|------------------:|-------------------:|-------------:|----------------| {% for package, stats in packages %} -{%- set output = " | [" ~ package ~ "]({{< ref \"/" ~ results ~ "/package/" ~ package ~ ".md\" >}})" -%} +{%- set output = " | [" ~ package ~ "]({{< ref \"/" ~ release ~ "/package/" ~ package ~ ".md\" >}})" -%} {%- set output = output ~ " | " ~ stats.translatedsourcewords -%} {%- set output = output ~ " | " ~ stats.totalsourcewordssum -%} {%- set output = output ~ " | " ~ '{:.1f}'.format(stats.progress) -%} @@ -45,4 +47,43 @@ Packages: {%- set output = output ~ " | " -%} {{ output }} {% endfor -%} -{.sortable} \ No newline at end of file +{.sortable} + +{% if polib_error_count > 0 or could_not_process_count > 0 %} +# Errors on PO files + +{% if polib_error_count > 0 %} +## Error with polib +We use the po metadata to get the language code and the team, but sometimes it fails, here are the files. + +| Package | Lang from file path/name | polib error | Team | +|---------|--------------------------|-------------|------| +{% for file in po if file.polib_error != "" -%} +{%- set output = " | [" ~ file.package ~ "]({{< ref \"/" ~ release ~ "/package/" ~ file.package ~ ".md\" >}})" -%} +{%- set output = output ~ " | " ~ file.filename -%} +{%- set output = output ~ " | " ~ file.polib_error -%} +{%- set output = output ~ " | " ~ file.metadata_language_team -%} +{%- set output = output ~ " | " -%} +{{ output }} +{% endfor -%} +{.sortable} +{% endif %} + +{% if could_not_process_count > 0 %} +## Error with calcstats + +We use the calcstats from translate toolkit to get the translation progress, but sometimes it fails, here are the files: + +| Package | Lang from file path/name | Team | +|---------|--------------------------|------| +{% for file in po if file.could_not_process == 1 -%} +{%- set output = " | [" ~ file.package ~ "]({{< ref \"/" ~ release ~ "/package/" ~ file.package ~ ".md\" >}})" -%} +{%- set output = output ~ " | " ~ file.filename -%} +{%- set output = output ~ " | " ~ file.metadata_language_team -%} +{%- set output = output ~ " | " -%} +{{ output }} +{% endfor -%} +{.sortable} +{% endif %} + +{% endif %} \ No newline at end of file diff --git a/templates/package.md b/templates/package.md index 33ecfb0..b6abc6d 100644 --- a/templates/package.md +++ b/templates/package.md @@ -1,19 +1,25 @@ --- -title: "{{ package }}" +title: "{{ package }} - translation progress for {{ release }}" +package: {{ package }} date: {{ now }} started_languages: {{ started_languages }} no_languages: {{ no_languages }} +polib_error_count: {{ polib_error_count }} +could_not_process_count: {{ could_not_process_count }} +release: {{ release }} --- The package {{ package }}: -* represents {{ totalsourcewords }} source words to be translated, -* is translated into {{ stats.languages|length }} languages in Fedora {{ results }}, -* contains {{ no_languages }} files for which no languages could be deducted. +* represents {{ totalsourcewords }} source words to be translated +* is translated into {{ stats.languages|length }} languages in Fedora {{ release }} +{% if no_languages > 0 %}* no languages could be deducted for {{ no_languages }} files {% endif %} +{% if polib_error_count > 0 %}* polib could not open {{ polib_error_count }} files to extract metadata{% endif %} +{% if could_not_process_count > 0 %}* contains {{ could_not_process_count }} po stats could not be processes{% endif %} | Language | Translated words | Total source words | Progress (%) | Files | |----------|-----------------:|-------------------:|-------------:|-------| {% for lang, stat in stats.languages -%} -{%- set output = " | [" ~ lang ~ "]({{< ref \"/" ~ results ~ "/language/" ~ lang ~ ".md\" >}})" -%} +{%- set output = " | [" ~ lang ~ "]({{< ref \"/" ~ release ~ "/language/" ~ lang ~ ".md\" >}})" -%} {%- set output = output ~ " | " ~ stat.translatedsourcewords -%} {%- set output = output ~ " | " ~ stat.totalsourcewordssum -%} {%- set output = output ~ " | " ~ '{:.1f}'.format(stat.progress) -%} @@ -21,4 +27,59 @@ The package {{ package }}: {%- set output = output ~ " | " -%} {{ output }} {% endfor -%} -{.sortable} \ No newline at end of file +{.sortable} + +{% if polib_error_count > 0 or could_not_process_count > 0 or no_languages > 0 %} +# Errors on PO files +{% if no_languages > 0 %} +## Errors on language deduction +It were not possible to properly deduct the language code for the following files. + +| Lang from file path/name | Lang from file path/name | team from po metadata | polib error | +|--------------------------|--------------------------|-----------------------|-------------| +{% for file in po if po[file].lang_code_chosen == "error" -%} +{%- set output = " | " ~ file -%} +{%- set output = output ~ " | " ~ po[file].lang_in_path -%} +{%- set output = output ~ " | " ~ po[file].metadata_language_team -%} +{%- set output = output ~ " | " ~ po[file].polib_error -%} +{%- set output = output ~ " | " -%} +{{ output }} +{% endfor -%} +{.sortable} +{% endif %} + +{% if polib_error_count > 0 %} +## Error with polib +We use the po metadata to get the language code and the team, but sometimes it fails, here are the files. + +| Lang from file path/name | Lang code chosen | polib error | Team | +|--------------------------|------------------|-------------|------| +{% for file in po if po[file].polib_error != "" -%} +{%- set output = output ~ " | " ~ file -%} +{%- set output = output ~ " | " ~ po[file].lang_code_chosen -%} +{%- set output = output ~ " | " ~ po[file].polib_error -%} +{%- set output = output ~ " | " ~ po[file].metadata_language_team -%} +{%- set output = output ~ " | " -%} +{{ output }} +{% endfor -%} +{.sortable} +{% endif %} + +{% if could_not_process_count > 0 %} +## Error with calcstats + +We use the calcstats from translate toolkit to get the translation progress, but sometimes it fails, here are the files: + +| Package | Lang code chosen | Team | +|---------|------------------|------| +{% for file in po if po[file].could_not_process == 1 -%} +{%- set output = output ~ " | " ~ file -%} +{%- set output = output ~ " | " ~ po[file].lang_code_chosen -%} +{%- set output = output ~ " | " ~ po[file].metadata_language_team -%} +{%- set output = output ~ " | " -%} +{{ output }} +{% endfor -%} +{.sortable} +{% endif %} + +{% endif %} \ No newline at end of file diff --git a/todo.md b/todo.md index e2e843a..632c09f 100644 --- a/todo.md +++ b/todo.md @@ -11,14 +11,9 @@ direct call to: # build_tm.py -move error detection (check_lang) into %language%/stats.json and display erros -move error files into %language%/stats.json and make these accessible via website remove terminology (someone who wants it can do it locally) # website -list why we could not deduct error files - -# global - - +territory: table per spoken languages +language: table across releases \ No newline at end of file diff --git a/website/themes/beautifulhugo/layouts/_default/list_languages.html b/website/themes/beautifulhugo/layouts/_default/list_languages.html index 4e862b4..19d365b 100644 --- a/website/themes/beautifulhugo/layouts/_default/list_languages.html +++ b/website/themes/beautifulhugo/layouts/_default/list_languages.html @@ -5,21 +5,35 @@
    {{ .Content }} - +

    + Column explanation: +

      +
    • Progress vs started packages (%): compare translated source words to every source word detected in each package
    • +
    • Progress vs release: compare translated source words to every source word detected in {{ .Params.release }}
    • +
    • File parsing errors: file for which po stats could not be extracted
    • +
    • Polib errors: polib could not open the file to extract metadata
    • +
    +

    Territory First Fedora version
    - + - + + + + {{ range sort .Pages "Title" "asc" }} + + + {{ end }}
    Click on columns headers to sort values
    codeCode English name Local nameProgress (%)Progress vs started packages (%)Progress vs release (%)File parsing errorsPolib errors
    {{ .Params.code }} {{ .Params.name_english }} {{ .Params.name_local }}{{ .Params.progress }} {{ .Params.progress_d }}{{ .Params.could_not_process_count }}{{ .Params.polib_error_count }}
    diff --git a/website/themes/beautifulhugo/layouts/_default/list_packages.html b/website/themes/beautifulhugo/layouts/_default/list_packages.html index 8276d0f..0ff2f9f 100644 --- a/website/themes/beautifulhugo/layouts/_default/list_packages.html +++ b/website/themes/beautifulhugo/layouts/_default/list_packages.html @@ -11,19 +11,26 @@
  • Package: package name in Fedora operating system
  • Languages: number of identified languages
  • No language: number of translation files for which no language could be deducted
  • +
  • File parsing errors: file for which po stats could not be extracted
  • +
  • Polib errors: polib could not open the file to extract metadata
  • + + + {{ range sort .Pages "Title" "asc" }} - + + + {{ end }}
    Click on columns headers to sort values
    Package Language No languageFile parsing errorsPolib errors
    {{ .Title }}{{ .Params.package }} {{ .Params.started_languages }} {{ .Params.no_languages }}{{ .Params.could_not_process_count }}{{ .Params.polib_error_count }}
    From d422d20dfd154ee54e2888fdc86a973dc34a1e4f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 10 2023 05:36:32 +0000 Subject: [PATCH 23/26] Merge #46 `add parameter to keep srpm in results` --- diff --git a/build.py b/build.py index 045a544..4ebf4c2 100755 --- a/build.py +++ b/build.py @@ -49,6 +49,13 @@ def main(): help="Keep SRPMs in /srpms", ) parser.add_argument( + "--store-srpms-in-results", + default=False, + action="store_true", + dest="srpms_in_results", + help="Store srpms in results folder (useful for automation)", + ) + parser.add_argument( "-f", "--force", default=False, @@ -76,7 +83,10 @@ def main(): srpm_regex = re.compile(f"^{args.filter}$") packages_folder = f"./results/{args.results}/packages/" - srpms_path = os.path.abspath("./srpms/") + if args.srpms_in_results: + srpms_path = f"./results/{args.results}/srpms/" + else: + srpms_path = os.path.abspath("./srpms/") if not os.path.exists(packages_folder): os.makedirs(packages_folder) From 5153be1ad121b26c97c0b47dff2f67c28d940a5f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 10 2023 10:56:05 +0000 Subject: [PATCH 24/26] rollback to f38 image with hugo 0.111.3 --- diff --git a/Dockerfile b/Dockerfile index 80cb12e..6adc98b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,12 @@ -FROM registry.fedoraproject.org/fedora:39 +FROM registry.fedoraproject.org/fedora:38 + +RUN dnf install -y lbzip2 unzip xz cpio dnf-plugins-core rsync python3-pip gettext git glibc-gconv-extra + +# f39 release don't work yet (pip install issue) +# use the same version as in f39 https://packages.fedoraproject.org/pkgs/hugo/hugo/ +ADD https://github.com/gohugoio/hugo/releases/download/v0.111.3/hugo_0.111.3_linux-amd64.tar.gz /tmp +RUN tar -C /usr/bin -xvf /tmp/hugo_0.111.3_linux-amd64.tar.gz -RUN dnf install -y lbzip2 unzip xz cpio dnf-plugins-core rsync python3-pip hugo gettext git glibc-gconv-extra COPY requirements.txt /src/requirements.txt RUN pip install --no-cache -r /src/requirements.txt From 01563f16573fae00c03e5bffeec69524cceae501 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 15 2023 19:07:37 +0000 Subject: [PATCH 25/26] use absolute path for srpms --- diff --git a/build.py b/build.py index 4ebf4c2..731ca7d 100755 --- a/build.py +++ b/build.py @@ -84,7 +84,7 @@ def main(): packages_folder = f"./results/{args.results}/packages/" if args.srpms_in_results: - srpms_path = f"./results/{args.results}/srpms/" + srpms_path = os.path.abspath(f"./results/{args.results}/srpms/") else: srpms_path = os.path.abspath("./srpms/") diff --git a/todo.md b/todo.md index 632c09f..3b44d20 100644 --- a/todo.md +++ b/todo.md @@ -16,4 +16,5 @@ remove terminology (someone who wants it can do it locally) # website territory: table per spoken languages -language: table across releases \ No newline at end of file +language: table across releases +add all languages without territories in CLDR to "ZZ/Unknown Region" \ No newline at end of file From f8d8b2191af5eb1c39e5d874f8eed8adba705351 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 16 2023 06:34:12 +0000 Subject: [PATCH 26/26] add missing js script sorttable is a MIT licensed code to allow sorting all our tables --- diff --git a/website/themes/beautifulhugo/static/js/sorttable.js b/website/themes/beautifulhugo/static/js/sorttable.js new file mode 100644 index 0000000..2383da1 --- /dev/null +++ b/website/themes/beautifulhugo/static/js/sorttable.js @@ -0,0 +1,494 @@ +/* + SortTable + version 2 + 7th April 2007 + Stuart Langridge, http://www.kryogenix.org/code/browser/sorttable/ + + Instructions: + Download this file + Add to your HTML + Add class="sortable" to any table you'd like to make sortable + Click on the headers to sort + + Thanks to many, many people for contributions and suggestions. + Licenced as X11: http://www.kryogenix.org/code/browser/licence.html + This basically means: do what you want with it. +*/ + + +var stIsIE = /*@cc_on!@*/false; + +sorttable = { + init: function() { + // quit if this function has already been called + if (arguments.callee.done) return; + // flag this function so we don't do the same thing twice + arguments.callee.done = true; + // kill the timer + if (_timer) clearInterval(_timer); + + if (!document.createElement || !document.getElementsByTagName) return; + + sorttable.DATE_RE = /^(\d\d?)[\/\.-](\d\d?)[\/\.-]((\d\d)?\d\d)$/; + + forEach(document.getElementsByTagName('table'), function(table) { + if (table.className.search(/\bsortable\b/) != -1) { + sorttable.makeSortable(table); + } + }); + + }, + + makeSortable: function(table) { + if (table.getElementsByTagName('thead').length == 0) { + // table doesn't have a tHead. Since it should have, create one and + // put the first table row in it. + the = document.createElement('thead'); + the.appendChild(table.rows[0]); + table.insertBefore(the,table.firstChild); + } + // Safari doesn't support table.tHead, sigh + if (table.tHead == null) table.tHead = table.getElementsByTagName('thead')[0]; + + if (table.tHead.rows.length != 1) return; // can't cope with two header rows + + // Sorttable v1 put rows with a class of "sortbottom" at the bottom (as + // "total" rows, for example). This is B&R, since what you're supposed + // to do is put them in a tfoot. So, if there are sortbottom rows, + // for backwards compatibility, move them to tfoot (creating it if needed). + sortbottomrows = []; + for (var i=0; i5' : ' ▴'; + this.appendChild(sortrevind); + return; + } + if (this.className.search(/\bsorttable_sorted_reverse\b/) != -1) { + // if we're already sorted by this column in reverse, just + // re-reverse the table, which is quicker + sorttable.reverse(this.sorttable_tbody); + this.className = this.className.replace('sorttable_sorted_reverse', + 'sorttable_sorted'); + this.removeChild(document.getElementById('sorttable_sortrevind')); + sortfwdind = document.createElement('span'); + sortfwdind.id = "sorttable_sortfwdind"; + sortfwdind.innerHTML = stIsIE ? ' 6' : ' ▾'; + this.appendChild(sortfwdind); + return; + } + + // remove sorttable_sorted classes + theadrow = this.parentNode; + forEach(theadrow.childNodes, function(cell) { + if (cell.nodeType == 1) { // an element + cell.className = cell.className.replace('sorttable_sorted_reverse',''); + cell.className = cell.className.replace('sorttable_sorted',''); + } + }); + sortfwdind = document.getElementById('sorttable_sortfwdind'); + if (sortfwdind) { sortfwdind.parentNode.removeChild(sortfwdind); } + sortrevind = document.getElementById('sorttable_sortrevind'); + if (sortrevind) { sortrevind.parentNode.removeChild(sortrevind); } + + this.className += ' sorttable_sorted'; + sortfwdind = document.createElement('span'); + sortfwdind.id = "sorttable_sortfwdind"; + sortfwdind.innerHTML = stIsIE ? ' 6' : ' ▾'; + this.appendChild(sortfwdind); + + // build an array to sort. This is a Schwartzian transform thing, + // i.e., we "decorate" each row with the actual sort key, + // sort based on the sort keys, and then put the rows back in order + // which is a lot faster because you only do getInnerText once per row + row_array = []; + col = this.sorttable_columnindex; + rows = this.sorttable_tbody.rows; + for (var j=0; j 12) { + // definitely dd/mm + return sorttable.sort_ddmm; + } else if (second > 12) { + return sorttable.sort_mmdd; + } else { + // looks like a date, but we can't tell which, so assume + // that it's dd/mm (English imperialism!) and keep looking + sortfn = sorttable.sort_ddmm; + } + } + } + } + return sortfn; + }, + + getInnerText: function(node) { + // gets the text we want to use for sorting for a cell. + // strips leading and trailing whitespace. + // this is *not* a generic getInnerText function; it's special to sorttable. + // for example, you can override the cell text with a customkey attribute. + // it also gets .value for fields. + + if (!node) return ""; + + hasInputs = (typeof node.getElementsByTagName == 'function') && + node.getElementsByTagName('input').length; + + if (node.getAttribute("sorttable_customkey") != null) { + return node.getAttribute("sorttable_customkey"); + } + else if (typeof node.textContent != 'undefined' && !hasInputs) { + return node.textContent.replace(/^\s+|\s+$/g, ''); + } + else if (typeof node.innerText != 'undefined' && !hasInputs) { + return node.innerText.replace(/^\s+|\s+$/g, ''); + } + else if (typeof node.text != 'undefined' && !hasInputs) { + return node.text.replace(/^\s+|\s+$/g, ''); + } + else { + switch (node.nodeType) { + case 3: + if (node.nodeName.toLowerCase() == 'input') { + return node.value.replace(/^\s+|\s+$/g, ''); + } + case 4: + return node.nodeValue.replace(/^\s+|\s+$/g, ''); + break; + case 1: + case 11: + var innerText = ''; + for (var i = 0; i < node.childNodes.length; i++) { + innerText += sorttable.getInnerText(node.childNodes[i]); + } + return innerText.replace(/^\s+|\s+$/g, ''); + break; + default: + return ''; + } + } + }, + + reverse: function(tbody) { + // reverse the rows in a tbody + newrows = []; + for (var i=0; i=0; i--) { + tbody.appendChild(newrows[i]); + } + delete newrows; + }, + + /* sort functions + each sort function takes two parameters, a and b + you are comparing a[0] and b[0] */ + sort_numeric: function(a,b) { + aa = parseFloat(a[0].replace(/[^0-9.-]/g,'')); + if (isNaN(aa)) aa = 0; + bb = parseFloat(b[0].replace(/[^0-9.-]/g,'')); + if (isNaN(bb)) bb = 0; + return aa-bb; + }, + sort_alpha: function(a,b) { + if (a[0]==b[0]) return 0; + if (a[0] 0 ) { + var q = list[i]; list[i] = list[i+1]; list[i+1] = q; + swap = true; + } + } // for + t--; + + if (!swap) break; + + for(var i = t; i > b; --i) { + if ( comp_func(list[i], list[i-1]) < 0 ) { + var q = list[i]; list[i] = list[i-1]; list[i-1] = q; + swap = true; + } + } // for + b++; + + } // while(swap) + } +} + +/* ****************************************************************** + Supporting functions: bundled here to avoid depending on a library + ****************************************************************** */ + +// Dean Edwards/Matthias Miller/John Resig + +/* for Mozilla/Opera9 */ +if (document.addEventListener) { + document.addEventListener("DOMContentLoaded", sorttable.init, false); +} + +/* for Internet Explorer */ +/*@cc_on @*/ +/*@if (@_win32) + document.write("