From 96b6938f992cd3885340e88fb5f3f9f075fc2d63 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 20 2020 06:31:40 +0000 Subject: [PATCH 1/7] fix empty folders issues --- diff --git a/build_stats.py b/build_stats.py index 4d9851b..ec93756 100755 --- a/build_stats.py +++ b/build_stats.py @@ -38,6 +38,7 @@ def main(): src_folder = os.path.join(packages_folder, package) dest_folder = os.path.join(packages_stats_folder, package) + if not os.path.exists(dest_folder): os.makedirs(dest_folder) @@ -60,26 +61,30 @@ def main(): for package in sorted(packages): count += 1 print(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package)) - input_file = packages_folder + "{p}/stats.csv".format(p=package) - - try: - with open(input_file, 'r') as f: - lines = f.readlines() - - seen_lines = set() - with open(input_file, 'w') as f: - for line in lines: - if line not in seen_lines: - seen_lines.add(line) - f.write(line) - except FileNotFoundError: + package_dir = os.path.join(packages_stats_folder, "{p}".format(p=package)) + input_file = os.path.join(package_dir, "stats.csv") + + if not os.listdir(package_dir): + os.rmdir(package_dir) continue + with open(input_file, 'r') as f: + lines = f.readlines() + + seen_lines = set() + with open(input_file, 'w') as f: + for line in lines: + if line not in seen_lines: + seen_lines.add(line) + f.write(line) + print("Computing language stats") languages = [f for f in os.listdir(languages_folder)] count = 0 dest_folder = languages_stats_folder + if not os.path.exists(dest_folder): + os.makedirs(dest_folder) for language in sorted(languages): count += 1 From f05cd5df7f24a9a4fe595040718a83482573f24d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 22 2020 12:07:07 +0000 Subject: [PATCH 2/7] use direct call to pocount and store result in json --- diff --git a/build_stats.py b/build_stats.py index ec93756..02a4b97 100755 --- a/build_stats.py +++ b/build_stats.py @@ -9,6 +9,7 @@ import subprocess import shutil import tempfile +from translate.tools.pocount import calcstats def main(): """Handle params""" @@ -42,8 +43,7 @@ def main(): if not os.path.exists(dest_folder): os.makedirs(dest_folder) - stats_file = os.path.join(dest_folder, "stats.csv") - error_file = os.path.join(dest_folder, "stats.errors.txt") + stats_file = os.path.join(dest_folder, "stats.json") if os.path.isfile(stats_file): continue @@ -52,31 +52,7 @@ def main(): files = glob.glob(os.path.join(src_folder, discover["filemask"])) if discover["file_format"] == "po": - get_po_translation_level(files, stats_file, error_file) - elif discover["file_format"] == "json": - get_json_translation_level(files, os.path.join(src_folder, discover["template"]), stats_file, error_file) - - print(" Removing duplicates") - count = 0 - for package in sorted(packages): - count += 1 - print(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package)) - package_dir = os.path.join(packages_stats_folder, "{p}".format(p=package)) - input_file = os.path.join(package_dir, "stats.csv") - - if not os.listdir(package_dir): - os.rmdir(package_dir) - continue - - with open(input_file, 'r') as f: - lines = f.readlines() - - seen_lines = set() - with open(input_file, 'w') as f: - for line in lines: - if line not in seen_lines: - seen_lines.add(line) - f.write(line) + get_po_translation_level(files, stats_file) print("Computing language stats") languages = [f for f in os.listdir(languages_folder)] @@ -94,59 +70,43 @@ def main(): with open(os.path.join(languages_folder, language), 'r') as f: discoveries = json.load(f) - stats_file = os.path.join(dest_folder, lang + ".stats.csv") - error_file = os.path.join(dest_folder, lang + ".stats.errors.txt") + stats_file = os.path.join(dest_folder, lang + ".stats.json") if os.path.isfile(stats_file): continue files = discoveries.get("po", []) if files: - get_po_translation_level(files, stats_file, error_file) + get_po_translation_level(files, stats_file) -def get_po_translation_level(files, stats_file, error_file): +def get_po_translation_level(files, stats_file): """ Compute results """ - - with open(stats_file, 'a') as stats: - with open(error_file, 'a') as error: - try: - subprocess.run(["pocount", "--csv"] + files, - stdout=stats, stderr=error, check=True) - except subprocess.CalledProcessError as e: - print(" Pocount --csv failed.") - print(e) - print(files) - exit() - - -def get_json_translation_level(files, template, stats_file, error_file): - """ convert json files into po and call get_po_translation_level """ - - # move only related json files to a temporary folder - with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmpjson: - error = open(error_file, 'a') - pofiles = [] - for filename in files: - # if filesare in language subfolder, reproduce the hierarchy - dest = filename.replace(os.path.basename(filename), "") - os.makedirs(tmpjson + "/" + dest, exist_ok=True) - - if os.path.isfile(template): - po = os.path.join(tmpjson, filename.replace(".json", ".po")) - subprocess.run(["json2po", - "-t", - template, - filename, - po, - "--progress=none"], - stderr=error, - check=True) - pofiles.append(po) - else: - print(" {t} missing, translation-finder bug?".format(t=template)) - error.close() - get_po_translation_level(pofiles, stats_file, error_file) + stats = dict() + + if os.path.isfile(stats_file): + with open(stats_file, "r") as read_file: + stats = json.load(read_file) + + for file in files: + stat = calcstats(file) + keys = [ + "translatedsourcewords", + "fuzzysourcewords", + "untranslatedsourcewords", + "translated", + "fuzzy", + "untranslated", + "translatedtargetwords", + ] + results = dict() + for key in keys: + results[key] = stat.get(key, 0) + + stats[file] = results + + with open(stats_file, "w") as f: + json.dump(stats, f, indent=2) if __name__ == '__main__': diff --git a/build_website.py b/build_website.py index 88b41c9..f75294f 100755 --- a/build_website.py +++ b/build_website.py @@ -46,11 +46,11 @@ def main(): packages = [d for d in os.listdir(packages_stats) if os.path.isdir(os.path.join(packages_stats, d))] log_files = pd.read_csv(langs_log, header=None, skipinitialspace=True) log_files = log_files.iloc[:, [0, 4]] - log_files.columns = ["Filename", "lang_code"] + log_files.columns = ["filename", "lang_code"] packages_langs_results = dict() for package in sorted(packages): - file_stats = os.path.join(packages_stats, package, "stats.csv") + file_stats = os.path.join(packages_stats, package, "stats.json") if not os.path.isfile(file_stats): print(" Package: {p} missing stats file {f}".format(p=package, f=file_stats)) continue @@ -69,8 +69,8 @@ def main(): print("prepare json files for languages") langs = [f for f in os.listdir(langs_stats) if os.path.isfile(os.path.join(langs_stats, f))] for lang in sorted(langs): - if lang.endswith(".stats.csv"): - code = lang[:-len(".stats.csv")] + if lang.endswith(".stats.json"): + code = lang[:-len(".stats.json")] results = consolidate_language_stats(os.path.join(langs_stats, lang)) results["packages"] = packages_langs_results.get(code, dict()) store_json_file(code, results, data_langs_folder) @@ -98,96 +98,98 @@ def main(): generate_static_pages_packages(args.release, code, content, dest) -def consolidate_language_stats(csv_file): +def consolidate_language_stats(stats_file): """ From a CSV file, return key indicators """ results = dict() - fieldnames = {"Filename": "str", - "TranslatedMessages": "int", - "TranslatedSourceWords": "int", - "TranslatedTargetWords": "int", - "FuzzyMessages": "int", - "FuzzySourceWords": "int", - "UntranslatedMessages": "int", - "UntranslatedSource Words": "int", - "TotalMessage": "int", - "TotalSourceWords": "int", - "ReviewMessages": "int", - "ReviewSourceWords": "int"} - - stats_df = pd.read_csv(csv_file, header=0, skipinitialspace=True) + fieldnames = { + 'filename': "str", + 'translatedsourcewords': "int", + 'fuzzysourcewords': "int", + 'untranslatedsourcewords': "int", + 'translated': "int", + 'fuzzy': "int", + 'untranslated': "int", + 'translatedtargetwords': "int", + "totalsourcewords": "int" + } + + stats_df = pd.read_json(stats_file, orient='index') stats_df.fillna(0, inplace=True) + stats_df.reset_index(level=0, inplace=True) + stats_df["totalsourcewords"] = stats_df["untranslatedsourcewords"] + stats_df["translatedsourcewords"] stats_df.columns = fieldnames.keys() - stats_df["package"] = stats_df["Filename"].str.split("/", expand=True)[4] + stats_df["package"] = stats_df["filename"].str.split("/", expand=True)[4] results["packages"] = stats_df["package"].unique().tolist() - results["progress"] = round(stats_df["TranslatedSourceWords"].sum() / stats_df["TotalSourceWords"].sum() * 100, 1) + results["progress"] = round(stats_df["translatedsourcewords"].sum() / stats_df["totalsourcewords"].sum() * 100, 1) - for kpi in ["TotalSourceWords", "TranslatedSourceWords"]: + for kpi in ["totalsourcewords", "translatedsourcewords"]: results[kpi + "Sum"] = int(stats_df[kpi].sum()) return results -def consolidate_package_stats(csv_file, log_files): +def consolidate_package_stats(stats_file, log_files): """ From a CSV file, return key indicators """ results = dict() - fieldnames = {"Filename": "str", - "TranslatedMessages": "int", - "TranslatedSourceWords": "int", - "TranslatedTargetWords": "int", - "FuzzyMessages": "int", - "FuzzySourceWords": "int", - "UntranslatedMessages": "int", - "UntranslatedSource Words": "int", - "TotalMessage": "int", - "TotalSourceWords": "int", - "ReviewMessages": "int", - "ReviewSourceWords": "int"} + fieldnames = { + 'filename': "str", + 'translatedsourcewords': "int", + 'fuzzysourcewords': "int", + 'untranslatedsourcewords': "int", + 'translated': "int", + 'fuzzy': "int", + 'untranslated': "int", + 'translatedtargetwords': "int", + "totalsourcewords": "int" + } try: - stats_df = pd.read_csv(csv_file, header=0, skipinitialspace=True) + stats_df = pd.read_json(stats_file, orient='index') except pd.errors.EmptyDataError as e: - print(" File {f} raised {e}".format(f=csv_file, e=e)) + print(" File {f} raised {e}".format(f=stats_file, e=e)) return results stats_df.fillna(0, inplace=True) + stats_df.reset_index(level=0, inplace=True) + stats_df["totalsourcewords"] = stats_df["untranslatedsourcewords"] + stats_df["translatedsourcewords"] stats_df.columns = fieldnames.keys() - stats_df_w_lang = pd.merge(stats_df, log_files, how="inner", on="Filename") + stats_df_w_lang = pd.merge(stats_df, log_files, how="inner", on="filename") stats_df_no_lang = pd.merge(stats_df, log_files, how="outer", indicator=True).loc[lambda x: x["_merge"] == "left_only"] try: - total_source_words = int(max(stats_df_w_lang["TotalSourceWords"])) + total_source_words = int(max(stats_df_w_lang["totalsourcewords"])) except ValueError as e: - print(" File {f} raised ValueError {e}".format(f=csv_file, e=e)) + print(" File {f} raised ValueError {e}".format(f=stats_file, e=e)) return results - temp = stats_df_w_lang.groupby(["lang_code"]).agg({"TranslatedSourceWords": ["sum"], }).reset_index().droplevel(1, axis=1).to_dict(orient="records") + temp = stats_df_w_lang.groupby(["lang_code"]).agg({"translatedsourcewords": ["sum"], }).reset_index().droplevel(1, axis=1).to_dict(orient="records") for line in temp: line["progress"] = 0 p = 0 if total_source_words == 0: - print(" File {f} has TranslatedSourceWords = 0".format(f=csv_file)) + print(" File {f} has translatedsourcewords = 0".format(f=stats_file)) line["progress"] = p continue try: - p = round((int(line["TranslatedSourceWords"]) / total_source_words)*100) + p = round((int(line["translatedsourcewords"]) / total_source_words)*100) except OverflowError: print(" File {f} has Translated={t} and Source={tot}".format( - f=csv_file, - t=line["TranslatedSourceWords"], + f=stats_file, + t=line["translatedsourcewords"], tot=total_source_words)) line["progress"] = p - results["TotalSourceWords"] = total_source_words + results["totalsourcewords"] = total_source_words results["count_languages"] = len(pd.unique(stats_df_w_lang["lang_code"])) for line in sorted(temp, key=lambda k: k["progress"], reverse=True): - del line["TranslatedSourceWords"] + del line["translatedsourcewords"] if line["progress"] <= 50: hop = results.get("lessorequalto50percent", []) hop.append(line) @@ -201,7 +203,7 @@ def consolidate_package_stats(csv_file, log_files): hop.append(line) results["equalsormorethan80percent"] = hop - results["no_languages"] = stats_df_no_lang["Filename"].tolist() + results["no_languages"] = stats_df_no_lang["filename"].tolist() return results From 5323564b4f3592551358efc83b5b6020c238d86e Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 22 2020 19:26:05 +0000 Subject: [PATCH 3/7] simplify packages stats storage --- diff --git a/build_stats.py b/build_stats.py index 02a4b97..ad7f778 100755 --- a/build_stats.py +++ b/build_stats.py @@ -5,7 +5,6 @@ import argparse import glob import json import os -import subprocess import shutil import tempfile @@ -31,6 +30,9 @@ def main(): packages = [f for f in os.listdir(packages_folder) if os.path.isdir(os.path.join(packages_folder, f))] count = 0 + if not os.path.exists(packages_stats_folder): + os.makedirs(packages_stats_folder) + for package in sorted(packages): count += 1 print(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package)) @@ -38,12 +40,7 @@ def main(): discoveries = json.load(f) src_folder = os.path.join(packages_folder, package) - dest_folder = os.path.join(packages_stats_folder, package) - - if not os.path.exists(dest_folder): - os.makedirs(dest_folder) - - stats_file = os.path.join(dest_folder, "stats.json") + stats_file = os.path.join(packages_stats_folder, package + ".json") if os.path.isfile(stats_file): continue @@ -58,9 +55,9 @@ def main(): languages = [f for f in os.listdir(languages_folder)] count = 0 - dest_folder = languages_stats_folder - if not os.path.exists(dest_folder): - os.makedirs(dest_folder) + languages_stats_folder = languages_stats_folder + if not os.path.exists(languages_stats_folder): + os.makedirs(languages_stats_folder) for language in sorted(languages): count += 1 @@ -70,7 +67,7 @@ def main(): with open(os.path.join(languages_folder, language), 'r') as f: discoveries = json.load(f) - stats_file = os.path.join(dest_folder, lang + ".stats.json") + stats_file = os.path.join(languages_stats_folder, lang + ".json") if os.path.isfile(stats_file): continue diff --git a/build_website.py b/build_website.py index f75294f..cedaa69 100755 --- a/build_website.py +++ b/build_website.py @@ -43,34 +43,32 @@ def main(): # prepare json files for packages print("prepare json files for packages") - packages = [d for d in os.listdir(packages_stats) if os.path.isdir(os.path.join(packages_stats, d))] + packages = [d for d in os.listdir(packages_stats) if os.path.isfile(os.path.join(packages_stats, d))] log_files = pd.read_csv(langs_log, header=None, skipinitialspace=True) log_files = log_files.iloc[:, [0, 4]] log_files.columns = ["filename", "lang_code"] packages_langs_results = dict() for package in sorted(packages): - file_stats = os.path.join(packages_stats, package, "stats.json") - if not os.path.isfile(file_stats): - print(" Package: {p} missing stats file {f}".format(p=package, f=file_stats)) - continue + name = package[:-len(".json")] + file_stats = os.path.join(packages_stats, name + ".json") results = consolidate_package_stats(file_stats, log_files) - store_json_file(package, results, data_pkgs_folder) + store_json_file(name, results, data_pkgs_folder) langs_results = results.get("equalsormorethan80percent", []) + results.get("between50and80percent", []) + results.get("lessorequalto50percent", []) for langs in langs_results: val = packages_langs_results.get(langs["lang_code"], []) - val.append({"name": package, "progress": langs["progress"]}) + val.append({"name": name, "progress": langs["progress"]}) packages_langs_results[langs["lang_code"]] = val # prepare json files for languages print("prepare json files for languages") langs = [f for f in os.listdir(langs_stats) if os.path.isfile(os.path.join(langs_stats, f))] for lang in sorted(langs): - if lang.endswith(".stats.json"): - code = lang[:-len(".stats.json")] + if lang.endswith(".json"): + code = lang[:-len(".json")] results = consolidate_language_stats(os.path.join(langs_stats, lang)) results["packages"] = packages_langs_results.get(code, dict()) store_json_file(code, results, data_langs_folder) diff --git a/todo.md b/todo.md index 8d8e4e1..a3d1c09 100644 --- a/todo.md +++ b/todo.md @@ -1,14 +1,13 @@ # global support for json files -stats computation in CSV isn't so useful, maybe a direct storage in JSON would make more sense. results should be stored by discovered translation files, so that progress computation per language makes sens both at package level and file level. # optimization direct call to: -- pocount: https://github.com/translate/translate/blob/master/translate/tools/pocount.py -- +* po2tmx +* poterminology # build_tm.py From 36ddef9d321a3d56b1cada1ae790bf126b90eed5 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 23 2020 21:00:09 +0000 Subject: [PATCH 4/7] use filemask to store stats --- diff --git a/build_stats.py b/build_stats.py index ad7f778..d116bd7 100755 --- a/build_stats.py +++ b/build_stats.py @@ -36,7 +36,7 @@ def main(): for package in sorted(packages): count += 1 print(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package)) - with open(os.path.join(packages_folder, package, "discover.json"), 'r') as f: + with open(os.path.join(packages_folder, package, "discover.json"), "r") as f: discoveries = json.load(f) src_folder = os.path.join(packages_folder, package) @@ -45,11 +45,17 @@ def main(): if os.path.isfile(stats_file): continue + results = dict() for discover in discoveries: files = glob.glob(os.path.join(src_folder, discover["filemask"])) if discover["file_format"] == "po": - get_po_translation_level(files, stats_file) + results[discover["filemask"]] = get_po_translation_level(files, stats_file) + + if len(results) > 0: + with open(stats_file, "w") as f: + json.dump(results, f, indent=2) + print("Computing language stats") languages = [f for f in os.listdir(languages_folder)] @@ -64,7 +70,7 @@ def main(): lang = language[:-5] print(" {c}/{t} - {l}".format(c=count, t=len(languages), l=lang)) - with open(os.path.join(languages_folder, language), 'r') as f: + with open(os.path.join(languages_folder, language), "r") as f: discoveries = json.load(f) stats_file = os.path.join(languages_stats_folder, lang + ".json") @@ -74,19 +80,21 @@ def main(): files = discoveries.get("po", []) if files: - get_po_translation_level(files, stats_file) + with open(stats_file, "w") as f: + json.dump(get_po_translation_level(files, stats_file), f, indent=2) -def get_po_translation_level(files, stats_file): +def get_po_translation_level(files, stats_file): """ Compute results """ stats = dict() - if os.path.isfile(stats_file): - with open(stats_file, "r") as read_file: - stats = json.load(read_file) - for file in files: - stat = calcstats(file) + try: + stat = calcstats(file) + except Exception as e: + print(" {f} triggered an {t} exception: {e}".format(f=file, t=type(e).__name__, e=e)) + continue + keys = [ "translatedsourcewords", "fuzzysourcewords", @@ -102,9 +110,8 @@ def get_po_translation_level(files, stats_file): stats[file] = results - with open(stats_file, "w") as f: - json.dump(stats, f, indent=2) + return stats -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/build_website.py b/build_website.py index cedaa69..c4d98a7 100755 --- a/build_website.py +++ b/build_website.py @@ -101,18 +101,18 @@ def consolidate_language_stats(stats_file): results = dict() fieldnames = { - 'filename': "str", - 'translatedsourcewords': "int", - 'fuzzysourcewords': "int", - 'untranslatedsourcewords': "int", - 'translated': "int", - 'fuzzy': "int", - 'untranslated': "int", - 'translatedtargetwords': "int", + "filename": "str", + "translatedsourcewords": "int", + "fuzzysourcewords": "int", + "untranslatedsourcewords": "int", + "translated": "int", + "fuzzy": "int", + "untranslated": "int", + "translatedtargetwords": "int", "totalsourcewords": "int" } - stats_df = pd.read_json(stats_file, orient='index') + stats_df = pd.read_json(stats_file, orient="index") stats_df.fillna(0, inplace=True) stats_df.reset_index(level=0, inplace=True) stats_df["totalsourcewords"] = stats_df["untranslatedsourcewords"] + stats_df["translatedsourcewords"] @@ -134,36 +134,48 @@ def consolidate_package_stats(stats_file, log_files): results = dict() fieldnames = { - 'filename': "str", - 'translatedsourcewords': "int", - 'fuzzysourcewords': "int", - 'untranslatedsourcewords': "int", - 'translated': "int", - 'fuzzy': "int", - 'untranslated': "int", - 'translatedtargetwords': "int", + "filename": "str", + "translatedsourcewords": "int", + "fuzzysourcewords": "int", + "untranslatedsourcewords": "int", + "translated": "int", + "fuzzy": "int", + "untranslated": "int", + "translatedtargetwords": "int", "totalsourcewords": "int" } - try: - stats_df = pd.read_json(stats_file, orient='index') - except pd.errors.EmptyDataError as e: - print(" File {f} raised {e}".format(f=stats_file, e=e)) - return results + _json = json.load(open(stats_file)) + dfs = [] + total_source_words = 0 - stats_df.fillna(0, inplace=True) - stats_df.reset_index(level=0, inplace=True) - stats_df["totalsourcewords"] = stats_df["untranslatedsourcewords"] + stats_df["translatedsourcewords"] - stats_df.columns = fieldnames.keys() + for template in _json.keys(): + tmp_df = pd.DataFrame.from_dict(_json.get(template), orient="index") + tmp_df.fillna(0, inplace=True) + tmp_df.reset_index(level=0, inplace=True) - stats_df_w_lang = pd.merge(stats_df, log_files, how="inner", on="filename") - stats_df_no_lang = pd.merge(stats_df, log_files, how="outer", indicator=True).loc[lambda x: x["_merge"] == "left_only"] + # sometimes, no file were found, which means no stats can be used + if len(tmp_df) == 0: + print(" The template {t} for {f} is empty".format(t=template, f=stats_file)) + continue + + tmp_df["totalsourcewords"] = tmp_df["untranslatedsourcewords"] + tmp_df["translatedsourcewords"] + tmp_df.columns = fieldnames.keys() + + total_source_words += max(tmp_df["totalsourcewords"]) + + dfs.append(tmp_df) - try: - total_source_words = int(max(stats_df_w_lang["totalsourcewords"])) - except ValueError as e: - print(" File {f} raised ValueError {e}".format(f=stats_file, e=e)) + if len(dfs) > 1: + stats_df = pd.concat(dfs) + elif len(dfs) == 0: + print("There is no stats for {f}".format(f=stats_file)) return results + else: + stats_df = dfs[0] + + stats_df_w_lang = pd.merge(stats_df, log_files, how="inner", on="filename") + stats_df_no_lang = pd.merge(stats_df, log_files, how="outer", indicator=True).loc[lambda x: x["_merge"] == "left_only"] temp = stats_df_w_lang.groupby(["lang_code"]).agg({"translatedsourcewords": ["sum"], }).reset_index().droplevel(1, axis=1).to_dict(orient="records") for line in temp: diff --git a/todo.md b/todo.md index a3d1c09..cd426a3 100644 --- a/todo.md +++ b/todo.md @@ -1,7 +1,7 @@ # global support for json files -results should be stored by discovered translation files, so that progress computation per language makes sens both at package level and file level. +default behavior: continue computing, and refresh force to re-compute # optimization @@ -26,5 +26,3 @@ roxterm triggers an error # global - -we may detect anomalies From ac9308096d3e4852652b6a0625b2dcfb502eb5d1 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 24 2020 20:47:34 +0000 Subject: [PATCH 5/7] add language files in websites --- diff --git a/.gitignore b/.gitignore index afebc54..ab40337 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ srpms/ results/ website/content/* website/public/ +website/static/* srpms_*.lst diff --git a/build_language_list.py b/build_language_list.py index 04f1f04..8b8ead6 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -89,7 +89,7 @@ def analyze_lang(lang_folder, analized_lang): files = [] results = dict() with open(os.path.join(lang_folder, analized_lang + ".json"), "r") as read_file: - files = json.load(read_file) + files = json.load(read_file)["po"] print(" Analysing language {l}, with {c} files".format(l=analized_lang, c=len(files))) diff --git a/build_tm.py b/build_tm.py index 55ff5d8..1f31b2d 100755 --- a/build_tm.py +++ b/build_tm.py @@ -7,7 +7,6 @@ import json import os import subprocess import tempfile -import time def main(): @@ -37,7 +36,6 @@ def main(): os.makedirs(tm_folder, exist_ok=True) print("Building the translation memory for every languages") - start_time_search = time.time() if args.lang: with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file: @@ -61,9 +59,6 @@ def main(): print("Compressing files") compress(tm_folder) - search_duration = round(time.time() - start_time_search, 1) - print(" Done in {d} seconds".format(d=search_duration)) - def compute_lang(lang, langfiles, tm_folder, refresh): """ Generate compendium and convert it to tmx """ @@ -71,7 +66,7 @@ def compute_lang(lang, langfiles, tm_folder, refresh): print(" Computing: " + lang) # po consolidation - compendium_file = tm_folder + lang + ".po" + compendium_file = os.path.join(tm_folder, lang + ".po") compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file) if not os.path.isfile(compendium_file) or refresh is True: @@ -101,14 +96,14 @@ def compute_lang(lang, langfiles, tm_folder, refresh): print(" msgcat exception...") # po to tmx convertion - tmx_file = tm_folder + lang + ".tmx" + tmx_file = os.path.join(tm_folder, lang + ".tmx") command = ["po2tmx", "--language="+lang, "--progress=none", compendium_file, "--output="+tmx_file] if not os.path.isfile(tmx_file) or refresh is True: subprocess.run(command, check=True, capture_output=True) # language terminology - terminology_file = tm_folder + lang + ".terminology.po" + terminology_file = os.path.join(tm_folder, lang + ".terminology.po") command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", "--progress=none", compendium_file, "--output=" + terminology_file] @@ -119,9 +114,9 @@ def compute_lang(lang, langfiles, tm_folder, refresh): def check_lang(lang, tm_folder): """ make sure the files were generated """ - compendium_file = tm_folder + lang + ".po" - tmx_file = tm_folder + lang + ".tmx" - terminology_file = tm_folder + lang + ".terminology.po" + compendium_file = os.path.join(tm_folder, lang + ".po") + tmx_file = os.path.join(tm_folder, lang + ".tmx") + terminology_file = os.path.join(tm_folder, lang + ".terminology.po") if not os.path.isfile(compendium_file): print(" {l}-compendium is missing".format(l=lang)) @@ -139,7 +134,12 @@ def compress(folder): files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] for file in sorted(files): + if file.endswith(".gz"): + continue + dest = file + ".gz" + if os.path.isfile(os.path.join(folder, dest)): + continue with open(os.path.join(folder, file), "rb") as file_in: with gzip.open(os.path.join(folder, dest), "wb") as file_out: diff --git a/build_website.py b/build_website.py index c4d98a7..df2e288 100755 --- a/build_website.py +++ b/build_website.py @@ -30,19 +30,21 @@ def main(): data_langs_folder = os.path.join(data_folder, "languages") data_pkgs_folder = os.path.join(data_folder, "packages") + tm_folder = os.path.join(release_folder, "languages-tm") + static_folder = "./website/content/f{v}/".format(v=args.release) static_langs_folder = os.path.join(static_folder, "language") static_pkgs_folder = os.path.join(static_folder, "package") + static_tm_folder = "./website/static/f{v}/".format(v=args.release) # clean destination folders - for folder in [data_langs_folder, data_pkgs_folder, static_langs_folder, static_pkgs_folder]: - if os.path.isdir(folder): - shutil.rmtree(folder) + for folder in [data_langs_folder, data_pkgs_folder, static_langs_folder, static_pkgs_folder, static_tm_folder]: + # if os.path.isdir(folder): + # shutil.rmtree(folder) - os.makedirs(folder) + os.makedirs(folder, exist_ok=True) - # prepare json files for packages - print("prepare json files for packages") + print("Prepare json files for packages") packages = [d for d in os.listdir(packages_stats) if os.path.isfile(os.path.join(packages_stats, d))] log_files = pd.read_csv(langs_log, header=None, skipinitialspace=True) log_files = log_files.iloc[:, [0, 4]] @@ -51,10 +53,11 @@ def main(): packages_langs_results = dict() for package in sorted(packages): name = package[:-len(".json")] + dest_file = os.path.join(data_pkgs_folder, name + ".json") file_stats = os.path.join(packages_stats, name + ".json") results = consolidate_package_stats(file_stats, log_files) - store_json_file(name, results, data_pkgs_folder) + store_json_file(results, dest_file) langs_results = results.get("equalsormorethan80percent", []) + results.get("between50and80percent", []) + results.get("lessorequalto50percent", []) @@ -63,37 +66,54 @@ def main(): val.append({"name": name, "progress": langs["progress"]}) packages_langs_results[langs["lang_code"]] = val - # prepare json files for languages - print("prepare json files for languages") + print("Prepare json files for languages") langs = [f for f in os.listdir(langs_stats) if os.path.isfile(os.path.join(langs_stats, f))] for lang in sorted(langs): if lang.endswith(".json"): code = lang[:-len(".json")] + dest_file = os.path.join(data_langs_folder, code + ".json") + + if os.path.isfile(dest_file): + continue + results = consolidate_language_stats(os.path.join(langs_stats, lang)) results["packages"] = packages_langs_results.get(code, dict()) - store_json_file(code, results, data_langs_folder) + store_json_file(results, dest_file) # generate static content for languages - print("generate static content for languages") + print("Generate static content for languages") langs = [f for f in os.listdir(data_langs_folder) if os.path.isfile(os.path.join(data_langs_folder, f))] for lang in sorted(langs): code = lang[:-len(".json")] - dest = os.path.join(static_langs_folder, code + ".md") + dest_file = os.path.join(static_langs_folder, code + ".md") + + if os.path.isfile(dest_file): + continue + with open(os.path.join(data_langs_folder, lang), "r") as read_file: content = json.load(read_file) - generate_static_pages_langs(args.release, code, content, dest) + generate_static_pages_langs(args.release, code, content, dest_file) - print("generate static content for packages") - # generate static content for packages + print("Generate static content for packages") packages = [f for f in os.listdir(data_pkgs_folder) if os.path.isfile(os.path.join(data_pkgs_folder, f))] for package in sorted(packages): code = package[:-len(".json")] - dest = os.path.join(static_pkgs_folder, code + ".md") + dest_file = os.path.join(static_pkgs_folder, code + ".md") + + if os.path.isfile(dest_file): + continue + with open(os.path.join(data_pkgs_folder, package), "r") as read_file: content = json.load(read_file) - generate_static_pages_packages(args.release, code, content, dest) + generate_static_pages_packages(args.release, code, content, dest_file) + + print("Copy translation memories") + langs = [f for f in os.listdir(tm_folder) if os.path.isfile(os.path.join(tm_folder, f))] + for lang in langs: + if lang.endswith(".gz"): + shutil.copyfile(os.path.join(tm_folder, lang), os.path.join(static_tm_folder, lang)) def consolidate_language_stats(stats_file): @@ -124,7 +144,7 @@ def consolidate_language_stats(stats_file): results["progress"] = round(stats_df["translatedsourcewords"].sum() / stats_df["totalsourcewords"].sum() * 100, 1) for kpi in ["totalsourcewords", "translatedsourcewords"]: - results[kpi + "Sum"] = int(stats_df[kpi].sum()) + results[kpi + "sum"] = int(stats_df[kpi].sum()) return results @@ -182,7 +202,7 @@ def consolidate_package_stats(stats_file, log_files): line["progress"] = 0 p = 0 if total_source_words == 0: - print(" File {f} has translatedsourcewords = 0".format(f=stats_file)) + print(" File {f} for file has translatedsourcewords = 0 in line {l}".format(f=stats_file, l=line)) line["progress"] = p continue try: @@ -218,38 +238,38 @@ def consolidate_package_stats(stats_file, log_files): return results -def generate_static_pages_langs(release, code, content, dest): +def generate_static_pages_langs(release, code, content, dest_file): data = content data["release"] = release data["lang_code"] = code templateLoader = jinja2.FileSystemLoader(searchpath="./templates/") - templateEnv = jinja2.Environment(loader=templateLoader) + templateEnv = jinja2.Environment(loader=templateLoader, undefined=jinja2.Undefined) TEMPLATE_FILE = "language.md" template = templateEnv.get_template(TEMPLATE_FILE) outputText = template.render(data) - with open(dest, "w") as write_out: + with open(dest_file, "w") as write_out: write_out.write(outputText) -def generate_static_pages_packages(release, code, content, dest): +def generate_static_pages_packages(release, code, content, dest_file): data = content data["release"] = release data["package"] = code templateLoader = jinja2.FileSystemLoader(searchpath="./templates/") - templateEnv = jinja2.Environment(loader=templateLoader) + templateEnv = jinja2.Environment(loader=templateLoader, undefined=jinja2.Undefined) TEMPLATE_FILE = "package.md" template = templateEnv.get_template(TEMPLATE_FILE) outputText = template.render(data) - with open(dest, "w") as write_out: + with open(dest_file, "w") as write_out: write_out.write(outputText) -def store_json_file(code, content, dest): - with open(os.path.join(dest, code + ".json"), "w") as f: +def store_json_file(content, dest_file): + with open(dest_file, "w") as f: f.write(json.dumps(content, indent=2)) diff --git a/templates/language.md b/templates/language.md index 140fd3c..c8f7183 100644 --- a/templates/language.md +++ b/templates/language.md @@ -6,7 +6,13 @@ Global progress for {{ lang_code }} in Fedora {{ release }} is {{ progress }}%. | Source words to translate | Translated words | |---------------------------:|-----------------:| -| {{ TotalSourceWordsSum }} | {{ TranslatedSourceWordsSum }} | +| {{ totalsourcewordssum }} | {{ translatedsourcewordssum }} | + +Download: + +* {{ "{{%" }} link "/f32/{{ lang_code }}.po.gz" {{ "%}}" }}{{ lang_code }} compendium{{ "{{%" }} /link {{ "%}}" }} (aggregation of all strings found in po files) +* {{ "{{%" }} link "/f32/{{ lang_code }}.terminology.po.gz" {{ "%}}" }}{{ lang_code }} terminology{{ "{{%" }} /link {{ "%}}" }} see [poterminology](https://docs.translatehouse.org/projects/translate-toolkit/en/latest/commands/poterminology.html) +* {{ "{{%" }} link "/f32/{{ lang_code }}.tmx.gz" {{ "%}}" }}{{ lang_code }} translation memory{{ "{{%" }} /link {{ "%}}" }} see [tmx](https://en.wikipedia.org/wiki/Translation_Memory_eXchange) Packages: {% for package in packages %} diff --git a/templates/package.md b/templates/package.md index 328adbd..e8480eb 100644 --- a/templates/package.md +++ b/templates/package.md @@ -4,21 +4,33 @@ date: 2020-11-18T18:20:46+01:00 --- The package {{ package }} is transtlated into {{ count_languages }} languages in Fedora {{ release }}. -## Languages with ≥80% words translated +## Languages with ≥80% words translated +{% if equalsormorethan80percent %} {% for stat in equalsormorethan80percent -%} [{{ stat.lang_code }}]({{ '{{' }}< ref "/f{{ release }}/language/{{ stat.lang_code }}.md" >{{ '}}' }}) ({{ stat.progress }}) {% endfor %} +{% else %} +None +{% endif %} ## Languages with >50% and <80% words translated +{% if between50and80percent %} {% for stat in between50and80percent -%} [{{ stat.lang_code }}]({{ '{{' }}< ref "/f{{ release }}/language/{{ stat.lang_code }}.md" >{{ '}}' }}) ({{ stat.progress }}) {% endfor %} +{% else %} +None +{% endif %} + ## Languages with ≤50% words translated +{% if lessorequalto50percent %} {% for stat in lessorequalto50percent -%} [{{ stat.lang_code }}]({{ '{{' }}< ref "/f{{ release }}/language/{{ stat.lang_code }}.md" >{{ '}}' }}) ({{ stat.progress }}) {% endfor %} - +{% else %} +None +{% endif %} List of files for which language detection were impossible: {% for missing in no_languages -%} diff --git a/website/config.toml b/website/config.toml index 6a95d7b..984d1ca 100644 --- a/website/config.toml +++ b/website/config.toml @@ -2,3 +2,7 @@ baseURL = "https://jibecfed.fedorapeople.org/partage/fedora-localization-statist languageCode = "en-us" title = "Temporary demo" theme = "ananke" +staticDir = "static" + +[markup.goldmark.renderer] +unsafe= true diff --git a/website/layouts/shortcodes/link.html b/website/layouts/shortcodes/link.html new file mode 100644 index 0000000..25266d4 --- /dev/null +++ b/website/layouts/shortcodes/link.html @@ -0,0 +1 @@ +{{ .Inner }} From 9b92ae9caeeb181b84818ac8847d2873330100b1 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 25 2020 08:00:04 +0000 Subject: [PATCH 6/7] add language name in generated pages and generation date --- diff --git a/build_language_list.py b/build_language_list.py index 8b8ead6..4f575e2 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -218,6 +218,12 @@ def choose_lang(filename, metadata, error): # 1 is language name codes[language[1].lower()] = language[0].lower() + code_capitalized = dict() + for language in languages.LANGUAGES: + # 0 is language code + # 1 is language name + code_capitalized[language[0].lower()] = language[0] + file_name = filename.lower().replace("-", "_") meta_language = metadata.get("Language", "").lower().replace("-", "_") @@ -267,6 +273,8 @@ def choose_lang(filename, metadata, error): if lang in aliases.ALIASES.keys(): lang = aliases.ALIASES[lang].lower() + lang = code_capitalized.get(lang, lang) + return lang, decision diff --git a/build_website.py b/build_website.py index df2e288..c5ebc8c 100755 --- a/build_website.py +++ b/build_website.py @@ -2,8 +2,10 @@ """Consolidate each po files into compendium""" import argparse +import datetime import jinja2 import json +import langtable import os import pandas as pd import shutil @@ -19,6 +21,9 @@ def main(): choices=[30, 31, 32], help="Provide the Fedora release to analyze") + parser.add_argument("--refresh", action="store_true", + help="Force refresh of files") + args = parser.parse_args() release_folder = "./results/f{v}/".format(v=args.release) @@ -39,8 +44,8 @@ def main(): # clean destination folders for folder in [data_langs_folder, data_pkgs_folder, static_langs_folder, static_pkgs_folder, static_tm_folder]: - # if os.path.isdir(folder): - # shutil.rmtree(folder) + if args.refresh and os.path.isdir(folder): + shutil.rmtree(folder) os.makedirs(folder, exist_ok=True) @@ -240,8 +245,12 @@ def consolidate_package_stats(stats_file, log_files): def generate_static_pages_langs(release, code, content, dest_file): data = content + data["lang_name_en"] = langtable.language_name(languageId = code, languageIdQuery = "en") + data["lang_name_local"] = langtable.language_name(languageId = code) + data["scripts"] = langtable.list_scripts(languageId = code) data["release"] = release data["lang_code"] = code + data["now"] = datetime.datetime.utcnow() templateLoader = jinja2.FileSystemLoader(searchpath="./templates/") templateEnv = jinja2.Environment(loader=templateLoader, undefined=jinja2.Undefined) @@ -257,6 +266,7 @@ def generate_static_pages_packages(release, code, content, dest_file): data = content data["release"] = release data["package"] = code + data["now"] = datetime.datetime.utcnow() templateLoader = jinja2.FileSystemLoader(searchpath="./templates/") templateEnv = jinja2.Environment(loader=templateLoader, undefined=jinja2.Undefined) diff --git a/requirements.txt b/requirements.txt index 1862971..7783663 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pandas polib weblate-language-data +langtable diff --git a/runall.sh b/runall.sh index e4d0bb7..91458e6 100755 --- a/runall.sh +++ b/runall.sh @@ -4,10 +4,13 @@ set -xe # this file is useful for end to end tests on a short corpus rm -rf ./results/f32/ +rm -rf ./website/static/* +rm -rf ./website/content/* # parcourir tous les fichiers rpm d'une version et en extraire tous les fichiers de traduction # ~ 3 h (without downloading time) # time podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:32 /src/build.py --keep-srpms gco.* +# time podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:32 /src/build.py --keep-srpms col.* time podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:32 /src/build.py --keep-srpms # déduire la liste de toutes les langues @@ -19,15 +22,16 @@ time ./build_language_list.py --release 32 --refresh time ./build_language_list.py --release 32 --analyzealllang # générer par langue un compendium, une mémoire de traduction et une terminologie -# ~ 3 h +# ~ 3 h 20 time ./build_tm.py --release 32 --compress # calculer des pourcentages d'avancement par paquet et langue # ~ time ./build_stats.py --release 32 +rm -rf ~/.translate_toolkit/ # générer le site statique -# +# ~ 7 m time ./build_website.py --release 32 ( diff --git a/templates/language.md b/templates/language.md index c8f7183..197ccd4 100644 --- a/templates/language.md +++ b/templates/language.md @@ -1,8 +1,11 @@ --- -title: "{{ lang_code }}" -date: 2020-11-18T18:20:46+01:00 +title: "{{ lang_name_en }} ({{ lang_name_local }})" +date: {{ now }} --- -Global progress for {{ lang_code }} in Fedora {{ release }} is {{ progress }}%. + +Global progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ release }} is {{ progress }}%. + +Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %} | Source words to translate | Translated words | |---------------------------:|-----------------:| @@ -15,6 +18,6 @@ Download: * {{ "{{%" }} link "/f32/{{ lang_code }}.tmx.gz" {{ "%}}" }}{{ lang_code }} translation memory{{ "{{%" }} /link {{ "%}}" }} see [tmx](https://en.wikipedia.org/wiki/Translation_Memory_eXchange) Packages: -{% for package in packages %} +{% for package in packages -%} * [{{ package.name }}]({{ '{{' }}< ref "/f{{ release }}/package/{{ package.name }}.md" >{{ '}}' }}) ({{ package.progress }}) {% endfor %} diff --git a/templates/package.md b/templates/package.md index e8480eb..bc377a9 100644 --- a/templates/package.md +++ b/templates/package.md @@ -1,6 +1,6 @@ --- title: "{{ package }}" -date: 2020-11-18T18:20:46+01:00 +date: {{ now }} --- The package {{ package }} is transtlated into {{ count_languages }} languages in Fedora {{ release }}. @@ -32,7 +32,12 @@ None None {% endif %} +## Errors +{% if no_languages %} List of files for which language detection were impossible: {% for missing in no_languages -%} * {{ missing }} {% endfor %} +{% else %} +None +{% endif %} From f8d0d9b548d66057e524192ad0f6dfea1cd5188a Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 25 2020 11:57:49 +0000 Subject: [PATCH 7/7] add pip depedencies for Fedora 30 --- diff --git a/docker/Dockerfile.30 b/docker/Dockerfile.30 index 0626703..db6ed22 100644 --- a/docker/Dockerfile.30 +++ b/docker/Dockerfile.30 @@ -4,6 +4,9 @@ RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core p COPY requirements.txt /src/requirements.txt RUN pip3 install --no-cache -r /src/requirements.txt +RUN pip3 install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip +RUN pip3 install charamel +RUN pip3 install git+https://github.com/WeblateOrg/translation-finder.git # Fix missing metalink for f30 COPY docker/fedora-updates-modular.repo /etc/yum.repos.d/fedora-updates-modular.repo