From c982fb2789bc11e087f1e0c1101b6c48be088ac0 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 27 2020 06:50:38 +0000 Subject: [PATCH 1/9] remove duplicates results in package stats --- diff --git a/build_packages_stats.py b/build_packages_stats.py index 2310f77..bc306c2 100755 --- a/build_packages_stats.py +++ b/build_packages_stats.py @@ -23,7 +23,12 @@ def main(): filenames = [f for f in os.listdir(packages_folder) if os.path.isdir(os.path.join(packages_folder, f))] + print("Computing stats") + count = 0 + for package in sorted(filenames): + count +=1 + print(" {c}/{t} - {p}".format(c=count, t=len(filenames),p=package)) with open(os.path.join(packages_folder, package, "discover.json"), 'r') as f: translation_files = json.load(f) @@ -40,6 +45,26 @@ def main(): # it's a detection of .tx configuration continue + print("Removing duplicates") + count = 0 + for package in sorted(filenames): + count +=1 + print(" {c}/{t} - {p}".format(c=count, t=len(filenames),p=package)) + input_file = packages_folder + "{p}/stats.csv".format(p=package) + + try: + with open(input_file, 'r') as f: + lines = f.readlines() + + seen_lines = set() + with open(input_file, 'w') as f: + for line in lines: + if line not in seen_lines: + seen_lines.add(line) + f.write(line) + except FileNotFoundError: + continue + def get_po_translation_level(path, discover, name, packages_folder): filemask = discover["filemask"] stats_file = packages_folder + "/{p}/stats.csv".format(p=name) From 5272fea021d2cfad519b4610cd2d4905984eb55d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 27 2020 06:53:22 +0000 Subject: [PATCH 2/9] build stats package per language --- diff --git a/build_global_stats.py b/build_global_stats.py index bc4b5cb..708e72d 100755 --- a/build_global_stats.py +++ b/build_global_stats.py @@ -2,8 +2,12 @@ """Consolidate and clean result files""" import argparse +import csv +import itertools +import json import os import pandas +import time RESULT_FOLDER = "" @@ -21,13 +25,71 @@ def main(): args = parser.parse_args() + lang_path = "./results/f{r}/languages/".format(r=args.release) RESULT_FOLDER = "./results/f{r}/stats/".format(r=args.release) packages_folder = "./results/f{r}/packages/".format(r=args.release) concat_csv(packages_folder, RESULT_FOLDER) file = RESULT_FOLDER + "/_concat.csv" - parse(file) + + # parse(file) + + langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))] + + for lang in sorted(langs): + lang_code = lang[:-len('.json')] + + with open(os.path.join(lang_path, lang), "r") as read_file: + files = json.load(read_file) + + compute_lang(lang_code, files, RESULT_FOLDER, packages_folder) + + +def compute_lang(lang_code, files, RESULT_FOLDER, packages_folder): + print("Computing: {l} ({c} files)".format(l=lang_code, c=len(files))) + start_time_search = time.time() + stats = [] + packages = {} + + # step 1: get package lists + for file in files: + po_file = file.replace(packages_folder, "") + package = po_file.split("/")[0] + po_file = po_file.replace(package + "/", "") + try: + packages[package].append(po_file) + except KeyError: + packages[package] = list() + packages[package].append(po_file) + + # step 2: remove duplicates + for package in packages.keys(): + packages[package] = list(set(packages[package])) + + # step 3: parse package files + for package in packages.keys(): + po_files = packages[package] + stats_file = os.path.join(packages_folder, package, "stats.csv") + + with open(stats_file, newline='') as csvfile: + csv_dict_reader = csv.DictReader(csvfile) + + [stats.append([package] + list(row.values())) for row in csv_dict_reader if row["Filename"] == po_file] + + # step 4: store results + lang_stats_file = os.path.join(RESULT_FOLDER, lang_code + ".stats.csv") + with open(lang_stats_file, 'w', newline='') as csvfile: + spamwriter = csv.writer(csvfile) + + header = ['Package', 'Filename', ' Translated Messages', ' Translated Source Words', ' Translated Target Words', ' Fuzzy Messages', ' Fuzzy Source Words', ' Untranslated Messages', ' Untranslated Source Words', ' Total Message', ' Total Source Words', ' Review Messages', ' Review Source Words'] + + spamwriter.writerow(header) + + [spamwriter.writerow(row) for row in stats] + + search_duration = round(time.time() - start_time_search, 1) + print(" Done in {d} seconds".format(d=search_duration)) def parse(file): From 17b3c31bc8c211865eedfb69569dfa5a75ddfdc4 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 29 2020 06:43:21 +0000 Subject: [PATCH 3/9] improve language detection --- diff --git a/build_language_list.py b/build_language_list.py index f7f03fb..d847a97 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -8,7 +8,7 @@ import time import polib from shutil import rmtree -from weblate_language_data import aliases, languages, language_codes +from weblate_language_data import aliases, languages, language_codes, countries def main(): """Handle params""" @@ -82,7 +82,7 @@ def choose_lang(filename, metadata, error): """ Returns: a language code """ lang = "" - file_name = filename.lower() + file_name = filename.lower().replace("-", "_") meta_language = "" meta_team = "" try: @@ -118,6 +118,17 @@ def choose_lang(filename, metadata, error): lang = aliases.ALIASES[meta_language].lower() elif file_name in aliases.ALIASES.keys(): lang = aliases.ALIASES[file_name].lower() + + if lang == "noresult": + if meta_language in countries.DEFAULT_LANGS: + lang = meta_language.split("_", 1)[0] + elif file_name in countries.DEFAULT_LANGS: + lang = file_name.split("_", 1)[0] + + if lang == "noresult": + if file_name == meta_language: + print("Language/Team {l}/({t}) is missing in weblate_language_data".format(l=meta_language, t=meta_team)) + lang = file_name else: lang = "error" From 028f9550235e9e4a781c788160d606aa8d2f5533 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 29 2020 06:44:01 +0000 Subject: [PATCH 4/9] fix terminology generation and check results --- diff --git a/build_tm.py b/build_tm.py index 062318d..d2ee0d9 100755 --- a/build_tm.py +++ b/build_tm.py @@ -49,6 +49,10 @@ def main(): compute_lang(lang[:-len('.json')], files, tm_folder, args.refresh) + print("Detecting missing files") + for lang in sorted(langs): + check_lang(lang[:-len('.json')], tm_folder) + search_duration = round(time.time() - start_time_search, 1) print(" Done in {d} seconds".format(d=search_duration)) @@ -101,9 +105,27 @@ def compute_lang(lang, langfiles, tm_folder, refresh): command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", "--progress=none", compendium_file, "--output="+terminology_file] - if not os.path.isfile(tmx_file) or refresh is True: + if not os.path.isfile(terminology_file) or refresh is True: subprocess.run(command, check=True, capture_output=True) + +def check_lang(lang, tm_folder): + """ make sure the files were generated """ + + compendium_file = tm_folder + lang + ".po" + tmx_file = tm_folder + lang + ".tmx" + terminology_file = tm_folder + lang + ".terminology.po" + + if not os.path.isfile(compendium_file): + print(" {l}-compendium is missing".format(l=lang)) + + if not os.path.isfile(tmx_file): + print(" {l}-tmx is missing".format(l=lang)) + + if not os.path.isfile(terminology_file): + print(" {l}-terminology is missing".format(l=lang)) + + if __name__ == '__main__': main() From 258588265d158b76c090e2787e03ca7a930d7752 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 31 2020 13:40:44 +0000 Subject: [PATCH 5/9] add refresh and describe option for language detector --- diff --git a/build_language_list.py b/build_language_list.py index d847a97..b434421 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -23,26 +23,44 @@ def main(): parser.add_argument("--refresh", action="store_true", help="Force refresh") + parser.add_argument("--describe", action="store_true", + help="Describe the current list of languages") + args = parser.parse_args() release_folder = "./results/f{v}/".format(v=args.release) lang_path = os.path.join(release_folder, "languages/") packages_path = os.path.join(release_folder, "packages/") - print("Refreshing the list of languages") - rmtree(lang_path, ignore_errors=True) - os.mkdir(lang_path) + if args.describe: + print("Describing detecting languages") + describe(lang_path) + elif args.refresh: + print("Refreshing the list of languages") + rmtree(lang_path, ignore_errors=True) + os.mkdir(lang_path) + + start_time_search = time.time() + + po_langs = detect_languages(packages_path) + + for lang in po_langs.keys(): + with open(os.path.join(lang_path, lang + '.json'), 'w') as f: + f.write(json.dumps(po_langs[lang], indent=2)) + + search_duration = round(time.time() - start_time_search, 1) + print(" Done in {d} seconds".format(d=search_duration)) - start_time_search = time.time() - po_langs = detect_languages(packages_path) +def describe(lang_path): + """ Provide the number of files per language """ + langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))] - for lang in po_langs.keys(): - with open(os.path.join(lang_path, lang + '.json'), 'w') as f: - f.write(json.dumps(po_langs[lang], indent=2)) + for lang in sorted(langs): + with open(os.path.join(lang_path, lang), "r") as read_file: + files = json.load(read_file) - search_duration = round(time.time() - start_time_search, 1) - print(" Done in {d} seconds".format(d=search_duration)) + print(" {l}:{c}".format(l=lang[:-len('.json')],c=len(files))) def detect_languages(tm_folder): From f055e539dd127321a82fffaabcd25568d058c350 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 02 2020 22:40:53 +0000 Subject: [PATCH 6/9] add language analysis --- diff --git a/build_language_list.py b/build_language_list.py index b434421..2190a91 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -26,15 +26,41 @@ def main(): parser.add_argument("--describe", action="store_true", help="Describe the current list of languages") + parser.add_argument("--analyzelang", type=str, + help="Produce an analyze file for a language") + + parser.add_argument("--analyzealllangs", action="store_true", + help="Produce an analyze file for all languages") + args = parser.parse_args() release_folder = "./results/f{v}/".format(v=args.release) lang_path = os.path.join(release_folder, "languages/") packages_path = os.path.join(release_folder, "packages/") + lang_analyze_folder = os.path.join(release_folder, "languages-analyses/") if args.describe: print("Describing detecting languages") describe(lang_path) + + elif args.analyzealllangs: + rmtree(lang_analyze_folder, ignore_errors=True) + os.mkdir(lang_analyze_folder) + + langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))] + for lang in sorted(langs): + analyze = analyze_lang(lang_path, lang[:-len('.json')]) + + with open(os.path.join(lang_analyze_folder, lang), 'w') as f: + f.write(json.dumps(analyze, indent=2)) + + elif args.analyzelang: + print("Provide more data to analyze errors") + analyze = analyze_lang(lang_path, args.analyzelang) + + with open(os.path.join(lang_analyze_folder, args.analyzelang + '.json'), 'w') as f: + f.write(json.dumps(analyze, indent=2)) + elif args.refresh: print("Refreshing the list of languages") rmtree(lang_path, ignore_errors=True) @@ -52,6 +78,70 @@ def main(): print(" Done in {d} seconds".format(d=search_duration)) +def analyze_lang(lang_path, analized_lang): + """ Analyze one lang """ + files = [] + results = dict() + with open(os.path.join(lang_path, analized_lang + ".json"), "r") as read_file: + files = json.load(read_file) + + print(" Analysing language {l}, with {c} files".format(l=analized_lang,c=len(files))) + + for file in files: + lang = "error" + metadata = dict() + error = "" + try: + metadata = polib.pofile(file).metadata + except UnicodeDecodeError as e: + # encoding error, to investigate before using it in TM + metadata["Language"] = "error-unicode" + except OSError as e: + # maybe a polib bug? to investigate before using it in TM + metadata["Language"] = "error-os" + + if "Language" not in metadata.keys(): + metadata["Language"] = "zzz_null" + elif metadata["Language"] == "": + metadata["Language"] = "zzz_empty" + + if analized_lang != "error": + lang = choose_lang("", metadata, error) + + if metadata.get("Language") not in results.keys(): + results[metadata.get("Language")] = dict() + + try: + results[metadata.get("Language")]["Count"] += 1 + except KeyError: + results[metadata.get("Language")]["Count"] = 1 + + try: + results[metadata.get("Language")]["Files"].append(file) + except KeyError: + results[metadata.get("Language")]["Files"] = [] + results[metadata.get("Language")]["Files"].append(file) + + try: + results[metadata.get("Language")]["Plural-Forms"].append(metadata.get("Plural-Forms")) + results[metadata.get("Language")]["Plural-Forms"] = list(set(results[metadata.get("Language")]["Plural-Forms"])) + except KeyError: + results[metadata.get("Language")]["Plural-Forms"] = [] + results[metadata.get("Language")]["Plural-Forms"].append(metadata.get("Plural-Forms")) + + + try: + results[metadata.get("Language")]["Language-Team"].append(metadata.get("Language-Team")) + results[metadata.get("Language")]["Language-Team"] = list(set(results[metadata.get("Language")]["Language-Team"])) + except KeyError: + results[metadata.get("Language")]["Language-Team"] = [] + results[metadata.get("Language")]["Language-Team"].append(metadata.get("Language-Team")) + + results = dict(sorted(results.items(), key=lambda item: item[0])) + + return results + + def describe(lang_path): """ Provide the number of files per language """ langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))] @@ -101,17 +191,14 @@ def choose_lang(filename, metadata, error): lang = "" file_name = filename.lower().replace("-", "_") - meta_language = "" - meta_team = "" - try: - meta_language = metadata.get("Language").lower() - except AttributeError: - pass - - try: - meta_team = metadata.get("Language-Team").lower() - except AttributeError: - pass + + meta_language = metadata.get("Language") + if isinstance(meta_language, str): + meta_language = meta_language.lower().replace("-", "_") + + meta_team = metadata.get("Language-Team") + if isinstance(meta_team, str): + meta_team = meta_team.lower().replace("-", "_") if meta_language in language_codes.LANGUAGES: lang = meta_language @@ -142,11 +229,6 @@ def choose_lang(filename, metadata, error): lang = meta_language.split("_", 1)[0] elif file_name in countries.DEFAULT_LANGS: lang = file_name.split("_", 1)[0] - - if lang == "noresult": - if file_name == meta_language: - print("Language/Team {l}/({t}) is missing in weblate_language_data".format(l=meta_language, t=meta_team)) - lang = file_name else: lang = "error" diff --git a/requirements.txt b/requirements.txt index edde0f9..8c8d2f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ pandas geopandas matplotlib descartes +polib +weblate-language-data From 25847a3a336a713e35b5b83596140b90576655c2 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 05 2020 23:11:44 +0000 Subject: [PATCH 7/9] add support for language name in folder --- diff --git a/build.py b/build.py index 6fc3d24..6ebcf02 100755 --- a/build.py +++ b/build.py @@ -164,7 +164,7 @@ def extract_srpm(tmp, name, result_folder): stdout=out, stderr=error, check=True) -def discover_translations(tmp, name, result_folder, packages_folder): +def discover_translations(tmp, pkg_name, result_folder, packages_folder): """find po file""" print("discover_translations: " + tmp) translation_files = [] @@ -177,21 +177,21 @@ def discover_translations(tmp, name, result_folder, packages_folder): translation_files = discover(tmp) except OSError: with open(result_folder + "/errors.txt", "a") as file: - file.write(name + " on discover_translations\n") + file.write(pkg_name + " on discover_translations\n") tsearch = round(time.time() - tsearch, 1) tcopy = time.time() if translation_files: - if not os.path.exists(os.path.join(packages_folder, name)): - os.makedirs(os.path.join(packages_folder, name)) + if not os.path.exists(os.path.join(packages_folder, pkg_name)): + os.makedirs(os.path.join(packages_folder, pkg_name)) - with open(os.path.join(packages_folder, name, "discover.json"), 'w') as f: + with open(os.path.join(packages_folder, pkg_name, "discover.json"), 'w') as f: f.write(json.dumps(translation_files, indent=2)) for translation in translation_files: - copy_translations(tmp, translation, name, result_folder, packages_folder) + copy_translations(tmp, translation, pkg_name, result_folder, packages_folder) tcopy = round(time.time() - tcopy, 1) @@ -202,19 +202,19 @@ def discover_translations(tmp, name, result_folder, packages_folder): return (tsearch, tcopy, cresults) -def copy_translations(tmp, translation, name, result_folder, packages_folder): - # translation, name, result_folder, packages_folder +def copy_translations(tmp, translation, pkg_name, result_folder, packages_folder): filemask = translation["filemask"] print("copy translations " + filemask) if translation["file_format"] in ["po", "json"]: for po in glob.glob(tmp + "/" + filemask): - dest = packages_folder + "/" + name + "/" + filemask.split("*")[0] - os.makedirs(dest, exist_ok=True) + dest = packages_folder + "/" + pkg_name + "/" + po.replace(tmp, "") + dest_folder = dest.replace(os.path.basename(dest), "") + os.makedirs(dest_folder, exist_ok=True) # use copyfile instead of copy2 to handle read-only files in rpm - copyfile(po, os.path.join(dest, os.path.basename(po))) + copyfile(po, dest) if __name__ == '__main__': main() From d247ba619b2feea78dee5c3bcf1f64c42e38506f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 07 2020 15:34:06 +0000 Subject: [PATCH 8/9] try to use language name to deduct language code --- diff --git a/build_language_list.py b/build_language_list.py index 2190a91..db9263c 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -7,6 +7,8 @@ import os import time import polib +from pprint import pprint + from shutil import rmtree from weblate_language_data import aliases, languages, language_codes, countries @@ -71,7 +73,7 @@ def main(): po_langs = detect_languages(packages_path) for lang in po_langs.keys(): - with open(os.path.join(lang_path, lang + '.json'), 'w') as f: + with open(os.path.join(lang_path, str(lang) + '.json'), 'w') as f: f.write(json.dumps(po_langs[lang], indent=2)) search_duration = round(time.time() - start_time_search, 1) @@ -108,34 +110,27 @@ def analyze_lang(lang_path, analized_lang): if analized_lang != "error": lang = choose_lang("", metadata, error) - if metadata.get("Language") not in results.keys(): - results[metadata.get("Language")] = dict() + language = results.get(metadata.get("Language"), dict()) - try: - results[metadata.get("Language")]["Count"] += 1 - except KeyError: - results[metadata.get("Language")]["Count"] = 1 + count = language.get("Count", 0) + count += 1 + language["Count"] = count - try: - results[metadata.get("Language")]["Files"].append(file) - except KeyError: - results[metadata.get("Language")]["Files"] = [] - results[metadata.get("Language")]["Files"].append(file) + lang_files = language.get("Files", []) + lang_files.append(file) + language["Files"] = lang_files - try: - results[metadata.get("Language")]["Plural-Forms"].append(metadata.get("Plural-Forms")) - results[metadata.get("Language")]["Plural-Forms"] = list(set(results[metadata.get("Language")]["Plural-Forms"])) - except KeyError: - results[metadata.get("Language")]["Plural-Forms"] = [] - results[metadata.get("Language")]["Plural-Forms"].append(metadata.get("Plural-Forms")) + plurals = language.get("Plural-Forms", []) + plurals.append(metadata.get("Plural-Forms")) + plurals = list(set(plurals)) + language["Plural-Forms"] = plurals + teams = language.get("Language-Team", []) + teams.append(metadata.get("Language-Team")) + teams = list(set(teams)) + language["Language-Team"] = teams - try: - results[metadata.get("Language")]["Language-Team"].append(metadata.get("Language-Team")) - results[metadata.get("Language")]["Language-Team"] = list(set(results[metadata.get("Language")]["Language-Team"])) - except KeyError: - results[metadata.get("Language")]["Language-Team"] = [] - results[metadata.get("Language")]["Language-Team"].append(metadata.get("Language-Team")) + results[metadata.get("Language")] = language results = dict(sorted(results.items(), key=lambda item: item[0])) @@ -192,30 +187,35 @@ def choose_lang(filename, metadata, error): lang = "" file_name = filename.lower().replace("-", "_") - meta_language = metadata.get("Language") - if isinstance(meta_language, str): - meta_language = meta_language.lower().replace("-", "_") + meta_language = metadata.get("Language","").lower().replace("-", "_") - meta_team = metadata.get("Language-Team") - if isinstance(meta_team, str): - meta_team = meta_team.lower().replace("-", "_") + meta_team = metadata.get("Language-Team","").lower().replace("-", "_") if meta_language in language_codes.LANGUAGES: lang = meta_language elif file_name in language_codes.LANGUAGES: lang = file_name + else: lang = "noresult" # try languages (some codes here are exclused from languages_codes) if lang == "noresult": - loc = [ lang[0] for lang in languages.LANGUAGES ] + codes = dict() + for language in languages.LANGUAGES: + # 0 is language code + # 1 is language name + codes[language[1].lower()] = language[0].lower() + + if meta_language in codes.values(): + lang = meta_language + + elif file_name in codes.values(): + lang = file_name - if meta_language in loc: - lang = meta_language.lower() - elif file_name in loc: - lang = file_name.lower() + elif meta_language in codes.keys(): + lang = codes.get(meta_language) # try ALIASES if lang == "noresult": From 32f1678a3380471a4e27c3999f0522c43ddd406c Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Nov 07 2020 15:34:56 +0000 Subject: [PATCH 9/9] compress output files --- diff --git a/build_tm.py b/build_tm.py index d2ee0d9..9b6d906 100755 --- a/build_tm.py +++ b/build_tm.py @@ -2,6 +2,7 @@ """Consolidate each po files into compendium""" import argparse +import gzip import json import os import subprocess @@ -21,6 +22,9 @@ def main(): parser.add_argument("--refresh", action="store_true", help="Force refresh of files") + parser.add_argument("--compress", action="store_true", + help="Compress output files") + parser.add_argument("--lang", required=False, type=str, help="Filter a language to analyze") @@ -53,6 +57,9 @@ def main(): for lang in sorted(langs): check_lang(lang[:-len('.json')], tm_folder) + if args.compress: + compress(tm_folder) + search_duration = round(time.time() - start_time_search, 1) print(" Done in {d} seconds".format(d=search_duration)) @@ -125,6 +132,21 @@ def check_lang(lang, tm_folder): if not os.path.isfile(terminology_file): print(" {l}-terminology is missing".format(l=lang)) +def compress(folder): + """ Compress files uzing gzip """ + + files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] + count = 0 + total = len(files) + + for file in sorted(files): + count += 1 + dest = file + ".gz" + print(" {c}/{t}") + + with open(os.path.join(folder, file), "rb") as file_in: + with gzip.open(os.path.join(folder, dest), "wb") as file_out: + file_out.writelines(file_in) if __name__ == '__main__': main()