From 0b62df10db1e1542fc1459448c48c249d0423225 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 09 2020 19:43:12 +0000 Subject: [PATCH 1/7] run compendium in a temporary folder as we now have hundreds of files, the function is quite slow as we now have a hierarchy of files for each package, language detection have to be smarter --- diff --git a/build_tm.py b/build_tm.py index fd8f63b..2f57757 100755 --- a/build_tm.py +++ b/build_tm.py @@ -7,6 +7,8 @@ import os import subprocess import tempfile +from shutil import copyfile + def main(): """Handle params""" @@ -17,7 +19,7 @@ def main(): choices=[30, 31, 32], help="Provide the Fedora release to analyze") - parser.add_argument("--lang", required=False, type=str, + parser.add_argument("--lang", required=True, type=str, help="Filter a language to analyze") args = parser.parse_args() @@ -28,11 +30,12 @@ def main(): po_langs = list(set([ os.path.basename(po) for po in pofiles ])) po_langs.sort() - if len(args.lang) > 0: + if args.lang: compute_lang(args.lang+".po", tm_folder) else: - for langfile in po_langs: - compute_lang(langfile, tm_folder) + print("not yet working") + # for langfile in po_langs: + # compute_lang(langfile, tm_folder) def compute_lang(langfile, tm_folder): """ Generate compendium and convert it to tmx""" @@ -43,16 +46,29 @@ def compute_lang(langfile, tm_folder): print(" - po consolidation") compendium_file = tm_folder + lang + ".po" - command = ["pocompendium", compendium_file] + pofiles - subprocess.run(command, check=True) + count = 1 + total_files = len(pofiles) + + with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp: + tmp_compendium_file = os.path.join(tmp, "compendium." + lang + ".po") + for file in pofiles: + count += 1 + pofile = os.path.join(os.path.dirname(os.path.abspath(__file__)), + file) + print(" {c}/{t} processing {n}".format(c=count, t=total_files, n=pofile)) + + command = ["pocompendium", tmp_compendium_file, tmp_compendium_file, pofile] + subprocess.run(command, check=True, cwd=tmp) + + copyfile(tmp_compendium_file, compendium_file) - print(" - po to tmx convertion") + print(" - po to tmx convertion") tmx_file = tm_folder + lang + ".tmx" command = ["po2tmx", "--language="+lang, "--progress=none", compendium_file, "--output="+tmx_file] subprocess.run(command, check=True) - print(" - language terminology") + print(" - language terminology") terminology_file = tm_folder + lang + ".terminology.po" command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", From 5c9d08bc94bdea64279edb2d123a59be578aaab4 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 10 2020 07:19:58 +0000 Subject: [PATCH 2/7] analyse languages and related files before generating compendium and tmx as we now have a hierarchy of files with multiple values, we need to detect language language detection is done by using polib, result is stored as json (lot of cleaning is required here) then, parsing is done per group of files because creating a compendium with one single buggy file makes the whole process fails as we can have more than a thousand file per language, it also takes quite a lot of time the whole process is slow, really slow --- diff --git a/build.py b/build.py index 9d2ffeb..4e9e0c2 100755 --- a/build.py +++ b/build.py @@ -42,7 +42,7 @@ def main(): (distname, distrel, distid) = distro.linux_distribution() result_folder = "./results/f{v}/".format(v=distrel) - tm_folder = "./tm/f{v}/".format(v=distrel) + tm_folder = "./tm/f{v}/packages/".format(v=distrel) srpms_path = "/srpms" if not os.path.exists(result_folder): diff --git a/build_tm.py b/build_tm.py index 2f57757..7e29d9b 100755 --- a/build_tm.py +++ b/build_tm.py @@ -3,11 +3,14 @@ import argparse import glob +import json import os +import polib import subprocess import tempfile from shutil import copyfile +from shutil import rmtree def main(): """Handle params""" @@ -19,47 +22,118 @@ def main(): choices=[30, 31, 32], help="Provide the Fedora release to analyze") - parser.add_argument("--lang", required=True, type=str, + parser.add_argument("--lang", required=False, type=str, help="Filter a language to analyze") + parser.add_argument("--refresh", action="store_true", + help="Refresh list of available languages to analyze") + args = parser.parse_args() - tm_folder="./tm/f{v}/".format(v=args.release) + release_folder = "./tm/f{v}/".format(v=args.release) + lang_path = os.path.join(release_folder, "languages/") + packages_path = os.path.join(release_folder, "packages/") + tm_folder = os.path.join(release_folder, "out/") + + if args.refresh: + rmtree(lang_path) + os.mkdir(lang_path) - pofiles = glob.glob(tm_folder+'*/*/*.po') - po_langs = list(set([ os.path.basename(po) for po in pofiles ])) - po_langs.sort() + po_langs = detect_languages(packages_path) + + for lang in po_langs.keys(): + with open(os.path.join(lang_path, lang + '.json'), 'w') as f: + f.write(json.dumps(po_langs[lang], indent=2)) if args.lang: - compute_lang(args.lang+".po", tm_folder) + with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file: + print("Converting JSON encoded data into Python dictionary") + files = json.load(read_file) + compute_lang(args.lang, files, tm_folder) else: - print("not yet working") - # for langfile in po_langs: - # compute_lang(langfile, tm_folder) + langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))] + + for lang in langs: + with open(os.path.join(lang_path, lang), "r") as read_file: + print("Converting JSON encoded data into Python dictionary") + files = json.load(read_file) + + compute_lang(lang[:-len('.json')], files, tm_folder) + +def detect_languages(tm_folder): + langs = {} + + count = 1 + total_file = 0 + for root, directories, files in os.walk(tm_folder): + total_file += len(files) -def compute_lang(langfile, tm_folder): + for root, directories, files in os.walk(tm_folder): + + for file in files: + print("{c}/{t}".format(c=count, t=total_file)) + count += 1 + + racine, ext = os.path.splitext(file) + if ext == ".po": + lang = "error" + try: + lang = polib.pofile(os.path.join(root, file)).metadata['Language'] + except KeyError as e: + # no Language in properties + lang = racine + except UnicodeDecodeError as e: + # encoding error, to investigate before using it in TM + lang = "error-unicode" + except OSError as e: + # maybe a polib bug? to investigate before using it in TM + lang = "error-os" + + if lang == "": + lang = racine + + try: + langs[lang].append(os.path.join(root, file)) + except KeyError: + langs[lang] = list() + langs[lang].append(os.path.join(root, file)) + + return langs + +def compute_lang(lang, langfiles, tm_folder): """ Generate compendium and convert it to tmx""" - lang = os.path.splitext(langfile)[0] - print("Creating lang: " + lang) + print("Computing lang: " + lang) - pofiles = glob.glob(tm_folder+'*/*/'+langfile) + pofiles = langfiles print(" - po consolidation") compendium_file = tm_folder + lang + ".po" count = 1 total_files = len(pofiles) + pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in pofiles] with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp: tmp_compendium_file = os.path.join(tmp, "compendium." + lang + ".po") - for file in pofiles: - count += 1 - pofile = os.path.join(os.path.dirname(os.path.abspath(__file__)), - file) - print(" {c}/{t} processing {n}".format(c=count, t=total_files, n=pofile)) - - command = ["pocompendium", tmp_compendium_file, tmp_compendium_file, pofile] + with open(tmp_compendium_file, 'w') as fp: + pass + + step = 10 + entier = total_files // step + modulo = total_files % step + start = 0 + for i in range(entier): + stop = (i + 1) * step + + command = ["pocompendium", tmp_compendium_file, tmp_compendium_file] + command += pofiles[start:stop] subprocess.run(command, check=True, cwd=tmp) + start += step + + command = ["pocompendium", tmp_compendium_file, tmp_compendium_file] + command += pofiles[start:len(pofiles)] + subprocess.run(command, check=True, cwd=tmp) + copyfile(tmp_compendium_file, compendium_file) print(" - po to tmx convertion") From b27aaabfaa9d0a5da5c5de88663a3f25baeb5273 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 12 2020 05:07:16 +0000 Subject: [PATCH 3/7] smarter way to detect languages as we can't trust the filename, try first to get the Language metadata from the po file the language list comes from Weblate's languages lists --- diff --git a/build_tm.py b/build_tm.py index 7e29d9b..e6f8049 100755 --- a/build_tm.py +++ b/build_tm.py @@ -8,9 +8,11 @@ import os import polib import subprocess import tempfile +import time from shutil import copyfile from shutil import rmtree +from weblate_language_data import aliases, languages, language_codes def main(): """Handle params""" @@ -35,16 +37,24 @@ def main(): packages_path = os.path.join(release_folder, "packages/") tm_folder = os.path.join(release_folder, "out/") + # Step 1: compute the list of languages if args.refresh: + print("Refresh the list of languages") rmtree(lang_path) os.mkdir(lang_path) + start_time_search = time.time() + po_langs = detect_languages(packages_path) for lang in po_langs.keys(): with open(os.path.join(lang_path, lang + '.json'), 'w') as f: f.write(json.dumps(po_langs[lang], indent=2)) + search_duration = round(time.time() - start_time_search, 1) + print(" Done in {d} seconds".format(d=search_duration)) + + # Step 2: call TM activities if args.lang: with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file: print("Converting JSON encoded data into Python dictionary") @@ -61,6 +71,9 @@ def main(): compute_lang(lang[:-len('.json')], files, tm_folder) def detect_languages(tm_folder): + """ For each po file, detect metadatas and deduct the language """ + """ Requires: a file hierarchy with po files """ + """ Returns: a dictionary of lists, key=lang code, value=file list """ langs = {} count = 1 @@ -76,21 +89,18 @@ def detect_languages(tm_folder): racine, ext = os.path.splitext(file) if ext == ".po": - lang = "error" + metadata = dict() + error = "" try: - lang = polib.pofile(os.path.join(root, file)).metadata['Language'] - except KeyError as e: - # no Language in properties - lang = racine + metadata = polib.pofile(os.path.join(root, file)).metadata except UnicodeDecodeError as e: # encoding error, to investigate before using it in TM - lang = "error-unicode" + error = "error-unicode" except OSError as e: # maybe a polib bug? to investigate before using it in TM - lang = "error-os" + error = "error-os" - if lang == "": - lang = racine + lang = choose_lang(racine, metadata, error) try: langs[lang].append(os.path.join(root, file)) @@ -100,13 +110,62 @@ def detect_languages(tm_folder): return langs +def choose_lang(filename, metadata, error): + """ From a po file and its medata, choose the most likely language code """ + """ By priority: the Language medata """ + """ Returns: a language code """ + + lang = "" + file_name = filename.lower() + meta_language = "" + meta_team = "" + try: + meta_language = metadata.get("Language").lower() + except AttributeError: + pass + + try: + meta_team = metadata.get("Language-Team").lower() + except AttributeError: + pass + + if meta_language in language_codes.LANGUAGES: + lang = meta_language + + elif file_name in language_codes.LANGUAGES: + lang = file_name + else: + lang = "noresult" + + # try languages (some codes here are exclused from languages_codes) + if lang == "noresult": + loc = [ lang[0] for lang in languages.LANGUAGES ] + + if meta_language in loc: + lang = meta_language + elif file_name in loc: + lang = file_name + + # try ALIASES + if lang == "noresult": + if meta_language in aliases.ALIASES.keys(): + lang = aliases.ALIASES[meta_language] + elif file_name in aliases.ALIASES.keys(): + lang = aliases.ALIASES[file_name] + else: + lang = "error" + + return lang + def compute_lang(lang, langfiles, tm_folder): - """ Generate compendium and convert it to tmx""" + """ Generate compendium and convert it to tmx """ + """ """ print("Computing lang: " + lang) pofiles = langfiles print(" - po consolidation") + start_time_compendium = time.time() compendium_file = tm_folder + lang + ".po" count = 1 total_files = len(pofiles) @@ -123,19 +182,24 @@ def compute_lang(lang, langfiles, tm_folder): start = 0 for i in range(entier): stop = (i + 1) * step + print(" {s}/{t} {d}".format(s=start, + t=total_files)) command = ["pocompendium", tmp_compendium_file, tmp_compendium_file] command += pofiles[start:stop] - subprocess.run(command, check=True, cwd=tmp) + subprocess.run(command, check=True, cwd=tmp, capture_output=True) start += step command = ["pocompendium", tmp_compendium_file, tmp_compendium_file] command += pofiles[start:len(pofiles)] - subprocess.run(command, check=True, cwd=tmp) + subprocess.run(command, check=True, cwd=tmp, capture_output=True) copyfile(tmp_compendium_file, compendium_file) + compendium_duration = round(time.time() - start_time_compendium, 1) + print(" Done in {d} seconds".format(d=search_duration)) + print(" - po to tmx convertion") tmx_file = tm_folder + lang + ".tmx" command = ["po2tmx", "--language="+lang, "--progress=none", From f601da580e9b1eee4996a7b699bbd50b81fe78de Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 12 2020 16:42:51 +0000 Subject: [PATCH 4/7] optimize translation memory generation the previous optimization was slowing everything... let's keept it simple by using more ram --- diff --git a/build_tm.py b/build_tm.py index e6f8049..be89294 100755 --- a/build_tm.py +++ b/build_tm.py @@ -57,15 +57,14 @@ def main(): # Step 2: call TM activities if args.lang: with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file: - print("Converting JSON encoded data into Python dictionary") files = json.load(read_file) + compute_lang(args.lang, files, tm_folder) else: langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))] - for lang in langs: + for lang in sorted(langs): with open(os.path.join(lang_path, lang), "r") as read_file: - print("Converting JSON encoded data into Python dictionary") files = json.load(read_file) compute_lang(lang[:-len('.json')], files, tm_folder) @@ -160,58 +159,36 @@ def choose_lang(filename, metadata, error): def compute_lang(lang, langfiles, tm_folder): """ Generate compendium and convert it to tmx """ """ """ - print("Computing lang: " + lang) - - pofiles = langfiles + print("Computing: " + lang) - print(" - po consolidation") - start_time_compendium = time.time() + # po consolidation compendium_file = tm_folder + lang + ".po" - count = 1 - total_files = len(pofiles) - pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in pofiles] + pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in langfiles] with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp: tmp_compendium_file = os.path.join(tmp, "compendium." + lang + ".po") - with open(tmp_compendium_file, 'w') as fp: - pass - - step = 10 - entier = total_files // step - modulo = total_files % step - start = 0 - for i in range(entier): - stop = (i + 1) * step - print(" {s}/{t} {d}".format(s=start, - t=total_files)) - - command = ["pocompendium", tmp_compendium_file, tmp_compendium_file] - command += pofiles[start:stop] - subprocess.run(command, check=True, cwd=tmp, capture_output=True) + copyfile(pofiles.pop(), tmp_compendium_file) - start += step + # prevent error if only one file is available for the language + if len(pofiles): + command = ["pocompendium", tmp_compendium_file] + pofiles - command = ["pocompendium", tmp_compendium_file, tmp_compendium_file] - command += pofiles[start:len(pofiles)] - subprocess.run(command, check=True, cwd=tmp, capture_output=True) + subprocess.run(command, check=True, cwd=tmp, capture_output=True) copyfile(tmp_compendium_file, compendium_file) - compendium_duration = round(time.time() - start_time_compendium, 1) - print(" Done in {d} seconds".format(d=search_duration)) - - print(" - po to tmx convertion") + # po to tmx convertion tmx_file = tm_folder + lang + ".tmx" command = ["po2tmx", "--language="+lang, "--progress=none", compendium_file, "--output="+tmx_file] - subprocess.run(command, check=True) + subprocess.run(command, check=True, capture_output=True) - print(" - language terminology") + # language terminology terminology_file = tm_folder + lang + ".terminology.po" command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", "--progress=verbose", compendium_file, "--output="+terminology_file] - subprocess.run(command, check=True) + subprocess.run(command, check=True, capture_output=True) if __name__ == '__main__': main() From c7df77557908c49b936295f8f1bdeb83c699ae10 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 12 2020 20:08:25 +0000 Subject: [PATCH 5/7] less text --- diff --git a/build_tm.py b/build_tm.py index be89294..ca761fb 100755 --- a/build_tm.py +++ b/build_tm.py @@ -75,17 +75,8 @@ def detect_languages(tm_folder): """ Returns: a dictionary of lists, key=lang code, value=file list """ langs = {} - count = 1 - total_file = 0 for root, directories, files in os.walk(tm_folder): - total_file += len(files) - - for root, directories, files in os.walk(tm_folder): - for file in files: - print("{c}/{t}".format(c=count, t=total_file)) - count += 1 - racine, ext = os.path.splitext(file) if ext == ".po": metadata = dict() From f3e04f2c8719f1b1384bdda9c355d7565080fc65 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 13 2020 07:12:12 +0000 Subject: [PATCH 6/7] use msgcat instead of pocompendium pocompendium crashes and stop everything in case of error while msgcat continue to aggregate but use msguniq to prevent msgcat to stop when two messages are the same ;) --- diff --git a/build_tm.py b/build_tm.py index ca761fb..2883db6 100755 --- a/build_tm.py +++ b/build_tm.py @@ -154,19 +154,34 @@ def compute_lang(lang, langfiles, tm_folder): # po consolidation compendium_file = tm_folder + lang + ".po" + compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file) pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in langfiles] + + count = 0 + with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp: - tmp_compendium_file = os.path.join(tmp, "compendium." + lang + ".po") - copyfile(pofiles.pop(), tmp_compendium_file) + for i in pofiles: + try: + command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"] + subprocess.run(command, check=True, cwd=tmp, capture_output=True) + except subprocess.CalledProcessError as e: + try: + command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"] + subprocess.run(command, check=True, cwd=tmp, capture_output=True) + except subprocess.CalledProcessError as e: + print("Error with msguniq {i}, error: {e}".format(i=i, e=e)) + + count += 1 - # prevent error if only one file is available for the language - if len(pofiles): - command = ["pocompendium", tmp_compendium_file] + pofiles + onlyfiles = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))] + command = ["msgcat", "--force-po", "--no-location", "--output-file", compendium_file] + onlyfiles + try: subprocess.run(command, check=True, cwd=tmp, capture_output=True) + except subprocess.CalledProcessError as e: + print(" msgcat exception...") - copyfile(tmp_compendium_file, compendium_file) # po to tmx convertion tmx_file = tm_folder + lang + ".tmx" @@ -178,7 +193,7 @@ def compute_lang(lang, langfiles, tm_folder): terminology_file = tm_folder + lang + ".terminology.po" command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", - "--progress=verbose", compendium_file, "--output="+terminology_file] + "--progress=none", compendium_file, "--output="+terminology_file] subprocess.run(command, check=True, capture_output=True) if __name__ == '__main__': From 642c89320fc4dee507f4264034a790e4c6f63b77 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 15 2020 20:05:11 +0000 Subject: [PATCH 7/7] add debug file for translation finder --- diff --git a/debug/debug_translation_finder.py b/debug/debug_translation_finder.py new file mode 100755 index 0000000..3ddda8d --- /dev/null +++ b/debug/debug_translation_finder.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# Allow to test the result of translation_finder on a local folder + +import argparse +import os +import pprint + +from translation_finder import discover + +def main(): + """Handle params""" + + parser = argparse.ArgumentParser( + description="Detect translation files") + parser.add_argument("--folder", required=True, + help="the folder to analyze") + args = parser.parse_args() + + folderToScan = args.folder + + if os.path.exists(folderToScan): + discover_translations(folderToScan) + + +def discover_translations(folder): + """find po file""" + print("discover_translations: "+folder) + translation_files = [] + + try: + translation_files = discover(folder) + except OSError: + print("error while searching for new") + + + pp = pprint.PrettyPrinter(indent=2) + pp.pprint(translation_files) + + +if __name__ == '__main__': + main() diff --git a/debug/launch.sh b/debug/launch.sh new file mode 100755 index 0000000..2f0d8f5 --- /dev/null +++ b/debug/launch.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +rm -rf venv +virtualenv venv +source venv/bin/activate +pip install -r ../requirements.txt +pip install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip +pip install ruamel.yaml charamel +pip install --upgrade https://github.com/WeblateOrg/translation-finder/archive/master.zip