PR#8: compendium generation - fedora-l10n/fedora-localization-statistics

		`@@ -42,7 +42,7 @@`

		`(distname, distrel, distid) = distro.linux_distribution()`
		`result_folder = "./results/f{v}/".format(v=distrel)`
		`- tm_folder = "./tm/f{v}/".format(v=distrel)`
		`+ tm_folder = "./tm/f{v}/packages/".format(v=distrel)`
		`srpms_path = "/srpms"`

		`if not os.path.exists(result_folder):`

build_tm.py

file modified

+159 -22

		`@@ -3,9 +3,16 @@`

		`import argparse`
		`import glob`
		`+ import json`
		`import os`
		`+ import polib`
		`import subprocess`
		`import tempfile`
		`+ import time`
		`+`
		`+ from shutil import copyfile`
		`+ from shutil import rmtree`
		`+ from weblate_language_data import aliases, languages, language_codes`

		`def main():`
		`"""Handle params"""`
		`@@ -20,44 +27,174 @@`
		`parser.add_argument("--lang", required=False, type=str,`
		`help="Filter a language to analyze")`

		`+ parser.add_argument("--refresh", action="store_true",`
		`+ help="Refresh list of available languages to analyze")`
		`+`
		`args = parser.parse_args()`

		`- tm_folder="./tm/f{v}/".format(v=args.release)`
		`+ release_folder = "./tm/f{v}/".format(v=args.release)`
		`+ lang_path = os.path.join(release_folder, "languages/")`
		`+ packages_path = os.path.join(release_folder, "packages/")`
		`+ tm_folder = os.path.join(release_folder, "out/")`

		`- pofiles = glob.glob(tm_folder+'//*.po')`
		`- po_langs = list(set([ os.path.basename(po) for po in pofiles ]))`
		`- po_langs.sort()`
		`+ # Step 1: compute the list of languages`
		`+ if args.refresh:`
		`+ print("Refresh the list of languages")`
		`+ rmtree(lang_path)`
		`+ os.mkdir(lang_path)`

		`- if len(args.lang) > 0:`
		`- compute_lang(args.lang+".po", tm_folder)`
		`- else:`
		`- for langfile in po_langs:`
		`- compute_lang(langfile, tm_folder)`
		`+ start_time_search = time.time()`
		`+`
		`+ po_langs = detect_languages(packages_path)`

		`- def compute_lang(langfile, tm_folder):`
		`- """ Generate compendium and convert it to tmx"""`
		`- lang = os.path.splitext(langfile)[0]`
		`- print("Creating lang: " + lang)`
		`+ for lang in po_langs.keys():`
		`+ with open(os.path.join(lang_path, lang + '.json'), 'w') as f:`
		`+ f.write(json.dumps(po_langs[lang], indent=2))`

		`- pofiles = glob.glob(tm_folder+'//'+langfile)`
		`+ search_duration = round(time.time() - start_time_search, 1)`
		`+ print(" Done in {d} seconds".format(d=search_duration))`

		`- print(" - po consolidation")`
		`+ # Step 2: call TM activities`
		`+ if args.lang:`
		`+ with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file:`
		`+ files = json.load(read_file)`
		`+`
		`+ compute_lang(args.lang, files, tm_folder)`
		`+ else:`
		`+ langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]`
		`+`
		`+ for lang in sorted(langs):`
		`+ with open(os.path.join(lang_path, lang), "r") as read_file:`
		`+ files = json.load(read_file)`
		`+`
		`+ compute_lang(lang[:-len('.json')], files, tm_folder)`
		`+`
		`+ def detect_languages(tm_folder):`
		`+ """ For each po file, detect metadatas and deduct the language """`
		`+ """ Requires: a file hierarchy with po files """`
		`+ """ Returns: a dictionary of lists, key=lang code, value=file list """`
		`+ langs = {}`
		`+`
		`+ for root, directories, files in os.walk(tm_folder):`
		`+ for file in files:`
		`+ racine, ext = os.path.splitext(file)`
		`+ if ext == ".po":`
		`+ metadata = dict()`
		`+ error = ""`
		`+ try:`
		`+ metadata = polib.pofile(os.path.join(root, file)).metadata`
		`+ except UnicodeDecodeError as e:`
		`+ # encoding error, to investigate before using it in TM`
		`+ error = "error-unicode"`
		`+ except OSError as e:`
		`+ # maybe a polib bug? to investigate before using it in TM`
		`+ error = "error-os"`
		`+`
		`+ lang = choose_lang(racine, metadata, error)`
		`+`
		`+ try:`
		`+ langs[lang].append(os.path.join(root, file))`
		`+ except KeyError:`
		`+ langs[lang] = list()`
		`+ langs[lang].append(os.path.join(root, file))`
		`+`
		`+ return langs`
		`+`
		`+ def choose_lang(filename, metadata, error):`
		`+ """ From a po file and its medata, choose the most likely language code """`
		`+ """ By priority: the Language medata """`
		`+ """ Returns: a language code """`
		`+`
		`+ lang = ""`
		`+ file_name = filename.lower()`
		`+ meta_language = ""`
		`+ meta_team = ""`
		`+ try:`
		`+ meta_language = metadata.get("Language").lower()`
		`+ except AttributeError:`
		`+ pass`
		`+`
		`+ try:`
		`+ meta_team = metadata.get("Language-Team").lower()`
		`+ except AttributeError:`
		`+ pass`
		`+`
		`+ if meta_language in language_codes.LANGUAGES:`
		`+ lang = meta_language`
		`+`
		`+ elif file_name in language_codes.LANGUAGES:`
		`+ lang = file_name`
		`+ else:`
		`+ lang = "noresult"`
		`+`
		`+ # try languages (some codes here are exclused from languages_codes)`
		`+ if lang == "noresult":`
		`+ loc = [ lang[0] for lang in languages.LANGUAGES ]`
		`+`
		`+ if meta_language in loc:`
		`+ lang = meta_language`
		`+ elif file_name in loc:`
		`+ lang = file_name`
		`+`
		`+ # try ALIASES`
		`+ if lang == "noresult":`
		`+ if meta_language in aliases.ALIASES.keys():`
		`+ lang = aliases.ALIASES[meta_language]`
		`+ elif file_name in aliases.ALIASES.keys():`
		`+ lang = aliases.ALIASES[file_name]`
		`+ else:`
		`+ lang = "error"`
		`+`
		`+ return lang`
		`+`
		`+ def compute_lang(lang, langfiles, tm_folder):`
		`+ """ Generate compendium and convert it to tmx """`
		`+ """ """`
		`+ print("Computing: " + lang)`
		`+`
		`+ # po consolidation`
		`compendium_file = tm_folder + lang + ".po"`
		`- command = ["pocompendium", compendium_file] + pofiles`
		`- subprocess.run(command, check=True)`
		`+ compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file)`
		`+`
		`+ pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in langfiles]`
		`+`
		`+ count = 0`
		`+`
		`+ with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp:`
		`+ for i in pofiles:`
		`+ try:`
		`+ command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"]`
		`+ subprocess.run(command, check=True, cwd=tmp, capture_output=True)`
		`+ except subprocess.CalledProcessError as e:`
		`+ try:`
		`+ command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"]`
		`+ subprocess.run(command, check=True, cwd=tmp, capture_output=True)`
		`+ except subprocess.CalledProcessError as e:`
		`+ print("Error with msguniq {i}, error: {e}".format(i=i, e=e))`
		`+`
		`+ count += 1`
		`+`
		`+ onlyfiles = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))]`
		`+ command = ["msgcat", "--force-po", "--no-location", "--output-file", compendium_file] + onlyfiles`
		`+`
		`+ try:`
		`+ subprocess.run(command, check=True, cwd=tmp, capture_output=True)`
		`+ except subprocess.CalledProcessError as e:`
		`+ print(" msgcat exception...")`
		`+`

		`- print(" - po to tmx convertion")`
		`+ # po to tmx convertion`
		`tmx_file = tm_folder + lang + ".tmx"`
		`command = ["po2tmx", "--language="+lang, "--progress=none",`
		`compendium_file, "--output="+tmx_file]`
		`- subprocess.run(command, check=True)`
		`+ subprocess.run(command, check=True, capture_output=True)`

		`- print(" - language terminology")`
		`+ # language terminology`
		`terminology_file = tm_folder + lang + ".terminology.po"`
		`command = ["poterminology", "--ignore-case", "--fold-titlecase",`
		`"--inputs-needed", "1",`
		`- "--progress=verbose", compendium_file, "--output="+terminology_file]`
		`- subprocess.run(command, check=True)`
		`+ "--progress=none", compendium_file, "--output="+terminology_file]`
		`+ subprocess.run(command, check=True, capture_output=True)`

		`if __name__ == '__main__':`
		`main()`

debug/debug_translation_finder.py

file added

+41

		`@@ -0,0 +1,41 @@`
		`+ #!/usr/bin/env python3`
		`+ # Allow to test the result of translation_finder on a local folder`
		`+`
		`+ import argparse`
		`+ import os`
		`+ import pprint`
		`+`
		`+ from translation_finder import discover`
		`+`
		`+ def main():`
		`+ """Handle params"""`
		`+`
		`+ parser = argparse.ArgumentParser(`
		`+ description="Detect translation files")`
		`+ parser.add_argument("--folder", required=True,`
		`+ help="the folder to analyze")`
		`+ args = parser.parse_args()`
		`+`
		`+ folderToScan = args.folder`
		`+`
		`+ if os.path.exists(folderToScan):`
		`+ discover_translations(folderToScan)`
		`+`
		`+`
		`+ def discover_translations(folder):`
		`+ """find po file"""`
		`+ print("discover_translations: "+folder)`
		`+ translation_files = []`
		`+`
		`+ try:`
		`+ translation_files = discover(folder)`
		`+ except OSError:`
		`+ print("error while searching for new")`
		`+`
		`+`
		`+ pp = pprint.PrettyPrinter(indent=2)`
		`+ pp.pprint(translation_files)`
		`+`
		`+`
		`+ if __name__ == '__main__':`
		`+ main()`

debug/launch.sh

file added

		`@@ -0,0 +1,9 @@`
		`+ #!/bin/bash`
		`+`
		`+ rm -rf venv`
		`+ virtualenv venv`
		`+ source venv/bin/activate`
		`+ pip install -r ../requirements.txt`
		`+ pip install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip`
		`+ pip install ruamel.yaml charamel`
		`+ pip install --upgrade https://github.com/WeblateOrg/translation-finder/archive/master.zip`

jibecfed commented 3 years ago

run compendium in a temporary folder and add a "smart" language detection

jibecfed commented 3 years ago

I would love to get some help to make this faster

language detection is a little bit slow, about 10 minutes
but compendium generation is really really slow, maybe replace the pocompendium from "language toolkit" (python) by msgcat from "gettext" (binary) ?

https://www.gnu.org/software/gettext/manual/html_node/msgcat-Invocation.html#msgcat-Invocation ?

1 new commit added