PR#18: remplace subprocess by direct calls to translate toolbox - fedora-l10n/fedora-localization-statistics

fedora-l10n / fedora-localization-statistics

#18 remplace subprocess by direct calls to translate toolbox

Merged 3 years ago by jibecfed. Opened 3 years ago by jibecfed.

less_subprocess into master

add translate-toolkit requirement

Jean-Baptiste Holcroft • 3 years ago

0e77fb1

remove useless

Jean-Baptiste Holcroft • 3 years ago

1b7276e

add "error management"

Jean-Baptiste Holcroft • 3 years ago

3e3ec76

remplace subprocess by direct calls to translate toolbox

Jean-Baptiste Holcroft • 3 years ago

e52117e

build_tm.py

file modified

+110 -57

		`@@ -6,14 +6,20 @@`
		`import json`
		`import os`
		`import subprocess`
		`+ import shutil`
		`import tempfile`

		`+ from io import BytesIO`
		`+ from translate.convert import po2tmx`
		`+ from translate.storage import factory, po`
		`+ from translate.tools import poterminology`
		`+`

		`def main():`
		`"""Handle params"""`

		`parser = argparse.ArgumentParser(`
		`- description="Creates compendium for every languages")`
		`+ description="Creates useful translator files for every languages")`

		`parser.add_argument("--results", required=True,`
		`help="Set the results folder to use")`
		`@@ -24,9 +30,6 @@`
		`parser.add_argument("--compress", action="store_true",`
		`help="Compress output files")`

		`- parser.add_argument("--lang", required=False, type=str,`
		`- help="Filter a language to analyze")`
		`-`
		`args = parser.parse_args()`

		`results_folder = "./results/{v}/".format(v=args.results)`
		`@@ -34,84 +37,134 @@`
		`tm_folder = os.path.join(results_folder, "languages-tm/")`
		`os.makedirs(tm_folder, exist_ok=True)`

		`+ # clean destination folders`
		`+ if args.refresh and os.path.isdir(tm_folder):`
		`+ shutil.rmtree(tm_folder)`
		`+`
		`+ if not os.path.exists(tm_folder):`
		`+ os.makedirs(tm_folder)`
		`+`
		`print("Building the translation memory for every languages")`

		`- if args.lang:`
		`- with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file:`
		`+ langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]`
		`+`
		`+ for lang in sorted(langs):`
		`+ lang_code = lang[:-len('.json')]`
		`+`
		`+ print(" {l}".format(l=lang_code))`
		`+`
		`+ with open(os.path.join(lang_path, lang), "r") as read_file:`
		`files = json.load(read_file)["po"]`

		`- compute_lang(args.lang, files, tm_folder)`
		`- else:`
		`- langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]`
		`+ compendium_file = os.path.join(tm_folder, lang_code + ".po")`
		`+ compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file)`
		`+ if not os.path.isfile(compendium_file):`
		`+ try:`
		`+ process_compendium(files, compendium_file)`
		`+ except Exception as e:`
		`+ print(" Compendium generation triggered an {t} exception: {e}".format(t=type(e).__name__, e=e))`

		`- for lang in sorted(langs):`
		`- with open(os.path.join(lang_path, lang), "r") as read_file:`
		`- files = json.load(read_file)["po"]`
		`+ tmx_file = os.path.join(tm_folder, lang_code + ".tmx")`
		`+ if not os.path.isfile(tmx_file):`
		`+ try:`
		`+ process_tmx(lang_code, compendium_file, tmx_file)`
		`+ except Exception as e:`
		`+ print(" TMX generation triggered an {t} exception: {e}".format(t=type(e).__name__, e=e))`

		`- compute_lang(lang[:-len('.json')], files, tm_folder, args.refresh)`
		`+ terminology_file = os.path.join(tm_folder, lang_code + ".terminology.po")`
		`+ if not os.path.isfile(terminology_file):`
		`+ try:`
		`+ process_terminology(compendium_file, terminology_file)`
		`+ except Exception as e:`
		`+ print(" Terminology generation triggered an {t} exception: {e}".format(t=type(e).__name__, e=e))`

		`- print("Detecting missing files")`
		`- for lang in sorted(langs):`
		`- check_lang(lang[:-len('.json')], tm_folder)`
		`+ print("Detecting missing files")`
		`+ for lang in sorted(langs):`
		`+ check_lang(lang[:-len('.json')], tm_folder)`

		`if args.compress:`
		`print("Compressing files")`
		`compress(tm_folder)`


		`- def compute_lang(lang, langfiles, tm_folder, refresh):`
		`- """ Generate compendium and convert it to tmx """`
		`- """ """`
		`- print(" Computing: " + lang)`
		`-`
		`- # po consolidation`
		`- compendium_file = os.path.join(tm_folder, lang + ".po")`
		`- compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file)`
		`+ def process_compendium(langfiles, dest):`
		`+ """ Generate a compendium (a concatenation of all po files) """`

		`- if not os.path.isfile(compendium_file) or refresh is True:`
		`- pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in langfiles]`
		`- count = 0`
		`+ pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in langfiles]`
		`+ count = 0`

		`- with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp:`
		`- for i in pofiles:`
		`+ with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp:`
		`+ for i in pofiles:`
		`+ try:`
		`+ command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"]`
		`+ subprocess.run(command, check=True, cwd=tmp, capture_output=True)`
		`+ except subprocess.CalledProcessError:`
		`try:`
		`- command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"]`
		`+ command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"]`
		`subprocess.run(command, check=True, cwd=tmp, capture_output=True)`
		`- except subprocess.CalledProcessError:`
		`- try:`
		`- command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"]`
		`- subprocess.run(command, check=True, cwd=tmp, capture_output=True)`
		`- except subprocess.CalledProcessError as e:`
		`- print("Error with msguniq {i}, error: {e}".format(i=i, e=e))`
		`+ except subprocess.CalledProcessError as e:`
		`+ print("Error with msguniq {i}, error: {e}".format(i=i, e=e))`

		`- count += 1`
		`+ count += 1`

		`- onlyfiles = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))]`
		`- command = ["msgcat", "--force-po", "--no-location", "--output-file", compendium_file] + onlyfiles`
		`+ onlyfiles = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))]`
		`+ command = ["msgcat", "--force-po", "--no-location", "--output-file", dest] + onlyfiles`

		`- try:`
		`- subprocess.run(command, check=True, cwd=tmp, capture_output=True)`
		`- except subprocess.CalledProcessError:`
		`- print(" msgcat exception...")`
		`+ try:`
		`+ subprocess.run(command, check=True, cwd=tmp, capture_output=True)`
		`+ except subprocess.CalledProcessError:`
		`+ print(" msgcat exception...")`

		`- # po to tmx convertion`
		`- tmx_file = os.path.join(tm_folder, lang + ".tmx")`
		`- command = ["po2tmx", "--language="+lang, "--progress=none",`
		`- compendium_file, "--output="+tmx_file]`
		`- if not os.path.isfile(tmx_file) or refresh is True:`
		`- subprocess.run(command, check=True, capture_output=True)`

		`- # language terminology`
		`- terminology_file = os.path.join(tm_folder, lang + ".terminology.po")`
		`- command = ["poterminology", "--ignore-case", "--fold-titlecase",`
		`- "--inputs-needed", "1",`
		`- "--progress=none", compendium_file, "--output=" + terminology_file]`
		`- if not os.path.isfile(terminology_file) or refresh is True:`
		`- subprocess.run(command, check=True, capture_output=True)`
		`+ def process_tmx(lang, source, dest):`
		`+ """ Generate a translation memory from a po file """`
		`+`
		`+ outputfile = po2tmx.tmxmultifile(dest)`
		`+ po2tmx.convertpo(`
		`+ inputfile=BytesIO(open(source, "r").read().encode()),`
		`+ outputfile=outputfile,`
		`+ templatefile=None,`
		`+ sourcelanguage="en",`
		`+ targetlanguage=lang,`
		`+ comment="source"`
		`+ )`
		`+`
		`+ outputfile.tmxfile.savefile(dest)`
		`+`
		`+`
		`+ def process_terminology(source, dest):`
		`+ """ Generate a termonology from a po file """`
		`+`
		`+ extractor = poterminology.TerminologyExtractor()`
		`+ options = {`
		`+ "inputmin": "1",`
		`+ "fullmsgmin": "1",`
		`+ "substrmin": "2",`
		`+ "locmin": "2",`
		`+ "nonstopmin": 1,`
		`+ "sortorders": ["frequency", "dictionary", "length"],`
		`+ "output": dest,`
		`+ }`
		`+`
		`+ with open(source, "rb") as fh:`
		`+ inputfile = factory.getobject(fh)`
		`+`
		`+ extractor.processunits(inputfile.units, source)`
		`+ terms = extractor.extract_terms()`
		`+`
		`+ termfile = po.pofile()`
		`+ termitems = extractor.filter_terms(`
		`+ terms, nonstopmin=options["nonstopmin"], sortorders=options["sortorders"]`
		`+ )`
		`+ for count, unit in termitems:`
		`+ termfile.units.append(unit)`
		`+`
		`+ with open(options["output"], "wb") as fh:`
		`+ termfile.serialize(fh)`


		`def check_lang(lang, tm_folder):`
		`- """ make sure the files were generated """`
		`+ """ Check if expected files were generated """`

		`compendium_file = os.path.join(tm_folder, lang + ".po")`
		`tmx_file = os.path.join(tm_folder, lang + ".tmx")`