| |
@@ -6,14 +6,20 @@
|
| |
import json
|
| |
import os
|
| |
import subprocess
|
| |
+ import shutil
|
| |
import tempfile
|
| |
|
| |
+ from io import BytesIO
|
| |
+ from translate.convert import po2tmx
|
| |
+ from translate.storage import factory, po
|
| |
+ from translate.tools import poterminology
|
| |
+
|
| |
|
| |
def main():
|
| |
"""Handle params"""
|
| |
|
| |
parser = argparse.ArgumentParser(
|
| |
- description="Creates compendium for every languages")
|
| |
+ description="Creates useful translator files for every languages")
|
| |
|
| |
parser.add_argument("--results", required=True,
|
| |
help="Set the results folder to use")
|
| |
@@ -24,9 +30,6 @@
|
| |
parser.add_argument("--compress", action="store_true",
|
| |
help="Compress output files")
|
| |
|
| |
- parser.add_argument("--lang", required=False, type=str,
|
| |
- help="Filter a language to analyze")
|
| |
-
|
| |
args = parser.parse_args()
|
| |
|
| |
results_folder = "./results/{v}/".format(v=args.results)
|
| |
@@ -34,84 +37,134 @@
|
| |
tm_folder = os.path.join(results_folder, "languages-tm/")
|
| |
os.makedirs(tm_folder, exist_ok=True)
|
| |
|
| |
+ # clean destination folders
|
| |
+ if args.refresh and os.path.isdir(tm_folder):
|
| |
+ shutil.rmtree(tm_folder)
|
| |
+
|
| |
+ if not os.path.exists(tm_folder):
|
| |
+ os.makedirs(tm_folder)
|
| |
+
|
| |
print("Building the translation memory for every languages")
|
| |
|
| |
- if args.lang:
|
| |
- with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file:
|
| |
+ langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]
|
| |
+
|
| |
+ for lang in sorted(langs):
|
| |
+ lang_code = lang[:-len('.json')]
|
| |
+
|
| |
+ print(" {l}".format(l=lang_code))
|
| |
+
|
| |
+ with open(os.path.join(lang_path, lang), "r") as read_file:
|
| |
files = json.load(read_file)["po"]
|
| |
|
| |
- compute_lang(args.lang, files, tm_folder)
|
| |
- else:
|
| |
- langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]
|
| |
+ compendium_file = os.path.join(tm_folder, lang_code + ".po")
|
| |
+ compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file)
|
| |
+ if not os.path.isfile(compendium_file):
|
| |
+ try:
|
| |
+ process_compendium(files, compendium_file)
|
| |
+ except Exception as e:
|
| |
+ print(" Compendium generation triggered an {t} exception: {e}".format(t=type(e).__name__, e=e))
|
| |
|
| |
- for lang in sorted(langs):
|
| |
- with open(os.path.join(lang_path, lang), "r") as read_file:
|
| |
- files = json.load(read_file)["po"]
|
| |
+ tmx_file = os.path.join(tm_folder, lang_code + ".tmx")
|
| |
+ if not os.path.isfile(tmx_file):
|
| |
+ try:
|
| |
+ process_tmx(lang_code, compendium_file, tmx_file)
|
| |
+ except Exception as e:
|
| |
+ print(" TMX generation triggered an {t} exception: {e}".format(t=type(e).__name__, e=e))
|
| |
|
| |
- compute_lang(lang[:-len('.json')], files, tm_folder, args.refresh)
|
| |
+ terminology_file = os.path.join(tm_folder, lang_code + ".terminology.po")
|
| |
+ if not os.path.isfile(terminology_file):
|
| |
+ try:
|
| |
+ process_terminology(compendium_file, terminology_file)
|
| |
+ except Exception as e:
|
| |
+ print(" Terminology generation triggered an {t} exception: {e}".format(t=type(e).__name__, e=e))
|
| |
|
| |
- print("Detecting missing files")
|
| |
- for lang in sorted(langs):
|
| |
- check_lang(lang[:-len('.json')], tm_folder)
|
| |
+ print("Detecting missing files")
|
| |
+ for lang in sorted(langs):
|
| |
+ check_lang(lang[:-len('.json')], tm_folder)
|
| |
|
| |
if args.compress:
|
| |
print("Compressing files")
|
| |
compress(tm_folder)
|
| |
|
| |
|
| |
- def compute_lang(lang, langfiles, tm_folder, refresh):
|
| |
- """ Generate compendium and convert it to tmx """
|
| |
- """ """
|
| |
- print(" Computing: " + lang)
|
| |
-
|
| |
- # po consolidation
|
| |
- compendium_file = os.path.join(tm_folder, lang + ".po")
|
| |
- compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file)
|
| |
+ def process_compendium(langfiles, dest):
|
| |
+ """ Generate a compendium (a concatenation of all po files) """
|
| |
|
| |
- if not os.path.isfile(compendium_file) or refresh is True:
|
| |
- pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in langfiles]
|
| |
- count = 0
|
| |
+ pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in langfiles]
|
| |
+ count = 0
|
| |
|
| |
- with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp:
|
| |
- for i in pofiles:
|
| |
+ with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp:
|
| |
+ for i in pofiles:
|
| |
+ try:
|
| |
+ command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"]
|
| |
+ subprocess.run(command, check=True, cwd=tmp, capture_output=True)
|
| |
+ except subprocess.CalledProcessError:
|
| |
try:
|
| |
- command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"]
|
| |
+ command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"]
|
| |
subprocess.run(command, check=True, cwd=tmp, capture_output=True)
|
| |
- except subprocess.CalledProcessError:
|
| |
- try:
|
| |
- command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"]
|
| |
- subprocess.run(command, check=True, cwd=tmp, capture_output=True)
|
| |
- except subprocess.CalledProcessError as e:
|
| |
- print("Error with msguniq {i}, error: {e}".format(i=i, e=e))
|
| |
+ except subprocess.CalledProcessError as e:
|
| |
+ print("Error with msguniq {i}, error: {e}".format(i=i, e=e))
|
| |
|
| |
- count += 1
|
| |
+ count += 1
|
| |
|
| |
- onlyfiles = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))]
|
| |
- command = ["msgcat", "--force-po", "--no-location", "--output-file", compendium_file] + onlyfiles
|
| |
+ onlyfiles = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))]
|
| |
+ command = ["msgcat", "--force-po", "--no-location", "--output-file", dest] + onlyfiles
|
| |
|
| |
- try:
|
| |
- subprocess.run(command, check=True, cwd=tmp, capture_output=True)
|
| |
- except subprocess.CalledProcessError:
|
| |
- print(" msgcat exception...")
|
| |
+ try:
|
| |
+ subprocess.run(command, check=True, cwd=tmp, capture_output=True)
|
| |
+ except subprocess.CalledProcessError:
|
| |
+ print(" msgcat exception...")
|
| |
|
| |
- # po to tmx convertion
|
| |
- tmx_file = os.path.join(tm_folder, lang + ".tmx")
|
| |
- command = ["po2tmx", "--language="+lang, "--progress=none",
|
| |
- compendium_file, "--output="+tmx_file]
|
| |
- if not os.path.isfile(tmx_file) or refresh is True:
|
| |
- subprocess.run(command, check=True, capture_output=True)
|
| |
|
| |
- # language terminology
|
| |
- terminology_file = os.path.join(tm_folder, lang + ".terminology.po")
|
| |
- command = ["poterminology", "--ignore-case", "--fold-titlecase",
|
| |
- "--inputs-needed", "1",
|
| |
- "--progress=none", compendium_file, "--output=" + terminology_file]
|
| |
- if not os.path.isfile(terminology_file) or refresh is True:
|
| |
- subprocess.run(command, check=True, capture_output=True)
|
| |
+ def process_tmx(lang, source, dest):
|
| |
+ """ Generate a translation memory from a po file """
|
| |
+
|
| |
+ outputfile = po2tmx.tmxmultifile(dest)
|
| |
+ po2tmx.convertpo(
|
| |
+ inputfile=BytesIO(open(source, "r").read().encode()),
|
| |
+ outputfile=outputfile,
|
| |
+ templatefile=None,
|
| |
+ sourcelanguage="en",
|
| |
+ targetlanguage=lang,
|
| |
+ comment="source"
|
| |
+ )
|
| |
+
|
| |
+ outputfile.tmxfile.savefile(dest)
|
| |
+
|
| |
+
|
| |
+ def process_terminology(source, dest):
|
| |
+ """ Generate a termonology from a po file """
|
| |
+
|
| |
+ extractor = poterminology.TerminologyExtractor()
|
| |
+ options = {
|
| |
+ "inputmin": "1",
|
| |
+ "fullmsgmin": "1",
|
| |
+ "substrmin": "2",
|
| |
+ "locmin": "2",
|
| |
+ "nonstopmin": 1,
|
| |
+ "sortorders": ["frequency", "dictionary", "length"],
|
| |
+ "output": dest,
|
| |
+ }
|
| |
+
|
| |
+ with open(source, "rb") as fh:
|
| |
+ inputfile = factory.getobject(fh)
|
| |
+
|
| |
+ extractor.processunits(inputfile.units, source)
|
| |
+ terms = extractor.extract_terms()
|
| |
+
|
| |
+ termfile = po.pofile()
|
| |
+ termitems = extractor.filter_terms(
|
| |
+ terms, nonstopmin=options["nonstopmin"], sortorders=options["sortorders"]
|
| |
+ )
|
| |
+ for count, unit in termitems:
|
| |
+ termfile.units.append(unit)
|
| |
+
|
| |
+ with open(options["output"], "wb") as fh:
|
| |
+ termfile.serialize(fh)
|
| |
|
| |
|
| |
def check_lang(lang, tm_folder):
|
| |
- """ make sure the files were generated """
|
| |
+ """ Check if expected files were generated """
|
| |
|
| |
compendium_file = os.path.join(tm_folder, lang + ".po")
|
| |
tmx_file = os.path.join(tm_folder, lang + ".tmx")
|
| |
on a few packages, this saves 20% of execution time