| |
@@ -10,11 +10,6 @@
|
| |
import tempfile
|
| |
import logging
|
| |
|
| |
- from io import BytesIO
|
| |
- from translate.convert import po2tmx
|
| |
- from translate.storage import factory, po
|
| |
- from translate.tools import poterminology
|
| |
-
|
| |
|
| |
def main():
|
| |
"""Handle params"""
|
| |
@@ -41,7 +36,7 @@
|
| |
|
| |
loglevel = logging.INFO
|
| |
if args.verbose:
|
| |
- loglevel = logging.DEBUG
|
| |
+ loglevel = logging.DEBUG
|
| |
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=loglevel)
|
| |
log = logging.getLogger("buildTm")
|
| |
|
| |
@@ -69,7 +64,7 @@
|
| |
for lang in sorted(langs):
|
| |
lang_code = lang[: -len(".json")]
|
| |
|
| |
- log.info(" {l}".format(l=lang_code))
|
| |
+ log.info("Processing {l}".format(l=lang_code))
|
| |
|
| |
with open(os.path.join(lang_path, lang), "r") as read_file:
|
| |
files = json.load(read_file)["po"]
|
| |
@@ -78,18 +73,19 @@
|
| |
compendium_file = os.path.join(
|
| |
os.path.dirname(os.path.abspath(__file__)), compendium_file
|
| |
)
|
| |
- if not os.path.isfile(compendium_file):
|
| |
- try:
|
| |
- process_compendium(files, compendium_file, debug_folder)
|
| |
- except Exception as e:
|
| |
- log.error(
|
| |
- " Compendium generation triggered an {t} exception: {e}".format(
|
| |
- t=type(e).__name__, e=e
|
| |
- )
|
| |
- )
|
| |
+ compendium_archive = compendium_file + ".gz"
|
| |
+ if os.path.isfile(compendium_file) is False and os.path.isfile(compendium_archive) is False:
|
| |
+ log.info("Compendium generation")
|
| |
+ process_compendium(files, compendium_file, debug_folder)
|
| |
+ # remove non standard comments
|
| |
+ # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean
|
| |
+ command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", compendium_file]
|
| |
+ subprocess.run(command, check=True, capture_output=True)
|
| |
|
| |
tmx_file = os.path.join(tm_folder, lang_code + ".tmx")
|
| |
- if not os.path.isfile(tmx_file):
|
| |
+ tmx_archive = tmx_file + ".gz"
|
| |
+ if os.path.isfile(tmx_file) is False and os.path.isfile(tmx_archive) is False:
|
| |
+ log.info("TMX generation")
|
| |
try:
|
| |
process_tmx(lang_code, compendium_file, tmx_file)
|
| |
except Exception as e:
|
| |
@@ -100,7 +96,9 @@
|
| |
)
|
| |
|
| |
terminology_file = os.path.join(tm_folder, lang_code + ".terminology.po")
|
| |
- if not os.path.isfile(terminology_file):
|
| |
+ terminology_archive = terminology_file + ".gz"
|
| |
+ if os.path.isfile(terminology_file) is False and os.path.isfile(terminology_archive) is False:
|
| |
+ log.info("Terminology generation")
|
| |
try:
|
| |
process_terminology(compendium_file, terminology_file)
|
| |
except Exception as e:
|
| |
@@ -110,13 +108,21 @@
|
| |
)
|
| |
)
|
| |
|
| |
+ if args.compress:
|
| |
+ if os.path.isfile(compendium_file):
|
| |
+ compress(compendium_file, compendium_archive)
|
| |
+
|
| |
+ if os.path.isfile(tmx_file):
|
| |
+ compress(tmx_file, tmx_archive)
|
| |
+
|
| |
+ if os.path.isfile(terminology_file):
|
| |
+ compress(terminology_file, terminology_archive)
|
| |
+
|
| |
+ log.info("All languages are processed")
|
| |
+
|
| |
log.info("Detecting missing files")
|
| |
for lang in sorted(langs):
|
| |
- check_lang(lang[: -len(".json")], tm_folder)
|
| |
-
|
| |
- if args.compress:
|
| |
- log.info("Compressing files")
|
| |
- compress(tm_folder)
|
| |
+ check_lang(lang[: -len(".json")], tm_folder, args.compress)
|
| |
|
| |
|
| |
def process_compendium(langfiles, dest, debug_folder):
|
| |
@@ -129,6 +135,7 @@
|
| |
count = 0
|
| |
|
| |
with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp:
|
| |
+
|
| |
for i in pofiles:
|
| |
try:
|
| |
command = [
|
| |
@@ -153,35 +160,24 @@
|
| |
subprocess.run(command, check=True, cwd=tmp, capture_output=True)
|
| |
except subprocess.CalledProcessError as e:
|
| |
debug_filename = "tm-msguniq-{lang}-{name}".format(lang=dest.split("/")[-1], name=count.__str__())
|
| |
- log.error(" msguniq error with {i} a copy of this file is into {d} as {n}".format(i=i, e=e.output, d=debug_folder, n=debug_filename))
|
| |
+ log.error(" msguniq error with {i} a copy of this file is into {d} as {n}".format(i=i, e=e.output,
|
| |
+ d=debug_folder,
|
| |
+ n=debug_filename))
|
| |
shutil.copyfile(i, os.path.join(debug_folder, debug_filename))
|
| |
|
| |
count += 1
|
| |
|
| |
- # search every file that were successful
|
| |
- search_guilty_file(tmp, dest, debug_folder)
|
| |
-
|
| |
+ all_files = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))]
|
| |
+ if len(all_files) == 1:
|
| |
+ shutil.copyfile(os.path.join(tmp, all_files[0]), dest)
|
| |
+ else:
|
| |
+ msgcat_recursive(dest, tmp, debug_folder, all_files, list(), list())
|
| |
|
| |
- def search_guilty_file(path, dest, debug_folder):
|
| |
- log = logging.getLogger("buildTm.process_compendium.guilty")
|
| |
- all_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
|
| |
|
| |
- try_msgcat(all_files, dest, path)
|
| |
-
|
| |
- guilty_file = None
|
| |
- while os.path.isfile(dest) is False:
|
| |
- guilty_file = all_files.pop()
|
| |
- try_msgcat(all_files, dest, path)
|
| |
-
|
| |
- if guilty_file is not None:
|
| |
- debug_filename = "tm-msgcat-{lang}-{name}".format(lang=dest.split("/")[-1], name=guilty_file)
|
| |
- log.error("the file {f} raised error with msgcat, a copy of this file is into {d} as {n}".format(f=guilty_file, d=debug_folder, n=debug_filename))
|
| |
- shutil.move(os.path.join(path, guilty_file), os.path.join(debug_folder, debug_filename))
|
| |
- os.remove(dest)
|
| |
- search_guilty_file(path, dest, debug_folder)
|
| |
-
|
| |
-
|
| |
- def try_msgcat(files, destination, cwd):
|
| |
+ def msgcat(files, destination, path, doubt=False):
|
| |
+ """ Call the msgcat command on a list of po files
|
| |
+ Only print output if a bug is suspected """
|
| |
+ log = logging.getLogger("buildTm.msgcat")
|
| |
command = [
|
| |
"msgcat",
|
| |
"--force-po",
|
| |
@@ -191,60 +187,87 @@
|
| |
] + files
|
| |
|
| |
try:
|
| |
- subprocess.run(command, check=True, cwd=cwd, capture_output=True)
|
| |
- except subprocess.CalledProcessError:
|
| |
+ subprocess.run(command, check=True, cwd=path, capture_output=True)
|
| |
+ except subprocess.CalledProcessError as e:
|
| |
# msgcat often raise exception but continues its processing
|
| |
+ if doubt is not False:
|
| |
+ log.error("Error with file {d}: {e}".format(d=doubt, e=e.stderr.decode('utf8')))
|
| |
pass
|
| |
|
| |
|
| |
+ def store_debug_file(path, name, file, debug_folder):
|
| |
+ """ Move the temporary move file in debug folder """
|
| |
+ log = logging.getLogger("buildTm.store_debug_file")
|
| |
+ target = os.path.join(debug_folder, "{n}-{f}".format(n=name, f=file))
|
| |
+ log.error("The file {f} were moved into {t}".format(f=file, t=target))
|
| |
+ shutil.move(os.path.join(path, file), target)
|
| |
+
|
| |
+
|
| |
+ def msgcat_recursive(destination, path, debug_folder, backlog, ongoing, ok):
|
| |
+ """ Try to call msgcat, retry with half of the files if it fails """
|
| |
+ log = logging.getLogger("buildTm.msgcat_recursive")
|
| |
+ doubt = False
|
| |
+ log.debug("backlog={b}, ongoing={o}, ok={ok}".format(b=len(backlog), o=len(ongoing), ok=len(ok)))
|
| |
+ if len(ongoing) == 0:
|
| |
+ ongoing = backlog.copy()
|
| |
+ backlog = []
|
| |
+
|
| |
+ # we can't use msgcat with one single file
|
| |
+ if len(ongoing) == 1:
|
| |
+ doubt = ongoing.copy().pop()
|
| |
+ ongoing.append(ok[0])
|
| |
+
|
| |
+ msgcat(ongoing, destination, path, doubt)
|
| |
+
|
| |
+ if os.path.isfile(destination) is True:
|
| |
+ processed = len(ongoing)
|
| |
+ ok += ongoing
|
| |
+ # if we added one item from 'ok', we want to make sure it's not duplicated
|
| |
+ ok = list(set(ok))
|
| |
+ ongoing = []
|
| |
+ if len(ok) == processed and len(backlog) == 0:
|
| |
+ log.debug("First generation worked")
|
| |
+ else:
|
| |
+ os.remove(destination)
|
| |
+ else:
|
| |
+ if doubt is not False:
|
| |
+ log.debug("This file raised a msgcat bug: {f}".format(f=doubt))
|
| |
+ store_debug_file(path, "tm-msgcat-" + destination.split("/")[-1], doubt, debug_folder)
|
| |
+ ongoing = []
|
| |
+
|
| |
+ half = int(len(ongoing) / 2)
|
| |
+ backlog += ongoing[half:]
|
| |
+ ongoing = ongoing[:half]
|
| |
+
|
| |
+ if len(backlog) + len(ongoing) > 0:
|
| |
+ msgcat_recursive(destination, path, debug_folder, backlog, ongoing, ok)
|
| |
+ else:
|
| |
+ if os.path.isfile(destination) is False:
|
| |
+ log.debug("Generating remaining files")
|
| |
+ msgcat(ok, destination, path)
|
| |
+
|
| |
+ if os.path.isfile(destination) is False:
|
| |
+ log.error("weird, some files raising bugs were missed?")
|
| |
+ msgcat_recursive(destination, path, debug_folder, ok, list(), list())
|
| |
+
|
| |
+
|
| |
def process_tmx(lang, source, dest):
|
| |
""" Generate a translation memory from a po file """
|
| |
|
| |
- outputfile = po2tmx.tmxmultifile(dest)
|
| |
- po2tmx.convertpo(
|
| |
- inputfile=BytesIO(open(source, "r").read().encode()),
|
| |
- outputfile=outputfile,
|
| |
- templatefile=None,
|
| |
- sourcelanguage="en",
|
| |
- targetlanguage=lang,
|
| |
- comment="source",
|
| |
- )
|
| |
-
|
| |
- outputfile.tmxfile.savefile(dest)
|
| |
+ command = ["po2tmx", "--language=" + lang, "--progress=none", source, "--output=" + dest]
|
| |
+ subprocess.run(command, check=True, capture_output=True)
|
| |
|
| |
|
| |
def process_terminology(source, dest):
|
| |
""" Generate a termonology from a po file """
|
| |
|
| |
- extractor = poterminology.TerminologyExtractor()
|
| |
- options = {
|
| |
- "inputmin": "1",
|
| |
- "fullmsgmin": "1",
|
| |
- "substrmin": "2",
|
| |
- "locmin": "2",
|
| |
- "nonstopmin": 1,
|
| |
- "sortorders": ["frequency", "dictionary", "length"],
|
| |
- "output": dest,
|
| |
- }
|
| |
-
|
| |
- with open(source, "rb") as fh:
|
| |
- inputfile = factory.getobject(fh)
|
| |
-
|
| |
- extractor.processunits(inputfile.units, source)
|
| |
- terms = extractor.extract_terms()
|
| |
-
|
| |
- termfile = po.pofile()
|
| |
- termitems = extractor.filter_terms(
|
| |
- terms, nonstopmin=options["nonstopmin"], sortorders=options["sortorders"]
|
| |
- )
|
| |
- for count, unit in termitems:
|
| |
- termfile.units.append(unit)
|
| |
-
|
| |
- with open(options["output"], "wb") as fh:
|
| |
- termfile.serialize(fh)
|
| |
+ command = ["poterminology", "--ignore-case", "--fold-titlecase",
|
| |
+ "--inputs-needed", "1",
|
| |
+ "--progress=none", source, "--output=" + dest]
|
| |
+ subprocess.run(command, check=True, capture_output=True)
|
| |
|
| |
|
| |
- def check_lang(lang, tm_folder):
|
| |
+ def check_lang(lang, tm_folder, compress):
|
| |
""" Check if expected files were generated """
|
| |
log = logging.getLogger("buildTm.check_lang")
|
| |
|
| |
@@ -252,33 +275,31 @@
|
| |
tmx_file = os.path.join(tm_folder, lang + ".tmx")
|
| |
terminology_file = os.path.join(tm_folder, lang + ".terminology.po")
|
| |
|
| |
- if not os.path.isfile(compendium_file):
|
| |
- log.warning(" {l}-compendium is missing".format(l=lang))
|
| |
+ if compress is True:
|
| |
+ compendium_file += ".gz"
|
| |
+ tmx_file += ".gz"
|
| |
+ terminology_file += ".gz"
|
| |
|
| |
- if not os.path.isfile(tmx_file):
|
| |
- log.warning(" {l}-tmx is missing".format(l=lang))
|
| |
+ if os.path.isfile(compendium_file) is False:
|
| |
+ log.warning("{l}-compendium is missing".format(l=lang))
|
| |
|
| |
- if not os.path.isfile(terminology_file):
|
| |
- log.warning(" {l}-terminology is missing".format(l=lang))
|
| |
+ if os.path.isfile(tmx_file) is False:
|
| |
+ log.warning("{l}-tmx is missing".format(l=lang))
|
| |
|
| |
+ if os.path.isfile(terminology_file) is False:
|
| |
+ log.warning("{l}-terminology is missing".format(l=lang))
|
| |
|
| |
- def compress(folder):
|
| |
+
|
| |
+ def compress(source, archive):
|
| |
""" Compress files uzing gzip """
|
| |
log = logging.getLogger("buildTm.compress")
|
| |
|
| |
- files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
|
| |
-
|
| |
- for file in sorted(files):
|
| |
- if file.endswith(".gz"):
|
| |
- continue
|
| |
-
|
| |
- dest = file + ".gz"
|
| |
- if os.path.isfile(os.path.join(folder, dest)):
|
| |
- continue
|
| |
+ log.info("Compressing")
|
| |
+ with open(source, "rb") as file_in:
|
| |
+ with gzip.open(archive, "wb") as file_out:
|
| |
+ file_out.writelines(file_in)
|
| |
|
| |
- with open(os.path.join(folder, file), "rb") as file_in:
|
| |
- with gzip.open(os.path.join(folder, dest), "wb") as file_out:
|
| |
- file_out.writelines(file_in)
|
| |
+ os.remove(source)
|
| |
|
| |
|
| |
if __name__ == "__main__":
|
| |