PR#27: improve error management and reduce disk space - fedora-l10n/fedora-localization-statistics

fedora-l10n / fedora-localization-statistics

#27 improve error management and reduce disk space

Merged 3 years ago by darknao. Opened 3 years ago by jibecfed.

build into main

fix shutil.rmtree call

Francois Andrieu • 3 years ago

20da6cf

improve error management and reduce disk space

Jean-Baptiste Holcroft • 3 years ago

fd1f1ec

build_language_list.py

file modified

+19 -13

		`@@ -94,11 +94,12 @@`
		`with open(result_file, "w") as f:`
		`f.write(json.dumps(analyze, indent=2))`

		`- elif args.refresh:`
		`- log.info("Refreshing the list of languages")`
		`- rmtree(lang_folder, ignore_errors=True)`
		`- os.mkdir(lang_folder)`
		`+ if args.refresh and os.path.isdir(lang_folder):`
		`+ rmtree(lang_folder)`

		`+ if os.path.exists(lang_folder) is False:`
		`+ log.info("Detecting the list of languages")`
		`+ os.makedirs(lang_folder)`
		`po_langs = detect_languages(package_folder, results_folder)`

		`for lang in po_langs.keys():`
		`@@ -120,12 +121,14 @@`
		`metadata = dict()`
		`try:`
		`metadata = polib.pofile(file).metadata`
		`- except UnicodeDecodeError:`
		`- # encoding error, to investigate before using it in TM`
		`- metadata["Language"] = "error-unicode"`
		`except OSError:`
		`# maybe a polib bug? to investigate before using it in TM`
		`metadata["Language"] = "error-os"`
		`+ except TypeError:`
		`+ metadata["Language"] = "error-type"`
		`+ except UnicodeDecodeError:`
		`+ # encoding error, to investigate before using it in TM`
		`+ metadata["Language"] = "error-unicode"`

		`if "Language" not in metadata.keys():`
		`metadata["Language"] = "zzz_null"`
		`@@ -154,9 +157,7 @@`

		`results[metadata.get("Language")] = language`

		`- results = dict(sorted(results.items(), key=lambda item: item[0]))`
		`-`
		`- return results`
		`+ return dict(sorted(results.items(), key=lambda item: item[0]))`


		`def describe(lang_folder):`
		`@@ -189,8 +190,11 @@`

		`log_file = os.path.join(results_folder, "build_language_list.log")`
		`file_object = open(log_file, "w")`
		`-`
		`+ count = 0`
		`+ total = len(packages)`
		`for package in packages:`
		`+ count += 1`
		`+ log.debug("{c}/{t}".format(c=count, t=total))`
		`discovery_file = os.path.join(package_folder, package, "discover.json")`

		`with open(discovery_file, "r") as read_file:`
		`@@ -212,13 +216,15 @@`
		`except UnicodeDecodeError:`
		`# encoding error, to investigate before using it in TM`
		`error = "error-unicode"`
		`+ except TypeError:`
		`+ error = "error-type"`
		`except OSError:`
		`# maybe a polib bug? to investigate before using it in TM`
		`error = "error-os"`

		`lang, decision = choose_lang(lang_code, metadata, error)`

		`- log = ",".join(`
		`+ debug = ",".join(`
		`[`
		`po,`
		`lang_code,`
		`@@ -228,7 +234,7 @@`
		`str(decision),`
		`]`
		`)`
		`- file_object.write(log + "\n")`
		`+ file_object.write(debug + "\n")`

		`lang_result = langs.get(lang, dict())`
		`po_results = lang_result.get("po", list())`

build_stats.py

file modified

+30 -19

		`@@ -6,6 +6,8 @@`
		`import json`
		`import os`
		`import shutil`
		`+ import subprocess`
		`+`
		`import polib`
		`import logging`

		`@@ -71,30 +73,31 @@`
		`for package in sorted(packages):`
		`count += 1`
		`log.info(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package))`
		`- with open(os.path.join(packages_folder, package, "discover.json"), "r") as f:`
		`- discoveries = json.load(f)`

		`src_folder = os.path.join(packages_folder, package)`
		`stats_file = os.path.join(packages_stats_folder, package + ".json")`

		`- if os.path.isfile(stats_file):`
		`- continue`
		`+ if os.path.isfile(stats_file) is False:`
		`+ with open(os.path.join(packages_folder, package, "discover.json"), "r") as f:`
		`+ discoveries = json.load(f)`

		`- results = dict()`
		`- for discover in discoveries:`
		`- files = glob.glob(os.path.join(src_folder, discover["filemask"]))`
		`+ results = dict()`
		`+ for discover in discoveries:`
		`+ files = glob.glob(os.path.join(src_folder, discover["filemask"]))`

		`- if discover["file_format"] == "po":`
		`- results[discover["filemask"]] = get_po_translation_level(`
		`- files, stats_file`
		`- )`
		`+ if discover["file_format"] == "po":`
		`+ results[discover["filemask"]] = get_po_translation_level(`
		`+ files, stats_file`
		`+ )`

		`- if len(results) > 0:`
		`- distribution_stats = extract_release_stats(distribution_stats, results)`
		`+ if len(results) > 0:`
		`+ with open(stats_file, "w") as f:`
		`+ json.dump(results, f, indent=2)`
		`+ else:`
		`+ with open(stats_file, "r") as f:`
		`+ results = json.load(f)`

		`- if len(results) > 0:`
		`- with open(stats_file, "w") as f:`
		`- json.dump(results, f, indent=2)`
		`+ distribution_stats = extract_release_stats(distribution_stats, results)`

		`log.info("Storing distribution stats")`
		`if not os.path.exists(distribution_stats_folder):`
		`@@ -137,6 +140,11 @@`
		`stats = dict()`

		`for file in files:`
		`+ # remove non standard comments`
		`+ # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean`
		`+ command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file]`
		`+ subprocess.run(command, check=True, capture_output=True)`
		`+`
		`try:`
		`stat = calcstats(file)`
		`except Exception as e:`
		`@@ -168,12 +176,15 @@`
		`metadata = dict()`
		`try:`
		`metadata = polib.pofile(file).metadata`
		`- except UnicodeDecodeError:`
		`- # encoding error, to investigate before using it in TM`
		`- metadata["Language"] = "error-unicode"`
		`except OSError:`
		`# maybe a polib bug? to investigate before using it in TM`
		`metadata["Language"] = "error-os"`
		`+ except UnicodeDecodeError:`
		`+ # encoding error, to investigate before using it in TM`
		`+ metadata["Language"] = "error-unicode"`
		`+ except TypeError:`
		`+ # TypeError: '>' not supported between instances of 'str' and 'int'`
		`+ metadata["Language"] = "error-valuerror"`

		`team = "Unknown..."`
		`try:`

build_tm.py

file modified

+47 -79

		`@@ -10,11 +10,6 @@`
		`import tempfile`
		`import logging`

		`- from io import BytesIO`
		`- from translate.convert import po2tmx`
		`- from translate.storage import factory, po`
		`- from translate.tools import poterminology`
		`-`

		`def main():`
		`"""Handle params"""`
		`@@ -57,7 +52,7 @@`
		`if args.refresh and os.path.isdir(tm_folder):`
		`shutil.rmtree(tm_folder)`

		`- if not os.path.exists(tm_folder):`
		`+ if os.path.exists(tm_folder) is False:`
		`os.makedirs(tm_folder)`

		`log.info("Building the translation memory for every languages")`
		`@@ -78,11 +73,19 @@`
		`compendium_file = os.path.join(`
		`os.path.dirname(os.path.abspath(__file__)), compendium_file`
		`)`
		`- if not os.path.isfile(compendium_file):`
		`+ compendium_archive = compendium_file + ".gz"`
		`+ if os.path.isfile(compendium_file) is False and os.path.isfile(compendium_archive) is False:`
		`+ log.info("Compendium generation")`
		`process_compendium(files, compendium_file, debug_folder)`
		`+ # remove non standard comments`
		`+ # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean`
		`+ command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", compendium_file]`
		`+ subprocess.run(command, check=True, capture_output=True)`

		`tmx_file = os.path.join(tm_folder, lang_code + ".tmx")`
		`- if not os.path.isfile(tmx_file):`
		`+ tmx_archive = tmx_file + ".gz"`
		`+ if os.path.isfile(tmx_file) is False and os.path.isfile(tmx_archive) is False:`
		`+ log.info("TMX generation")`
		`try:`
		`process_tmx(lang_code, compendium_file, tmx_file)`
		`except Exception as e:`
		`@@ -93,7 +96,9 @@`
		`)`

		`terminology_file = os.path.join(tm_folder, lang_code + ".terminology.po")`
		`- if not os.path.isfile(terminology_file):`
		`+ terminology_archive = terminology_file + ".gz"`
		`+ if os.path.isfile(terminology_file) is False and os.path.isfile(terminology_archive) is False:`
		`+ log.info("Terminology generation")`
		`try:`
		`process_terminology(compendium_file, terminology_file)`
		`except Exception as e:`
		`@@ -103,15 +108,21 @@`
		`)`
		`)`

		`+ if args.compress:`
		`+ if os.path.isfile(compendium_file):`
		`+ compress(compendium_file, compendium_archive)`
		`+`
		`+ if os.path.isfile(tmx_file):`
		`+ compress(tmx_file, tmx_archive)`
		`+`
		`+ if os.path.isfile(terminology_file):`
		`+ compress(terminology_file, terminology_archive)`
		`+`
		`log.info("All languages are processed")`

		`log.info("Detecting missing files")`
		`for lang in sorted(langs):`
		`- check_lang(lang[: -len(".json")], tm_folder)`
		`-`
		`- if args.compress:`
		`- log.info("Compressing files")`
		`- compress(tm_folder)`
		`+ check_lang(lang[: -len(".json")], tm_folder, args.compress)`


		`def process_compendium(langfiles, dest, debug_folder):`
		`@@ -149,7 +160,9 @@`
		`subprocess.run(command, check=True, cwd=tmp, capture_output=True)`
		`except subprocess.CalledProcessError as e:`
		`debug_filename = "tm-msguniq-{lang}-{name}".format(lang=dest.split("/")[-1], name=count.__str__())`
		`- log.error(" msguniq error with {i} a copy of this file is into {d} as {n}".format(i=i, e=e.output, d=debug_folder, n=debug_filename))`
		`+ log.error(" msguniq error with {i} a copy of this file is into {d} as {n}".format(i=i, e=e.output,`
		`+ d=debug_folder,`
		`+ n=debug_filename))`
		`shutil.copyfile(i, os.path.join(debug_folder, debug_filename))`

		`count += 1`
		`@@ -219,7 +232,7 @@`
		`else:`
		`if doubt is not False:`
		`log.debug("This file raised a msgcat bug: {f}".format(f=doubt))`
		`- store_debug_file(path, "tm-msgcat-"+destination.split("/")[-1], doubt, debug_folder)`
		`+ store_debug_file(path, "tm-msgcat-" + destination.split("/")[-1], doubt, debug_folder)`
		`ongoing = []`

		`half = int(len(ongoing) / 2)`
		`@@ -241,63 +254,20 @@`
		`def process_tmx(lang, source, dest):`
		`""" Generate a translation memory from a po file """`

		`- """`
		`- outputfile = po2tmx.tmxmultifile(dest)`
		`- po2tmx.convertpo(`
		`- inputfile=BytesIO(open(source, "r").read().encode()),`
		`- outputfile=outputfile,`
		`- templatefile=None,`
		`- sourcelanguage="en",`
		`- targetlanguage=lang,`
		`- comment="source",`
		`- )`
		`-`
		`- outputfile.tmxfile.savefile(dest)`
		`- """`
		`-`
		`command = ["po2tmx", "--language=" + lang, "--progress=none", source, "--output=" + dest]`
		`subprocess.run(command, check=True, capture_output=True)`

		`+`
		`def process_terminology(source, dest):`
		`""" Generate a termonology from a po file """`

		`-`
		`- """`
		`- extractor = poterminology.TerminologyExtractor()`
		`- options = {`
		`- "inputmin": "1",`
		`- "fullmsgmin": "1",`
		`- "substrmin": "2",`
		`- "locmin": "2",`
		`- "nonstopmin": 1,`
		`- "sortorders": ["frequency", "dictionary", "length"],`
		`- "output": dest,`
		`- }`
		`-`
		`- with open(source, "rb") as fh:`
		`- inputfile = factory.getobject(fh)`
		`-`
		`- extractor.processunits(inputfile.units, source)`
		`- terms = extractor.extract_terms()`
		`-`
		`- termfile = po.pofile()`
		`- termitems = extractor.filter_terms(`
		`- terms, nonstopmin=options["nonstopmin"], sortorders=options["sortorders"]`
		`- )`
		`- for count, unit in termitems:`
		`- termfile.units.append(unit)`
		`-`
		`- with open(options["output"], "wb") as fh:`
		`- termfile.serialize(fh)`
		`-`
		`- """`
		`command = ["poterminology", "--ignore-case", "--fold-titlecase",`
		`- "--inputs-needed", "1",`
		`- "--progress=none", source, "--output=" + dest]`
		`+ "--inputs-needed", "1",`
		`+ "--progress=none", source, "--output=" + dest]`
		`subprocess.run(command, check=True, capture_output=True)`


		`- def check_lang(lang, tm_folder):`
		`+ def check_lang(lang, tm_folder, compress):`
		`""" Check if expected files were generated """`
		`log = logging.getLogger("buildTm.check_lang")`

		`@@ -305,33 +275,31 @@`
		`tmx_file = os.path.join(tm_folder, lang + ".tmx")`
		`terminology_file = os.path.join(tm_folder, lang + ".terminology.po")`

		`- if not os.path.isfile(compendium_file):`
		`+ if compress is True:`
		`+ compendium_file += ".gz"`
		`+ tmx_file += ".gz"`
		`+ terminology_file += ".gz"`
		`+`
		`+ if os.path.isfile(compendium_file) is False:`
		`log.warning("{l}-compendium is missing".format(l=lang))`

		`- if not os.path.isfile(tmx_file):`
		`+ if os.path.isfile(tmx_file) is False:`
		`log.warning("{l}-tmx is missing".format(l=lang))`

		`- if not os.path.isfile(terminology_file):`
		`+ if os.path.isfile(terminology_file) is False:`
		`log.warning("{l}-terminology is missing".format(l=lang))`


		`- def compress(folder):`
		`+ def compress(source, archive):`
		`""" Compress files uzing gzip """`
		`log = logging.getLogger("buildTm.compress")`

		`- files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]`
		`-`
		`- for file in sorted(files):`
		`- if file.endswith(".gz"):`
		`- continue`
		`-`
		`- dest = file + ".gz"`
		`- if os.path.isfile(os.path.join(folder, dest)):`
		`- continue`
		`+ log.info("Compressing")`
		`+ with open(source, "rb") as file_in:`
		`+ with gzip.open(archive, "wb") as file_out:`
		`+ file_out.writelines(file_in)`

		`- with open(os.path.join(folder, file), "rb") as file_in:`
		`- with gzip.open(os.path.join(folder, dest), "wb") as file_out:`
		`- file_out.writelines(file_in)`
		`+ os.remove(source)`


		`if __name__ == "__main__":`

runall.sh

file modified

+4 -4

		`@@ -13,13 +13,13 @@`

		`# parcourir tous les fichiers rpm d'une version et en extraire tous les fichiers de traduction`
		`# ~ 3 h (without downloading time)`
		`- # time podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms gco.* --results "$results" --verbose`
		`- # time podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms col.* --results "$results" --verbose`
		`- podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms --results "$results" --verbose \| tee log.1.srpms`
		`+ podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms gco.* --results "$results" --verbose`
		`+ # podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms col.* --results "$results" --verbose`
		`+ # podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms --results "$results" --verbose \| tee log.1.srpms`

		`# déduire la liste de toutes les langues`
		`# ~ 25 m`
		`- ./build_language_list.py --results "$results" --refresh --verbose 2>&1 \| tee log.2.languages`
		`+ ./build_language_list.py --results "$results" --verbose 2>&1 \| tee log.2.languages`

		`# générer un fichier d'analyse de la langue (quels fichiers, équipes, pluriels, etc.)`
		`# ~ 3 m`