#27 improve error management and reduce disk space
Merged 3 years ago by darknao. Opened 3 years ago by jibecfed.

file modified
+19 -13
@@ -94,11 +94,12 @@ 

          with open(result_file, "w") as f:

              f.write(json.dumps(analyze, indent=2))

  

-     elif args.refresh:

-         log.info("Refreshing the list of languages")

-         rmtree(lang_folder, ignore_errors=True)

-         os.mkdir(lang_folder)

+     if args.refresh and os.path.isdir(lang_folder):

+         rmtree(lang_folder)

  

+     if os.path.exists(lang_folder) is False:

+         log.info("Detecting the list of languages")

+         os.makedirs(lang_folder)

          po_langs = detect_languages(package_folder, results_folder)

  

          for lang in po_langs.keys():
@@ -120,12 +121,14 @@ 

          metadata = dict()

          try:

              metadata = polib.pofile(file).metadata

-         except UnicodeDecodeError:

-             # encoding error, to investigate before using it in TM

-             metadata["Language"] = "error-unicode"

          except OSError:

              # maybe a polib bug? to investigate before using it in TM

              metadata["Language"] = "error-os"

+         except TypeError:

+             metadata["Language"] = "error-type"

+         except UnicodeDecodeError:

+             # encoding error, to investigate before using it in TM

+             metadata["Language"] = "error-unicode"

  

          if "Language" not in metadata.keys():

              metadata["Language"] = "zzz_null"
@@ -154,9 +157,7 @@ 

  

          results[metadata.get("Language")] = language

  

-     results = dict(sorted(results.items(), key=lambda item: item[0]))

- 

-     return results

+     return dict(sorted(results.items(), key=lambda item: item[0]))

  

  

  def describe(lang_folder):
@@ -189,8 +190,11 @@ 

  

      log_file = os.path.join(results_folder, "build_language_list.log")

      file_object = open(log_file, "w")

- 

+     count = 0

+     total = len(packages)

      for package in packages:

+         count += 1

+         log.debug("{c}/{t}".format(c=count, t=total))

          discovery_file = os.path.join(package_folder, package, "discover.json")

  

          with open(discovery_file, "r") as read_file:
@@ -212,13 +216,15 @@ 

                  except UnicodeDecodeError:

                      # encoding error, to investigate before using it in TM

                      error = "error-unicode"

+                 except TypeError:

+                     error = "error-type"

                  except OSError:

                      # maybe a polib bug? to investigate before using it in TM

                      error = "error-os"

  

                  lang, decision = choose_lang(lang_code, metadata, error)

  

-                 log = ",".join(

+                 debug = ",".join(

                      [

                          po,

                          lang_code,
@@ -228,7 +234,7 @@ 

                          str(decision),

                      ]

                  )

-                 file_object.write(log + "\n")

+                 file_object.write(debug + "\n")

  

                  lang_result = langs.get(lang, dict())

                  po_results = lang_result.get("po", list())

file modified
+30 -19
@@ -6,6 +6,8 @@ 

  import json

  import os

  import shutil

+ import subprocess

+ 

  import polib

  import logging

  
@@ -71,30 +73,31 @@ 

      for package in sorted(packages):

          count += 1

          log.info(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package))

-         with open(os.path.join(packages_folder, package, "discover.json"), "r") as f:

-             discoveries = json.load(f)

  

          src_folder = os.path.join(packages_folder, package)

          stats_file = os.path.join(packages_stats_folder, package + ".json")

  

-         if os.path.isfile(stats_file):

-             continue

+         if os.path.isfile(stats_file) is False:

+             with open(os.path.join(packages_folder, package, "discover.json"), "r") as f:

+                 discoveries = json.load(f)

  

-         results = dict()

-         for discover in discoveries:

-             files = glob.glob(os.path.join(src_folder, discover["filemask"]))

+             results = dict()

+             for discover in discoveries:

+                 files = glob.glob(os.path.join(src_folder, discover["filemask"]))

  

-             if discover["file_format"] == "po":

-                 results[discover["filemask"]] = get_po_translation_level(

-                     files, stats_file

-                 )

+                 if discover["file_format"] == "po":

+                     results[discover["filemask"]] = get_po_translation_level(

+                         files, stats_file

+                     )

  

-         if len(results) > 0:

-             distribution_stats = extract_release_stats(distribution_stats, results)

+             if len(results) > 0:

+                 with open(stats_file, "w") as f:

+                     json.dump(results, f, indent=2)

+         else:

+             with open(stats_file, "r") as f:

+                 results = json.load(f)

  

-         if len(results) > 0:

-             with open(stats_file, "w") as f:

-                 json.dump(results, f, indent=2)

+         distribution_stats = extract_release_stats(distribution_stats, results)

  

      log.info("Storing distribution stats")

      if not os.path.exists(distribution_stats_folder):
@@ -137,6 +140,11 @@ 

      stats = dict()

  

      for file in files:

+         # remove non standard comments

+         # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean

+         command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file]

+         subprocess.run(command, check=True, capture_output=True)

+ 

          try:

              stat = calcstats(file)

          except Exception as e:
@@ -168,12 +176,15 @@ 

      metadata = dict()

      try:

          metadata = polib.pofile(file).metadata

-     except UnicodeDecodeError:

-         # encoding error, to investigate before using it in TM

-         metadata["Language"] = "error-unicode"

      except OSError:

          # maybe a polib bug? to investigate before using it in TM

          metadata["Language"] = "error-os"

+     except UnicodeDecodeError:

+         # encoding error, to investigate before using it in TM

+         metadata["Language"] = "error-unicode"

+     except TypeError:

+         # TypeError: '>' not supported between instances of 'str' and 'int'

+         metadata["Language"] = "error-valuerror"

  

      team = "Unknown..."

      try:

file modified
+47 -79
@@ -10,11 +10,6 @@ 

  import tempfile

  import logging

  

- from io import BytesIO

- from translate.convert import po2tmx

- from translate.storage import factory, po

- from translate.tools import poterminology

- 

  

  def main():

      """Handle params"""
@@ -57,7 +52,7 @@ 

      if args.refresh and os.path.isdir(tm_folder):

          shutil.rmtree(tm_folder)

  

-     if not os.path.exists(tm_folder):

+     if os.path.exists(tm_folder) is False:

          os.makedirs(tm_folder)

  

      log.info("Building the translation memory for every languages")
@@ -78,11 +73,19 @@ 

          compendium_file = os.path.join(

              os.path.dirname(os.path.abspath(__file__)), compendium_file

          )

-         if not os.path.isfile(compendium_file):

+         compendium_archive = compendium_file + ".gz"

+         if os.path.isfile(compendium_file) is False and os.path.isfile(compendium_archive) is False:

+             log.info("Compendium generation")

              process_compendium(files, compendium_file, debug_folder)

+             # remove non standard comments

+             # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean

+             command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", compendium_file]

+             subprocess.run(command, check=True, capture_output=True)

  

          tmx_file = os.path.join(tm_folder, lang_code + ".tmx")

-         if not os.path.isfile(tmx_file):

+         tmx_archive = tmx_file + ".gz"

+         if os.path.isfile(tmx_file) is False and os.path.isfile(tmx_archive) is False:

+             log.info("TMX generation")

              try:

                  process_tmx(lang_code, compendium_file, tmx_file)

              except Exception as e:
@@ -93,7 +96,9 @@ 

                  )

  

          terminology_file = os.path.join(tm_folder, lang_code + ".terminology.po")

-         if not os.path.isfile(terminology_file):

+         terminology_archive = terminology_file + ".gz"

+         if os.path.isfile(terminology_file) is False and os.path.isfile(terminology_archive) is False:

+             log.info("Terminology generation")

              try:

                  process_terminology(compendium_file, terminology_file)

              except Exception as e:
@@ -103,15 +108,21 @@ 

                      )

                  )

  

+         if args.compress:

+             if os.path.isfile(compendium_file):

+                 compress(compendium_file, compendium_archive)

+ 

+             if os.path.isfile(tmx_file):

+                 compress(tmx_file, tmx_archive)

+ 

+             if os.path.isfile(terminology_file):

+                 compress(terminology_file, terminology_archive)

+ 

      log.info("All languages are processed")

  

      log.info("Detecting missing files")

      for lang in sorted(langs):

-         check_lang(lang[: -len(".json")], tm_folder)

- 

-     if args.compress:

-         log.info("Compressing files")

-         compress(tm_folder)

+         check_lang(lang[: -len(".json")], tm_folder, args.compress)

  

  

  def process_compendium(langfiles, dest, debug_folder):
@@ -149,7 +160,9 @@ 

                      subprocess.run(command, check=True, cwd=tmp, capture_output=True)

                  except subprocess.CalledProcessError as e:

                      debug_filename = "tm-msguniq-{lang}-{name}".format(lang=dest.split("/")[-1], name=count.__str__())

-                     log.error(" msguniq error with {i} a copy of this file is into {d} as {n}".format(i=i, e=e.output, d=debug_folder, n=debug_filename))

+                     log.error(" msguniq error with {i} a copy of this file is into {d} as {n}".format(i=i, e=e.output,

+                                                                                                       d=debug_folder,

+                                                                                                       n=debug_filename))

                      shutil.copyfile(i, os.path.join(debug_folder, debug_filename))

  

              count += 1
@@ -219,7 +232,7 @@ 

      else:

          if doubt is not False:

              log.debug("This file raised a msgcat bug: {f}".format(f=doubt))

-             store_debug_file(path, "tm-msgcat-"+destination.split("/")[-1], doubt, debug_folder)

+             store_debug_file(path, "tm-msgcat-" + destination.split("/")[-1], doubt, debug_folder)

              ongoing = []

  

          half = int(len(ongoing) / 2)
@@ -241,63 +254,20 @@ 

  def process_tmx(lang, source, dest):

      """ Generate a translation memory from a po file """

  

-     """

-     outputfile = po2tmx.tmxmultifile(dest)

-     po2tmx.convertpo(

-         inputfile=BytesIO(open(source, "r").read().encode()),

-         outputfile=outputfile,

-         templatefile=None,

-         sourcelanguage="en",

-         targetlanguage=lang,

-         comment="source",

-     )

- 

-     outputfile.tmxfile.savefile(dest)

-     """

- 

      command = ["po2tmx", "--language=" + lang, "--progress=none", source, "--output=" + dest]

      subprocess.run(command, check=True, capture_output=True)

  

+ 

  def process_terminology(source, dest):

      """ Generate a termonology from a po file """

  

- 

-     """

-     extractor = poterminology.TerminologyExtractor()

-     options = {

-         "inputmin": "1",

-         "fullmsgmin": "1",

-         "substrmin": "2",

-         "locmin": "2",

-         "nonstopmin": 1,

-         "sortorders": ["frequency", "dictionary", "length"],

-         "output": dest,

-     }

- 

-     with open(source, "rb") as fh:

-         inputfile = factory.getobject(fh)

- 

-     extractor.processunits(inputfile.units, source)

-     terms = extractor.extract_terms()

- 

-     termfile = po.pofile()

-     termitems = extractor.filter_terms(

-         terms, nonstopmin=options["nonstopmin"], sortorders=options["sortorders"]

-     )

-     for count, unit in termitems:

-         termfile.units.append(unit)

- 

-     with open(options["output"], "wb") as fh:

-         termfile.serialize(fh)

-     

-     """

      command = ["poterminology", "--ignore-case", "--fold-titlecase",

-                                 "--inputs-needed", "1",

-                                 "--progress=none", source, "--output=" + dest]

+                "--inputs-needed", "1",

+                "--progress=none", source, "--output=" + dest]

      subprocess.run(command, check=True, capture_output=True)

  

  

- def check_lang(lang, tm_folder):

+ def check_lang(lang, tm_folder, compress):

      """ Check if expected files were generated """

      log = logging.getLogger("buildTm.check_lang")

  
@@ -305,33 +275,31 @@ 

      tmx_file = os.path.join(tm_folder, lang + ".tmx")

      terminology_file = os.path.join(tm_folder, lang + ".terminology.po")

  

-     if not os.path.isfile(compendium_file):

+     if compress is True:

+         compendium_file += ".gz"

+         tmx_file += ".gz"

+         terminology_file += ".gz"

+ 

+     if os.path.isfile(compendium_file) is False:

          log.warning("{l}-compendium is missing".format(l=lang))

  

-     if not os.path.isfile(tmx_file):

+     if os.path.isfile(tmx_file) is False:

          log.warning("{l}-tmx is missing".format(l=lang))

  

-     if not os.path.isfile(terminology_file):

+     if os.path.isfile(terminology_file) is False:

          log.warning("{l}-terminology is missing".format(l=lang))

  

  

- def compress(folder):

+ def compress(source, archive):

      """ Compress files uzing gzip """

      log = logging.getLogger("buildTm.compress")

  

-     files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

- 

-     for file in sorted(files):

-         if file.endswith(".gz"):

-             continue

- 

-         dest = file + ".gz"

-         if os.path.isfile(os.path.join(folder, dest)):

-             continue

+     log.info("Compressing")

+     with open(source, "rb") as file_in:

+         with gzip.open(archive, "wb") as file_out:

+             file_out.writelines(file_in)

  

-         with open(os.path.join(folder, file), "rb") as file_in:

-             with gzip.open(os.path.join(folder, dest), "wb") as file_out:

-                 file_out.writelines(file_in)

+     os.remove(source)

  

  

  if __name__ == "__main__":

file modified
+4 -4
@@ -13,13 +13,13 @@ 

  

  # parcourir tous les fichiers rpm d'une version et en extraire tous les fichiers de traduction

  # ~ 3 h (without downloading time)

- # time podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms gco.* --results "$results" --verbose

- # time podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms col.* --results "$results" --verbose

- podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms --results "$results" --verbose | tee log.1.srpms

+ podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms gco.* --results "$results" --verbose

+ # podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms col.* --results "$results" --verbose

+ # podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release /src/build.py --keep-srpms --results "$results" --verbose | tee log.1.srpms

  

  # déduire la liste de toutes les langues

  # ~ 25 m

- ./build_language_list.py --results "$results" --refresh --verbose 2>&1 | tee log.2.languages

+ ./build_language_list.py --results "$results" --verbose 2>&1 | tee log.2.languages

  

  # générer un fichier d'analyse de la langue (quels fichiers, équipes, pluriels, etc.)

  # ~ 3 m

no initial comment

1 new commit added

  • fix shutil.rmtree call
3 years ago

15G less on f34 with this, looks good :D

Pull-Request has been merged by darknao

3 years ago