#8 compendium generation
Merged 3 years ago by jibecfed. Opened 3 years ago by jibecfed.

file modified
+1 -1
@@ -42,7 +42,7 @@ 

  

      (distname, distrel, distid) = distro.linux_distribution()

      result_folder = "./results/f{v}/".format(v=distrel)

-     tm_folder = "./tm/f{v}/".format(v=distrel)

+     tm_folder = "./tm/f{v}/packages/".format(v=distrel)

      srpms_path = "/srpms"

  

      if not os.path.exists(result_folder):

file modified
+159 -22
@@ -3,9 +3,16 @@ 

  

  import argparse

  import glob

+ import json

  import os

+ import polib

  import subprocess

  import tempfile

+ import time

+ 

+ from shutil import copyfile

+ from shutil import rmtree

+ from weblate_language_data import aliases, languages, language_codes

  

  def main():

      """Handle params"""
@@ -20,44 +27,174 @@ 

      parser.add_argument("--lang", required=False, type=str,

                          help="Filter a language to analyze")

  

+     parser.add_argument("--refresh", action="store_true",

+                         help="Refresh list of available languages to analyze")

+ 

      args = parser.parse_args()

  

-     tm_folder="./tm/f{v}/".format(v=args.release)

+     release_folder = "./tm/f{v}/".format(v=args.release)

+     lang_path = os.path.join(release_folder, "languages/")

+     packages_path = os.path.join(release_folder, "packages/")

+     tm_folder = os.path.join(release_folder, "out/")

  

-     pofiles = glob.glob(tm_folder+'*/*/*.po')

-     po_langs = list(set([ os.path.basename(po) for po in pofiles ]))

-     po_langs.sort()

+     # Step 1: compute the list of languages

+     if args.refresh:

+         print("Refresh the list of languages")

+         rmtree(lang_path)

+         os.mkdir(lang_path)

  

-     if len(args.lang) > 0:

-         compute_lang(args.lang+".po", tm_folder)

-     else:

-         for langfile in po_langs:

-             compute_lang(langfile, tm_folder)

+         start_time_search = time.time()

+ 

+         po_langs = detect_languages(packages_path)

  

- def compute_lang(langfile, tm_folder):

-     """ Generate compendium and convert it to tmx"""

-     lang = os.path.splitext(langfile)[0]

-     print("Creating lang: " + lang)

+         for lang in po_langs.keys():

+             with open(os.path.join(lang_path, lang + '.json'), 'w') as f:

+                 f.write(json.dumps(po_langs[lang], indent=2))

  

-     pofiles = glob.glob(tm_folder+'*/*/'+langfile)

+         search_duration = round(time.time() - start_time_search, 1)

+         print(" Done in {d} seconds".format(d=search_duration))

  

-     print(" - po consolidation")

+     # Step 2: call TM activities

+     if args.lang:

+         with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file:

+             files = json.load(read_file)

+ 

+         compute_lang(args.lang, files, tm_folder)

+     else:

+         langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]

+ 

+         for lang in sorted(langs):

+             with open(os.path.join(lang_path, lang), "r") as read_file:

+                 files = json.load(read_file)

+ 

+             compute_lang(lang[:-len('.json')], files, tm_folder)

+ 

+ def detect_languages(tm_folder):

+     """ For each po file, detect metadatas and deduct the language     """

+     """ Requires: a file hierarchy with po files                       """

+     """ Returns: a dictionary of lists, key=lang code, value=file list """

+     langs = {}

+ 

+     for root, directories, files in os.walk(tm_folder):

+         for file in files:

+             racine, ext = os.path.splitext(file)

+             if ext == ".po":

+                 metadata = dict()

+                 error = ""

+                 try:

+                     metadata = polib.pofile(os.path.join(root, file)).metadata

+                 except UnicodeDecodeError as e:

+                     # encoding error, to investigate before using it in TM

+                     error = "error-unicode"

+                 except OSError as e:

+                     # maybe a polib bug? to investigate before using it in TM

+                     error = "error-os"

+ 

+                 lang = choose_lang(racine, metadata, error)

+ 

+                 try:

+                     langs[lang].append(os.path.join(root, file))

+                 except KeyError:

+                     langs[lang] = list()

+                     langs[lang].append(os.path.join(root, file))

+ 

+     return langs

+ 

+ def choose_lang(filename, metadata, error):

+     """ From a po file and its medata, choose the most likely language code """

+     """ By priority: the Language medata """

+     """ Returns: a language code """

+ 

+     lang = ""

+     file_name = filename.lower()

+     meta_language = ""

+     meta_team = ""

+     try:

+         meta_language = metadata.get("Language").lower()

+     except AttributeError:

+         pass

+ 

+     try:

+         meta_team = metadata.get("Language-Team").lower()

+     except AttributeError:

+         pass

+ 

+     if meta_language in language_codes.LANGUAGES:

+         lang = meta_language

+ 

+     elif file_name in language_codes.LANGUAGES:

+         lang = file_name

+     else:

+         lang = "noresult"

+ 

+     # try languages (some codes here are exclused from languages_codes)

+     if lang == "noresult":

+         loc = [ lang[0] for lang in languages.LANGUAGES ]

+ 

+         if meta_language in loc:

+             lang = meta_language

+         elif file_name in loc:

+             lang = file_name

+ 

+     # try ALIASES

+     if lang == "noresult":

+         if meta_language in aliases.ALIASES.keys():

+             lang = aliases.ALIASES[meta_language]

+         elif file_name in aliases.ALIASES.keys():

+             lang = aliases.ALIASES[file_name]

+         else:

+             lang = "error"

+ 

+     return lang

+ 

+ def compute_lang(lang, langfiles, tm_folder):

+     """ Generate compendium and convert it to tmx """

+     """  """

+     print("Computing: " + lang)

+ 

+     # po consolidation

      compendium_file = tm_folder + lang + ".po"

-     command = ["pocompendium", compendium_file] + pofiles

-     subprocess.run(command, check=True)

+     compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file)

+ 

+     pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in langfiles]

+ 

+     count = 0

+ 

+     with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp:

+         for i in pofiles:

+             try:

+                 command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"]

+                 subprocess.run(command, check=True, cwd=tmp, capture_output=True)

+             except subprocess.CalledProcessError as e:

+                 try:

+                     command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"]

+                     subprocess.run(command, check=True, cwd=tmp, capture_output=True)

+                 except subprocess.CalledProcessError as e:

+                     print("Error with msguniq {i}, error: {e}".format(i=i, e=e))

+ 

+             count += 1

+ 

+         onlyfiles = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))]

+         command = ["msgcat", "--force-po", "--no-location", "--output-file", compendium_file] + onlyfiles

+ 

+         try:

+             subprocess.run(command, check=True, cwd=tmp, capture_output=True)

+         except subprocess.CalledProcessError as e:

+             print(" msgcat exception...")

+ 

  

-     print("  - po to tmx convertion")

+     # po to tmx convertion

      tmx_file = tm_folder + lang + ".tmx"

      command = ["po2tmx", "--language="+lang, "--progress=none",

                 compendium_file, "--output="+tmx_file]

-     subprocess.run(command, check=True)

+     subprocess.run(command, check=True, capture_output=True)

  

-     print("  - language terminology")

+     # language terminology

      terminology_file = tm_folder + lang + ".terminology.po"

      command = ["poterminology", "--ignore-case", "--fold-titlecase",

                  "--inputs-needed", "1",

-                 "--progress=verbose", compendium_file, "--output="+terminology_file]

-     subprocess.run(command, check=True)

+                 "--progress=none", compendium_file, "--output="+terminology_file]

+     subprocess.run(command, check=True, capture_output=True)

  

  if __name__ == '__main__':

      main()

@@ -0,0 +1,41 @@ 

+ #!/usr/bin/env python3

+ # Allow to test the result of translation_finder on a local folder

+ 

+ import argparse

+ import os

+ import pprint

+ 

+ from translation_finder import discover

+ 

+ def main():

+     """Handle params"""

+ 

+     parser = argparse.ArgumentParser(

+         description="Detect translation files")

+     parser.add_argument("--folder", required=True,

+                         help="the folder to analyze")

+     args = parser.parse_args()

+ 

+     folderToScan = args.folder

+ 

+     if os.path.exists(folderToScan):

+         discover_translations(folderToScan)

+ 

+ 

+ def discover_translations(folder):

+     """find po file"""

+     print("discover_translations: "+folder)

+     translation_files = []

+ 

+     try:

+         translation_files = discover(folder)

+     except OSError:

+         print("error while searching for new")

+ 

+ 

+     pp = pprint.PrettyPrinter(indent=2)

+     pp.pprint(translation_files)

+ 

+ 

+ if __name__ == '__main__':

+     main()

file added
+9
@@ -0,0 +1,9 @@ 

+ #!/bin/bash

+ 

+ rm -rf venv

+ virtualenv venv

+ source venv/bin/activate

+ pip install -r ../requirements.txt

+ pip install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip

+ pip install ruamel.yaml charamel

+ pip install --upgrade https://github.com/WeblateOrg/translation-finder/archive/master.zip

run compendium in a temporary folder and add a "smart" language detection

I would love to get some help to make this faster

language detection is a little bit slow, about 10 minutes
but compendium generation is really really slow, maybe replace the pocompendium from "language toolkit" (python) by msgcat from "gettext" (binary) ?

https://www.gnu.org/software/gettext/manual/html_node/msgcat-Invocation.html#msgcat-Invocation ?

1 new commit added

  • smarter way to detect languages
3 years ago

1 new commit added

  • optimize translation memory generation
3 years ago

2 new commits added

  • use msgcat instead of pocompendium
  • less text
3 years ago

1 new commit added

  • add debug file for translation finder
3 years ago

Pull-Request has been merged by jibecfed

3 years ago