#11 compute statistics per language
Merged 3 years ago by jibecfed. Opened 3 years ago by jibecfed.

file modified
+11 -11
@@ -164,7 +164,7 @@ 

                             stdout=out, stderr=error, check=True)

  

  

- def discover_translations(tmp, name, result_folder, packages_folder):

+ def discover_translations(tmp, pkg_name, result_folder, packages_folder):

      """find po file"""

      print("discover_translations: " + tmp)

      translation_files = []
@@ -177,21 +177,21 @@ 

          translation_files = discover(tmp)

      except OSError:

          with open(result_folder + "/errors.txt", "a") as file:

-             file.write(name + " on discover_translations\n")

+             file.write(pkg_name + " on discover_translations\n")

  

      tsearch = round(time.time() - tsearch, 1)

  

      tcopy = time.time()

  

      if translation_files:

-         if not os.path.exists(os.path.join(packages_folder, name)):

-             os.makedirs(os.path.join(packages_folder, name))

+         if not os.path.exists(os.path.join(packages_folder, pkg_name)):

+             os.makedirs(os.path.join(packages_folder, pkg_name))

  

-         with open(os.path.join(packages_folder, name, "discover.json"), 'w') as f:

+         with open(os.path.join(packages_folder, pkg_name, "discover.json"), 'w') as f:

              f.write(json.dumps(translation_files, indent=2))

  

          for translation in translation_files:

-             copy_translations(tmp, translation, name, result_folder, packages_folder)

+             copy_translations(tmp, translation, pkg_name, result_folder, packages_folder)

  

      tcopy = round(time.time() - tcopy, 1)

  
@@ -202,19 +202,19 @@ 

      return (tsearch, tcopy, cresults)

  

  

- def copy_translations(tmp, translation, name, result_folder, packages_folder):

-     # translation, name, result_folder, packages_folder

+ def copy_translations(tmp, translation, pkg_name, result_folder, packages_folder):

      filemask = translation["filemask"]

  

      print("copy translations " + filemask)

  

      if translation["file_format"] in ["po", "json"]:

          for po in glob.glob(tmp + "/" + filemask):

-             dest = packages_folder + "/" + name + "/" + filemask.split("*")[0]

-             os.makedirs(dest, exist_ok=True)

+             dest = packages_folder + "/" + pkg_name + "/" + po.replace(tmp, "")

+             dest_folder = dest.replace(os.path.basename(dest), "")

+             os.makedirs(dest_folder, exist_ok=True)

  

              # use copyfile instead of copy2 to handle read-only files in rpm

-             copyfile(po, os.path.join(dest, os.path.basename(po)))

+             copyfile(po, dest)

  

  if __name__ == '__main__':

      main()

file modified
+63 -1
@@ -2,8 +2,12 @@ 

  """Consolidate and clean result files"""

  

  import argparse

+ import csv

+ import itertools

+ import json

  import os

  import pandas

+ import time

  

  RESULT_FOLDER = ""

  
@@ -21,13 +25,71 @@ 

  

      args = parser.parse_args()

  

+     lang_path = "./results/f{r}/languages/".format(r=args.release)

      RESULT_FOLDER = "./results/f{r}/stats/".format(r=args.release)

      packages_folder = "./results/f{r}/packages/".format(r=args.release)

  

      concat_csv(packages_folder, RESULT_FOLDER)

  

      file = RESULT_FOLDER + "/_concat.csv"

-     parse(file)

+ 

+     # parse(file)

+ 

+     langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]

+ 

+     for lang in sorted(langs):

+         lang_code = lang[:-len('.json')]

+ 

+         with open(os.path.join(lang_path, lang), "r") as read_file:

+             files = json.load(read_file)

+ 

+         compute_lang(lang_code, files, RESULT_FOLDER, packages_folder)

+ 

+ 

+ def compute_lang(lang_code, files, RESULT_FOLDER, packages_folder):

+     print("Computing: {l} ({c} files)".format(l=lang_code, c=len(files)))

+     start_time_search = time.time()

+     stats = []

+     packages = {}

+ 

+     # step 1: get package lists

+     for file in files:

+         po_file = file.replace(packages_folder, "")

+         package = po_file.split("/")[0]

+         po_file = po_file.replace(package + "/", "")

+         try:

+             packages[package].append(po_file)

+         except KeyError:

+             packages[package] = list()

+             packages[package].append(po_file)

+ 

+     # step 2: remove duplicates

+     for package in packages.keys():

+         packages[package] = list(set(packages[package]))

+ 

+     # step 3: parse package files

+     for package in packages.keys():

+         po_files = packages[package]

+         stats_file = os.path.join(packages_folder, package, "stats.csv")

+ 

+         with open(stats_file, newline='') as csvfile:

+             csv_dict_reader = csv.DictReader(csvfile)

+ 

+             [stats.append([package] + list(row.values())) for row in csv_dict_reader if row["Filename"] == po_file]

+ 

+     # step 4: store results

+     lang_stats_file = os.path.join(RESULT_FOLDER, lang_code + ".stats.csv")

+     with open(lang_stats_file, 'w', newline='') as csvfile:

+         spamwriter = csv.writer(csvfile)

+ 

+         header = ['Package', 'Filename', ' Translated Messages', ' Translated Source Words', ' Translated Target Words', ' Fuzzy Messages', ' Fuzzy Source Words', ' Untranslated Messages', ' Untranslated Source Words', ' Total Message', ' Total Source Words', ' Review Messages', ' Review Source Words']

+ 

+         spamwriter.writerow(header)

+ 

+         [spamwriter.writerow(row) for row in stats]

+ 

+     search_duration = round(time.time() - start_time_search, 1)

+     print(" Done in {d} seconds".format(d=search_duration))

  

  

  def parse(file):

file modified
+139 -28
@@ -7,8 +7,10 @@ 

  import time

  import polib

  

+ from pprint import pprint

+ 

  from shutil import rmtree

- from weblate_language_data import aliases, languages, language_codes

+ from weblate_language_data import aliases, languages, language_codes, countries

  

  def main():

      """Handle params"""
@@ -23,26 +25,127 @@ 

      parser.add_argument("--refresh", action="store_true",

                          help="Force refresh")

  

+     parser.add_argument("--describe", action="store_true",

+                         help="Describe the current list of languages")

+ 

+     parser.add_argument("--analyzelang", type=str,

+                         help="Produce an analyze file for a language")

+ 

+     parser.add_argument("--analyzealllangs", action="store_true",

+                         help="Produce an analyze file for all languages")

+ 

      args = parser.parse_args()

  

      release_folder = "./results/f{v}/".format(v=args.release)

      lang_path = os.path.join(release_folder, "languages/")

      packages_path = os.path.join(release_folder, "packages/")

+     lang_analyze_folder = os.path.join(release_folder, "languages-analyses/")

+ 

+     if args.describe:

+         print("Describing detecting languages")

+         describe(lang_path)

+ 

+     elif args.analyzealllangs:

+         rmtree(lang_analyze_folder, ignore_errors=True)

+         os.mkdir(lang_analyze_folder)

+ 

+         langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]

+         for lang in sorted(langs):

+             analyze = analyze_lang(lang_path, lang[:-len('.json')])

+ 

+             with open(os.path.join(lang_analyze_folder, lang), 'w') as f:

+                 f.write(json.dumps(analyze, indent=2))

+ 

+     elif args.analyzelang:

+         print("Provide more data to analyze errors")

+         analyze = analyze_lang(lang_path, args.analyzelang)

+ 

+         with open(os.path.join(lang_analyze_folder, args.analyzelang + '.json'), 'w') as f:

+             f.write(json.dumps(analyze, indent=2))

+ 

+     elif args.refresh:

+         print("Refreshing the list of languages")

+         rmtree(lang_path, ignore_errors=True)

+         os.mkdir(lang_path)

+ 

+         start_time_search = time.time()

+ 

+         po_langs = detect_languages(packages_path)

+ 

+         for lang in po_langs.keys():

+             with open(os.path.join(lang_path, str(lang) + '.json'), 'w') as f:

+                 f.write(json.dumps(po_langs[lang], indent=2))

+ 

+         search_duration = round(time.time() - start_time_search, 1)

+         print(" Done in {d} seconds".format(d=search_duration))

+ 

+ 

+ def analyze_lang(lang_path, analized_lang):

+     """ Analyze one lang """

+     files = []

+     results = dict()

+     with open(os.path.join(lang_path, analized_lang + ".json"), "r") as read_file:

+         files = json.load(read_file)

+ 

+     print(" Analysing language {l}, with {c} files".format(l=analized_lang,c=len(files)))

  

-     print("Refreshing the list of languages")

-     rmtree(lang_path, ignore_errors=True)

-     os.mkdir(lang_path)

+     for file in files:

+         lang = "error"

+         metadata = dict()

+         error = ""

+         try:

+             metadata = polib.pofile(file).metadata

+         except UnicodeDecodeError as e:

+             # encoding error, to investigate before using it in TM

+             metadata["Language"] = "error-unicode"

+         except OSError as e:

+             # maybe a polib bug? to investigate before using it in TM

+             metadata["Language"] = "error-os"

  

-     start_time_search = time.time()

+         if "Language" not in metadata.keys():

+             metadata["Language"] = "zzz_null"

+         elif metadata["Language"] == "":

+             metadata["Language"] = "zzz_empty"

  

-     po_langs = detect_languages(packages_path)

+         if analized_lang != "error":

+             lang = choose_lang("", metadata, error)

  

-     for lang in po_langs.keys():

-         with open(os.path.join(lang_path, lang + '.json'), 'w') as f:

-             f.write(json.dumps(po_langs[lang], indent=2))

+         language = results.get(metadata.get("Language"), dict())

  

-     search_duration = round(time.time() - start_time_search, 1)

-     print(" Done in {d} seconds".format(d=search_duration))

+         count = language.get("Count", 0)

+         count += 1

+         language["Count"] = count

+ 

+         lang_files = language.get("Files", [])

+         lang_files.append(file)

+         language["Files"] = lang_files

+ 

+         plurals = language.get("Plural-Forms", [])

+         plurals.append(metadata.get("Plural-Forms"))

+         plurals = list(set(plurals))

+         language["Plural-Forms"] = plurals

+ 

+         teams = language.get("Language-Team", [])

+         teams.append(metadata.get("Language-Team"))

+         teams = list(set(teams))

+         language["Language-Team"] = teams

+ 

+         results[metadata.get("Language")] = language

+ 

+     results = dict(sorted(results.items(), key=lambda item: item[0]))

+ 

+     return results

+ 

+ 

+ def describe(lang_path):

+     """ Provide the number of files per language """

+     langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]

+ 

+     for lang in sorted(langs):

+         with open(os.path.join(lang_path, lang), "r") as read_file:

+             files = json.load(read_file)

+ 

+         print(" {l}:{c}".format(l=lang[:-len('.json')],c=len(files)))

  

  

  def detect_languages(tm_folder):
@@ -82,35 +185,37 @@ 

      """ Returns: a language code """

  

      lang = ""

-     file_name = filename.lower()

-     meta_language = ""

-     meta_team = ""

-     try:

-         meta_language = metadata.get("Language").lower()

-     except AttributeError:

-         pass

- 

-     try:

-         meta_team = metadata.get("Language-Team").lower()

-     except AttributeError:

-         pass

+     file_name = filename.lower().replace("-", "_")

+ 

+     meta_language = metadata.get("Language","").lower().replace("-", "_")

+ 

+     meta_team = metadata.get("Language-Team","").lower().replace("-", "_")

  

      if meta_language in language_codes.LANGUAGES:

          lang = meta_language

  

      elif file_name in language_codes.LANGUAGES:

          lang = file_name

+ 

      else:

          lang = "noresult"

  

      # try languages (some codes here are exclused from languages_codes)

      if lang == "noresult":

-         loc = [ lang[0] for lang in languages.LANGUAGES ]

+         codes = dict()

+         for language in languages.LANGUAGES:

+             # 0 is language code

+             # 1 is language name

+             codes[language[1].lower()] = language[0].lower()

  

-         if meta_language in loc:

-             lang = meta_language.lower()

-         elif file_name in loc:

-             lang = file_name.lower()

+         if meta_language in codes.values():

+             lang = meta_language

+ 

+         elif file_name in codes.values():

+             lang = file_name

+ 

+         elif meta_language in codes.keys():

+             lang = codes.get(meta_language)

  

      # try ALIASES

      if lang == "noresult":
@@ -118,6 +223,12 @@ 

              lang = aliases.ALIASES[meta_language].lower()

          elif file_name in aliases.ALIASES.keys():

              lang = aliases.ALIASES[file_name].lower()

+ 

+     if lang == "noresult":

+         if meta_language in countries.DEFAULT_LANGS:

+             lang = meta_language.split("_", 1)[0]

+         elif file_name in countries.DEFAULT_LANGS:

+             lang = file_name.split("_", 1)[0]

          else:

              lang = "error"

  

file modified
+25
@@ -23,7 +23,12 @@ 

  

      filenames = [f for f in os.listdir(packages_folder) if os.path.isdir(os.path.join(packages_folder, f))]

  

+     print("Computing stats")

+     count = 0

+ 

      for package in sorted(filenames):

+         count +=1

+         print(" {c}/{t} - {p}".format(c=count, t=len(filenames),p=package))

          with open(os.path.join(packages_folder, package, "discover.json"), 'r') as f:

              translation_files = json.load(f)

  
@@ -40,6 +45,26 @@ 

                  # it's a detection of .tx configuration

                  continue

  

+     print("Removing duplicates")

+     count = 0

+     for package in sorted(filenames):

+         count +=1

+         print(" {c}/{t} - {p}".format(c=count, t=len(filenames),p=package))

+         input_file = packages_folder + "{p}/stats.csv".format(p=package)

+ 

+         try:

+             with open(input_file, 'r') as f:

+                 lines = f.readlines()

+ 

+             seen_lines = set()

+             with open(input_file, 'w') as f:

+                 for line in lines:

+                     if line not in seen_lines:

+                         seen_lines.add(line)

+                         f.write(line)

+         except FileNotFoundError:

+             continue

+ 

  def get_po_translation_level(path, discover, name, packages_folder):

      filemask = discover["filemask"]

      stats_file = packages_folder + "/{p}/stats.csv".format(p=name)

file modified
+45 -1
@@ -2,6 +2,7 @@ 

  """Consolidate each po files into compendium"""

  

  import argparse

+ import gzip

  import json

  import os

  import subprocess
@@ -21,6 +22,9 @@ 

      parser.add_argument("--refresh", action="store_true",

                      help="Force refresh of files")

  

+     parser.add_argument("--compress", action="store_true",

+                     help="Compress output files")

+ 

      parser.add_argument("--lang", required=False, type=str,

                          help="Filter a language to analyze")

  
@@ -49,6 +53,13 @@ 

  

              compute_lang(lang[:-len('.json')], files, tm_folder, args.refresh)

  

+         print("Detecting missing files")

+         for lang in sorted(langs):

+             check_lang(lang[:-len('.json')], tm_folder)

+ 

+     if args.compress:

+         compress(tm_folder)

+ 

      search_duration = round(time.time() - start_time_search, 1)

      print(" Done in {d} seconds".format(d=search_duration))

  
@@ -101,9 +112,42 @@ 

      command = ["poterminology", "--ignore-case", "--fold-titlecase",

                  "--inputs-needed", "1",

                  "--progress=none", compendium_file, "--output="+terminology_file]

-     if not os.path.isfile(tmx_file) or refresh is True:

+     if not os.path.isfile(terminology_file) or refresh is True:

          subprocess.run(command, check=True, capture_output=True)

  

+ 

+ def check_lang(lang, tm_folder):

+     """ make sure the files were generated """

+ 

+     compendium_file = tm_folder + lang + ".po"

+     tmx_file = tm_folder + lang + ".tmx"

+     terminology_file = tm_folder + lang + ".terminology.po"

+ 

+     if not os.path.isfile(compendium_file):

+         print(" {l}-compendium is missing".format(l=lang))

+ 

+     if not os.path.isfile(tmx_file):

+         print(" {l}-tmx is missing".format(l=lang))

+ 

+     if not os.path.isfile(terminology_file):

+         print(" {l}-terminology is missing".format(l=lang))

+ 

+ def compress(folder):

+     """ Compress files uzing gzip """

+ 

+     files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

+     count = 0

+     total = len(files)

+ 

+     for file in sorted(files):

+         count += 1

+         dest = file + ".gz"

+         print(" {c}/{t}")

+ 

+         with open(os.path.join(folder, file), "rb") as file_in:

+             with gzip.open(os.path.join(folder, dest), "wb") as file_out:

+                 file_out.writelines(file_in)

+ 

  if __name__ == '__main__':

      main()

  

file modified
+2
@@ -3,3 +3,5 @@ 

  geopandas

  matplotlib

  descartes

+ polib

+ weblate-language-data

no initial comment

the idea is to reuse the language detection mechanism (one json file per lang) to aggregate language statistics

2 new commits added

  • fix terminology generation and check results
  • improve language detection
3 years ago

1 new commit added

  • add refresh and describe option for language detector
3 years ago

2 new commits added

  • add support for language name in folder
  • add language analysis
3 years ago

2 new commits added

  • compress output files
  • try to use language name to deduct language code
3 years ago

Pull-Request has been merged by jibecfed

3 years ago