#45 merge language stats files together
Merged 8 months ago by jibecfed. Opened 9 months ago by jibecfed.

file modified
+1 -1
@@ -4,7 +4,7 @@ 

  

  * Motivation is described in https://fedoraproject.org/wiki/Changes/LocalizationMeasurementAndTooling

  * It is deployed in https://languages.fedoraproject.org and https://languages.stg.fedoraproject.org

- * Infrastructure is hosted by https://console-openshift-console.apps.ocp.fedoraproject.org/

+ * Infrastructure is hosted by https://console-openshift-console.apps.ocp.fedoraproject.org/ and https://console-openshift-console.apps.ocp.stg.fedoraproject.org

  * Infrastructure code is in https://pagure.io/fedora-infra/ansible/blob/main/f/roles/openshift-apps/languages

  

  # Licensing

file modified
+9 -9
@@ -73,18 +73,18 @@ 

  

      srpm_regex = None

      if args.filter:

-         srpm_regex = re.compile("^{}$".format(args.filter))

+         srpm_regex = re.compile(f"^{args.filter}$")

  

-     packages_folder = "./results/{v}/packages/".format(v=args.results)

-     srpms_path = os.path.abspath("./results/{v}/srpms/".format(v=args.results))

+     packages_folder = f"./results/{args.results}/packages/"

+     srpms_path = os.path.abspath("./srpms/")

  

      if not os.path.exists(packages_folder):

          os.makedirs(packages_folder)

      if not os.path.exists(srpms_path):

          os.makedirs(srpms_path)

  

-     data_file = os.path.join("./results/{v}/".format(v=args.results), "data.json")

-     srpm_list_file = os.path.join(srpms_path, "{v}.txt".format(v=args.results))

+     data_file = os.path.join(f"./results/{args.results}/", "data.json")

+     srpm_list_file = os.path.join(srpms_path, f"{args.results}.txt")

      url_list = None

  

      if os.path.isfile(srpm_list_file):
@@ -102,12 +102,12 @@ 

          if dnf_file:

              dnf_fp = os.path.join("dnf", dnf_file)

              if os.path.isfile(dnf_fp):

-                 dnf_args = "-c {}".format(dnf_fp)

-                 log.info("Using dnf conf {}".format(dnf_file))

+                 dnf_args = f"-c {dnf_fp}"

+                 log.info(f"Using dnf conf {dnf_file}")

              else:

-                 log.warning("dnf conf {} not found".format(dnf_file))

+                 log.warning(f"dnf conf {dnf_file} not found")

          p = subprocess.Popen(

-             "dnf {dnf_args} download --source --skip-broken --url '*' | grep src.rpm".format(dnf_args=dnf_args),

+             f"dnf {dnf_args} download --source --skip-broken --url '*' | grep src.rpm",

              stdout=subprocess.PIPE,

              shell=True)

  

file modified
+61 -169
@@ -1,8 +1,7 @@ 

  #!/usr/bin/env python3

- """ Parse translation files to deduct language list """

+ """ Detect language for each translation file """

  

  import argparse

- import csv

  import glob

  import json

  import os
@@ -11,7 +10,6 @@ 

  import logging

  import utils

  

- from shutil import rmtree

  from weblate_language_data import aliases, languages, language_codes, countries

  

  LOCAL_ALIASES = {"ca_valencia": "ca@valencia"}
@@ -21,7 +19,7 @@ 

      """Handle params"""

  

      parser = argparse.ArgumentParser(

-         description="Creates a list of languages form translation files"

+         description="Detect language for each translation file"

      )

  

      parser.add_argument(
@@ -31,20 +29,6 @@ 

      parser.add_argument("--refresh", action="store_true", help="Force refresh")

  

      parser.add_argument(

-         "--describe", action="store_true", help="Describe the current list of languages"

-     )

- 

-     parser.add_argument(

-         "--analyzelang", type=str, help="Produce an analyze file for a language"

-     )

- 

-     parser.add_argument(

-         "--analyzealllangs",

-         action="store_true",

-         help="Produce an analyze file for all languages",

-     )

- 

-     parser.add_argument(

          "-v",

          "--verbose",

          default=False,
@@ -58,130 +42,20 @@ 

      utils.set_logging(args.verbose, args.results)

      log = logging.getLogger("buildLanguageList")

  

-     results_folder = "./results/{v}/".format(v=args.results)

-     lang_folder = os.path.join(results_folder, "languages/")

+     results_folder = f"./results/{args.results}/"

      package_folder = os.path.join(results_folder, "packages/")

-     lang_analyze_folder = os.path.join(results_folder, "languages-analyses/")

  

-     if args.describe:

-         log.info("Describing detecting languages")

-         describe(lang_folder)

- 

-     elif args.analyzealllangs:

-         log.info("Provide more data to analyze errors")

-         rmtree(lang_analyze_folder, ignore_errors=True)

-         os.mkdir(lang_analyze_folder)

- 

-         langs = [

-             f

-             for f in os.listdir(lang_folder)

-             if os.path.isfile(os.path.join(lang_folder, f))

-         ]

-         for lang in sorted(langs):

-             analyze = analyze_lang(lang_folder, lang[: -len(".json")])

- 

-             with open(os.path.join(lang_analyze_folder, lang), "w") as f:

-                 f.write(json.dumps(analyze, indent=2))

- 

-     elif args.analyzelang:

-         log.info("Provide more data to analyze errors")

-         if not os.path.exists(lang_analyze_folder):

-             os.makedirs(lang_analyze_folder)

- 

-         analyze = analyze_lang(lang_folder, args.analyzelang)

-         result_file = os.path.join(lang_analyze_folder, args.analyzelang + ".json")

- 

-         with open(result_file, "w") as f:

-             f.write(json.dumps(analyze, indent=2))

- 

-     if args.refresh and os.path.isdir(lang_folder):

-         rmtree(lang_folder)

- 

-     if os.path.exists(lang_folder) is False:

-         log.info("Detecting the list of languages")

-         os.makedirs(lang_folder)

-         po_langs = detect_languages(package_folder, results_folder)

- 

-         for lang in po_langs.keys():

-             with open(os.path.join(lang_folder, str(lang) + ".json"), "w") as f:

-                 f.write(json.dumps(po_langs[lang], indent=2))

+     scan_packages(package_folder, args.refresh)

  

      log.info("done")

  

  

- def analyze_lang(lang_folder, analized_lang):

-     """ Analyze one lang """

-     log = logging.getLogger("buildLanguageList.analyze_lang")

-     files = []

-     results = dict()

-     with open(os.path.join(lang_folder, analized_lang + ".json"), "r") as read_file:

-         files = json.load(read_file)["po"]

- 

-     log.info(" Analysing language {l}, with {c} files".format(l=analized_lang, c=len(files)))

- 

-     for file in files:

-         metadata = dict()

-         try:

-             metadata = polib.pofile(file).metadata

-         except OSError:

-             # maybe a polib bug? to investigate before using it in TM

-             metadata["Language"] = "error-os"

-         except TypeError:

-             metadata["Language"] = "error-type"

-         except UnicodeDecodeError:

-             # encoding error, to investigate before using it in TM

-             metadata["Language"] = "error-unicode"

- 

-         if "Language" not in metadata.keys():

-             metadata["Language"] = "zzz_null"

-         elif metadata["Language"] == "":

-             metadata["Language"] = "zzz_empty"

- 

-         language = results.get(metadata.get("Language"), dict())

- 

-         count = language.get("Count", 0)

-         count += 1

-         language["Count"] = count

- 

-         lang_files = language.get("Files", [])

-         lang_files.append(file)

-         language["Files"] = sorted(lang_files)

- 

-         plurals = language.get("Plural-Forms", [])

-         plurals.append(metadata.get("Plural-Forms"))

-         plurals = list(set(plurals))

-         language["Plural-Forms"] = plurals

- 

-         teams = language.get("Language-Team", [])

-         teams.append(metadata.get("Language-Team"))

-         teams = list(set(teams))

-         language["Language-Team"] = teams

- 

-         results[metadata.get("Language")] = language

- 

-     return dict(sorted(results.items(), key=lambda item: item[0]))

- 

- 

- def describe(lang_folder):

-     """ Provide the number of files per language """

-     log = logging.getLogger("buildLanguageList.describe")

-     langs = [

-         f

-         for f in os.listdir(lang_folder)

-         if os.path.isfile(os.path.join(lang_folder, f))

-     ]

- 

-     for lang in sorted(langs):

-         with open(os.path.join(lang_folder, lang), "r") as read_file:

-             files = json.load(read_file)

- 

-         log.info(" {l}:{c}".format(l=lang[:-len('.json')], c=len(files)))

- 

- 

- def detect_languages(package_folder, results_folder):

-     """ For each po file, detect metadatas and deduct the language     """

-     """ Requires: a file hierarchy with po files                       """

-     """ Returns: a dictionary of lists, key=lang code, value=file list """

+ def scan_packages(package_folder: str, refresh: bool):

+     """ For each po file, detect metadata and deduct the language

+     :param refresh: force to compute again the values

+     :param package_folder: where to find packages hierarchy with discover.json

+     :return: a dictionary of lists, key=lang code, value=file lis

+     """

      log = logging.getLogger("buildLanguageList.detect_languages")

      langs = {}

      packages = [
@@ -190,31 +64,48 @@ 

          if os.path.isdir(os.path.join(package_folder, f))

      ]

  

-     log_file = os.path.join(results_folder, "build_language_list.log")

-     debug_file = list()

      count = 0

+     processed_files_count = 0

+     processed_files_duplicates_count = 0

      total = len(packages)

      for package in sorted(packages):

          count += 1

-         log.info("{c}/{t} {p}".format(c=count, t=total, p=package))

+         log.info(f"{count}/{total} {package}")

          discovery_file = os.path.join(package_folder, package, "discover.json")

+         languages_file = os.path.join(package_folder, package, "stats.json")

+ 

+         if os.path.isfile(languages_file) is True:

+             if refresh is False:

+                 log.info("Language file already exist, no need to process")

+                 continue

+ 

+         processed_files = dict()

  

          with open(discovery_file, "r") as read_file:

-             alls = json.load(read_file)

+             discover_patterns = json.load(read_file)

  

-         to_process = [p for p in alls if p["file_format"] == "po"]

+         po_patterns = [p for p in discover_patterns if p["file_format"] == "po"]

  

-         for pattern in to_process:

-             mask = os.path.join(package_folder, package, pattern["filemask"])

-             p = re.compile(mask.replace("*", "(.*)").replace("+", r"\+"))

+         for pattern in po_patterns:

+             filemask = os.path.join(package_folder, package, pattern["filemask"])

+             p = re.compile(filemask.replace("*", "(.*)").replace("+", r"\+"))

  

-             for po in glob.glob(mask):

-                 result = p.search(po)

-                 lang_code = result.group(1)

+             for po_file in glob.glob(filemask):

+ 

+                 if po_file in processed_files.get("po", {}).keys():

+                     # there is no need to process the file it were processed already

+                     log.debug(f"{po_file} were already processed")

+                     processed_files_duplicates_count += 1

+                     continue

+ 

+                 processed_files_count += 1

+ 

+                 result = p.search(po_file)

+                 path_lang_code = result.group(1)

                  metadata = dict()

                  error = ""

                  try:

-                     metadata = polib.pofile(po).metadata

+                     metadata = polib.pofile(po_file).metadata

                  except UnicodeDecodeError:

                      # encoding error, to investigate before using it in TM

                      error = "error-unicode"
@@ -224,35 +115,36 @@ 

                      # maybe a polib bug? to investigate before using it in TM

                      error = "error-os"

  

-                 lang, decision = choose_lang(lang_code, metadata, error)

+                 lang, decision = choose_language_code_from_po(path_lang_code, metadata)

  

-                 debug_file.append([

-                     po,

-                     lang_code,

-                     metadata.get("Language", ""),

-                     error,

-                     lang,

-                     str(decision),

-                 ])

+                 debug_file = {"lang_in_path": path_lang_code,

+                               "metadata_lang": metadata.get("Language", ""),

+                               "metadata_plurals": metadata.get("Plural-Forms", ""),

+                               "metadata_language_team": metadata.get("Language-Team", ""),

+                               "polib_error": error,

+                               "lang_code_chosen": lang,

+                               "lang_code_decision": str(decision)

+                               }

  

-                 lang_result = langs.get(lang, dict())

-                 po_results = lang_result.get("po", list())

-                 po_results.append(po)

-                 lang_result["po"] = po_results

+                 processed_po_files = processed_files.get("po", {})

+                 processed_po_files[po_file] = debug_file

+                 processed_files["po"] = processed_po_files

  

-                 langs[lang] = lang_result

+         with open(languages_file, "w") as f:

+             json.dump(processed_files, f, indent=2)

  

-     with open(log_file, "w") as file_object:

-         write_file_object = csv.writer(file_object)

-         write_file_object.writerows(debug_file)

+     log.info(f"Done {processed_files_count} files were processed, we skipped {processed_files_duplicates_count} duplicates")

  

      return langs

  

  

- def choose_lang(filename, metadata, error):

-     """ From a po file and its medata, choose the most likely language code """

-     """ By priority: the Language medata """

-     """ Returns: a language code """

+ def choose_language_code_from_po(filename: str, metadata: dict[str]) -> tuple[str, int]:

+     """ Deduct a language code from a filename and its metadata

+ 

+     :param filename: po filename

+     :param metadata: po metadata

+     :return: a language code, a decision path

+     """

      log = logging.getLogger("buildLanguageList.choose_lang")

  

      lang = "noresult"

file modified
+205 -166
@@ -1,25 +1,115 @@ 

  #!/usr/bin/env python3

- """For each package, compute stats"""

+ """ Computes stats for each package with translations and each detected language """

  

  import argparse

- import glob

  import json

+ import logging

  import os

- import shutil

  import subprocess

+ from collections import defaultdict

  

- import polib

- import logging

- import utils

+ from numpyencoder import NumpyEncoder

  

+ import pandas as pd

  from translate.tools.pocount import calcstats

  

+ import utils

+ 

+ 

+ def compute_language_statistics(languages_stats: dict, total_release_source_words: int) -> dict:

+     """ For each language, produce global statistics and per package statistics

+ 

+     global statistics target:

+         "totalsourcewordssum": total words on started packages

+         "totalsourcewords_d": total words in release

+         "translatedsourcewordssum": total translated words

+         "progress": current translation progress on started packages (in percents)

+         "progress_d": current translation progress on all strings in release (in percents)

+ 

+     per package statistics target:

+         "name": package name

+         "progress": current translation progress (in percents)

+         "translated": total translated words (source words, it can vary in target language)

+         "team": language team info

+     """

+     log = logging.getLogger("buildStats.compute_language_statistics")

+ 

+     results_languages = dict()

+     po_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy",

+      "untranslated", "translatedtargetwords"]

+     package_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy",

+      "untranslated", "translatedtargetwords", "package"]

+ 

+     for code, stats in languages_stats.items():

+         results_languages[code] = {}

+         results_languages[code]["po"] = stats

+         result = {}

+ 

+         df = pd.DataFrame.from_records(stats)

+ 

+         for kpi in po_fields:

+             result[kpi] = df[kpi].sum()

+ 

+         result["totalsourcewordssum"] = result["translatedsourcewords"] + result["fuzzysourcewords"] + result[

+             "untranslatedsourcewords"]

+         result["totalsourcewords_d"] = total_release_source_words

+ 

+         # prevent a Runtime warning for languages with no content

+         if result["totalsourcewordssum"] > 0:

+             result["progress"] = (result["translatedsourcewords"] / result["totalsourcewordssum"]) * 100

+         else:

+             result["progress"] = 0.0

+ 

+         result["progress_d"] = (result["translatedsourcewords"] / result["totalsourcewords_d"]) * 100

+ 

+         packages_stats = df[package_fields].groupby("package").sum()

+         packages_stats["totalsourcewordssum"] = packages_stats["translatedsourcewords"] + packages_stats["fuzzysourcewords"] + packages_stats["untranslatedsourcewords"]

+ 

+         packages_stats["progress"] = (packages_stats["translatedsourcewords"] / packages_stats["totalsourcewordssum"]) * 100

+         # prevent NaN values when a package have total source words = 0

+         packages_stats.fillna(0, inplace=True)

+         packages_stats["team"] = df[["metadata_language_team", "package"]].groupby("package").first()

+         result["packages"] = packages_stats.to_dict(orient="index")

+ 

+         results_languages[code].update(result)

+ 

+     return results_languages

+ 

+ 

+ def compute_package_statistics(df: pd.DataFrame) -> dict:

+     """ For each package, per language statistics

+ 

+     global statistics target:

+         "lang_code": language code

+         "team": language team info

+         "progress": current translation progress (in percents),

+         "translated": total translated words (source words, it can vary in target language)

+         "filename": list of files considered for statistics

+     """

+     log = logging.getLogger("buildStats.compute_language_statistics")

+     results = dict()

+     index = "lang_code_chosen"

+     po_fields = ["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords", "translated", "fuzzy",

+                  "untranslated", "translatedtargetwords", index]

+ 

+     stats = df[po_fields].groupby(index).sum()

+     stats["totalsourcewordssum"] = stats["translatedsourcewords"] + stats["fuzzysourcewords"] + stats["untranslatedsourcewords"]

+     stats["progress"] = (stats["translatedsourcewords"] / stats["totalsourcewordssum"]) * 100

+     # prevent NaN values when a package have total source words = 0

+     stats.fillna(0, inplace=True)

+     stats["team"] = df[["metadata_language_team", index]].groupby(index).first()

+     df['filename'] = df.index

+     stats["filename"] = df[["filename", index]].groupby(index).sum()

+     results["languages"] = stats.to_dict(orient="index")

+ 

+     return results

+ 

  

  def main():

      """Handle params"""

  

      parser = argparse.ArgumentParser(

-         description="Computes stats for each package with translations"

+         description="Computes stats for each package with translations and each detected language"

      )

      parser.add_argument(

          "--results", required=True, help="Set the results folder to use"
@@ -40,18 +130,10 @@ 

      utils.set_logging(args.verbose, args.results)

      log = logging.getLogger("buildStats")

  

-     results_folder = "./results/{v}/".format(v=args.results)

-     packages_folder = "./results/{v}/packages/".format(v=args.results)

-     packages_stats_folder = "./results/{v}/packages-stats/".format(v=args.results)

-     languages_folder = "./results/{v}/languages/".format(v=args.results)

-     languages_stats_folder = "./results/{v}/languages-stats/".format(v=args.results)

- 

-     for folder in [

-         packages_stats_folder,

-         languages_stats_folder

-     ]:

-         if args.refresh and os.path.isdir(folder):

-             shutil.rmtree(folder)

+     results_folder = f"./results/{args.results}/"

+     packages_folder = f"./results/{args.results}/packages/"

+     languages_stats_folder = f"./results/{args.results}/languages/"

+     os.makedirs(languages_stats_folder, exist_ok=True)

  

      log.info("Computing packages stats")

      packages = [
@@ -60,187 +142,144 @@ 

          if os.path.isdir(os.path.join(packages_folder, f))

      ]

      count = 0

-     distribution_stats = dict()

- 

-     if not os.path.exists(packages_stats_folder):

-         os.makedirs(packages_stats_folder)

+     all_stats = list()

  

      for package in sorted(packages):

          count += 1

-         log.info(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package))

+         log.info(f" {count}/{len(packages)} - {package}")

+         stats_file = os.path.join(packages_folder, package, "stats.json")

  

-         src_folder = os.path.join(packages_folder, package)

-         stats_file = os.path.join(packages_stats_folder, package + ".json")

+         with open(stats_file, "r") as f:

+             stats = json.load(f)

  

-         if os.path.isfile(stats_file) is False:

-             with open(os.path.join(packages_folder, package, "discover.json"), "r") as f:

-                 discoveries = json.load(f)

+         stats["package"] = package

  

-             results = dict()

-             for discover in discoveries:

-                 files = glob.glob(os.path.join(src_folder, discover["filemask"]))

+         # some packages have no detected po files

+         if "po" not in stats.keys():

+             continue

  

-                 if discover["file_format"] == "po":

-                     results[discover["filemask"]] = get_po_translation_level(

-                         files, stats_file

-                     )

+         for file in stats["po"].keys():

+             if "translated" in stats["po"][file].keys() \

+                     and args.refresh is False:

+                 log.debug(f"{file} is already processed")

+                 continue

  

-             if len(results) == 0:

-                 log.warning("No translation file found?")

-             else:

-                 with open(stats_file, "w") as f:

-                     json.dump(results, f, indent=2)

-         else:

-             with open(stats_file, "r") as f:

-                 results = json.load(f)

+             stats["po"][file].update(get_po_translation_level(file))

  

-         distribution_stats = extract_release_stats(distribution_stats, results)

+         df = pd.DataFrame.from_dict(stats["po"], orient='index')

+         stats["stats"] = compute_package_statistics(df)

+         stats["totalsourcewords"] = df[["translatedsourcewords", "fuzzysourcewords", "untranslatedsourcewords"]].sum().sum()

  

-     log.info("Computing language stats")

-     languages = [f for f in os.listdir(languages_folder)]

-     count = 0

+         with open(stats_file, "w") as f:

+             json.dump(stats, f, indent=2, cls=NumpyEncoder)

  

-     languages_stats_folder = languages_stats_folder

-     if not os.path.exists(languages_stats_folder):

-         os.makedirs(languages_stats_folder)

+         all_stats.append(stats)

  

-     for language in sorted(languages):

-         count += 1

-         lang = language[:-5]

+     log.info("Aggregating language stats")

+     languages = defaultdict(list)

+     total_distribution_source_words = 0

+     for package in all_stats:

+         max_languages = defaultdict(int)

+         for filename, stats in package["po"].items():

+             lang_code = stats["lang_code_chosen"]

+             stats["filename"] = filename

+             stats["package"] = package["package"]

  

-         log.info(" {c}/{t} - {l}".format(c=count, t=len(languages), l=lang))

-         with open(os.path.join(languages_folder, language), "r") as f:

-             discoveries = json.load(f)

+             languages[lang_code].append(stats)

  

-         stats_file = os.path.join(languages_stats_folder, lang + ".json")

+             max_languages[lang_code] += stats["translatedsourcewords"]

+             max_languages[lang_code] += stats["untranslatedsourcewords"]

  

-         if os.path.isfile(stats_file):

-             continue

+         try:

+             del max_languages["error"]

+         except KeyError:

+             pass

+         total_distribution_source_words += max(max_languages.values())

  

-         files = discoveries.get("po", [])

-         if files:

-             with open(stats_file, "w") as f:

-                 json.dump(get_po_translation_level(files, stats_file), f, indent=2)

+     log.info("Storing language stats")

+     for lang_code, language in languages.items():

+         stats_file = os.path.join(languages_stats_folder, f"{lang_code}.json")

+         with open(stats_file, "w") as f:

+             json.dump(language, f, indent=2)

  

-     log.info("Storing distribution stats")

+     log.info("Computing language stats")

+     languages = compute_language_statistics(languages, total_distribution_source_words)

  

-     distribution_file = os.path.join(results_folder, "release.json")

-     with open(os.path.join(results_folder, "data.json"), "r") as f:

-         distribution_stats["total_release_packages"] = len(json.load(f))

+     log.info("Storing language stats")

+     for lang_code, language in languages.items():

+         stats_file = os.path.join(languages_stats_folder, f"{lang_code}.json")

+         with open(stats_file, "w") as f:

+             json.dump(language, f, indent=2, cls=NumpyEncoder)

  

-     total_packages_files = list()

-     for base, dirs, files in os.walk(packages_folder):

-         for file in files:

-             if file != "discover.json":

-                 total_packages_files.append(os.path.join(base, file))

  

-     distribution_stats["total_packages_files"] = len(total_packages_files)

-     distribution_stats["total_packages"] = len(packages)

-     distribution_stats["nb_files"] = len(list(set(distribution_stats["files"])))

+     log.info("Processing distribution stats")

  

-     packages_with_stats = [f for f in os.listdir(packages_stats_folder) if os.path.isfile(os.path.join(packages_stats_folder, f))]

-     distribution_stats["total_packages_with_stats"] = len(packages_with_stats)

-     distribution_stats["total_languages"] = len(languages)

-     with open(distribution_file, "w") as f:

-         json.dump(distribution_stats, f, indent=2)

+     distribution_file = os.path.join(results_folder, "release.json")

+     distribution_stats = dict()

+     with open(os.path.join(results_folder, "data.json"), "r") as f:

+         distribution_stats["packages_count"] = len(json.load(f))

  

-     log.info("Searching for bugs ;)")

-     used_files = list(set(distribution_stats["files"]))

-     if len(total_packages_files) != len(used_files):

-         log.debug("source:{s} used: {u}".format(s=len(total_packages_files), u=len(used_files)))

-         missing_files = [source for source in total_packages_files if not source in used_files]

-         missing_files_po = [file for file in missing_files if file.endswith(".po")]

-         if len(missing_files_po) > 0:

-             log.debug("Some po files are missing")

-             distribution_file = os.path.join(results_folder, "build_stats.missing_po_files.json")

-             with open(distribution_file, "w") as f:

-                 json.dump(missing_files_po, f, indent=2)

+     # detected = identified with translation files

+     distribution_stats["packages_detected_count"] = len(packages)

+     distribution_stats["files_detected_count"] = sum([len(package["po"]) for package in all_stats])

  

-     log.info("done")

+     # processed = what we were able to use

+     distribution_stats["packages_processed_count"] = 0

+     distribution_stats["files_processed_count"] = 0

  

+     for package in sorted(packages):

+         log.info(package)

+         stats_file = os.path.join(packages_folder, package, "stats.json")

  

- def get_po_translation_level(files, stats_file):

-     """ Compute results """

-     log = logging.getLogger("buildStats.get_po_translation_level")

-     stats = dict()

+         with open(stats_file, "r") as f:

+             stats = json.load(f)

  

-     for file in files:

-         # remove non standard comments

-         # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean

-         command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file]

-         subprocess.run(command, check=True, capture_output=True)

+         # if there is no source words, it means we were not able to process anything

+         if "totalsourcewords" in stats.keys():

+             if stats["totalsourcewords"] > 0:

+                 distribution_stats["packages_processed_count"] += 1

  

-         try:

-             stat = calcstats(file)

-         except Exception as e:

-             log.error(" {f} triggered an {t} exception: {e}".format(f=file, t=type(e).__name__, e=e))

-             continue

+             for _, detected in stats["po"].items():

+                 if detected["lang_code_chosen"] != "error":

+                     distribution_stats["files_processed_count"] += 1

+ 

+     distribution_stats["totalsourcewords"] = total_distribution_source_words

+     distribution_stats["languages_processed_count"] = len(languages)

  

-         keys = [

-             "translatedsourcewords",

-             "fuzzysourcewords",

-             "untranslatedsourcewords",

-             "translated",

-             "fuzzy",

-             "untranslated",

-             "translatedtargetwords",

-         ]

-         results = dict()

-         for key in keys:

-             results[key] = stat.get(key, 0)

+     log.info(distribution_stats)

  

-         results["team"] = get_language_team(file)

+     log.info("Storing distribution stats")

+     with open(distribution_file, "w") as f:

+         json.dump(distribution_stats, f, indent=2)

  

-         stats[file] = results

+     log.info("done")

  

-     return stats

  

+ def get_po_translation_level(file: str) -> dict:

+     """ Call pocount to get translation stats for a file """

+     log = logging.getLogger("buildStats.get_po_translation_level")

+     command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", file]

+     subprocess.run(command, check=True, capture_output=True)

  

- def get_language_team(file):

-     log = logging.getLogger("buildStats.get_language_team")

-     metadata = dict()

-     try:

-         metadata = polib.pofile(file).metadata

-     except OSError:

-         # maybe a polib bug? to investigate before using it in TM

-         metadata["Language"] = "error-os"

-     except UnicodeDecodeError:

-         # encoding error, to investigate before using it in TM

-         metadata["Language"] = "error-unicode"

-     except TypeError:

-         # TypeError: '>' not supported between instances of 'str' and 'int'

-         metadata["Language"] = "error-valuerror"

- 

-     team = "Unknown..."

      try:

-         team = metadata["Language-Team"]

-     except KeyError:

-         log.debug("The file {f} have no Language team? Here are the metadata: {m}".format(f=file, m=metadata))

-     return team

- 

- 

- def extract_release_stats(results, files_stats):

-     log = logging.getLogger("buildStats.extract_release_stats")

-     number_of_packages = results.get("nb_packages", 0)

-     number_of_packages += 1

-     files = results.get("files", list())

-     total_source_words = results.get("totalsourcewords", 0)

- 

-     for template in files_stats:

-         maxresult = 0

-         for file in files_stats[template]:

-             translated = files_stats[template][file]["translatedsourcewords"]

-             untranslated = files_stats[template][file]["untranslatedsourcewords"]

-             maxresult = max(maxresult, translated + untranslated)

-             files.append(file)

- 

-         total_source_words += maxresult

- 

-     results = {

-         "nb_packages": number_of_packages,

-         "files": files,

-         "totalsourcewords": total_source_words,

-     }

+         stat = calcstats(file)

+     except Exception as e:

+         log.error(f" {file} triggered an {type(e).__name__} exception: {e}")

+         stat = {"could_not_process": 1}

+ 

+     keys = [

+         "translatedsourcewords",

+         "fuzzysourcewords",

+         "untranslatedsourcewords",

+         "translated",

+         "fuzzy",

+         "untranslated",

+         "translatedtargetwords",

+         "could_not_process"

+     ]

+     results = dict()

+     for key in keys:

+         results[key] = stat.get(key, 0)

  

      return results

  

file modified
+62 -76
@@ -1,5 +1,5 @@ 

  #!/usr/bin/env python3

- """Consolidate each po files into compendium"""

+ """ Creates useful translator files for every language """

  

  import argparse

  import gzip
@@ -17,7 +17,7 @@ 

      """Handle params"""

  

      parser = argparse.ArgumentParser(

-         description="Creates useful translator files for every languages"

+         description="Creates useful translator files for every language"

      )

  

      parser.add_argument(
@@ -39,13 +39,11 @@ 

      utils.set_logging(args.verbose, args.results)

      log = logging.getLogger("buildTm")

  

-     results_folder = "./results/{v}/".format(v=args.results)

+     results_folder = f"./results/{args.results}/"

      lang_path = os.path.join(results_folder, "languages/")

      tm_folder = os.path.join(results_folder, "languages-tm/")

-     debug_folder = os.path.join(results_folder, "debug_folder/")

  

      os.makedirs(tm_folder, exist_ok=True)

-     os.makedirs(debug_folder, exist_ok=True)

  

      # clean destination folders

      if args.refresh and os.path.isdir(tm_folder):
@@ -54,58 +52,51 @@ 

      if os.path.exists(tm_folder) is False:

          os.makedirs(tm_folder)

  

-     log.info("Building the translation memory for every languages")

+     log.info("Find detected languages")

  

-     langs = [

+     languages = [

          f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))

      ]

  

-     for lang in sorted(langs):

-         lang_code = lang[: -len(".json")]

+     for language in sorted(languages):

+         language_code = language[: -len(".json")]

  

-         log.info("Processing {l}".format(l=lang_code))

+         log.info(f"Processing {language_code}")

  

-         with open(os.path.join(lang_path, lang), "r") as read_file:

+         with open(os.path.join(lang_path, language), "r") as read_file:

              files = json.load(read_file)["po"]

+             files = [f["filename"] for f in files]

  

-         compendium_file = os.path.join(tm_folder, lang_code + ".po")

+         compendium_file = os.path.join(tm_folder, f"{language_code}.po")

          compendium_file = os.path.join(

              os.path.dirname(os.path.abspath(__file__)), compendium_file

          )

-         compendium_archive = compendium_file + ".gz"

+         compendium_archive = f"{compendium_file}.gz"

          if os.path.isfile(compendium_file) is False and os.path.isfile(compendium_archive) is False:

              log.info("Compendium generation")

-             process_compendium(files, compendium_file, debug_folder)

+             process_compendium(files, compendium_file, tm_folder, language_code)

              # remove non standard comments

              # taken from: https://github.com/translate/translate/blob/master/tools/pocommentclean

              command = ["sed", "-i", "/^#$/d;/^#[^\:\~,\.]/d", compendium_file]

              subprocess.run(command, check=True, capture_output=True)

  

-         tmx_file = os.path.join(tm_folder, lang_code + ".tmx")

-         tmx_archive = tmx_file + ".gz"

+         tmx_file = os.path.join(tm_folder, f"{language_code}.tmx")

+         tmx_archive = f"{tmx_file}.gz"

          if os.path.isfile(tmx_file) is False and os.path.isfile(tmx_archive) is False:

              log.info("TMX generation")

              try:

-                 process_tmx(lang_code, compendium_file, tmx_file)

+                 process_tmx(language_code, compendium_file, tmx_file)

              except Exception as e:

-                 log.error(

-                     " TMX generation triggered an {t} exception: {e}".format(

-                         t=type(e).__name__, e=e

-                     )

-                 )

- 

-         terminology_file = os.path.join(tm_folder, lang_code + ".terminology.po")

-         terminology_archive = terminology_file + ".gz"

+                 log.error(f" TMX generation triggered an {type(e)} exception: {e}")

+ 

+         terminology_file = os.path.join(tm_folder, f"{language_code}.terminology.po")

+         terminology_archive = f"{terminology_file}.gz"

          if os.path.isfile(terminology_file) is False and os.path.isfile(terminology_archive) is False:

              log.info("Terminology generation")

              try:

                  process_terminology(compendium_file, terminology_file)

              except Exception as e:

-                 log.error(

-                     " Terminology generation triggered an {t} exception: {e}".format(

-                         t=type(e).__name__, e=e

-                     )

-                 )

+                 log.error(f" Terminology generation triggered an {type(e)} exception: {e}")

  

          if args.compress:

              if os.path.isfile(compendium_file):
@@ -120,28 +111,26 @@ 

      log.info("All languages are processed")

  

      log.info("Detecting missing files")

-     for lang in sorted(langs):

-         check_lang(lang[: -len(".json")], tm_folder, args.compress)

+     for language in sorted(languages):

+         check_lang(language[: -len(".json")], tm_folder, args.compress)

  

      log.info("done")

  

  

- def process_compendium(langfiles, dest, debug_folder):

+ def process_compendium(po_files: list, destination_file, debug_folder: str, language_code: str) -> None:

      """ Generate a compendium (a concatenation of all po files) """

      log = logging.getLogger("buildTm.process_compendium")

  

-     pofiles = [

-         os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in langfiles

-     ]

+     po_files = [os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in po_files]

      count = 0

  

      with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp:

  

-         for i in pofiles:

+         for file in po_files:

              try:

                  command = [

                      "msguniq",

-                     i,

+                     file,

                      "--output-file",

                      count.__str__(),

                      "--no-location",
@@ -151,7 +140,7 @@ 

                  try:

                      command = [

                          "msguniq",

-                         i,

+                         file,

                          "--output-file",

                          count.__str__(),

                          "--to-code",
@@ -160,22 +149,22 @@ 

                      ]

                      subprocess.run(command, check=True, cwd=tmp, capture_output=True)

                  except subprocess.CalledProcessError as e:

-                     debug_filename = "tm-msguniq-{lang}-{name}".format(lang=dest.split("/")[-1], name=count.__str__())

-                     log.error(" msguniq error with {i} a copy of this file is into {d} as {n}".format(i=i, e=e.output,

-                                                                                                       d=debug_folder,

-                                                                                                       n=debug_filename))

-                     shutil.copyfile(i, os.path.join(debug_folder, debug_filename))

+                     short_filename = os.path.relpath(file, os.path.dirname(os.path.abspath(__file__)))

+                     short_filename = "_".join(short_filename.split(sep=os.path.sep))

+                     debug_filename = os.path.join(debug_folder, f"{language_code}-tm-msguniq-{short_filename}")

+                     log.error(f" msguniq error, a copy of this file is into {debug_filename}")

+                     shutil.copyfile(file, debug_filename)

  

              count += 1

  

          all_files = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))]

          if len(all_files) == 1:

-             shutil.copyfile(os.path.join(tmp, all_files[0]), dest)

+             shutil.copyfile(os.path.join(tmp, all_files[0]), destination_file)

          else:

-             msgcat_loop(dest, tmp, debug_folder, all_files)

+             msgcat_loop(destination_file, tmp, debug_folder, all_files, language_code)

  

  

- def msgcat(files, destination, path):

+ def msgcat(files: list[str], destination: str, path: str):

      """ Call the msgcat command on a list of po files

      Return stderr, if any """

      command = [
@@ -195,15 +184,8 @@ 

          stderr = e.stderr.decode('utf8')

      return stderr

  

- def store_debug_file(path, name, file, debug_folder):

-     """ Move the temporary move file in debug folder """

-     log = logging.getLogger("buildTm.store_debug_file")

-     target = os.path.join(debug_folder, "{n}-{f}".format(n=name, f=file))

-     log.error("The file {f} were moved into {t}".format(f=file, t=target))

-     shutil.move(os.path.join(path, file), target)

- 

  

- def msgcat_loop(destination, path, debug_folder, files):

+ def msgcat_loop(destination: str, path: str, debug_folder: str, files: list[str], language: str) -> None:

      """ call msgcat, and exclude any problematic files """

      log = logging.getLogger("buildTm.msgcat_loop")

      log.debug("Starting msgcat loop")
@@ -213,68 +195,72 @@ 

          ids += re.findall(r"\d+:\d+: (\d+): input is not valid in", ret)

          if ids:

              file = ids[0]

-             log.debug("This file raised a msgcat bug: {f}".format(f=file))

-             store_debug_file(path, "tm-msgcat-" + destination.split("/")[-1], file, debug_folder)

+             short_filename = os.path.relpath(file, os.path.dirname(os.path.abspath(__file__)))

+             short_filename = "_".join(short_filename.split(sep=os.path.sep))

+             destination_file = f"{language}-tm-msgcat-{short_filename}"

+             target = os.path.join(debug_folder, f"{destination_file}")

+             log.error(f"msgcat error, a copy of this file is into {target}")

+             shutil.move(os.path.join(path, file), target)

              files.remove(file)

          else:

              # nothing found in stderr

              if os.path.isfile(destination) is False:

                  # and destination not here : unhandled exception

                  # TODO: maybe actually throw an exception here?

-                 log.error("Error with msgcat: {e}".format(e=ret))

-                 return False

+                 log.error(f"Error with msgcat: {ret}")

+                 return

              # no stderr and final file is here : all good

              break

          log.debug("next try")

      log.debug("msgcat loop over")

  

  

- def process_tmx(lang, source, dest):

+ def process_tmx(lang: str, source: str, destination: str) -> None:

      """ Generate a translation memory from a po file """

  

-     command = ["po2tmx", "--language=" + lang, "--progress=none", source, "--output=" + dest]

+     command = ["po2tmx", f"--language={lang}", "--progress=none", source, f"--output={destination}"]

      subprocess.run(command, check=True, capture_output=True)

  

  

- def process_terminology(source, dest):

-     """ Generate a termonology from a po file """

+ def process_terminology(compendium: str, destination: str) -> None:

+     """ Generate a terminology from a po file """

  

      command = ["poterminology", "--ignore-case", "--fold-titlecase",

                 "--inputs-needed", "1",

-                "--progress=none", source, "--output=" + dest]

+                "--progress=none", compendium, f"--output={destination}"]

      subprocess.run(command, check=True, capture_output=True)

  

  

- def check_lang(lang, tm_folder, compress):

+ def check_lang(lang: str, tm_folder: str, to_compress: bool) -> None:

      """ Check if expected files were generated """

      log = logging.getLogger("buildTm.check_lang")

  

-     compendium_file = os.path.join(tm_folder, lang + ".po")

-     tmx_file = os.path.join(tm_folder, lang + ".tmx")

-     terminology_file = os.path.join(tm_folder, lang + ".terminology.po")

+     compendium_file = os.path.join(tm_folder, f"{lang}.po")

+     tmx_file = os.path.join(tm_folder, f"{lang}.tmx")

+     terminology_file = os.path.join(tm_folder, f"{lang}.terminology.po")

  

-     if compress is True:

+     if to_compress is True:

          compendium_file += ".gz"

          tmx_file += ".gz"

          terminology_file += ".gz"

  

      if os.path.isfile(compendium_file) is False:

-         log.warning("{l}-compendium is missing".format(l=lang))

+         log.warning(f"{lang}-compendium is missing")

  

      if os.path.isfile(tmx_file) is False:

-         log.warning("{l}-tmx is missing".format(l=lang))

+         log.warning(f"{lang}-tmx is missing")

  

      if os.path.isfile(terminology_file) is False:

-         log.warning("{l}-terminology is missing".format(l=lang))

+         log.warning(f"{lang}-terminology is missing")

  

  

- def compress(source, archive):

-     """ Compress files uzing gzip """

+ def compress(source: str, destination_file: str) -> None:

+     """ Compress files using gzip """

      log = logging.getLogger("buildTm.compress")

  

-     log.debug("Compressing {s}".format(s=source))

+     log.debug(f"Compressing {source}")

      with open(source, "rb") as file_in:

-         with gzip.open(archive, "wb") as file_out:

+         with gzip.open(destination_file, "wb") as file_out:

              file_out.writelines(file_in)

  

      os.remove(source)

file modified
+89 -275
@@ -1,5 +1,5 @@ 

  #!/usr/bin/env python3

- """Consolidate each po files into compendium"""

+ """ Generate static asciidoc pages from generated statistics """

  

  import argparse

  import datetime
@@ -15,10 +15,31 @@ 

  import utils

  

  

+ def get_territories_for_language(language_name: str, cldr_languages: dict) -> list:

+     log = logging.getLogger("buildWebsite.get_territory")

+ 

+     code = language_name.split("_", 1)[0]  # ro_MD or zh_Hant_HK

+     code = code.split("@", 1)[0]  # ca@valencia

+ 

+     territories = cldr_languages.get(code, {}).get("_territories", [])

+     territories = territories + cldr_languages.get(code + "-alt-secondary", {}).get("_territories", [])

+ 

+     # if language contains a territory code, then only keep this one

+     if len(language_name.split("_")) > 1:

+         if language_name.split("_")[-1] in territories:

+             territories = [language_name.split("_")[-1]]

+ 

+     if len(territories) == 0:

+         territories = ["not-found-in-cldr"]

+         log.warning(f"The language {code} does not exist in territories data from CLDR")

+ 

+     return territories

+ 

+ 

  def main():

      """Handle params"""

  

-     parser = argparse.ArgumentParser(description="")

+     parser = argparse.ArgumentParser(description="Generate static asciidoc pages from generated statistics")

  

      parser.add_argument(

          "--results",
@@ -45,26 +66,20 @@ 

      utils.set_logging(args.verbose, args.results)

      log = logging.getLogger("buildWebsite")

  

-     results_folder = "./results/{v}/".format(v=args.results)

-     langs_log = os.path.join(results_folder, "build_language_list.log")

-     langs_stats = os.path.join(results_folder, "languages-stats")

-     packages_stats = os.path.join(results_folder, "packages-stats")

- 

-     data_langs_folder = os.path.join(results_folder, "languages-website")

-     data_pkgs_folder = os.path.join(results_folder, "packages-website")

+     results_folder = f"./results/{args.results}/"

+     langs_stats = os.path.join(results_folder, "languages")

+     packages_stats = os.path.join(results_folder, "packages")

  

      tm_folder = os.path.join(results_folder, "languages-tm")

  

-     static_folder = "./website/content/{v}/".format(v=args.results)

+     static_folder = f"./website/content/{args.results}/"

      static_territories_folder = "./website/content/territories"

      static_langs_folder = os.path.join(static_folder, "language")

      static_pkgs_folder = os.path.join(static_folder, "package")

-     static_tm_folder = "./website/static/{v}/".format(v=args.results)

+     static_tm_folder = f"./website/static/{args.results}/"

  

      # clean destination folders

      for folder in [

-         data_langs_folder,

-         data_pkgs_folder,

          static_langs_folder,

          static_pkgs_folder,

          static_tm_folder,
@@ -77,61 +92,10 @@ 

      log.info("Get distribution stats")

      distribution_stats = json.load(open(os.path.join(results_folder, "release.json")))

  

-     log.info("Prepare json files for packages")

-     packages = [

-         d

-         for d in os.listdir(packages_stats)

-         if os.path.isfile(os.path.join(packages_stats, d))

-     ]

-     log_files = pd.read_csv(langs_log, header=None, skipinitialspace=True)

-     log_files = log_files.iloc[:, [0, 4]]

-     log_files.columns = ["filename", "lang_code"]

- 

-     packages_langs_results = dict()

-     count = 0

-     total = len(packages)

-     for package in sorted(packages):

-         count += 1

-         log.debug("Preparing package {c}/{t} - {p}".format(c=count, t=total, p=package))

-         package_name = package[: -len(".json")]

-         package_statistics_file = os.path.join(data_pkgs_folder, package_name + ".json")

-         file_stats = os.path.join(packages_stats, package_name + ".json")

- 

-         results = consolidate_package_stats(file_stats, log_files, os.path.join(results_folder, "package", package_name))

-         store_json_file(results, package_statistics_file)

-         for lang in results.get("stats", []):

-             val = packages_langs_results.get(lang["lang_code"], [])

-             val.append(

-                 {

-                     "name": package_name,

-                     "progress": lang["progress"],

-                     "translated": lang["translated"],

-                     "team": lang["team"],

-                 }

-             )

-             packages_langs_results[lang["lang_code"]] = val

- 

-     log.info("Prepare json files for languages")

-     languages = [

-         f

-         for f in os.listdir(langs_stats)

-         if os.path.isfile(os.path.join(langs_stats, f))

-     ]

-     for lang in sorted(languages):

-         if lang.endswith(".json"):

-             code = lang[: -len(".json")]

-             package_statistics_file = os.path.join(data_langs_folder, code + ".json")

- 

-             if os.path.isfile(package_statistics_file):

-                 continue

- 

-             results = consolidate_language_stats(os.path.join(langs_stats, lang), distribution_stats)

-             results["packages"] = packages_langs_results.get(code, dict())

-             store_json_file(results, package_statistics_file)

- 

      log.info("Load CLDR data")

      with open("CLDR-raw/languageData.json", "r") as read_file:

          cldr_languages = json.load(read_file)

+         cldr_version = cldr_languages["supplemental"]["version"]["_cldrVersion"]

          cldr_languages = cldr_languages["supplemental"]["languageData"]

  

      with open("CLDR-raw/territories.json", "r") as read_file:
@@ -145,53 +109,38 @@ 

      log.info("Generate static content for languages")

      languages = [

          f

-         for f in os.listdir(data_langs_folder)

-         if os.path.isfile(os.path.join(data_langs_folder, f))

+         for f in os.listdir(langs_stats)

+         if os.path.isfile(os.path.join(langs_stats, f))

      ]

-     for lang in sorted(languages):

-         code = lang[: -len(".json")]

-         package_statistics_file = os.path.join(static_langs_folder, code + ".adoc")

+     for language_file in sorted(languages):

+         language = language_file[: -len(".json")]

+         stats_file = os.path.join(langs_stats, language_file)

+         destination_file = os.path.join(static_langs_folder, f"{language}.adoc")

  

-         if os.path.isfile(package_statistics_file):

-             continue

- 

-         with open(os.path.join(data_langs_folder, lang), "r") as read_file:

+         with open(stats_file, "r") as read_file:

              content = json.load(read_file)

  

-         pd.DataFrame(content["packages"]).to_csv(os.path.join(static_tm_folder, f"{code}.csv"), index=False)

- 

-         cldr_code = code.split("_", 1)[0]  # ro_MD or zh_Hant_HK

-         cldr_code = cldr_code.split("@", 1)[0]  # ca@valencia

- 

-         territories = cldr_languages.get(cldr_code, {}).get("_territories", []) \

-                       + cldr_languages.get(cldr_code + "-alt-secondary", {}).get("_territories", [])

- 

-         # if language contains a territory code, then only keep this one

-         if len(code.split("_")) > 1:

-             if code.split("_")[-1] in territories:

-                 territories = [code.split("_")[-1]]

+         pd.DataFrame\

+             .from_dict(content['packages'], orient="index")\

+             .to_csv(os.path.join(static_tm_folder, f"{language}.csv"), index_label="package")

  

-         if len(territories) == 0:

-             log.warning("The language {l} does not exist in territories data from CLDR".format(l=code))

-         generate_static_pages_langs(args.results, code, content, package_statistics_file, territories, tm_folder, static_tm_folder)

+         territories = get_territories_for_language(language, cldr_languages)

+         generate_static_pages_langs(args.results, language, content, destination_file, territories, tm_folder, static_tm_folder)

  

      log.info("Generate static content for packages")

      packages = [

          f

-         for f in os.listdir(data_pkgs_folder)

-         if os.path.isfile(os.path.join(data_pkgs_folder, f))

+         for f in os.listdir(packages_stats)

+         if os.path.isdir(os.path.join(packages_stats, f))

      ]

      for package in sorted(packages):

-         code = package[: -len(".json")]

-         package_statistics_file = os.path.join(static_pkgs_folder, code + ".adoc")

+         stats_file = os.path.join(packages_stats, package, "stats.json")

+         destination_file = os.path.join(static_pkgs_folder, f"{package}.adoc")

  

-         if os.path.isfile(package_statistics_file):

-             continue

- 

-         with open(os.path.join(data_pkgs_folder, package), "r") as read_file:

+         with open(stats_file, "r") as read_file:

              content = json.load(read_file)

  

-         generate_static_pages_packages(args.results, code, content, package_statistics_file)

+         generate_static_pages_packages(args.results, package, content, destination_file)

  

      log.info("Generating indexes")

      package_statistics_file = os.path.join(static_folder, "_index.adoc")
@@ -207,7 +156,7 @@ 

          # prevent containers and alternative names to be included

          if code in cldr_territories_info.keys():

              package_statistics_file = os.path.join(static_territories_folder, code, "_index.adoc")

-             generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}))

+             generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}), cldr_version)

  

      log.info("Copy translation memories")

      languages = [
@@ -222,172 +171,8 @@ 

      log.info("done")

  

  

- def consolidate_language_stats(stats_file, distribution_stats):

-     """ From a CSV file, return key indicators """

-     log = logging.getLogger("buildWebsite.consolidate_language_stats")

-     results = dict()

-     total_words_distrib = distribution_stats.get("totalsourcewords", 0)

- 

-     fieldnames = {

-         "filename": "str",

-         "translatedsourcewords": "int",

-         "fuzzysourcewords": "int",

-         "untranslatedsourcewords": "int",

-         "translated": "int",

-         "fuzzy": "int",

-         "untranslated": "int",

-         "translatedtargetwords": "int",

-         "team": "str",

-         "totalsourcewords": "int",

-     }

- 

-     stats_df = pd.read_json(stats_file, orient="index")

-     stats_df.fillna(0, inplace=True)

-     stats_df.reset_index(level=0, inplace=True)

-     stats_df["totalsourcewords"] = (

-             stats_df["untranslatedsourcewords"] + stats_df["translatedsourcewords"]

-     )

-     stats_df.columns = fieldnames.keys()

- 

-     stats_df["package"] = stats_df["filename"].str.split("/", expand=True)[4]

- 

-     results["packages"] = stats_df["package"].unique().tolist()

-     results["progress"] = round(

-         stats_df["translatedsourcewords"].sum()

-         / stats_df["totalsourcewords"].sum()

-         * 100,

-         1,

-     )

-     results["progress_d"] = round(

-         stats_df["translatedsourcewords"].sum() / total_words_distrib * 100, 1

-     )

-     results["totalsourcewords_d"] = total_words_distrib

- 

-     for kpi in ["totalsourcewords", "translatedsourcewords"]:

-         results[kpi + "sum"] = int(stats_df[kpi].sum())

- 

-     return results

- 

- 

- def consolidate_package_stats(stats_file, log_files, package_folder):

-     """ From a CSV file, return key indicators """

-     log = logging.getLogger("buildWebsite.consolidate_package_stats")

-     results = dict()

- 

-     fieldnames = {

-         "filename": "str",

-         "translatedsourcewords": "int",

-         "fuzzysourcewords": "int",

-         "untranslatedsourcewords": "int",

-         "translated": "int",

-         "fuzzy": "int",

-         "untranslated": "int",

-         "translatedtargetwords": "int",

-         "team": "str",

-         "totalsourcewords": "int",

-     }

- 

-     _json = json.load(open(stats_file))

-     dfs = []

-     total_source_words = 0

- 

-     for template in _json.keys():

-         tmp_df = pd.DataFrame.from_dict(_json.get(template), orient="index")

-         tmp_df.fillna(0, inplace=True)

-         tmp_df.reset_index(level=0, inplace=True)

- 

-         # sometimes, no file were found, which means no stats can be used

-         if len(tmp_df) == 0:

-             log.debug(" The template {t} for {f} is empty".format(t=template, f=stats_file))

-             continue

- 

-         tmp_df["totalsourcewords"] = (

-                 tmp_df["untranslatedsourcewords"] + tmp_df["translatedsourcewords"]

-         )

-         tmp_df.columns = fieldnames.keys()

- 

-         total_source_words += max(tmp_df["totalsourcewords"])

- 

-         dfs.append(tmp_df)

- 

-     if len(dfs) > 1:

-         stats_df = pd.concat(dfs)

-     elif len(dfs) == 0:

-         log.debug("There is no stats for {f}".format(f=stats_file))

-         return results

-     else:

-         stats_df = dfs[0]

- 

-     stats_df_w_lang = pd.merge(stats_df, log_files, how="inner", on="filename")

-     stats_df_no_lang = pd.merge(stats_df, log_files, how="outer", indicator=True).loc[

-         lambda x: x["_merge"] == "left_only"

-     ]

- 

-     stats_df_w_lang["filename"] = stats_df_w_lang["filename"].apply(

-         lambda s: s[len(package_folder) + 2:]

-     )

- 

-     temp_translated = (

-         stats_df_w_lang.groupby(["lang_code"])

-             .agg(

-             {

-                 "translatedsourcewords": ["sum"],

-             }

-         )

-             .reset_index()

-             .droplevel(1, axis=1)

-     )

- 

-     temp_teams = stats_df_w_lang.groupby("lang_code")["team"].apply(

-         lambda x: ", ".join(x.drop_duplicates())

-     )

-     temp_files = stats_df_w_lang.groupby("lang_code")["filename"].apply(

-         lambda x: ",".join(x)

-     )

-     temp_bis = pd.merge(temp_teams, temp_files, how="inner", on="lang_code")

-     temp = pd.merge(temp_translated, temp_bis, how="inner", on="lang_code").to_dict(

-         orient="records"

-     )

- 

-     for line in temp:

-         line["progress"] = 0

-         line["translated"] = line["translatedsourcewords"]

- 

-         if total_source_words == 0:

-             log.info(

-                 " File {f} for file has translatedsourcewords = 0 in line {l}".format(

-                     f=stats_file, l=line

-                 )

-             )

-             line["progress"] = 0

-             continue

-         try:

-             line["progress"] = round(

-                 (int(line["translatedsourcewords"]) / total_source_words) * 100

-             )

-         except OverflowError:

-             log.info(

-                 " File {f} has Translated={t} and Source={tot}".format(

-                     f=stats_file,

-                     t=line["translatedsourcewords"],

-                     tot=total_source_words,

-                 )

-             )

- 

-         line["filename"] = line["filename"].split(",")

- 

-     results["stats"] = list()

-     for line in sorted(temp, key=lambda k: k["progress"], reverse=True):

-         del line["translatedsourcewords"]

-         results["stats"].append(line)

- 

-     results["totalsourcewords"] = total_source_words

-     results["no_languages"] = stats_df_no_lang["filename"].tolist()

- 

-     return results

- 

- 

  def generate_static_pages_langs(results: str, code: str, content: dict, destination_file: str, territories: list[str], tm_folder: str, static_tm_folder: str) -> None:

+     """ Aggregate info and call language template """

      log = logging.getLogger("buildWebsite.generate_static_pages_langs")

      data = content

      data["lang_name_en"] = langtable.language_name(
@@ -413,17 +198,36 @@ 

      apply_jinja_template(data, destination_file, "language.adoc")

  

  

- def generate_static_pages_packages(results, code, content, destination_file):

+ def generate_static_pages_packages(release: str, package: str, statistics: dict, destination_file: str) -> None:

+     """ Aggregate info and call package template """

      log = logging.getLogger("buildWebsite.generate_static_pages_packages")

-     data = content

-     data["results"] = results

-     data["package"] = code

+     data = statistics

+     data["results"] = release

+     data["package"] = package

      data["now"] = datetime.datetime.utcnow()

  

+     # in some rare cases, a package may have no translation progress

+     if "stats" not in statistics.keys():

+         data["stats"] = {}

+         data["stats"]["languages"] = {}

+ 

+     if "error" in data["stats"]["languages"].keys():

+         data["started_languages"] = len(data["stats"]["languages"]) - 1

+         data["no_languages"] = len(data["stats"]["languages"]["error"]["filename"].split("./")) - 1

+     else:

+         data["started_languages"] = len(data["stats"]["languages"])

+         data["no_languages"] = 0

+ 

+     # remove local path

+     for lang in data["stats"]["languages"].keys():

+         path = f"./results/{release}/packages/{package}/"

+         data["stats"]["languages"][lang]["filename"] = data["stats"]["languages"][lang]["filename"].replace(path, " ")

+ 

      apply_jinja_template(data, destination_file, "package.adoc")

  

  

- def generate_release_index(release, destination_file, data):

+ def generate_release_index(release: str, destination_file: str, data: dict) -> None:

+     """ Aggregate info and call release index template """

      log = logging.getLogger("buildWebsite.generate_release_index")

      data["release"] = release

      data["now"] = datetime.datetime.utcnow()
@@ -431,7 +235,8 @@ 

      apply_jinja_template(data, destination_file, "_index.release.adoc")

  

  

- def generate_language_index(release, destination_file):

+ def generate_language_index(release: str, destination_file: str) -> None:

+     """ Aggregate info and call language index template """

      log = logging.getLogger("buildWebsite.generate_language_index")

      data = dict()

      data["release"] = release
@@ -440,7 +245,8 @@ 

      apply_jinja_template(data, destination_file, "_index.language.adoc")

  

  

- def generate_package_index(distribution, destination_file):

+ def generate_package_index(distribution: str, destination_file: str) -> None:

+     """ Aggregate info and call package index template """

      log = logging.getLogger("buildWebsite.generate_package_index")

      data = dict()

      data["distribution"] = distribution
@@ -449,26 +255,34 @@ 

      apply_jinja_template(data, destination_file, "_index.package.adoc")

  

  

- def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict):

+ def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict, cldr_version: str) -> None:

+     """ Aggregate info and call territory index template """

      log = logging.getLogger("buildWebsite.generate_package_index")

      data["name"] = name

      data["code"] = code

+     data["cldr_version"] = cldr_version

  

      apply_jinja_template(data, destination_file, "_index.territory.adoc")

  

  

- def store_json_file(content, destination_file):

+ def store_json_file(content: dict, destination_file: str) -> None:

+     """ Store a json file"""

      with open(destination_file, "w") as f:

          f.write(json.dumps(content, indent=2))

  

  

- def apply_jinja_template(data: dict, destination_file: str, template_file: str):

+ def apply_jinja_template(data: dict, destination_file: str, template_file: str) -> None:

+     """ Call a jinja template with a data dictionary """

      os.makedirs(os.path.dirname(os.path.abspath(destination_file)), exist_ok=True)

  

      template_loader = jinja2.FileSystemLoader(searchpath="./templates/")

      template_env = jinja2.Environment(loader=template_loader, undefined=jinja2.Undefined)

      template = template_env.get_template(template_file)

-     output_text = template.render(data)

+     try:

+         output_text = template.render(data)

+     except jinja2.exceptions.UndefinedError as e:

+         logging.error(f"Error with {destination_file}: {e}")

+         raise

  

      with open(destination_file, "w") as write_out:

          write_out.write(output_text)

file modified
-3
@@ -26,9 +26,6 @@ 

        /src/build_language_list.py --results "$results"

  

      podman run -it --rm -v ./:/src:z -v "$WORK_DIR"/results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G -e DNF_CONF=/src/"$dnf_conf" fedlocstats:latest \

-       /src/build_language_list.py --results "$results" --analyzealllang

- 

-     podman run -it --rm -v ./:/src:z -v "$WORK_DIR"/results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G -e DNF_CONF=/src/"$dnf_conf" fedlocstats:latest \

        /src/build_tm.py --results "$results" --compress

  

      podman run -it --rm -v ./:/src:z -v "$WORK_DIR"/results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G -e DNF_CONF=/src/"$dnf_conf" fedlocstats:latest \

file modified
+2
@@ -5,3 +5,5 @@ 

  weblate-language-data

  langtable

  translate-toolkit

+ 

+ numpyencoder 

\ No newline at end of file

file modified
-3
@@ -20,9 +20,6 @@ 

  # ~ 18 m

  ./build_language_list.py --results "$results"

  

- # ~ 18 m

- ./build_language_list.py --results "$results" --analyzealllang

- 

  # Creates useful translator files for every languages

  # ~ 3 h 00

  LANG=C ./build_tm.py --results "$results" --compress

@@ -5,14 +5,14 @@ 

  ---

  

  Fedora {{ release }}::

- * contains {{ total_release_packages }} packages,

- * we identified {{ total_packages }} packages with translations files,

- * it represents {{ total_packages_files }} translations files (po).

+ * contains {{ packages_count }} packages,

+ * we identified {{ packages_detected_count }} packages with translations files,

+ * it represents {{ files_detected_count }} translations files (po).

  

  What we were able to process::

- * {{ total_packages_with_stats }} packages,

- * {{ nb_files }} translation files containing {{ totalsourcewords }} words to translate,

- * {{ total_languages }} languages.

+ * {{ packages_processed_count }} packages,

+ * {{ files_processed_count }} translation files containing {{ totalsourcewords }} words to translate,

+ * {{ languages_processed_count }} languages.

  

  Why such gaps?::

  . File reading was not possible (encoding or format issue),

@@ -1,7 +1,7 @@ 

  ---

  title: "{{ code }} {{ name }}"

  ---

- Data coming from Unicode consortium (CLDR 38):

+ Data coming from Unicode consortium (CLDR {{ cldr_version }}):

  

  * Population: {{ _population }}

  * Literacy percent: {{_literacyPercent}}

file modified
+14 -11
@@ -4,7 +4,8 @@ 

  code: {{ lang_code }}

  name_english: {{ lang_name_en }}

  name_local: {{ lang_name_local }}

- progress_d: {{ progress_d }}

+ progress: {{ '{:.2f}'.format(progress) }}

+ progress_d: {{ '{:.2f}'.format(progress_d) }}

  release: {{ results }}

  {%- if territories %}

  territories:
@@ -16,14 +17,14 @@ 

  

  Language progress for {{ lang_name_en }} ({{ lang_code }}) in Fedora {{ results }} is:

  

- * {{ progress }}% when we only look on started packages for this language.

- * {{ progress_d }}% when we compare to every single translatable string in Fedora {{ results }}.

+ * {{ '{:.2f}'.format(progress) }}% when we only look on started packages for this language.

+ * {{ '{:.2f}'.format(progress_d) }}% when we compare to every single translatable string in Fedora {{ results }}.

  

  Possible scripts are: {% for script in scripts -%}{{ script }} {%- endfor %}

  

  * Total translatable string in Fedora {{ results }}: {{ totalsourcewords_d }}

  * Source words to translate in started packages: {{ totalsourcewordssum }}

- * Translated words: {{ translatedsourcewordssum }}

+ * Translated words: {{ translatedsourcewords }}

  

  Download:

  
@@ -34,17 +35,19 @@ 

  

  Packages:

  

- [cols="1a,1,1,3", options="header"]

+ [cols="1a,1,1,1,3", options="header"]

  |===

  | Name

  | Translated words

- | Progress

- | Language team

+ | Total source words

+ | Progress (%)

+ | Language teams

  

  {% for package in packages -%}

- | link:{{ '{{' }}< ref "/{{ results }}/package/{{ package.name }}.adoc" >{{ '}}' }}[{{ package.name }}]

- >| {{ package.translated }}

- >| {{ package.progress }}

- | {{ package.team }}

+ | link:{{ '{{' }}< ref "/{{ results }}/package/{{ package }}.adoc" >{{ '}}' }}[{{ package }}]

+ >| {{ packages[package].translatedsourcewords }}

+ >| {{ packages[package].totalsourcewordssum }}

+ >| {{ '{:.1f}'.format(packages[package].progress) }}

+ | {{ packages[package].team }}

  {% endfor %}

  |=== 

\ No newline at end of file

file modified
+14 -21
@@ -1,37 +1,30 @@ 

  ---

  title: "{{ package }}"

  date: {{ now }}

- started_languages: {{ stats|length }}

- no_languages: {{ no_languages|length }}

+ started_languages: {{ started_languages }}

+ no_languages: {{ no_languages }}

  ---

  The package {{ package }}:

  

  * represents {{ totalsourcewords }} source words to be translated,

- * is translated into {{ stats|length }} languages in Fedora {{ results }},

- * contains {{ no_languages|length }} files for which no languages could be deducted.

+ * is translated into {{ stats.languages|length }} languages in Fedora {{ results }},

+ * contains {{ no_languages }} files for which no languages could be deducted.

  

- [cols="1a,1,1,3a", options="header"]

+ [cols="1a,1,1,1,3", options="header"]

  |===

  | Language

  | Translated words

- | Progress

+ | Total source words

+ | Progress (%)

  | Files

  

- {% for stat in stats|sort(attribute="lang_code") -%}

- | link:{{ '{{' }}< ref "/{{ results }}/language/{{ stat.lang_code }}.adoc" >{{ '}}' }}[{{ stat.lang_code }}]

- >| {{ stat.translated }}

- >| {{ stat.progress }}

- | {% for file in stat.filename -%}{{ file }}{{ " " }}{%- endfor %}

+ {% for stat in stats.languages|sort -%}

+ | link:{{ '{{' }}< ref "/{{ results }}/language/{{ stat }}.adoc" >{{ '}}' }}[{{ stat }}]

+ >| {{ stats.languages[stat].translatedsourcewords }}

+ >| {{ stats.languages[stat].totalsourcewordssum }}

+ >| {{ '{:.1f}'.format(stats.languages[stat].progress) }}

+ | {{ stats.languages[stat].filename }}

+ 

  {% endfor %}

  

  |===

- 

- ## Errors

- {% if no_languages %}

- List of files for which language detection were impossible:

- {% for missing in no_languages -%}

- * {{ missing }}

- {% endfor %}

- {% else %}

- None

- {% endif %}

file modified
+9 -8
@@ -11,17 +11,18 @@ 

  

  # build_tm.py

  

- Detecting missing files

- - en-compendium is missing

- - error-compendium is missing

- - gl-compendium is missing

- - nb_no-compendium is missing

- - sk-compendium is missing

- - zh_hant-compendium is missing

+ move error detection (check_lang) into %language%/stats.json and display erros

+ move error files into %language%/stats.json and make these accessible via website

+ remove terminology (someone who wants it can do it locally)

  

  # build_stats.py

  

- roxterm triggers an error

+ when %package%/stats.json is empty, make sure it is counted as an existing package for which we were not able to extract anything (release stats)

+ 

+ # website

+ 

+ list why we could not deduct error files

+ allow sort on all tables

  

  # global

  

@@ -12,7 +12,7 @@ 

            <th onclick="sortTable(0)">code</th>

            <th onclick="sortTable(1)">English name</th>

            <th onclick="sortTable(2)">Local name</th>

-           <th onclick="sortTable(3)">Progress</th>

+           <th onclick="sortTable(3)">Progress (%)</th>

        </tr>

        {{ range sort .Pages "Title" "asc" }}

          <tr>

move all stats generation in build_stats.py to simplify code (and increase performance)

1 new commit added

  • make build_tm to use the new file
9 months ago

2 new commits added

  • use f-strings
  • merge all stats in one file
9 months ago

1 new commit added

  • document staging in readme
9 months ago

3 new commits added

  • support empty package stats
  • remove package stats in build_website
  • improve documentation
9 months ago

1 new commit added

  • pivot per language csv file content
9 months ago

1 new commit added

  • display file names as string
9 months ago

1 new commit added

  • display progress as percents
8 months ago

2 new commits added

  • add fake territory languages not in CLDR
  • fix territories
8 months ago

1 new commit added

  • webiste: package: compute metrics in build_website
8 months ago

1 new commit added

  • website: display progress as percents
8 months ago

1 new commit added

  • adapt release wide stats
8 months ago

Pull-Request has been merged by jibecfed

8 months ago