#12 global rewrite and stats computation
Merged 3 years ago by jibecfed. Opened 3 years ago by jibecfed.

file modified
-2
@@ -64,8 +64,6 @@ 

  Detect the list of languages

  Aggregate all files for a language and produce a compendium, a terminology and a translation memory.

  

- TODO: language detection should probably be a in a dedicated build file.

- 

  # Output files

  

  * `0.error.language not in cldr.csv` contains unknown languages (lines are removed)

file modified
+44 -41
@@ -1,8 +1,10 @@ 

  #!/usr/bin/env python3

+ 

  # For each packages in src.rpms folder :

  #   extract srpm

  #   run the translation_finder

- # Then, concat csv files

+ #   copy translations files

+ 

  import argparse

  import dnf

  import json
@@ -26,13 +28,13 @@ 

  

      parser = argparse.ArgumentParser(

          description="Computes stats for each srpm detected")

-     parser.add_argument("filter", default=None, nargs='?',

+     parser.add_argument("filter", default=None, nargs="?",

                          help="package name filter (regex)")

      parser.add_argument("-k", "--keep-srpms", default=False,

-                         action='store_true', dest='keep',

+                         action="store_true", dest="keep",

                          help="Keep SRPMs in /srpms")

      parser.add_argument("-f", "--force", default=False,

-                         action='store_true', dest='force',

+                         action="store_true", dest="force",

                          help="Ignore past progression state")

      args = parser.parse_args()

  
@@ -41,16 +43,13 @@ 

          srpm_regex = re.compile("^{}$".format(args.filter))

  

      (distname, distrel, distid) = distro.linux_distribution()

-     result_folder = "./results/f{v}/stats/".format(v=distrel)

      packages_folder = "./results/f{v}/packages/".format(v=distrel)

      srpms_path = "/srpms"

  

-     if not os.path.exists(result_folder):

-         os.makedirs(result_folder)

      if not os.path.exists(packages_folder):

          os.makedirs(packages_folder)

  

-     processing_file = os.path.join("./results/f{v}/".format(v=distrel), "data.json")

+     data_file = os.path.join("./results/f{v}/".format(v=distrel), "data.json")

      srpm_list_file = os.path.join(srpms_path, "srpm.txt")

      url_list = None

  
@@ -63,18 +62,18 @@ 

  

      if not url_list:

          print("Fetching SRPMs url list")

-         p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm',

+         p = subprocess.Popen("dnf download --source --skip-broken --url '*' | grep src.rpm",

                               stdout=subprocess.PIPE,

                               shell=True)

  

          urls = str(p.stdout.read(), "utf-8")

-         with open(srpm_list_file, 'w') as f:

+         with open(srpm_list_file, "w") as f:

              f.write(urls)

          url_list = urls.splitlines()

  

      # Load processing data, if any

      try:

-         with open(processing_file) as f:

+         with open(data_file) as f:

              data = json.load(f)

      except BaseException:

          data = {}
@@ -106,25 +105,24 @@ 

                      (known_package.epoch,

                       known_package.version,

                       known_package.release)) <= 0:

-                     print("{c}/{t} skipping already processed {n}".format(

+                     print(" {c}/{t} skipping already processed {n}".format(

                          c=count, t=total_urls, n=package.name))

                      continue

  

-             print("{c}/{t} processing {n}".format(

+             print(" {c}/{t} processing {n}".format(

                  c=count, t=total_urls, n=package.name))

  

              srpm_path = os.path.join(srpms_path, srpm_filename)

              if not os.path.isfile(srpm_path):

-                 print("downloading {}".format(srpm_filename))

                  if url.scheme == "rsync":

                      dl = subprocess.run(

-                             ['rsync', url.geturl(), srpms_path],

+                             ["rsync", url.geturl(), srpms_path],

                              stdin=subprocess.PIPE,

                              stdout=subprocess.PIPE,

                              stderr=subprocess.STDOUT)

                  else:

                      dl = subprocess.run(

-                             ['curl', '-L', '--remote-name', url.geturl()],

+                             ["curl", "-L", "--remote-name", url.geturl()],

                              stdin=subprocess.PIPE,

                              stdout=subprocess.PIPE,

                              stderr=subprocess.STDOUT,
@@ -135,9 +133,9 @@ 

                      print(dl.stdout)

                      continue

  

-             extract_srpm(tmp, srpm_path, result_folder)

+             extract_srpm(tmp, srpm_path, packages_folder)

              (tsearch, tcopy, results) = discover_translations(

-                 tmp, package.name, result_folder, packages_folder)

+                 tmp, package.name, packages_folder)

  

              if not args.keep:

                  os.unlink(srpm_path)
@@ -149,24 +147,25 @@ 

                  "tcopy": tcopy,

                  "results": results}

  

-             with open(processing_file, "w") as f:

+             with open(data_file, "w") as f:

                  json.dump(data, f, indent=2)

-             print("")

  

  

- def extract_srpm(tmp, name, result_folder):

+ def extract_srpm(tmp, name, dest_folder):

      """extract srpm page"""

-     print("extract_srpm: " + name)

  

-     with open(result_folder + '/_srpm.out.txt', 'a') as out:

-         with open(result_folder + '_srpm.errors.txt', 'a') as error:

-             subprocess.run(['./extract_srpm.sh', tmp, name],

+     srpm_log_file = os.path.join(dest_folder, "..", "extract_srpm.log")

+     srpm_error_file = os.path.join(dest_folder, "..", "extract_srpm_error.log")

+ 

+     with open(srpm_log_file, "a") as out:

+         with open(srpm_error_file, "a") as error:

+             subprocess.run(["./extract_srpm.sh", tmp, name],

                             stdout=out, stderr=error, check=True)

  

  

- def discover_translations(tmp, pkg_name, result_folder, packages_folder):

+ def discover_translations(tmp, pkg_name, dest_folder):

      """find po file"""

-     print("discover_translations: " + tmp)

+ 

      translation_files = []

      tsearch = 0

      tcopy = 0
@@ -176,7 +175,7 @@ 

      try:

          translation_files = discover(tmp)

      except OSError:

-         with open(result_folder + "/errors.txt", "a") as file:

+         with open(dest_folder + "/errors.txt", "a") as file:

              file.write(pkg_name + " on discover_translations\n")

  

      tsearch = round(time.time() - tsearch, 1)
@@ -184,14 +183,14 @@ 

      tcopy = time.time()

  

      if translation_files:

-         if not os.path.exists(os.path.join(packages_folder, pkg_name)):

-             os.makedirs(os.path.join(packages_folder, pkg_name))

+         if not os.path.exists(os.path.join(dest_folder, pkg_name)):

+             os.makedirs(os.path.join(dest_folder, pkg_name))

  

-         with open(os.path.join(packages_folder, pkg_name, "discover.json"), 'w') as f:

+         with open(os.path.join(dest_folder, pkg_name, "discover.json"), "w") as f:

              f.write(json.dumps(translation_files, indent=2))

  

          for translation in translation_files:

-             copy_translations(tmp, translation, pkg_name, result_folder, packages_folder)

+             copy_translations(tmp, translation, pkg_name, dest_folder)

  

      tcopy = round(time.time() - tcopy, 1)

  
@@ -202,19 +201,23 @@ 

      return (tsearch, tcopy, cresults)

  

  

- def copy_translations(tmp, translation, pkg_name, result_folder, packages_folder):

+ def copy_translations(tmp, translation, pkg_name, dest_folder):

      filemask = translation["filemask"]

  

-     print("copy translations " + filemask)

+     if translation["file_format"] in ["po", "json", "json-nested"]:

+         files = glob.glob(os.path.join(tmp, filemask))

+ 

+         if "template" in translation.keys():

+             files.append(os.path.join(tmp, translation["template"]))

  

-     if translation["file_format"] in ["po", "json"]:

-         for po in glob.glob(tmp + "/" + filemask):

-             dest = packages_folder + "/" + pkg_name + "/" + po.replace(tmp, "")

-             dest_folder = dest.replace(os.path.basename(dest), "")

-             os.makedirs(dest_folder, exist_ok=True)

+         for file in files:

+             dest = dest_folder + "/" + pkg_name + "/" + file.replace(tmp, "")

+             dest_path = dest.replace(os.path.basename(dest), "")

+             os.makedirs(dest_path, exist_ok=True)

  

              # use copyfile instead of copy2 to handle read-only files in rpm

-             copyfile(po, dest)

+             copyfile(file, dest)

+ 

  

- if __name__ == '__main__':

+ if __name__ == "__main__":

      main()

file removed
-457
@@ -1,457 +0,0 @@ 

- #!/usr/bin/env python3

- """Consolidate and clean result files"""

- 

- import argparse

- import csv

- import itertools

- import json

- import os

- import pandas

- import time

- 

- RESULT_FOLDER = ""

- 

- 

- def main():

-     """Handle params"""

- 

-     global RESULT_FOLDER

- 

-     parser = argparse.ArgumentParser(

-         description="Consolidate every result files and produce a clean concatenated update")

-     parser.add_argument("--release", required=True, type=int, default=31,

-                         choices=[30, 31, 32],

-                         help="Provide the Fedora release to analyze")

- 

-     args = parser.parse_args()

- 

-     lang_path = "./results/f{r}/languages/".format(r=args.release)

-     RESULT_FOLDER = "./results/f{r}/stats/".format(r=args.release)

-     packages_folder = "./results/f{r}/packages/".format(r=args.release)

- 

-     concat_csv(packages_folder, RESULT_FOLDER)

- 

-     file = RESULT_FOLDER + "/_concat.csv"

- 

-     # parse(file)

- 

-     langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]

- 

-     for lang in sorted(langs):

-         lang_code = lang[:-len('.json')]

- 

-         with open(os.path.join(lang_path, lang), "r") as read_file:

-             files = json.load(read_file)

- 

-         compute_lang(lang_code, files, RESULT_FOLDER, packages_folder)

- 

- 

- def compute_lang(lang_code, files, RESULT_FOLDER, packages_folder):

-     print("Computing: {l} ({c} files)".format(l=lang_code, c=len(files)))

-     start_time_search = time.time()

-     stats = []

-     packages = {}

- 

-     # step 1: get package lists

-     for file in files:

-         po_file = file.replace(packages_folder, "")

-         package = po_file.split("/")[0]

-         po_file = po_file.replace(package + "/", "")

-         try:

-             packages[package].append(po_file)

-         except KeyError:

-             packages[package] = list()

-             packages[package].append(po_file)

- 

-     # step 2: remove duplicates

-     for package in packages.keys():

-         packages[package] = list(set(packages[package]))

- 

-     # step 3: parse package files

-     for package in packages.keys():

-         po_files = packages[package]

-         stats_file = os.path.join(packages_folder, package, "stats.csv")

- 

-         with open(stats_file, newline='') as csvfile:

-             csv_dict_reader = csv.DictReader(csvfile)

- 

-             [stats.append([package] + list(row.values())) for row in csv_dict_reader if row["Filename"] == po_file]

- 

-     # step 4: store results

-     lang_stats_file = os.path.join(RESULT_FOLDER, lang_code + ".stats.csv")

-     with open(lang_stats_file, 'w', newline='') as csvfile:

-         spamwriter = csv.writer(csvfile)

- 

-         header = ['Package', 'Filename', ' Translated Messages', ' Translated Source Words', ' Translated Target Words', ' Fuzzy Messages', ' Fuzzy Source Words', ' Untranslated Messages', ' Untranslated Source Words', ' Total Message', ' Total Source Words', ' Review Messages', ' Review Source Words']

- 

-         spamwriter.writerow(header)

- 

-         [spamwriter.writerow(row) for row in stats]

- 

-     search_duration = round(time.time() - start_time_search, 1)

-     print(" Done in {d} seconds".format(d=search_duration))

- 

- 

- def parse(file):

-     """Call all cleaning functions"""

-     data = pandas.read_csv(file)

-     data.columns = ['src',

-                     'filename',

-                     'translatedMessages',

-                     'translatedSourceWords',

-                     'translatedTargetWords',

-                     'fuzzyMessages',

-                     'fuzzySourceWords',

-                     'untranslatedMessages',

-                     'untranslatedSourceWords',

-                     'totalMessage',

-                     'totalSourceWords',

-                     'reviewMessages',

-                     'reviewSourceWords']

- 

-     info(data, "CSV loaded")

- 

-     data = clean_0_basic(data)

- 

-     data = clean_1_dupplication(data)

-     info(data, "Deduplication is done")

- 

-     data = data.astype({

-         'translatedMessages': int,

-         'translatedSourceWords': int,

-         'translatedTargetWords': int,

-         'fuzzyMessages': int,

-         'fuzzySourceWords': int,

-         'untranslatedMessages': int,

-         'untranslatedSourceWords': int,

-         'totalMessage': int,

-         'totalSourceWords': int

-     })

- 

-     data = clean_2_remove(data)

-     info(data, "Removal is done")

- 

-     data = guess_bcp47(data)

-     info(data, "bcp47 data are guessed")

- 

-     data = clean_bcp47(data)

-     info(data, "bcp47 data are cleaned")

- 

-     data = add_cldr(data)

-     info(data, "cldr data are added")

- 

-     data = clean_cldr(data)

-     info(data, "cldr data are cleaned")

- 

-     data = check_lang_territory_consistency(data)

-     info(data, "cldr consistency are done")

- 

-     data = harmonize_totals(data)

-     info(data, "data harmonization are done")

- 

-     data = summary(data)

- 

-     store(data, "3.result.csv")

- 

- 

- def clean_0_basic(data):

-     """String cleaning and drops useless 'review' columns"""

- 

-     # remove useless spaces and use lowercase

-     for column in list(data):

-         data[column] = data[column].str.strip()

-         data[column] = data[column].str.lower()

- 

-     # these columns never have values

-     data = data.drop('reviewMessages', 1)

-     data = data.drop('reviewSourceWords', 1)

- 

-     return data

- 

- 

- def clean_1_dupplication(data):

-     """Removes duplicates"""

- 

-     # remove all headers from indivual stat result

-     data = data[data.filename != "filename"]

-     info(data, "* duplicated headers are removed")

- 

-     # remove duplicated stats from bad translation file patterns

-     data = data.drop_duplicates(['src', 'filename'], keep='last')

-     info(data, "* duplicated po files are removed")

- 

-     return data

- 

- 

- def clean_2_remove(data):

-     """Removes obvious useless strings"""

- 

-     # remove pot files

-     data = data[~(data.filename.str.endswith(".pot"))]

-     info(data, "* remove pot files")

- 

-     # remove gmo files

-     data = data[~(data.filename.str.endswith(".gmo"))]

-     info(data, "* remove gmo files")

- 

-     # remove when no result

-     store(data[data.totalMessage == 0], "1.debug.total message = 0.csv")

-     data = data[data.totalMessage != 0]

-     info(data, "* remove files with 'totalMessage'=0")

- 

-     return data

- 

- 

- def guess_bcp47(data):

-     """Guess Language, Territory and Script from filename"""

-     data['basename'] = data['filename'].apply(os.path.basename)

- 

-     data['full_lang'] = data['basename'].str.rsplit('.', 1, expand=True)[0]

- 

-     # a few lang naming are wrong

-     data.full_lang = data.full_lang.replace(

-         {'kmr_latn': 'kmr@latn', 'fr-braille': 'fr@braille', })

- 

-     data['lang'] = data['full_lang'].str.rsplit('@', 1, expand=True)[0]

- 

-     data['script'] = data['full_lang'].str.rsplit('@', 1, expand=True)[1]

- 

-     # these are just re-encoded translations

-     data = data[~(

-         data.lang.str.endswith(".big5") |

-         data.lang.str.endswith(".cp936") |

-         data.lang.str.endswith(".cp1250") |

-         data.lang.str.endswith(".cp1251") |

-         data.lang.str.endswith(".euc-jp") |

-         data.lang.str.endswith(".gb2312") |

-         data.lang.str.endswith(".sjis") |

-         data.lang.str.endswith(".utf-8")

-     )]

-     info(data, "* remove if lang endswith encoding values")

- 

-     # these are just re-encoded translations

-     store(data[data.lang.str.contains(".", regex=False)],

-           '0.error.lang with point.csv')

-     data = data[~(data.lang.str.contains(".", regex=False))]

-     info(data, "* remove if lang contains a point")

- 

-     data['language'] = data['lang'].str.rsplit('_', 1, expand=True)[0]

- 

-     data['territory'] = data['lang'].str.rsplit('_', 1, expand=True)[1]

- 

-     # store all unique values for debug

-     store(data.drop_duplicates('lang', keep='last'), "1.debug.lang.csv")

-     store(data.drop_duplicates('language', keep='last'), "1.debug.language.csv")

-     store(data.drop_duplicates('territory', keep='last'), "1.debug.territory.csv")

-     store(data.drop_duplicates('script', keep='last'), "1.debug.script.csv")

- 

-     # remove temporary columns

-     data = data.drop('full_lang', 1)

-     data = data.drop('lang', 1)

- 

-     return data

- 

- 

- def clean_bcp47(data):

-     """Remove impossible values for language and territory"""

-     # remove territory longer than 2 chars

-     store(data[data.territory.str.len() > 2], '0.error.len(territory)>2.csv')

-     data = data[~(data.territory.str.len() > 2)]

-     info(data, "* remove if len(territory)>2")

- 

-     # remove languages longer than 3 chars

-     store(data[data.language.str.len() > 3], '0.error.len(language)>3.csv')

-     data = data[~(data.language.str.len() > 3)]

-     info(data, "* remove if len(language)>3")

- 

-     # remove numeric languages

-     store(data[data.language.str.isdigit()],

-           '0.error.languages is numeric.csv')

-     data = data[~(data.language.str.isdigit())]

-     info(data, "* remove if language.isdigit()")

- 

-     # set types

-     data.territory = data.territory.fillna('')

-     data.script = data.script.fillna('')

-     data = data.astype(

-         {'territory': 'str', 'language': 'str', 'script': 'str'})

- 

-     return data

- 

- 

- def add_cldr(data):

-     """Load cldr data, merge it with"""

-     cldr_language = pandas.read_csv("CLDR-raw/language.csv")

-     cldr_language.name = cldr_language.name.str.lower()

- 

-     cldr_script = pandas.read_csv("CLDR-raw/script.csv")

-     cldr_script.code = cldr_script.code.str.lower()

-     cldr_script.name = cldr_script.name.str.lower()

- 

-     cldr_territory = pandas.read_csv("CLDR-raw/territory.csv")

-     cldr_territory.code = cldr_territory.code.str.lower()

-     cldr_territory.name = cldr_territory.name.str.lower()

- 

-     data = data.merge(cldr_language, how='left',

-                       left_on='language', right_on='code')

-     data = data.rename(columns={'name': 'language_name'})

-     data = data.drop('code', 1)

- 

-     data = data.merge(cldr_script, how='left', left_on='script',

-                       right_on='code', suffixes=(False, 'script_'))

-     data = data.rename(columns={'name': 'script_name'})

-     data = data.drop('code', 1)

- 

-     data = data.merge(cldr_territory, how='left', left_on='territory',

-                       right_on='code', suffixes=(False, 'territory_'))

-     data = data.rename(columns={'name': 'territory_name'})

-     data = data.drop('code', 1)

- 

-     data['full_language_code'] = data.apply(get_full_language_code, axis=1)

- 

-     return data

- 

- 

- def clean_cldr(data):

-     """Remove """

-     # remove numeric languages

-     store(data[data.language_name.isnull()].drop_duplicates(

-         'language'), '0.error.language not in cldr.csv')

-     data = data[~(data.language_name.isnull())]

-     info(data, "* remove languages non existing in CLDR")

- 

-     return data

- 

- 

- def check_lang_territory_consistency(data):

-     """ use pop per lang_script and territory to detect potential errors """

- 

-     cldr_data = pandas.read_csv("CLDR-raw/country_language_population_raw.txt",

-                                 sep="\t")

-     cldr_data.CName = cldr_data.CName.str.lower()

- 

-     # in this file, Azerbaijani (Arabic) is written az_Arab, we only keep lang for now

-     cldr_data['Language'] = cldr_data['Language'].str.rsplit('_', 1, expand=True)[

-         0]

- 

-     cldr_data = cldr_data[["CName", 'Language']]

-     # as we may have duplicated values now

-     cldr_data = cldr_data.drop_duplicates()

-     cldr_data = cldr_data.rename(

-         columns={'CName': 'terr', 'Language': 'language'})

- 

-     data = data.merge(cldr_data, how='left',

-                       left_on=('language', 'territory'),

-                       right_on=('language', 'terr'),

-                       suffixes=(False, '_cldr'))

- 

-     error = data[['language', 'territory', 'terr']]

-     error = error[~(error.territory.isnull())]

-     error = error[~(error.territory == '')]

-     error = error[error.terr.isnull()].drop_duplicates()

-     store(error, '0.error.no population for this language-territory couple.csv')

- 

-     data = data.drop('terr', 1)

- 

-     return data

- 

- 

- def get_full_language_code(row):

-     """ full language code using this naming: lang_territory@script """

-     val = row.language

-     if row.territory:

-         val = val + "_" + row.territory

-     if row.script:

-         val = val + "@" + row.script

- 

-     return val

- 

- 

- def clean_dirname(row):

-     """ strip full_language_code from dirname """

-     val = row.dirname

- 

-     if val.endswith(row.full_language_code):

-         val = row.dirname[:-len(row.full_language_code)]

- 

-     return val

- 

- 

- def harmonize_totals(data):

-     """ po files may be outdate, hypothese: max(source string)=truth"""

-     # there could be multiple translation files for one language on a same project

-     data['dirname'] = data['filename'].apply(os.path.dirname)

- 

-     # sometimes, the $lang.po file is inside a $lang folder, remove this

-     data['dirname'] = data.apply(clean_dirname, axis=1)

- 

-     # calculate the real totalMessage

-     tmp = data.groupby(['src', 'dirname'])['totalMessage'].max().rename(

-         "totalMessageMax").reset_index()

-     data = data.merge(tmp)

-     data['untranslatedMessages'] += data['totalMessageMax'] - data['totalMessage']

-     data['totalMessage'] = data['totalMessageMax']

- 

-     # calculate the real totalSourceWords

-     tmp = data.groupby(['src', 'dirname'])['totalSourceWords'].max().rename(

-         "totalSourceWordsMax").reset_index()

-     data = data.merge(tmp)

-     data['untranslatedSourceWords'] += data['totalSourceWordsMax'] - \

-         data['totalSourceWords']

-     data['totalSourceWords'] = data['totalSourceWordsMax']

- 

-     data = data.drop('totalMessageMax', 1)

-     data = data.drop('totalSourceWordsMax', 1)

- 

-     return data

- 

- 

- def summary(data):

- 

-     stat1 = data[['language', 'territory', 'script']

-                  ].drop_duplicates().count().max()

-     stat2 = data['src'].drop_duplicates().count()

-     stat3 = data[['src', 'filename']].drop_duplicates().count().max()

-     stat4 = data['language'].drop_duplicates().count()

-     stat5 = data.groupby(['src', 'dirname'])['totalMessage'].max().sum()

-     stat6 = data.groupby(['src', 'dirname'])['totalSourceWords'].max().sum()

- 

-     print("")

-     print("We have:")

-     print("  * number of upstream sources: "+str(stat2))

-     print("  * number of distinct lang-script-territory: "+str(stat1))

-     print("  * number of languages: "+str(stat4))

-     print("  * translation files: "+str(stat3))

-     print("This represents:")

-     print("  * Total messages: "+str(stat5))

-     print("  * Total words: "+str(stat6))

-     print("")

- 

-     return data

- 

- 

- def info(dataset, step):

-     """Print basic informations about current dataset"""

-     print(" * "+step+" → we now have "+str(len(dataset))+" rows")

- 

- 

- def store(dataset, name):

-     """Store dataset to csv"""

-     global RESULT_FOLDER

-     dataset.to_csv(RESULT_FOLDER+"/"+name, index=False)

- 

- 

- def concat_csv(packages_folder, stats_folder):

-     dirs = [f for f in os.listdir(packages_folder) if os.path.isdir(os.path.join(packages_folder, f))]

- 

-     with open(os.path.join(stats_folder, "_concat.csv"), "w") as outfile:

-         for name in dirs:

-             try:

-                 with open(os.path.join(packages_folder, name, "stats.csv")) as infile:

-                     for line in infile:

-                         outfile.write(line)

-             except FileNotFoundError:

-                 pass

- 

- if __name__ == '__main__':

-     main()

file modified
+119 -89
@@ -2,21 +2,25 @@ 

  """ Parse translation files to deduct language list """

  

  import argparse

+ import glob

  import json

  import os

- import time

  import polib

- 

- from pprint import pprint

+ import re

  

  from shutil import rmtree

  from weblate_language_data import aliases, languages, language_codes, countries

  

+ LOCAL_ALIASES = {

+     "ca_valencia": "ca@valencia"

+ }

+ 

+ 

  def main():

      """Handle params"""

  

      parser = argparse.ArgumentParser(

-     description="Creates a list of languages form translation files")

+         description="Creates a list of languages form translation files")

  

      parser.add_argument("--release", required=True, type=int, default=31,

                          choices=[30, 31, 32],
@@ -37,68 +41,66 @@ 

      args = parser.parse_args()

  

      release_folder = "./results/f{v}/".format(v=args.release)

-     lang_path = os.path.join(release_folder, "languages/")

-     packages_path = os.path.join(release_folder, "packages/")

+     lang_folder = os.path.join(release_folder, "languages/")

+     package_folder = os.path.join(release_folder, "packages/")

      lang_analyze_folder = os.path.join(release_folder, "languages-analyses/")

  

      if args.describe:

          print("Describing detecting languages")

-         describe(lang_path)

+         describe(lang_folder)

  

      elif args.analyzealllangs:

+         print("Provide more data to analyze errors")

          rmtree(lang_analyze_folder, ignore_errors=True)

          os.mkdir(lang_analyze_folder)

  

-         langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]

+         langs = [f for f in os.listdir(lang_folder) if os.path.isfile(os.path.join(lang_folder, f))]

          for lang in sorted(langs):

-             analyze = analyze_lang(lang_path, lang[:-len('.json')])

+             analyze = analyze_lang(lang_folder, lang[:-len('.json')])

  

              with open(os.path.join(lang_analyze_folder, lang), 'w') as f:

                  f.write(json.dumps(analyze, indent=2))

  

      elif args.analyzelang:

          print("Provide more data to analyze errors")

-         analyze = analyze_lang(lang_path, args.analyzelang)

+         if not os.path.exists(lang_analyze_folder):

+             os.makedirs(lang_analyze_folder)

+ 

+         analyze = analyze_lang(lang_folder, args.analyzelang)

+         result_file = os.path.join(lang_analyze_folder, args.analyzelang + '.json')

  

-         with open(os.path.join(lang_analyze_folder, args.analyzelang + '.json'), 'w') as f:

+         with open(result_file, 'w') as f:

              f.write(json.dumps(analyze, indent=2))

  

      elif args.refresh:

          print("Refreshing the list of languages")

-         rmtree(lang_path, ignore_errors=True)

-         os.mkdir(lang_path)

+         rmtree(lang_folder, ignore_errors=True)

+         os.mkdir(lang_folder)

  

-         start_time_search = time.time()

- 

-         po_langs = detect_languages(packages_path)

+         po_langs = detect_languages(package_folder, release_folder)

  

          for lang in po_langs.keys():

-             with open(os.path.join(lang_path, str(lang) + '.json'), 'w') as f:

+             with open(os.path.join(lang_folder, str(lang) + '.json'), 'w') as f:

                  f.write(json.dumps(po_langs[lang], indent=2))

  

-         search_duration = round(time.time() - start_time_search, 1)

-         print(" Done in {d} seconds".format(d=search_duration))

- 

  

- def analyze_lang(lang_path, analized_lang):

+ def analyze_lang(lang_folder, analized_lang):

      """ Analyze one lang """

      files = []

      results = dict()

-     with open(os.path.join(lang_path, analized_lang + ".json"), "r") as read_file:

+     with open(os.path.join(lang_folder, analized_lang + ".json"), "r") as read_file:

          files = json.load(read_file)

  

-     print(" Analysing language {l}, with {c} files".format(l=analized_lang,c=len(files)))

+     print(" Analysing language {l}, with {c} files".format(l=analized_lang, c=len(files)))

  

      for file in files:

-         lang = "error"

          metadata = dict()

-         error = ""

          try:

              metadata = polib.pofile(file).metadata

-         except UnicodeDecodeError as e:

+         except UnicodeDecodeError:

              # encoding error, to investigate before using it in TM

              metadata["Language"] = "error-unicode"

-         except OSError as e:

+         except OSError:

              # maybe a polib bug? to investigate before using it in TM

              metadata["Language"] = "error-os"

  
@@ -107,9 +109,6 @@ 

          elif metadata["Language"] == "":

              metadata["Language"] = "zzz_empty"

  

-         if analized_lang != "error":

-             lang = choose_lang("", metadata, error)

- 

          language = results.get(metadata.get("Language"), dict())

  

          count = language.get("Count", 0)
@@ -118,7 +117,7 @@ 

  

          lang_files = language.get("Files", [])

          lang_files.append(file)

-         language["Files"] = lang_files

+         language["Files"] = sorted(lang_files)

  

          plurals = language.get("Plural-Forms", [])

          plurals.append(metadata.get("Plural-Forms"))
@@ -137,108 +136,139 @@ 

      return results

  

  

- def describe(lang_path):

+ def describe(lang_folder):

      """ Provide the number of files per language """

-     langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]

+     langs = [f for f in os.listdir(lang_folder) if os.path.isfile(os.path.join(lang_folder, f))]

  

      for lang in sorted(langs):

-         with open(os.path.join(lang_path, lang), "r") as read_file:

+         with open(os.path.join(lang_folder, lang), "r") as read_file:

              files = json.load(read_file)

  

-         print(" {l}:{c}".format(l=lang[:-len('.json')],c=len(files)))

+         print(" {l}:{c}".format(l=lang[:-len('.json')], c=len(files)))

  

  

- def detect_languages(tm_folder):

+ def detect_languages(package_folder, release_folder):

      """ For each po file, detect metadatas and deduct the language     """

      """ Requires: a file hierarchy with po files                       """

      """ Returns: a dictionary of lists, key=lang code, value=file list """

      langs = {}

+     packages = [f for f in os.listdir(package_folder) if os.path.isdir(os.path.join(package_folder, f))]

  

-     for root, directories, files in os.walk(tm_folder):

-         for file in files:

-             racine, ext = os.path.splitext(file)

-             if ext == ".po":

+     log_file = os.path.join(release_folder, "build_language_list.log")

+     file_object = open(log_file, "w")

+ 

+     for package in packages:

+         discovery_file = os.path.join(package_folder, package, "discover.json")

+ 

+         with open(discovery_file, "r") as read_file:

+             alls = json.load(read_file)

+ 

+         to_process = [p for p in alls if p["file_format"] == "po"]

+ 

+         for pattern in to_process:

+             mask = os.path.join(package_folder, package, pattern["filemask"])

+             p = re.compile(mask.replace("*", "(.*)").replace("+", r"\+"))

+ 

+             for po in glob.glob(mask):

+                 result = p.search(po)

+                 lang_code = result.group(1)

                  metadata = dict()

                  error = ""

                  try:

-                     metadata = polib.pofile(os.path.join(root, file)).metadata

-                 except UnicodeDecodeError as e:

+                     metadata = polib.pofile(po).metadata

+                 except UnicodeDecodeError:

                      # encoding error, to investigate before using it in TM

                      error = "error-unicode"

-                 except OSError as e:

+                 except OSError:

                      # maybe a polib bug? to investigate before using it in TM

                      error = "error-os"

  

-                 lang = choose_lang(racine, metadata, error)

+                 lang, decision = choose_lang(lang_code, metadata, error)

  

-                 try:

-                     langs[lang].append(os.path.join(root, file))

-                 except KeyError:

-                     langs[lang] = list()

-                     langs[lang].append(os.path.join(root, file))

+                 log = ",".join([po,

+                                 lang_code,

+                                 metadata.get("Language", ""),

+                                 error,

+                                 lang,

+                                 str(decision)])

+                 file_object.write(log + "\n")

+ 

+                 lang_result = langs.get(lang, dict())

+                 po_results = lang_result.get("po", list())

+                 po_results.append(po)

+                 lang_result["po"] = po_results

+ 

+                 langs[lang] = lang_result

+ 

+     file_object.close()

  

      return langs

  

+ 

  def choose_lang(filename, metadata, error):

      """ From a po file and its medata, choose the most likely language code """

      """ By priority: the Language medata """

      """ Returns: a language code """

  

-     lang = ""

-     file_name = filename.lower().replace("-", "_")

- 

-     meta_language = metadata.get("Language","").lower().replace("-", "_")

+     lang = "noresult"

+     decision = 0

+     codes = dict()

+     for language in languages.LANGUAGES:

+         # 0 is language code

+         # 1 is language name

+         codes[language[1].lower()] = language[0].lower()

  

-     meta_team = metadata.get("Language-Team","").lower().replace("-", "_")

+     file_name = filename.lower().replace("-", "_")

+     meta_language = metadata.get("Language", "").lower().replace("-", "_")

  

      if meta_language in language_codes.LANGUAGES:

          lang = meta_language

+         decision = 1

+ 

+     elif meta_language in codes.values():

+         lang = meta_language

+         decision = 2

+ 

+     elif meta_language in codes.keys():

+         lang = codes.get(meta_language)

+         decision = 3

+ 

+     elif meta_language in LOCAL_ALIASES.keys():

+         lang = LOCAL_ALIASES[meta_language].lower()

+         decision = 4

+ 

+     elif meta_language in aliases.ALIASES.keys():

+         lang = aliases.ALIASES[meta_language].lower()

+         decision = 5

+ 

+     elif meta_language in countries.DEFAULT_LANGS:

+         lang = meta_language.split("_", 1)[0]

+         decision = 6

  

      elif file_name in language_codes.LANGUAGES:

          lang = file_name

+         decision = 7

+ 

+     elif file_name in codes.values():

+         lang = file_name

+         decision = 8

+ 

+     elif file_name in aliases.ALIASES.keys():

+         lang = aliases.ALIASES[file_name].lower()

+         decision = 9

  

+     elif file_name in countries.DEFAULT_LANGS:

+         lang = file_name.split("_", 1)[0]

+         decision = 10

      else:

-         lang = "noresult"

- 

-     # try languages (some codes here are exclused from languages_codes)

-     if lang == "noresult":

-         codes = dict()

-         for language in languages.LANGUAGES:

-             # 0 is language code

-             # 1 is language name

-             codes[language[1].lower()] = language[0].lower()

- 

-         if meta_language in codes.values():

-             lang = meta_language

- 

-         elif file_name in codes.values():

-             lang = file_name

- 

-         elif meta_language in codes.keys():

-             lang = codes.get(meta_language)

- 

-     # try ALIASES

-     if lang == "noresult":

-         if meta_language in aliases.ALIASES.keys():

-             lang = aliases.ALIASES[meta_language].lower()

-         elif file_name in aliases.ALIASES.keys():

-             lang = aliases.ALIASES[file_name].lower()

- 

-     if lang == "noresult":

-         if meta_language in countries.DEFAULT_LANGS:

-             lang = meta_language.split("_", 1)[0]

-         elif file_name in countries.DEFAULT_LANGS:

-             lang = file_name.split("_", 1)[0]

-         else:

-             lang = "error"

+         lang = "error"

  

      # harmonization (example: mo = ro_MD)

      if lang in aliases.ALIASES.keys():

          lang = aliases.ALIASES[lang].lower()

  

-     return lang

+     return lang, decision

  

  

  if __name__ == '__main__':

      main()

- 

file removed
-161
@@ -1,161 +0,0 @@ 

- #!/usr/bin/env python3

- """For each package, compute stats"""

- 

- import argparse

- import glob

- import json

- import os

- import subprocess

- import tempfile

- 

- def main():

-     """Handle params"""

- 

-     parser = argparse.ArgumentParser(

-         description="Computes stats for each srpm detected")

-     parser.add_argument("--release", required=True, type=int, default=31,

-                         choices=[30, 31, 32],

-                         help="Provide the Fedora release to analyze")

- 

-     args = parser.parse_args()

- 

-     packages_folder = "./results/f{v}/packages/".format(v=args.release)

- 

-     filenames = [f for f in os.listdir(packages_folder) if os.path.isdir(os.path.join(packages_folder, f))]

- 

-     print("Computing stats")

-     count = 0

- 

-     for package in sorted(filenames):

-         count +=1

-         print(" {c}/{t} - {p}".format(c=count, t=len(filenames),p=package))

-         with open(os.path.join(packages_folder, package, "discover.json"), 'r') as f:

-             translation_files = json.load(f)

- 

-         tmp = os.path.join(packages_folder, package)

- 

-         for translation in translation_files:

-             if translation["file_format"] == "po":

-                 get_po_translation_level(tmp, translation, package, packages_folder)

-             elif translation["file_format"] == "ts":

-                 get_ts_translation_level(tmp, translation, package, packages_folder)

-             elif translation["file_format"] == "json":

-                 get_json_translation_level(tmp, translation, package, packages_folder)

-             elif translation["file_format"] == "auto":

-                 # it's a detection of .tx configuration

-                 continue

- 

-     print("Removing duplicates")

-     count = 0

-     for package in sorted(filenames):

-         count +=1

-         print(" {c}/{t} - {p}".format(c=count, t=len(filenames),p=package))

-         input_file = packages_folder + "{p}/stats.csv".format(p=package)

- 

-         try:

-             with open(input_file, 'r') as f:

-                 lines = f.readlines()

- 

-             seen_lines = set()

-             with open(input_file, 'w') as f:

-                 for line in lines:

-                     if line not in seen_lines:

-                         seen_lines.add(line)

-                         f.write(line)

-         except FileNotFoundError:

-             continue

- 

- def get_po_translation_level(path, discover, name, packages_folder):

-     filemask = discover["filemask"]

-     stats_file = packages_folder + "/{p}/stats.csv".format(p=name)

-     error_file = packages_folder + "/{p}/stats.errors.txt".format(p=name)

- 

-     with open(stats_file, 'a') as stats:

-         with open(error_file, 'a') as error:

-             subprocess.run(["pocount", filemask.split("*")[0], "--csv"],

-                            stdout=stats, stderr=error, check=True, cwd=path)

- 

-     subprocess.run(["sed",

-                     "-i",

-                     "-e",

-                     "s|{p}|.|g".format(p=path),

-                     error_file],

-                    check=True)

- 

- 

- def get_ts_translation_level(path, discover, name, packages_folder):

-     filemask = discover["filemask"]

-     stats_file = packages_folder + "/{p}/stats.csv".format(p=name)

-     error_file = packages_folder + "/{p}/stats.errors.txt".format(p=name)

- 

-     with open(stats_file, 'a') as stats:

-         with open(error_file, 'a') as error:

-             subprocess.run(["pocount", filemask.split("*")[0], "--csv"],

-                            stdout=stats, stderr=error, check=True, cwd=path)

- 

-     subprocess.run(["sed",

-                     "-i",

-                     "-e",

-                     "s|{p}|.|g".format(p=path),

-                     error_file],

-                    check=True)

- 

- 

- def get_json_translation_level(path, discover, name, packages_folder):

-     filemask = discover["filemask"]

- 

-     stats_file = packages_folder + "/{p}/stats.csv".format(p=name)

-     error_file = packages_folder + "/{p}/stats.errors.txt".format(p=name)

- 

-     stats = open(stats_file, 'a')

-     error = open(error_file, 'a')

- 

-     # move only related json files to a temporary folder

-     with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmpjson:

-         for filename in glob.iglob(path + "/" + filemask):

-             # if filesare in language subfolder, reproduce the hierarchy

-             dest = os.path.join(

-                 *(os.path.dirname(filename).split(os.path.sep)[3:]))

-             os.makedirs(tmpjson + "/" + dest, exist_ok=True)

- 

-         # convert json files to po files

-         with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmppo:

-             # use existing template, in not existing (probably a bug), try "en"

-             template_file = tmpjson + "/" + \

-                 discover.get("template", filemask.replace("*", "en"))

- 

-             if os.path.isfile(template_file):

-                 subprocess.run(["json2po",

-                                 "-t",

-                                 template_file,

-                                 tmpjson,

-                                 tmppo,

-                                 "--progress=none"],

-                                stderr=error,

-                                check=True,

-                                cwd=tmppo)

- 

-                 # compute stats

-                 subprocess.run(["pocount",

-                                 filemask.split("*")[0],

-                                 "--csv"],

-                                stdout=stats,

-                                stderr=error,

-                                check=True,

-                                cwd=tmppo)

-             else:

-                 print("  template doesn't exist, is it a translation-finder bug?")

- 

-     stats.close()

-     error.close()

- 

-     subprocess.run(["sed",

-                     "-i",

-                     "-e",

-                     "s|{p}|.|g".format(p=path),

-                     error_file],

-                    check=True)

- 

- 

- if __name__ == '__main__':

-     main()

file added
+145
@@ -0,0 +1,145 @@ 

+ #!/usr/bin/env python3

+ """For each package, compute stats"""

+ 

+ import argparse

+ import glob

+ import json

+ import os

+ import subprocess

+ import shutil

+ import tempfile

+ 

+ 

+ def main():

+     """Handle params"""

+ 

+     parser = argparse.ArgumentParser(

+         description="Computes stats for each srpm detected")

+     parser.add_argument("--release", required=True, type=int, default=31,

+                         choices=[30, 31, 32],

+                         help="Provide the Fedora release to analyze")

+ 

+     args = parser.parse_args()

+ 

+     packages_folder = "./results/f{v}/packages/".format(v=args.release)

+     packages_stats_folder = "./results/f{v}/packages-stats/".format(v=args.release)

+     languages_folder = "./results/f{v}/languages/".format(v=args.release)

+     languages_stats_folder = "./results/f{v}/languages-stats/".format(v=args.release)

+ 

+     print("Computing packages stats")

+     packages = [f for f in os.listdir(packages_folder) if os.path.isdir(os.path.join(packages_folder, f))]

+     count = 0

+ 

+     for package in sorted(packages):

+         count += 1

+         print(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package))

+         with open(os.path.join(packages_folder, package, "discover.json"), 'r') as f:

+             discoveries = json.load(f)

+ 

+         src_folder = os.path.join(packages_folder, package)

+         dest_folder = os.path.join(packages_stats_folder, package)

+         if not os.path.exists(dest_folder):

+             os.makedirs(dest_folder)

+ 

+         stats_file = os.path.join(dest_folder, "stats.csv")

+         error_file = os.path.join(dest_folder, "stats.errors.txt")

+ 

+         if os.path.isfile(stats_file):

+             os.remove(stats_file)

+ 

+         if os.path.isfile(error_file):

+             os.remove(error_file)

+ 

+         for discover in discoveries:

+             files = glob.glob(os.path.join(src_folder, discover["filemask"]))

+ 

+             if discover["file_format"] == "po":

+                 get_po_translation_level(files, stats_file, error_file)

+             elif discover["file_format"] == "json":

+                 get_json_translation_level(files, os.path.join(src_folder, discover["template"]), stats_file, error_file)

+ 

+     print(" Removing duplicates")

+     count = 0

+     for package in sorted(packages):

+         count += 1

+         print(" {c}/{t} - {p}".format(c=count, t=len(packages), p=package))

+         input_file = packages_folder + "{p}/stats.csv".format(p=package)

+ 

+         try:

+             with open(input_file, 'r') as f:

+                 lines = f.readlines()

+ 

+             seen_lines = set()

+             with open(input_file, 'w') as f:

+                 for line in lines:

+                     if line not in seen_lines:

+                         seen_lines.add(line)

+                         f.write(line)

+         except FileNotFoundError:

+             continue

+ 

+     print("Computing language stats")

+     languages = [f for f in os.listdir(languages_folder)]

+     count = 0

+ 

+     dest_folder = languages_stats_folder

+     if os.path.isdir(dest_folder):

+         shutil.rmtree(dest_folder)

+     os.makedirs(dest_folder)

+ 

+     for language in sorted(languages):

+         count += 1

+         lang = language[:-5]

+ 

+         print(" {c}/{t} - {l}".format(c=count, t=len(languages), l=lang))

+         with open(os.path.join(languages_folder, language), 'r') as f:

+             discoveries = json.load(f)

+ 

+         stats_file = os.path.join(dest_folder, lang + ".stats.csv")

+         error_file = os.path.join(dest_folder, lang + ".stats.errors.txt")

+ 

+         files = discoveries.get("po", [])

+         if files:

+             get_po_translation_level(files, stats_file, error_file)

+ 

+ 

+ def get_po_translation_level(files,  stats_file, error_file):

+     """ Compute results """

+ 

+     with open(stats_file, 'a') as stats:

+         with open(error_file, 'a') as error:

+             subprocess.run(["pocount", "--csv"] + files,

+                            stdout=stats, stderr=error, check=True)

+ 

+ 

+ def get_json_translation_level(files, template, stats_file, error_file):

+     """ convert json files into po and call get_po_translation_level """

+ 

+     # move only related json files to a temporary folder

+     with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmpjson:

+         error = open(error_file, 'a')

+         pofiles = []

+         for filename in files:

+             # if filesare in language subfolder, reproduce the hierarchy

+             dest = filename.replace(os.path.basename(filename), "")

+             os.makedirs(tmpjson + "/" + dest, exist_ok=True)

+ 

+             if os.path.isfile(template):

+                 po = os.path.join(tmpjson, filename.replace(".json", ".po"))

+                 subprocess.run(["json2po",

+                                 "-t",

+                                 template,

+                                 filename,

+                                 po,

+                                 "--progress=none"],

+                                stderr=error,

+                                check=True)

+                 pofiles.append(po)

+             else:

+                 print("  {t} missing, translation-finder bug?".format(t=template))

+         error.close()

+         get_po_translation_level(pofiles, stats_file, error_file)

+ 

+ 

+ if __name__ == '__main__':

+     main()

file modified
+15 -18
@@ -9,21 +9,22 @@ 

  import tempfile

  import time

  

+ 

  def main():

      """Handle params"""

  

      parser = argparse.ArgumentParser(

-     description="Creates compendium for every languages")

+         description="Creates compendium for every languages")

  

      parser.add_argument("--release", required=True, type=int, default=31,

                          choices=[30, 31, 32],

                          help="Provide the Fedora release to analyze")

  

      parser.add_argument("--refresh", action="store_true",

-                     help="Force refresh of files")

+                         help="Force refresh of files")

  

      parser.add_argument("--compress", action="store_true",

-                     help="Compress output files")

+                         help="Compress output files")

  

      parser.add_argument("--lang", required=False, type=str,

                          help="Filter a language to analyze")
@@ -32,8 +33,7 @@ 

  

      release_folder = "./results/f{v}/".format(v=args.release)

      lang_path = os.path.join(release_folder, "languages/")

-     packages_path = os.path.join(release_folder, "packages/")

-     tm_folder = os.path.join(release_folder, "out/")

+     tm_folder = os.path.join(release_folder, "languages-tm/")

      os.makedirs(tm_folder, exist_ok=True)

  

      print("Building the translation memory for every languages")
@@ -41,7 +41,7 @@ 

  

      if args.lang:

          with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file:

-             files = json.load(read_file)

+             files = json.load(read_file)["po"]

  

          compute_lang(args.lang, files, tm_folder)

      else:
@@ -49,7 +49,7 @@ 

  

          for lang in sorted(langs):

              with open(os.path.join(lang_path, lang), "r") as read_file:

-                 files = json.load(read_file)

+                 files = json.load(read_file)["po"]

  

              compute_lang(lang[:-len('.json')], files, tm_folder, args.refresh)

  
@@ -58,6 +58,7 @@ 

              check_lang(lang[:-len('.json')], tm_folder)

  

      if args.compress:

+         print("Compressing files")

          compress(tm_folder)

  

      search_duration = round(time.time() - start_time_search, 1)
@@ -74,7 +75,7 @@ 

      compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file)

  

      if not os.path.isfile(compendium_file) or refresh is True:

-         pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in langfiles]

+         pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)), f) for f in langfiles]

          count = 0

  

          with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp:
@@ -82,7 +83,7 @@ 

                  try:

                      command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"]

                      subprocess.run(command, check=True, cwd=tmp, capture_output=True)

-                 except subprocess.CalledProcessError as e:

+                 except subprocess.CalledProcessError:

                      try:

                          command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"]

                          subprocess.run(command, check=True, cwd=tmp, capture_output=True)
@@ -96,10 +97,9 @@ 

  

              try:

                  subprocess.run(command, check=True, cwd=tmp, capture_output=True)

-             except subprocess.CalledProcessError as e:

+             except subprocess.CalledProcessError:

                  print(" msgcat exception...")

  

- 

      # po to tmx convertion

      tmx_file = tm_folder + lang + ".tmx"

      command = ["po2tmx", "--language="+lang, "--progress=none",
@@ -110,8 +110,8 @@ 

      # language terminology

      terminology_file = tm_folder + lang + ".terminology.po"

      command = ["poterminology", "--ignore-case", "--fold-titlecase",

-                 "--inputs-needed", "1",

-                 "--progress=none", compendium_file, "--output="+terminology_file]

+                "--inputs-needed", "1",

+                "--progress=none", compendium_file, "--output=" + terminology_file]

      if not os.path.isfile(terminology_file) or refresh is True:

          subprocess.run(command, check=True, capture_output=True)

  
@@ -132,22 +132,19 @@ 

      if not os.path.isfile(terminology_file):

          print(" {l}-terminology is missing".format(l=lang))

  

+ 

  def compress(folder):

      """ Compress files uzing gzip """

  

      files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

-     count = 0

-     total = len(files)

  

      for file in sorted(files):

-         count += 1

          dest = file + ".gz"

-         print(" {c}/{t}")

  

          with open(os.path.join(folder, file), "rb") as file_in:

              with gzip.open(os.path.join(folder, dest), "wb") as file_out:

                  file_out.writelines(file_in)

  

+ 

  if __name__ == '__main__':

      main()

- 

file modified
-4
@@ -1,7 +1,3 @@ 

- pyyaml

  pandas

- geopandas

- matplotlib

- descartes

  polib

  weblate-language-data

file added
+24
@@ -0,0 +1,24 @@ 

+ #!/bin/bash

+ 

+ # this file is useful for end to end tests on a short corpus

+ rm -rf ./results/f32/

+ 

+ # parcourir tous les fichiers rpm d'une version et en extraire tous les fichiers de traduction

+ # ~ 8 h

+ podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:32 /src/build.py --keep-srpms gco.*

+ 

+ # déduire la liste de toutes les langues

+ # ~ 25 m

+ ./build_language_list.py --release 32 --refresh

+ 

+ # générer un fichier d'analyse de la langue (quels fichiers, équipes, pluriels, etc.)

+ # ~ 25 m

+ ./build_language_list.py --release 32 --analyzealllang

+ 

+ # générer par langue un compendium, une mémoire de traduction et une terminologie

+ # ~ 3 h

+ ./build_tm.py --release 32 --refresh

+ 

+ # calculer des pourcentages d'avancement par paquet et langue

+ # ~ 41 m

+ ./build_stats.py --release 32

file added
+13
@@ -0,0 +1,13 @@ 

+ # global

+ 

+ support for json file

+ 

+ # build_tm.py

+ 

+ Detecting missing files

+ - en-compendium is missing

+ - error-compendium is missing

+ - gl-compendium is missing

+ - nb_no-compendium is missing

+ - sk-compendium is missing

+ - zh_hant-compendium is missing