From 83b997ca1899821c8361ae598d6e97ac4628d982 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 18 2020 18:43:06 +0000 Subject: [PATCH 1/5] split data extraction from stats calculation this reduces the execution time by a factor of 4 this allows to make the scripts more distribution independant --- diff --git a/README.md b/README.md index 611b123..45b483a 100644 --- a/README.md +++ b/README.md @@ -30,13 +30,17 @@ with `$script`, one of the following: ## Compute data -`./build.py` +`./build.py` get srpm lists, apply discover and compute progression stats -The result will be in multiple files inside the results folder. +## Produce per package stats -## Produce stats +`./build_packages_stats.py` -`./build_stats.py` +For each package, produce progression stats. + +## Produce global stats + +`./build_global_stats.py` Applies data cleanups and enhancements (cldr name). @@ -47,6 +51,15 @@ Applies data cleanups and enhancements (cldr name). Agregate the data per language, then apply it on territories (it uses stats from CLDR with language per territory). +## Produce translation memories + +`./build_tm.py` + +Detect the list of languages +Aggregate all files for a language and produce a compendium, a terminology and a translation memory. + +TODO: language detection should probably be a in a dedicated build file. + # Output files * `0.error.language not in cldr.csv` contains unknown languages (lines are removed) diff --git a/build.py b/build.py index f6c1c18..6fc3d24 100755 --- a/build.py +++ b/build.py @@ -42,13 +42,13 @@ def main(): (distname, distrel, distid) = distro.linux_distribution() result_folder = "./results/f{v}/stats/".format(v=distrel) - tm_folder = "./results/f{v}/packages/".format(v=distrel) + packages_folder = "./results/f{v}/packages/".format(v=distrel) srpms_path = "/srpms" if not os.path.exists(result_folder): os.makedirs(result_folder) - if not os.path.exists(tm_folder): - os.makedirs(tm_folder) + if not os.path.exists(packages_folder): + os.makedirs(packages_folder) processing_file = os.path.join("./results/f{v}/".format(v=distrel), "data.json") srpm_list_file = os.path.join(srpms_path, "srpm.txt") @@ -137,7 +137,7 @@ def main(): extract_srpm(tmp, srpm_path, result_folder) (tsearch, tcopy, results) = discover_translations( - tmp, package.name, result_folder, tm_folder) + tmp, package.name, result_folder, packages_folder) if not args.keep: os.unlink(srpm_path) @@ -153,8 +153,6 @@ def main(): json.dump(data, f, indent=2) print("") - concat_csv(result_folder) - def extract_srpm(tmp, name, result_folder): """extract srpm page""" @@ -165,7 +163,8 @@ def extract_srpm(tmp, name, result_folder): subprocess.run(['./extract_srpm.sh', tmp, name], stdout=out, stderr=error, check=True) -def discover_translations(tmp, name, result_folder, tm_folder): + +def discover_translations(tmp, name, result_folder, packages_folder): """find po file""" print("discover_translations: " + tmp) translation_files = [] @@ -185,30 +184,15 @@ def discover_translations(tmp, name, result_folder, tm_folder): tcopy = time.time() if translation_files: - if not os.path.exists(os.path.join(tm_folder, name)): - os.makedirs(os.path.join(tm_folder, name)) + if not os.path.exists(os.path.join(packages_folder, name)): + os.makedirs(os.path.join(packages_folder, name)) - with open(os.path.join(tm_folder, name, "discover.json"), 'w') as f: + with open(os.path.join(packages_folder, name, "discover.json"), 'w') as f: f.write(json.dumps(translation_files, indent=2)) for translation in translation_files: - if translation["file_format"] == "po": - get_po_translation_level( - tmp, translation, name, result_folder, tm_folder) - elif translation["file_format"] == "ts": - get_ts_translation_level(tmp, translation, name, result_folder) - elif translation["file_format"] == "json": - get_json_translation_level( - tmp, translation, name, result_folder) - elif translation["file_format"] == "auto": - # it's a detection of .tx configuration - continue - else: - unknown_format( - translation, - name, - translation["file_format"], - result_folder) + copy_translations(tmp, translation, name, result_folder, packages_folder) + tcopy = round(time.time() - tcopy, 1) cresults = dict() @@ -218,121 +202,19 @@ def discover_translations(tmp, name, result_folder, tm_folder): return (tsearch, tcopy, cresults) -def get_po_translation_level(path, mask, name, result_folder, tm_folder): - filemask = mask["filemask"] - - with open(result_folder + '/{p}.stats.csv'.format(p=name), 'a') as stats: - with open(result_folder + '/{p}.errors.txt'.format(p=name), 'a') as error: - subprocess.run(["pocount", filemask.split("*")[0], "--csv"], - stdout=stats, stderr=error, check=True, cwd=path) - - # Copy translation files in translation memory - for po in glob.glob(path + "/" + filemask): - dest = tm_folder + "/" + name + "/" + filemask.split("*")[0] - os.makedirs(dest, exist_ok=True) - # use copyfile instead of copy2 to handle read-only files in rpm - copyfile(po, os.path.join(dest, os.path.basename(po))) - - subprocess.run(["sed", - "-i", - "-e", - "s|{p}|.|g".format(p=path), - result_folder + '/{p}.errors.txt'.format(p=name)], - check=True) - - -def get_ts_translation_level(path, mask, name, result_folder): - filemask = mask["filemask"] - - with open(result_folder + '/{p}.stats.csv'.format(p=name), 'a') as stats: - with open(result_folder + '/{p}.errors.txt'.format(p=name), 'a') as error: - subprocess.run(["pocount", filemask.split("*")[0], "--csv"], - stdout=stats, stderr=error, check=True, cwd=path) - - subprocess.run(["sed", - "-i", - "-e", - "s|{p}|.|g".format(p=path), - result_folder + '/{p}.errors.txt'.format(p=name)], - check=True) - - -def get_json_translation_level(path, mask, name, result_folder): - filemask = mask["filemask"] - - stats = open(result_folder + '/{p}.stats.csv'.format(p=name), 'a') - error = open(result_folder + '/{p}.errors.txt'.format(p=name), 'a') - - # move only related json files to a temporary folder - with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmpjson: - for filename in glob.iglob(path + "/" + filemask): - # if filesare in language subfolder, reproduce the hierarchy - dest = os.path.join( - *(os.path.dirname(filename).split(os.path.sep)[3:])) - os.makedirs(tmpjson + "/" + dest, exist_ok=True) - - copyfile( - filename, - tmpjson + - "/" + - dest + - "/" + - os.path.basename(filename)) - - # convert json files to po files - with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmppo: - # use existing template, in not existing (probably a bug), try "en" - template_file = tmpjson + "/" + \ - mask.get("template", filemask.replace("*", "en")) - - if os.path.isfile(template_file): - subprocess.run(["json2po", - "-t", - template_file, - tmpjson, - tmppo, - "--progress=none"], - stderr=error, - check=True, - cwd=tmppo) - - # compute stats - subprocess.run(["pocount", - filemask.split("*")[0], - "--csv"], - stdout=stats, - stderr=error, - check=True, - cwd=tmppo) - else: - print(" template doesn't exist, is it a translation-finder bug?") - - stats.close() - error.close() - - subprocess.run(["sed", - "-i", - "-e", - "s|{p}|.|g".format(p=path), - result_folder + '/{p}.errors.txt'.format(p=name)], - check=True) - - -def unknown_format(results, srpm, tformat, result_folder): - with open(result_folder + "/todo_" + tformat + ".txt", "a") as file: - file.write(srpm + " " + results["filemask"] + "\n") - - -def concat_csv(result_folder): - filenames = [f for f in os.listdir(result_folder) if os.path.isfile(os.path.join(result_folder, f))] - filenames = [f for f in filenames if f.endswith(".stats.csv")] - - with open(os.path.join(result_folder, "_concat.csv"), "w") as outfile: - for fname in filenames: - with open(os.path.join(result_folder, fname)) as infile: - for line in infile: - outfile.write(line) +def copy_translations(tmp, translation, name, result_folder, packages_folder): + # translation, name, result_folder, packages_folder + filemask = translation["filemask"] + + print("copy translations " + filemask) + + if translation["file_format"] in ["po", "json"]: + for po in glob.glob(tmp + "/" + filemask): + dest = packages_folder + "/" + name + "/" + filemask.split("*")[0] + os.makedirs(dest, exist_ok=True) + # use copyfile instead of copy2 to handle read-only files in rpm + copyfile(po, os.path.join(dest, os.path.basename(po))) if __name__ == '__main__': main() diff --git a/build_global_stats.py b/build_global_stats.py new file mode 100755 index 0000000..bc4b5cb --- /dev/null +++ b/build_global_stats.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +"""Consolidate and clean result files""" + +import argparse +import os +import pandas + +RESULT_FOLDER = "" + + +def main(): + """Handle params""" + + global RESULT_FOLDER + + parser = argparse.ArgumentParser( + description="Consolidate every result files and produce a clean concatenated update") + parser.add_argument("--release", required=True, type=int, default=31, + choices=[30, 31, 32], + help="Provide the Fedora release to analyze") + + args = parser.parse_args() + + RESULT_FOLDER = "./results/f{r}/stats/".format(r=args.release) + packages_folder = "./results/f{r}/packages/".format(r=args.release) + + concat_csv(packages_folder, RESULT_FOLDER) + + file = RESULT_FOLDER + "/_concat.csv" + parse(file) + + +def parse(file): + """Call all cleaning functions""" + data = pandas.read_csv(file) + data.columns = ['src', + 'filename', + 'translatedMessages', + 'translatedSourceWords', + 'translatedTargetWords', + 'fuzzyMessages', + 'fuzzySourceWords', + 'untranslatedMessages', + 'untranslatedSourceWords', + 'totalMessage', + 'totalSourceWords', + 'reviewMessages', + 'reviewSourceWords'] + + info(data, "CSV loaded") + + data = clean_0_basic(data) + + data = clean_1_dupplication(data) + info(data, "Deduplication is done") + + data = data.astype({ + 'translatedMessages': int, + 'translatedSourceWords': int, + 'translatedTargetWords': int, + 'fuzzyMessages': int, + 'fuzzySourceWords': int, + 'untranslatedMessages': int, + 'untranslatedSourceWords': int, + 'totalMessage': int, + 'totalSourceWords': int + }) + + data = clean_2_remove(data) + info(data, "Removal is done") + + data = guess_bcp47(data) + info(data, "bcp47 data are guessed") + + data = clean_bcp47(data) + info(data, "bcp47 data are cleaned") + + data = add_cldr(data) + info(data, "cldr data are added") + + data = clean_cldr(data) + info(data, "cldr data are cleaned") + + data = check_lang_territory_consistency(data) + info(data, "cldr consistency are done") + + data = harmonize_totals(data) + info(data, "data harmonization are done") + + data = summary(data) + + store(data, "3.result.csv") + + +def clean_0_basic(data): + """String cleaning and drops useless 'review' columns""" + + # remove useless spaces and use lowercase + for column in list(data): + data[column] = data[column].str.strip() + data[column] = data[column].str.lower() + + # these columns never have values + data = data.drop('reviewMessages', 1) + data = data.drop('reviewSourceWords', 1) + + return data + + +def clean_1_dupplication(data): + """Removes duplicates""" + + # remove all headers from indivual stat result + data = data[data.filename != "filename"] + info(data, "* duplicated headers are removed") + + # remove duplicated stats from bad translation file patterns + data = data.drop_duplicates(['src', 'filename'], keep='last') + info(data, "* duplicated po files are removed") + + return data + + +def clean_2_remove(data): + """Removes obvious useless strings""" + + # remove pot files + data = data[~(data.filename.str.endswith(".pot"))] + info(data, "* remove pot files") + + # remove gmo files + data = data[~(data.filename.str.endswith(".gmo"))] + info(data, "* remove gmo files") + + # remove when no result + store(data[data.totalMessage == 0], "1.debug.total message = 0.csv") + data = data[data.totalMessage != 0] + info(data, "* remove files with 'totalMessage'=0") + + return data + + +def guess_bcp47(data): + """Guess Language, Territory and Script from filename""" + data['basename'] = data['filename'].apply(os.path.basename) + + data['full_lang'] = data['basename'].str.rsplit('.', 1, expand=True)[0] + + # a few lang naming are wrong + data.full_lang = data.full_lang.replace( + {'kmr_latn': 'kmr@latn', 'fr-braille': 'fr@braille', }) + + data['lang'] = data['full_lang'].str.rsplit('@', 1, expand=True)[0] + + data['script'] = data['full_lang'].str.rsplit('@', 1, expand=True)[1] + + # these are just re-encoded translations + data = data[~( + data.lang.str.endswith(".big5") | + data.lang.str.endswith(".cp936") | + data.lang.str.endswith(".cp1250") | + data.lang.str.endswith(".cp1251") | + data.lang.str.endswith(".euc-jp") | + data.lang.str.endswith(".gb2312") | + data.lang.str.endswith(".sjis") | + data.lang.str.endswith(".utf-8") + )] + info(data, "* remove if lang endswith encoding values") + + # these are just re-encoded translations + store(data[data.lang.str.contains(".", regex=False)], + '0.error.lang with point.csv') + data = data[~(data.lang.str.contains(".", regex=False))] + info(data, "* remove if lang contains a point") + + data['language'] = data['lang'].str.rsplit('_', 1, expand=True)[0] + + data['territory'] = data['lang'].str.rsplit('_', 1, expand=True)[1] + + # store all unique values for debug + store(data.drop_duplicates('lang', keep='last'), "1.debug.lang.csv") + store(data.drop_duplicates('language', keep='last'), "1.debug.language.csv") + store(data.drop_duplicates('territory', keep='last'), "1.debug.territory.csv") + store(data.drop_duplicates('script', keep='last'), "1.debug.script.csv") + + # remove temporary columns + data = data.drop('full_lang', 1) + data = data.drop('lang', 1) + + return data + + +def clean_bcp47(data): + """Remove impossible values for language and territory""" + # remove territory longer than 2 chars + store(data[data.territory.str.len() > 2], '0.error.len(territory)>2.csv') + data = data[~(data.territory.str.len() > 2)] + info(data, "* remove if len(territory)>2") + + # remove languages longer than 3 chars + store(data[data.language.str.len() > 3], '0.error.len(language)>3.csv') + data = data[~(data.language.str.len() > 3)] + info(data, "* remove if len(language)>3") + + # remove numeric languages + store(data[data.language.str.isdigit()], + '0.error.languages is numeric.csv') + data = data[~(data.language.str.isdigit())] + info(data, "* remove if language.isdigit()") + + # set types + data.territory = data.territory.fillna('') + data.script = data.script.fillna('') + data = data.astype( + {'territory': 'str', 'language': 'str', 'script': 'str'}) + + return data + + +def add_cldr(data): + """Load cldr data, merge it with""" + cldr_language = pandas.read_csv("CLDR-raw/language.csv") + cldr_language.name = cldr_language.name.str.lower() + + cldr_script = pandas.read_csv("CLDR-raw/script.csv") + cldr_script.code = cldr_script.code.str.lower() + cldr_script.name = cldr_script.name.str.lower() + + cldr_territory = pandas.read_csv("CLDR-raw/territory.csv") + cldr_territory.code = cldr_territory.code.str.lower() + cldr_territory.name = cldr_territory.name.str.lower() + + data = data.merge(cldr_language, how='left', + left_on='language', right_on='code') + data = data.rename(columns={'name': 'language_name'}) + data = data.drop('code', 1) + + data = data.merge(cldr_script, how='left', left_on='script', + right_on='code', suffixes=(False, 'script_')) + data = data.rename(columns={'name': 'script_name'}) + data = data.drop('code', 1) + + data = data.merge(cldr_territory, how='left', left_on='territory', + right_on='code', suffixes=(False, 'territory_')) + data = data.rename(columns={'name': 'territory_name'}) + data = data.drop('code', 1) + + data['full_language_code'] = data.apply(get_full_language_code, axis=1) + + return data + + +def clean_cldr(data): + """Remove """ + # remove numeric languages + store(data[data.language_name.isnull()].drop_duplicates( + 'language'), '0.error.language not in cldr.csv') + data = data[~(data.language_name.isnull())] + info(data, "* remove languages non existing in CLDR") + + return data + + +def check_lang_territory_consistency(data): + """ use pop per lang_script and territory to detect potential errors """ + + cldr_data = pandas.read_csv("CLDR-raw/country_language_population_raw.txt", + sep="\t") + cldr_data.CName = cldr_data.CName.str.lower() + + # in this file, Azerbaijani (Arabic) is written az_Arab, we only keep lang for now + cldr_data['Language'] = cldr_data['Language'].str.rsplit('_', 1, expand=True)[ + 0] + + cldr_data = cldr_data[["CName", 'Language']] + # as we may have duplicated values now + cldr_data = cldr_data.drop_duplicates() + cldr_data = cldr_data.rename( + columns={'CName': 'terr', 'Language': 'language'}) + + data = data.merge(cldr_data, how='left', + left_on=('language', 'territory'), + right_on=('language', 'terr'), + suffixes=(False, '_cldr')) + + error = data[['language', 'territory', 'terr']] + error = error[~(error.territory.isnull())] + error = error[~(error.territory == '')] + error = error[error.terr.isnull()].drop_duplicates() + store(error, '0.error.no population for this language-territory couple.csv') + + data = data.drop('terr', 1) + + return data + + +def get_full_language_code(row): + """ full language code using this naming: lang_territory@script """ + val = row.language + if row.territory: + val = val + "_" + row.territory + if row.script: + val = val + "@" + row.script + + return val + + +def clean_dirname(row): + """ strip full_language_code from dirname """ + val = row.dirname + + if val.endswith(row.full_language_code): + val = row.dirname[:-len(row.full_language_code)] + + return val + + +def harmonize_totals(data): + """ po files may be outdate, hypothese: max(source string)=truth""" + # there could be multiple translation files for one language on a same project + data['dirname'] = data['filename'].apply(os.path.dirname) + + # sometimes, the $lang.po file is inside a $lang folder, remove this + data['dirname'] = data.apply(clean_dirname, axis=1) + + # calculate the real totalMessage + tmp = data.groupby(['src', 'dirname'])['totalMessage'].max().rename( + "totalMessageMax").reset_index() + data = data.merge(tmp) + data['untranslatedMessages'] += data['totalMessageMax'] - data['totalMessage'] + data['totalMessage'] = data['totalMessageMax'] + + # calculate the real totalSourceWords + tmp = data.groupby(['src', 'dirname'])['totalSourceWords'].max().rename( + "totalSourceWordsMax").reset_index() + data = data.merge(tmp) + data['untranslatedSourceWords'] += data['totalSourceWordsMax'] - \ + data['totalSourceWords'] + data['totalSourceWords'] = data['totalSourceWordsMax'] + + data = data.drop('totalMessageMax', 1) + data = data.drop('totalSourceWordsMax', 1) + + return data + + +def summary(data): + + stat1 = data[['language', 'territory', 'script'] + ].drop_duplicates().count().max() + stat2 = data['src'].drop_duplicates().count() + stat3 = data[['src', 'filename']].drop_duplicates().count().max() + stat4 = data['language'].drop_duplicates().count() + stat5 = data.groupby(['src', 'dirname'])['totalMessage'].max().sum() + stat6 = data.groupby(['src', 'dirname'])['totalSourceWords'].max().sum() + + print("") + print("We have:") + print(" * number of upstream sources: "+str(stat2)) + print(" * number of distinct lang-script-territory: "+str(stat1)) + print(" * number of languages: "+str(stat4)) + print(" * translation files: "+str(stat3)) + print("This represents:") + print(" * Total messages: "+str(stat5)) + print(" * Total words: "+str(stat6)) + print("") + + return data + + +def info(dataset, step): + """Print basic informations about current dataset""" + print(" * "+step+" → we now have "+str(len(dataset))+" rows") + + +def store(dataset, name): + """Store dataset to csv""" + global RESULT_FOLDER + dataset.to_csv(RESULT_FOLDER+"/"+name, index=False) + + +def concat_csv(packages_folder, stats_folder): + dirs = [f for f in os.listdir(packages_folder) if os.path.isdir(os.path.join(packages_folder, f))] + + with open(os.path.join(stats_folder, "_concat.csv"), "w") as outfile: + for name in dirs: + try: + with open(os.path.join(packages_folder, name, "stats.csv")) as infile: + for line in infile: + outfile.write(line) + except FileNotFoundError: + pass + +if __name__ == '__main__': + main() diff --git a/build_map.py b/build_map.py index 39579bf..decf900 100755 --- a/build_map.py +++ b/build_map.py @@ -27,7 +27,7 @@ def main(): args = parser.parse_args() - RESULT_FOLDER = "./results/f{r}".format(r=args.release) + RESULT_FOLDER = "./results/f{r}/stats/".format(r=args.release) file = RESULT_FOLDER + "/3.result.csv" parse(file, args.include_english, args.include_nonofficial) diff --git a/build_packages_stats.py b/build_packages_stats.py new file mode 100755 index 0000000..2310f77 --- /dev/null +++ b/build_packages_stats.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""For each package, compute stats""" + +import argparse +import glob +import json +import os +import subprocess +import tempfile + +def main(): + """Handle params""" + + parser = argparse.ArgumentParser( + description="Computes stats for each srpm detected") + parser.add_argument("--release", required=True, type=int, default=31, + choices=[30, 31, 32], + help="Provide the Fedora release to analyze") + + args = parser.parse_args() + + packages_folder = "./results/f{v}/packages/".format(v=args.release) + + filenames = [f for f in os.listdir(packages_folder) if os.path.isdir(os.path.join(packages_folder, f))] + + for package in sorted(filenames): + with open(os.path.join(packages_folder, package, "discover.json"), 'r') as f: + translation_files = json.load(f) + + tmp = os.path.join(packages_folder, package) + + for translation in translation_files: + if translation["file_format"] == "po": + get_po_translation_level(tmp, translation, package, packages_folder) + elif translation["file_format"] == "ts": + get_ts_translation_level(tmp, translation, package, packages_folder) + elif translation["file_format"] == "json": + get_json_translation_level(tmp, translation, package, packages_folder) + elif translation["file_format"] == "auto": + # it's a detection of .tx configuration + continue + +def get_po_translation_level(path, discover, name, packages_folder): + filemask = discover["filemask"] + stats_file = packages_folder + "/{p}/stats.csv".format(p=name) + error_file = packages_folder + "/{p}/stats.errors.txt".format(p=name) + + with open(stats_file, 'a') as stats: + with open(error_file, 'a') as error: + subprocess.run(["pocount", filemask.split("*")[0], "--csv"], + stdout=stats, stderr=error, check=True, cwd=path) + + subprocess.run(["sed", + "-i", + "-e", + "s|{p}|.|g".format(p=path), + error_file], + check=True) + + +def get_ts_translation_level(path, discover, name, packages_folder): + filemask = discover["filemask"] + stats_file = packages_folder + "/{p}/stats.csv".format(p=name) + error_file = packages_folder + "/{p}/stats.errors.txt".format(p=name) + + with open(stats_file, 'a') as stats: + with open(error_file, 'a') as error: + subprocess.run(["pocount", filemask.split("*")[0], "--csv"], + stdout=stats, stderr=error, check=True, cwd=path) + + subprocess.run(["sed", + "-i", + "-e", + "s|{p}|.|g".format(p=path), + error_file], + check=True) + + +def get_json_translation_level(path, discover, name, packages_folder): + filemask = discover["filemask"] + + stats_file = packages_folder + "/{p}/stats.csv".format(p=name) + error_file = packages_folder + "/{p}/stats.errors.txt".format(p=name) + + stats = open(stats_file, 'a') + error = open(error_file, 'a') + + # move only related json files to a temporary folder + with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmpjson: + for filename in glob.iglob(path + "/" + filemask): + # if filesare in language subfolder, reproduce the hierarchy + dest = os.path.join( + *(os.path.dirname(filename).split(os.path.sep)[3:])) + os.makedirs(tmpjson + "/" + dest, exist_ok=True) + + # convert json files to po files + with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmppo: + # use existing template, in not existing (probably a bug), try "en" + template_file = tmpjson + "/" + \ + discover.get("template", filemask.replace("*", "en")) + + if os.path.isfile(template_file): + subprocess.run(["json2po", + "-t", + template_file, + tmpjson, + tmppo, + "--progress=none"], + stderr=error, + check=True, + cwd=tmppo) + + # compute stats + subprocess.run(["pocount", + filemask.split("*")[0], + "--csv"], + stdout=stats, + stderr=error, + check=True, + cwd=tmppo) + else: + print(" template doesn't exist, is it a translation-finder bug?") + + stats.close() + error.close() + + subprocess.run(["sed", + "-i", + "-e", + "s|{p}|.|g".format(p=path), + error_file], + check=True) + + +if __name__ == '__main__': + main() diff --git a/build_stats.py b/build_stats.py deleted file mode 100755 index 50457b5..0000000 --- a/build_stats.py +++ /dev/null @@ -1,380 +0,0 @@ -#!/usr/bin/env python3 -"""Consolidate and clean result files""" - -import argparse -import os -import pandas - -RESULT_FOLDER = "" - - -def main(): - """Handle params""" - - global RESULT_FOLDER - - parser = argparse.ArgumentParser( - description="Consolidate every result files and produce a clean concatenated update") - parser.add_argument("--release", required=True, type=int, default=31, - choices=[30, 31, 32], - help="Provide the Fedora release to analyze") - - args = parser.parse_args() - - RESULT_FOLDER = "./results/f{r}".format(r=args.release) - - file = RESULT_FOLDER + "/_concat.csv" - parse(file) - - -def parse(file): - """Call all cleaning functions""" - data = pandas.read_csv(file) - data.columns = ['src', - 'filename', - 'translatedMessages', - 'translatedSourceWords', - 'translatedTargetWords', - 'fuzzyMessages', - 'fuzzySourceWords', - 'untranslatedMessages', - 'untranslatedSourceWords', - 'totalMessage', - 'totalSourceWords', - 'reviewMessages', - 'reviewSourceWords'] - - info(data, "CSV loaded") - - data = clean_0_basic(data) - - data = clean_1_dupplication(data) - info(data, "Deduplication is done") - - data = data.astype({ - 'translatedMessages': int, - 'translatedSourceWords': int, - 'translatedTargetWords': int, - 'fuzzyMessages': int, - 'fuzzySourceWords': int, - 'untranslatedMessages': int, - 'untranslatedSourceWords': int, - 'totalMessage': int, - 'totalSourceWords': int - }) - - data = clean_2_remove(data) - info(data, "Removal is done") - - data = guess_bcp47(data) - info(data, "bcp47 data are guessed") - - data = clean_bcp47(data) - info(data, "bcp47 data are cleaned") - - data = add_cldr(data) - info(data, "cldr data are added") - - data = clean_cldr(data) - info(data, "cldr data are cleaned") - - data = check_lang_territory_consistency(data) - info(data, "cldr consistency are done") - - data = harmonize_totals(data) - info(data, "data harmonization are done") - - data = summary(data) - - store(data, "3.result.csv") - - -def clean_0_basic(data): - """String cleaning and drops useless 'review' columns""" - - # remove useless spaces and use lowercase - for column in list(data): - data[column] = data[column].str.strip() - data[column] = data[column].str.lower() - - # these columns never have values - data = data.drop('reviewMessages', 1) - data = data.drop('reviewSourceWords', 1) - - return data - - -def clean_1_dupplication(data): - """Removes duplicates""" - - # remove all headers from indivual stat result - data = data[data.filename != "filename"] - info(data, "* duplicated headers are removed") - - # remove duplicated stats from bad translation file patterns - data = data.drop_duplicates(['src', 'filename'], keep='last') - info(data, "* duplicated po files are removed") - - return data - - -def clean_2_remove(data): - """Removes obvious useless strings""" - - # remove pot files - data = data[~(data.filename.str.endswith(".pot"))] - info(data, "* remove pot files") - - # remove gmo files - data = data[~(data.filename.str.endswith(".gmo"))] - info(data, "* remove gmo files") - - # remove when no result - store(data[data.totalMessage == 0], "1.debug.total message = 0.csv") - data = data[data.totalMessage != 0] - info(data, "* remove files with 'totalMessage'=0") - - return data - - -def guess_bcp47(data): - """Guess Language, Territory and Script from filename""" - data['basename'] = data['filename'].apply(os.path.basename) - - data['full_lang'] = data['basename'].str.rsplit('.', 1, expand=True)[0] - - # a few lang naming are wrong - data.full_lang = data.full_lang.replace( - {'kmr_latn': 'kmr@latn', 'fr-braille': 'fr@braille', }) - - data['lang'] = data['full_lang'].str.rsplit('@', 1, expand=True)[0] - - data['script'] = data['full_lang'].str.rsplit('@', 1, expand=True)[1] - - # these are just re-encoded translations - data = data[~( - data.lang.str.endswith(".big5") | - data.lang.str.endswith(".cp936") | - data.lang.str.endswith(".cp1250") | - data.lang.str.endswith(".cp1251") | - data.lang.str.endswith(".euc-jp") | - data.lang.str.endswith(".gb2312") | - data.lang.str.endswith(".sjis") | - data.lang.str.endswith(".utf-8") - )] - info(data, "* remove if lang endswith encoding values") - - # these are just re-encoded translations - store(data[data.lang.str.contains(".", regex=False)], - '0.error.lang with point.csv') - data = data[~(data.lang.str.contains(".", regex=False))] - info(data, "* remove if lang contains a point") - - data['language'] = data['lang'].str.rsplit('_', 1, expand=True)[0] - - data['territory'] = data['lang'].str.rsplit('_', 1, expand=True)[1] - - # store all unique values for debug - store(data.drop_duplicates('lang', keep='last'), "1.debug.lang.csv") - store(data.drop_duplicates('language', keep='last'), "1.debug.language.csv") - store(data.drop_duplicates('territory', keep='last'), "1.debug.territory.csv") - store(data.drop_duplicates('script', keep='last'), "1.debug.script.csv") - - # remove temporary columns - data = data.drop('full_lang', 1) - data = data.drop('lang', 1) - - return data - - -def clean_bcp47(data): - """Remove impossible values for language and territory""" - # remove territory longer than 2 chars - store(data[data.territory.str.len() > 2], '0.error.len(territory)>2.csv') - data = data[~(data.territory.str.len() > 2)] - info(data, "* remove if len(territory)>2") - - # remove languages longer than 3 chars - store(data[data.language.str.len() > 3], '0.error.len(language)>3.csv') - data = data[~(data.language.str.len() > 3)] - info(data, "* remove if len(language)>3") - - # remove numeric languages - store(data[data.language.str.isdigit()], - '0.error.languages is numeric.csv') - data = data[~(data.language.str.isdigit())] - info(data, "* remove if language.isdigit()") - - # set types - data.territory = data.territory.fillna('') - data.script = data.script.fillna('') - data = data.astype( - {'territory': 'str', 'language': 'str', 'script': 'str'}) - - return data - - -def add_cldr(data): - """Load cldr data, merge it with""" - cldr_language = pandas.read_csv("CLDR-raw/language.csv") - cldr_language.name = cldr_language.name.str.lower() - - cldr_script = pandas.read_csv("CLDR-raw/script.csv") - cldr_script.code = cldr_script.code.str.lower() - cldr_script.name = cldr_script.name.str.lower() - - cldr_territory = pandas.read_csv("CLDR-raw/territory.csv") - cldr_territory.code = cldr_territory.code.str.lower() - cldr_territory.name = cldr_territory.name.str.lower() - - data = data.merge(cldr_language, how='left', - left_on='language', right_on='code') - data = data.rename(columns={'name': 'language_name'}) - data = data.drop('code', 1) - - data = data.merge(cldr_script, how='left', left_on='script', - right_on='code', suffixes=(False, 'script_')) - data = data.rename(columns={'name': 'script_name'}) - data = data.drop('code', 1) - - data = data.merge(cldr_territory, how='left', left_on='territory', - right_on='code', suffixes=(False, 'territory_')) - data = data.rename(columns={'name': 'territory_name'}) - data = data.drop('code', 1) - - data['full_language_code'] = data.apply(get_full_language_code, axis=1) - - return data - - -def clean_cldr(data): - """Remove """ - # remove numeric languages - store(data[data.language_name.isnull()].drop_duplicates( - 'language'), '0.error.language not in cldr.csv') - data = data[~(data.language_name.isnull())] - info(data, "* remove languages non existing in CLDR") - - return data - - -def check_lang_territory_consistency(data): - """ use pop per lang_script and territory to detect potential errors """ - - cldr_data = pandas.read_csv("CLDR-raw/country_language_population_raw.txt", - sep="\t") - cldr_data.CName = cldr_data.CName.str.lower() - - # in this file, Azerbaijani (Arabic) is written az_Arab, we only keep lang for now - cldr_data['Language'] = cldr_data['Language'].str.rsplit('_', 1, expand=True)[ - 0] - - cldr_data = cldr_data[["CName", 'Language']] - # as we may have duplicated values now - cldr_data = cldr_data.drop_duplicates() - cldr_data = cldr_data.rename( - columns={'CName': 'terr', 'Language': 'language'}) - - data = data.merge(cldr_data, how='left', - left_on=('language', 'territory'), - right_on=('language', 'terr'), - suffixes=(False, '_cldr')) - - error = data[['language', 'territory', 'terr']] - error = error[~(error.territory.isnull())] - error = error[~(error.territory == '')] - error = error[error.terr.isnull()].drop_duplicates() - store(error, '0.error.no population for this language-territory couple.csv') - - data = data.drop('terr', 1) - - return data - - -def get_full_language_code(row): - """ full language code using this naming: lang_territory@script """ - val = row.language - if row.territory: - val = val + "_" + row.territory - if row.script: - val = val + "@" + row.script - - return val - - -def clean_dirname(row): - """ strip full_language_code from dirname """ - val = row.dirname - - if val.endswith(row.full_language_code): - val = row.dirname[:-len(row.full_language_code)] - - return val - - -def harmonize_totals(data): - """ po files may be outdate, hypothese: max(source string)=truth""" - # there could be multiple translation files for one language on a same project - data['dirname'] = data['filename'].apply(os.path.dirname) - - # sometimes, the $lang.po file is inside a $lang folder, remove this - data['dirname'] = data.apply(clean_dirname, axis=1) - - # calculate the real totalMessage - tmp = data.groupby(['src', 'dirname'])['totalMessage'].max().rename( - "totalMessageMax").reset_index() - data = data.merge(tmp) - data['untranslatedMessages'] += data['totalMessageMax'] - data['totalMessage'] - data['totalMessage'] = data['totalMessageMax'] - - # calculate the real totalSourceWords - tmp = data.groupby(['src', 'dirname'])['totalSourceWords'].max().rename( - "totalSourceWordsMax").reset_index() - data = data.merge(tmp) - data['untranslatedSourceWords'] += data['totalSourceWordsMax'] - \ - data['totalSourceWords'] - data['totalSourceWords'] = data['totalSourceWordsMax'] - - data = data.drop('totalMessageMax', 1) - data = data.drop('totalSourceWordsMax', 1) - - return data - - -def summary(data): - - stat1 = data[['language', 'territory', 'script'] - ].drop_duplicates().count().max() - stat2 = data['src'].drop_duplicates().count() - stat3 = data[['src', 'filename']].drop_duplicates().count().max() - stat4 = data['language'].drop_duplicates().count() - stat5 = data.groupby(['src', 'dirname'])['totalMessage'].max().sum() - stat6 = data.groupby(['src', 'dirname'])['totalSourceWords'].max().sum() - - print("") - print("We have:") - print(" * number of upstream sources: "+str(stat2)) - print(" * number of distinct lang-script-territory: "+str(stat1)) - print(" * number of languages: "+str(stat4)) - print(" * translation files: "+str(stat3)) - print("This represents:") - print(" * Total messages: "+str(stat5)) - print(" * Total words: "+str(stat6)) - print("") - - return data - - -def info(dataset, step): - """Print basic informations about current dataset""" - print(" * "+step+" → we now have "+str(len(dataset))+" rows") - - -def store(dataset, name): - """Store dataset to csv""" - global RESULT_FOLDER - dataset.to_csv(RESULT_FOLDER+"/"+name, index=False) - - -if __name__ == '__main__': - main() diff --git a/build_tm.py b/build_tm.py index 2883db6..6031900 100755 --- a/build_tm.py +++ b/build_tm.py @@ -32,15 +32,15 @@ def main(): args = parser.parse_args() - release_folder = "./tm/f{v}/".format(v=args.release) + release_folder = "./results/f{v}/".format(v=args.release) lang_path = os.path.join(release_folder, "languages/") packages_path = os.path.join(release_folder, "packages/") tm_folder = os.path.join(release_folder, "out/") # Step 1: compute the list of languages if args.refresh: - print("Refresh the list of languages") - rmtree(lang_path) + print("Refreshing the list of languages") + rmtree(lang_path, ignore_errors=True) os.mkdir(lang_path) start_time_search = time.time() From 00abb611daf19c211e2e1e834105c30c73fa2a73 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 19 2020 18:15:45 +0000 Subject: [PATCH 2/5] move language detection in a dedicated file this language detection should be used to build per package and per language stats --- diff --git a/README.md b/README.md index 45b483a..5cb11ac 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,16 @@ podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedloc with `$script`, one of the following: -## Compute data +## Get the source packages `./build.py` get srpm lists, apply discover and compute progression stats +## Detect languages + +`./build_language_list.py` + +For each package, produce progression stats. + ## Produce per package stats `./build_packages_stats.py` diff --git a/build_language_list.py b/build_language_list.py new file mode 100755 index 0000000..5d0098b --- /dev/null +++ b/build_language_list.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" Parse translation files to deduct language list """ + +import argparse +import json +import os +import time +import polib + +from shutil import rmtree +from weblate_language_data import aliases, languages, language_codes + +def main(): + """Handle params""" + + parser = argparse.ArgumentParser( + description="Creates compendium for every languages") + + parser.add_argument("--release", required=True, type=int, default=31, + choices=[30, 31, 32], + help="Provide the Fedora release to analyze") + + parser.add_argument("--refresh", action="store_true", + help="Refresh list of available languages to analyze") + + args = parser.parse_args() + + release_folder = "./results/f{v}/".format(v=args.release) + lang_path = os.path.join(release_folder, "languages/") + packages_path = os.path.join(release_folder, "packages/") + + print("Refreshing the list of languages") + rmtree(lang_path, ignore_errors=True) + os.mkdir(lang_path) + + start_time_search = time.time() + + po_langs = detect_languages(packages_path) + + for lang in po_langs.keys(): + with open(os.path.join(lang_path, lang + '.json'), 'w') as f: + f.write(json.dumps(po_langs[lang], indent=2)) + + search_duration = round(time.time() - start_time_search, 1) + print(" Done in {d} seconds".format(d=search_duration)) + + +def detect_languages(tm_folder): + """ For each po file, detect metadatas and deduct the language """ + """ Requires: a file hierarchy with po files """ + """ Returns: a dictionary of lists, key=lang code, value=file list """ + langs = {} + + for root, directories, files in os.walk(tm_folder): + for file in files: + racine, ext = os.path.splitext(file) + if ext == ".po": + metadata = dict() + error = "" + try: + metadata = polib.pofile(os.path.join(root, file)).metadata + except UnicodeDecodeError as e: + # encoding error, to investigate before using it in TM + error = "error-unicode" + except OSError as e: + # maybe a polib bug? to investigate before using it in TM + error = "error-os" + + lang = choose_lang(racine, metadata, error) + + try: + langs[lang].append(os.path.join(root, file)) + except KeyError: + langs[lang] = list() + langs[lang].append(os.path.join(root, file)) + + return langs + +def choose_lang(filename, metadata, error): + """ From a po file and its medata, choose the most likely language code """ + """ By priority: the Language medata """ + """ Returns: a language code """ + + lang = "" + file_name = filename.lower() + meta_language = "" + meta_team = "" + try: + meta_language = metadata.get("Language").lower() + except AttributeError: + pass + + try: + meta_team = metadata.get("Language-Team").lower() + except AttributeError: + pass + + if meta_language in language_codes.LANGUAGES: + lang = meta_language + + elif file_name in language_codes.LANGUAGES: + lang = file_name + else: + lang = "noresult" + + # try languages (some codes here are exclused from languages_codes) + if lang == "noresult": + loc = [ lang[0] for lang in languages.LANGUAGES ] + + if meta_language in loc: + lang = meta_language + elif file_name in loc: + lang = file_name + + # try ALIASES + if lang == "noresult": + if meta_language in aliases.ALIASES.keys(): + lang = aliases.ALIASES[meta_language] + elif file_name in aliases.ALIASES.keys(): + lang = aliases.ALIASES[file_name] + else: + lang = "error" + + return lang + + +if __name__ == '__main__': + main() + diff --git a/build_tm.py b/build_tm.py index 6031900..917e2df 100755 --- a/build_tm.py +++ b/build_tm.py @@ -2,18 +2,12 @@ """Consolidate each po files into compendium""" import argparse -import glob import json import os -import polib import subprocess import tempfile import time -from shutil import copyfile -from shutil import rmtree -from weblate_language_data import aliases, languages, language_codes - def main(): """Handle params""" @@ -27,34 +21,17 @@ def main(): parser.add_argument("--lang", required=False, type=str, help="Filter a language to analyze") - parser.add_argument("--refresh", action="store_true", - help="Refresh list of available languages to analyze") - args = parser.parse_args() release_folder = "./results/f{v}/".format(v=args.release) lang_path = os.path.join(release_folder, "languages/") packages_path = os.path.join(release_folder, "packages/") tm_folder = os.path.join(release_folder, "out/") + os.mkdir(tm_folder) - # Step 1: compute the list of languages - if args.refresh: - print("Refreshing the list of languages") - rmtree(lang_path, ignore_errors=True) - os.mkdir(lang_path) - - start_time_search = time.time() - - po_langs = detect_languages(packages_path) + print("Building the translation memory for every languages") + start_time_search = time.time() - for lang in po_langs.keys(): - with open(os.path.join(lang_path, lang + '.json'), 'w') as f: - f.write(json.dumps(po_langs[lang], indent=2)) - - search_duration = round(time.time() - start_time_search, 1) - print(" Done in {d} seconds".format(d=search_duration)) - - # Step 2: call TM activities if args.lang: with open(os.path.join(lang_path, args.lang + ".json"), "r") as read_file: files = json.load(read_file) @@ -69,95 +46,20 @@ def main(): compute_lang(lang[:-len('.json')], files, tm_folder) -def detect_languages(tm_folder): - """ For each po file, detect metadatas and deduct the language """ - """ Requires: a file hierarchy with po files """ - """ Returns: a dictionary of lists, key=lang code, value=file list """ - langs = {} - - for root, directories, files in os.walk(tm_folder): - for file in files: - racine, ext = os.path.splitext(file) - if ext == ".po": - metadata = dict() - error = "" - try: - metadata = polib.pofile(os.path.join(root, file)).metadata - except UnicodeDecodeError as e: - # encoding error, to investigate before using it in TM - error = "error-unicode" - except OSError as e: - # maybe a polib bug? to investigate before using it in TM - error = "error-os" + search_duration = round(time.time() - start_time_search, 1) + print(" Done in {d} seconds".format(d=search_duration)) - lang = choose_lang(racine, metadata, error) - - try: - langs[lang].append(os.path.join(root, file)) - except KeyError: - langs[lang] = list() - langs[lang].append(os.path.join(root, file)) - - return langs - -def choose_lang(filename, metadata, error): - """ From a po file and its medata, choose the most likely language code """ - """ By priority: the Language medata """ - """ Returns: a language code """ - - lang = "" - file_name = filename.lower() - meta_language = "" - meta_team = "" - try: - meta_language = metadata.get("Language").lower() - except AttributeError: - pass - - try: - meta_team = metadata.get("Language-Team").lower() - except AttributeError: - pass - - if meta_language in language_codes.LANGUAGES: - lang = meta_language - - elif file_name in language_codes.LANGUAGES: - lang = file_name - else: - lang = "noresult" - - # try languages (some codes here are exclused from languages_codes) - if lang == "noresult": - loc = [ lang[0] for lang in languages.LANGUAGES ] - - if meta_language in loc: - lang = meta_language - elif file_name in loc: - lang = file_name - - # try ALIASES - if lang == "noresult": - if meta_language in aliases.ALIASES.keys(): - lang = aliases.ALIASES[meta_language] - elif file_name in aliases.ALIASES.keys(): - lang = aliases.ALIASES[file_name] - else: - lang = "error" - - return lang def compute_lang(lang, langfiles, tm_folder): """ Generate compendium and convert it to tmx """ """ """ - print("Computing: " + lang) + print(" Computing: " + lang) # po consolidation compendium_file = tm_folder + lang + ".po" compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file) pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in langfiles] - count = 0 with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp: From 745c1c81126122a9cefc7a0699771227966f8bac Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 19 2020 20:48:26 +0000 Subject: [PATCH 3/5] use aliases to reduce the number of languages --- diff --git a/build_language_list.py b/build_language_list.py index 5d0098b..8e9a56c 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -14,14 +14,14 @@ def main(): """Handle params""" parser = argparse.ArgumentParser( - description="Creates compendium for every languages") + description="Creates a list of languages form translation files") parser.add_argument("--release", required=True, type=int, default=31, choices=[30, 31, 32], help="Provide the Fedora release to analyze") parser.add_argument("--refresh", action="store_true", - help="Refresh list of available languages to analyze") + help="Force refresh") args = parser.parse_args() @@ -108,9 +108,9 @@ def choose_lang(filename, metadata, error): loc = [ lang[0] for lang in languages.LANGUAGES ] if meta_language in loc: - lang = meta_language + lang = meta_language.lower() elif file_name in loc: - lang = file_name + lang = file_name.lower() # try ALIASES if lang == "noresult": @@ -121,6 +121,10 @@ def choose_lang(filename, metadata, error): else: lang = "error" + # harmonization (example: mo = ro_MD) + if lang in aliases.ALIASES.keys(): + lang = aliases.ALIASES[lang] + return lang From ca4aa1bd185c3dc12ed8b0c288d48f0727a59087 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 19 2020 20:50:05 +0000 Subject: [PATCH 4/5] only generate if no file and add refresh option --- diff --git a/build_tm.py b/build_tm.py index 917e2df..062318d 100755 --- a/build_tm.py +++ b/build_tm.py @@ -18,6 +18,9 @@ def main(): choices=[30, 31, 32], help="Provide the Fedora release to analyze") + parser.add_argument("--refresh", action="store_true", + help="Force refresh of files") + parser.add_argument("--lang", required=False, type=str, help="Filter a language to analyze") @@ -27,7 +30,7 @@ def main(): lang_path = os.path.join(release_folder, "languages/") packages_path = os.path.join(release_folder, "packages/") tm_folder = os.path.join(release_folder, "out/") - os.mkdir(tm_folder) + os.makedirs(tm_folder, exist_ok=True) print("Building the translation memory for every languages") start_time_search = time.time() @@ -44,13 +47,13 @@ def main(): with open(os.path.join(lang_path, lang), "r") as read_file: files = json.load(read_file) - compute_lang(lang[:-len('.json')], files, tm_folder) + compute_lang(lang[:-len('.json')], files, tm_folder, args.refresh) search_duration = round(time.time() - start_time_search, 1) print(" Done in {d} seconds".format(d=search_duration)) -def compute_lang(lang, langfiles, tm_folder): +def compute_lang(lang, langfiles, tm_folder, refresh): """ Generate compendium and convert it to tmx """ """ """ print(" Computing: " + lang) @@ -59,44 +62,47 @@ def compute_lang(lang, langfiles, tm_folder): compendium_file = tm_folder + lang + ".po" compendium_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), compendium_file) - pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in langfiles] - count = 0 + if not os.path.isfile(compendium_file) or refresh is True: + pofiles = [os.path.join(os.path.dirname(os.path.abspath(__file__)),f) for f in langfiles] + count = 0 - with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp: - for i in pofiles: - try: - command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"] - subprocess.run(command, check=True, cwd=tmp, capture_output=True) - except subprocess.CalledProcessError as e: + with tempfile.TemporaryDirectory(prefix="l10n-tm") as tmp: + for i in pofiles: try: - command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"] + command = ["msguniq", i, "--output-file", count.__str__(), "--no-location"] subprocess.run(command, check=True, cwd=tmp, capture_output=True) except subprocess.CalledProcessError as e: - print("Error with msguniq {i}, error: {e}".format(i=i, e=e)) + try: + command = ["msguniq", i, "--output-file", count.__str__(), "--to-code", "utf-8", "--no-location"] + subprocess.run(command, check=True, cwd=tmp, capture_output=True) + except subprocess.CalledProcessError as e: + print("Error with msguniq {i}, error: {e}".format(i=i, e=e)) - count += 1 + count += 1 - onlyfiles = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))] - command = ["msgcat", "--force-po", "--no-location", "--output-file", compendium_file] + onlyfiles + onlyfiles = [f for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f))] + command = ["msgcat", "--force-po", "--no-location", "--output-file", compendium_file] + onlyfiles - try: - subprocess.run(command, check=True, cwd=tmp, capture_output=True) - except subprocess.CalledProcessError as e: - print(" msgcat exception...") + try: + subprocess.run(command, check=True, cwd=tmp, capture_output=True) + except subprocess.CalledProcessError as e: + print(" msgcat exception...") # po to tmx convertion tmx_file = tm_folder + lang + ".tmx" command = ["po2tmx", "--language="+lang, "--progress=none", compendium_file, "--output="+tmx_file] - subprocess.run(command, check=True, capture_output=True) + if not os.path.isfile(tmx_file) or refresh is True: + subprocess.run(command, check=True, capture_output=True) # language terminology terminology_file = tm_folder + lang + ".terminology.po" command = ["poterminology", "--ignore-case", "--fold-titlecase", "--inputs-needed", "1", "--progress=none", compendium_file, "--output="+terminology_file] - subprocess.run(command, check=True, capture_output=True) + if not os.path.isfile(tmx_file) or refresh is True: + subprocess.run(command, check=True, capture_output=True) if __name__ == '__main__': main() From 9e2222286b7aecc02f96f64e52fd08cffd6ee1a3 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 20 2020 05:06:45 +0000 Subject: [PATCH 5/5] mising lower on some language code --- diff --git a/build_language_list.py b/build_language_list.py index 8e9a56c..f7f03fb 100755 --- a/build_language_list.py +++ b/build_language_list.py @@ -115,15 +115,15 @@ def choose_lang(filename, metadata, error): # try ALIASES if lang == "noresult": if meta_language in aliases.ALIASES.keys(): - lang = aliases.ALIASES[meta_language] + lang = aliases.ALIASES[meta_language].lower() elif file_name in aliases.ALIASES.keys(): - lang = aliases.ALIASES[file_name] + lang = aliases.ALIASES[file_name].lower() else: lang = "error" # harmonization (example: mo = ro_MD) if lang in aliases.ALIASES.keys(): - lang = aliases.ALIASES[lang] + lang = aliases.ALIASES[lang].lower() return lang