| |
@@ -1,457 +0,0 @@
|
| |
- #!/usr/bin/env python3
|
| |
- """Consolidate and clean result files"""
|
| |
-
|
| |
- import argparse
|
| |
- import csv
|
| |
- import itertools
|
| |
- import json
|
| |
- import os
|
| |
- import pandas
|
| |
- import time
|
| |
-
|
| |
- RESULT_FOLDER = ""
|
| |
-
|
| |
-
|
| |
- def main():
|
| |
- """Handle params"""
|
| |
-
|
| |
- global RESULT_FOLDER
|
| |
-
|
| |
- parser = argparse.ArgumentParser(
|
| |
- description="Consolidate every result files and produce a clean concatenated update")
|
| |
- parser.add_argument("--release", required=True, type=int, default=31,
|
| |
- choices=[30, 31, 32],
|
| |
- help="Provide the Fedora release to analyze")
|
| |
-
|
| |
- args = parser.parse_args()
|
| |
-
|
| |
- lang_path = "./results/f{r}/languages/".format(r=args.release)
|
| |
- RESULT_FOLDER = "./results/f{r}/stats/".format(r=args.release)
|
| |
- packages_folder = "./results/f{r}/packages/".format(r=args.release)
|
| |
-
|
| |
- concat_csv(packages_folder, RESULT_FOLDER)
|
| |
-
|
| |
- file = RESULT_FOLDER + "/_concat.csv"
|
| |
-
|
| |
- # parse(file)
|
| |
-
|
| |
- langs = [f for f in os.listdir(lang_path) if os.path.isfile(os.path.join(lang_path, f))]
|
| |
-
|
| |
- for lang in sorted(langs):
|
| |
- lang_code = lang[:-len('.json')]
|
| |
-
|
| |
- with open(os.path.join(lang_path, lang), "r") as read_file:
|
| |
- files = json.load(read_file)
|
| |
-
|
| |
- compute_lang(lang_code, files, RESULT_FOLDER, packages_folder)
|
| |
-
|
| |
-
|
| |
- def compute_lang(lang_code, files, RESULT_FOLDER, packages_folder):
|
| |
- print("Computing: {l} ({c} files)".format(l=lang_code, c=len(files)))
|
| |
- start_time_search = time.time()
|
| |
- stats = []
|
| |
- packages = {}
|
| |
-
|
| |
- # step 1: get package lists
|
| |
- for file in files:
|
| |
- po_file = file.replace(packages_folder, "")
|
| |
- package = po_file.split("/")[0]
|
| |
- po_file = po_file.replace(package + "/", "")
|
| |
- try:
|
| |
- packages[package].append(po_file)
|
| |
- except KeyError:
|
| |
- packages[package] = list()
|
| |
- packages[package].append(po_file)
|
| |
-
|
| |
- # step 2: remove duplicates
|
| |
- for package in packages.keys():
|
| |
- packages[package] = list(set(packages[package]))
|
| |
-
|
| |
- # step 3: parse package files
|
| |
- for package in packages.keys():
|
| |
- po_files = packages[package]
|
| |
- stats_file = os.path.join(packages_folder, package, "stats.csv")
|
| |
-
|
| |
- with open(stats_file, newline='') as csvfile:
|
| |
- csv_dict_reader = csv.DictReader(csvfile)
|
| |
-
|
| |
- [stats.append([package] + list(row.values())) for row in csv_dict_reader if row["Filename"] == po_file]
|
| |
-
|
| |
- # step 4: store results
|
| |
- lang_stats_file = os.path.join(RESULT_FOLDER, lang_code + ".stats.csv")
|
| |
- with open(lang_stats_file, 'w', newline='') as csvfile:
|
| |
- spamwriter = csv.writer(csvfile)
|
| |
-
|
| |
- header = ['Package', 'Filename', ' Translated Messages', ' Translated Source Words', ' Translated Target Words', ' Fuzzy Messages', ' Fuzzy Source Words', ' Untranslated Messages', ' Untranslated Source Words', ' Total Message', ' Total Source Words', ' Review Messages', ' Review Source Words']
|
| |
-
|
| |
- spamwriter.writerow(header)
|
| |
-
|
| |
- [spamwriter.writerow(row) for row in stats]
|
| |
-
|
| |
- search_duration = round(time.time() - start_time_search, 1)
|
| |
- print(" Done in {d} seconds".format(d=search_duration))
|
| |
-
|
| |
-
|
| |
- def parse(file):
|
| |
- """Call all cleaning functions"""
|
| |
- data = pandas.read_csv(file)
|
| |
- data.columns = ['src',
|
| |
- 'filename',
|
| |
- 'translatedMessages',
|
| |
- 'translatedSourceWords',
|
| |
- 'translatedTargetWords',
|
| |
- 'fuzzyMessages',
|
| |
- 'fuzzySourceWords',
|
| |
- 'untranslatedMessages',
|
| |
- 'untranslatedSourceWords',
|
| |
- 'totalMessage',
|
| |
- 'totalSourceWords',
|
| |
- 'reviewMessages',
|
| |
- 'reviewSourceWords']
|
| |
-
|
| |
- info(data, "CSV loaded")
|
| |
-
|
| |
- data = clean_0_basic(data)
|
| |
-
|
| |
- data = clean_1_dupplication(data)
|
| |
- info(data, "Deduplication is done")
|
| |
-
|
| |
- data = data.astype({
|
| |
- 'translatedMessages': int,
|
| |
- 'translatedSourceWords': int,
|
| |
- 'translatedTargetWords': int,
|
| |
- 'fuzzyMessages': int,
|
| |
- 'fuzzySourceWords': int,
|
| |
- 'untranslatedMessages': int,
|
| |
- 'untranslatedSourceWords': int,
|
| |
- 'totalMessage': int,
|
| |
- 'totalSourceWords': int
|
| |
- })
|
| |
-
|
| |
- data = clean_2_remove(data)
|
| |
- info(data, "Removal is done")
|
| |
-
|
| |
- data = guess_bcp47(data)
|
| |
- info(data, "bcp47 data are guessed")
|
| |
-
|
| |
- data = clean_bcp47(data)
|
| |
- info(data, "bcp47 data are cleaned")
|
| |
-
|
| |
- data = add_cldr(data)
|
| |
- info(data, "cldr data are added")
|
| |
-
|
| |
- data = clean_cldr(data)
|
| |
- info(data, "cldr data are cleaned")
|
| |
-
|
| |
- data = check_lang_territory_consistency(data)
|
| |
- info(data, "cldr consistency are done")
|
| |
-
|
| |
- data = harmonize_totals(data)
|
| |
- info(data, "data harmonization are done")
|
| |
-
|
| |
- data = summary(data)
|
| |
-
|
| |
- store(data, "3.result.csv")
|
| |
-
|
| |
-
|
| |
- def clean_0_basic(data):
|
| |
- """String cleaning and drops useless 'review' columns"""
|
| |
-
|
| |
- # remove useless spaces and use lowercase
|
| |
- for column in list(data):
|
| |
- data[column] = data[column].str.strip()
|
| |
- data[column] = data[column].str.lower()
|
| |
-
|
| |
- # these columns never have values
|
| |
- data = data.drop('reviewMessages', 1)
|
| |
- data = data.drop('reviewSourceWords', 1)
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def clean_1_dupplication(data):
|
| |
- """Removes duplicates"""
|
| |
-
|
| |
- # remove all headers from indivual stat result
|
| |
- data = data[data.filename != "filename"]
|
| |
- info(data, "* duplicated headers are removed")
|
| |
-
|
| |
- # remove duplicated stats from bad translation file patterns
|
| |
- data = data.drop_duplicates(['src', 'filename'], keep='last')
|
| |
- info(data, "* duplicated po files are removed")
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def clean_2_remove(data):
|
| |
- """Removes obvious useless strings"""
|
| |
-
|
| |
- # remove pot files
|
| |
- data = data[~(data.filename.str.endswith(".pot"))]
|
| |
- info(data, "* remove pot files")
|
| |
-
|
| |
- # remove gmo files
|
| |
- data = data[~(data.filename.str.endswith(".gmo"))]
|
| |
- info(data, "* remove gmo files")
|
| |
-
|
| |
- # remove when no result
|
| |
- store(data[data.totalMessage == 0], "1.debug.total message = 0.csv")
|
| |
- data = data[data.totalMessage != 0]
|
| |
- info(data, "* remove files with 'totalMessage'=0")
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def guess_bcp47(data):
|
| |
- """Guess Language, Territory and Script from filename"""
|
| |
- data['basename'] = data['filename'].apply(os.path.basename)
|
| |
-
|
| |
- data['full_lang'] = data['basename'].str.rsplit('.', 1, expand=True)[0]
|
| |
-
|
| |
- # a few lang naming are wrong
|
| |
- data.full_lang = data.full_lang.replace(
|
| |
- {'kmr_latn': 'kmr@latn', 'fr-braille': 'fr@braille', })
|
| |
-
|
| |
- data['lang'] = data['full_lang'].str.rsplit('@', 1, expand=True)[0]
|
| |
-
|
| |
- data['script'] = data['full_lang'].str.rsplit('@', 1, expand=True)[1]
|
| |
-
|
| |
- # these are just re-encoded translations
|
| |
- data = data[~(
|
| |
- data.lang.str.endswith(".big5") |
|
| |
- data.lang.str.endswith(".cp936") |
|
| |
- data.lang.str.endswith(".cp1250") |
|
| |
- data.lang.str.endswith(".cp1251") |
|
| |
- data.lang.str.endswith(".euc-jp") |
|
| |
- data.lang.str.endswith(".gb2312") |
|
| |
- data.lang.str.endswith(".sjis") |
|
| |
- data.lang.str.endswith(".utf-8")
|
| |
- )]
|
| |
- info(data, "* remove if lang endswith encoding values")
|
| |
-
|
| |
- # these are just re-encoded translations
|
| |
- store(data[data.lang.str.contains(".", regex=False)],
|
| |
- '0.error.lang with point.csv')
|
| |
- data = data[~(data.lang.str.contains(".", regex=False))]
|
| |
- info(data, "* remove if lang contains a point")
|
| |
-
|
| |
- data['language'] = data['lang'].str.rsplit('_', 1, expand=True)[0]
|
| |
-
|
| |
- data['territory'] = data['lang'].str.rsplit('_', 1, expand=True)[1]
|
| |
-
|
| |
- # store all unique values for debug
|
| |
- store(data.drop_duplicates('lang', keep='last'), "1.debug.lang.csv")
|
| |
- store(data.drop_duplicates('language', keep='last'), "1.debug.language.csv")
|
| |
- store(data.drop_duplicates('territory', keep='last'), "1.debug.territory.csv")
|
| |
- store(data.drop_duplicates('script', keep='last'), "1.debug.script.csv")
|
| |
-
|
| |
- # remove temporary columns
|
| |
- data = data.drop('full_lang', 1)
|
| |
- data = data.drop('lang', 1)
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def clean_bcp47(data):
|
| |
- """Remove impossible values for language and territory"""
|
| |
- # remove territory longer than 2 chars
|
| |
- store(data[data.territory.str.len() > 2], '0.error.len(territory)>2.csv')
|
| |
- data = data[~(data.territory.str.len() > 2)]
|
| |
- info(data, "* remove if len(territory)>2")
|
| |
-
|
| |
- # remove languages longer than 3 chars
|
| |
- store(data[data.language.str.len() > 3], '0.error.len(language)>3.csv')
|
| |
- data = data[~(data.language.str.len() > 3)]
|
| |
- info(data, "* remove if len(language)>3")
|
| |
-
|
| |
- # remove numeric languages
|
| |
- store(data[data.language.str.isdigit()],
|
| |
- '0.error.languages is numeric.csv')
|
| |
- data = data[~(data.language.str.isdigit())]
|
| |
- info(data, "* remove if language.isdigit()")
|
| |
-
|
| |
- # set types
|
| |
- data.territory = data.territory.fillna('')
|
| |
- data.script = data.script.fillna('')
|
| |
- data = data.astype(
|
| |
- {'territory': 'str', 'language': 'str', 'script': 'str'})
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def add_cldr(data):
|
| |
- """Load cldr data, merge it with"""
|
| |
- cldr_language = pandas.read_csv("CLDR-raw/language.csv")
|
| |
- cldr_language.name = cldr_language.name.str.lower()
|
| |
-
|
| |
- cldr_script = pandas.read_csv("CLDR-raw/script.csv")
|
| |
- cldr_script.code = cldr_script.code.str.lower()
|
| |
- cldr_script.name = cldr_script.name.str.lower()
|
| |
-
|
| |
- cldr_territory = pandas.read_csv("CLDR-raw/territory.csv")
|
| |
- cldr_territory.code = cldr_territory.code.str.lower()
|
| |
- cldr_territory.name = cldr_territory.name.str.lower()
|
| |
-
|
| |
- data = data.merge(cldr_language, how='left',
|
| |
- left_on='language', right_on='code')
|
| |
- data = data.rename(columns={'name': 'language_name'})
|
| |
- data = data.drop('code', 1)
|
| |
-
|
| |
- data = data.merge(cldr_script, how='left', left_on='script',
|
| |
- right_on='code', suffixes=(False, 'script_'))
|
| |
- data = data.rename(columns={'name': 'script_name'})
|
| |
- data = data.drop('code', 1)
|
| |
-
|
| |
- data = data.merge(cldr_territory, how='left', left_on='territory',
|
| |
- right_on='code', suffixes=(False, 'territory_'))
|
| |
- data = data.rename(columns={'name': 'territory_name'})
|
| |
- data = data.drop('code', 1)
|
| |
-
|
| |
- data['full_language_code'] = data.apply(get_full_language_code, axis=1)
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def clean_cldr(data):
|
| |
- """Remove """
|
| |
- # remove numeric languages
|
| |
- store(data[data.language_name.isnull()].drop_duplicates(
|
| |
- 'language'), '0.error.language not in cldr.csv')
|
| |
- data = data[~(data.language_name.isnull())]
|
| |
- info(data, "* remove languages non existing in CLDR")
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def check_lang_territory_consistency(data):
|
| |
- """ use pop per lang_script and territory to detect potential errors """
|
| |
-
|
| |
- cldr_data = pandas.read_csv("CLDR-raw/country_language_population_raw.txt",
|
| |
- sep="\t")
|
| |
- cldr_data.CName = cldr_data.CName.str.lower()
|
| |
-
|
| |
- # in this file, Azerbaijani (Arabic) is written az_Arab, we only keep lang for now
|
| |
- cldr_data['Language'] = cldr_data['Language'].str.rsplit('_', 1, expand=True)[
|
| |
- 0]
|
| |
-
|
| |
- cldr_data = cldr_data[["CName", 'Language']]
|
| |
- # as we may have duplicated values now
|
| |
- cldr_data = cldr_data.drop_duplicates()
|
| |
- cldr_data = cldr_data.rename(
|
| |
- columns={'CName': 'terr', 'Language': 'language'})
|
| |
-
|
| |
- data = data.merge(cldr_data, how='left',
|
| |
- left_on=('language', 'territory'),
|
| |
- right_on=('language', 'terr'),
|
| |
- suffixes=(False, '_cldr'))
|
| |
-
|
| |
- error = data[['language', 'territory', 'terr']]
|
| |
- error = error[~(error.territory.isnull())]
|
| |
- error = error[~(error.territory == '')]
|
| |
- error = error[error.terr.isnull()].drop_duplicates()
|
| |
- store(error, '0.error.no population for this language-territory couple.csv')
|
| |
-
|
| |
- data = data.drop('terr', 1)
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def get_full_language_code(row):
|
| |
- """ full language code using this naming: lang_territory@script """
|
| |
- val = row.language
|
| |
- if row.territory:
|
| |
- val = val + "_" + row.territory
|
| |
- if row.script:
|
| |
- val = val + "@" + row.script
|
| |
-
|
| |
- return val
|
| |
-
|
| |
-
|
| |
- def clean_dirname(row):
|
| |
- """ strip full_language_code from dirname """
|
| |
- val = row.dirname
|
| |
-
|
| |
- if val.endswith(row.full_language_code):
|
| |
- val = row.dirname[:-len(row.full_language_code)]
|
| |
-
|
| |
- return val
|
| |
-
|
| |
-
|
| |
- def harmonize_totals(data):
|
| |
- """ po files may be outdate, hypothese: max(source string)=truth"""
|
| |
- # there could be multiple translation files for one language on a same project
|
| |
- data['dirname'] = data['filename'].apply(os.path.dirname)
|
| |
-
|
| |
- # sometimes, the $lang.po file is inside a $lang folder, remove this
|
| |
- data['dirname'] = data.apply(clean_dirname, axis=1)
|
| |
-
|
| |
- # calculate the real totalMessage
|
| |
- tmp = data.groupby(['src', 'dirname'])['totalMessage'].max().rename(
|
| |
- "totalMessageMax").reset_index()
|
| |
- data = data.merge(tmp)
|
| |
- data['untranslatedMessages'] += data['totalMessageMax'] - data['totalMessage']
|
| |
- data['totalMessage'] = data['totalMessageMax']
|
| |
-
|
| |
- # calculate the real totalSourceWords
|
| |
- tmp = data.groupby(['src', 'dirname'])['totalSourceWords'].max().rename(
|
| |
- "totalSourceWordsMax").reset_index()
|
| |
- data = data.merge(tmp)
|
| |
- data['untranslatedSourceWords'] += data['totalSourceWordsMax'] - \
|
| |
- data['totalSourceWords']
|
| |
- data['totalSourceWords'] = data['totalSourceWordsMax']
|
| |
-
|
| |
- data = data.drop('totalMessageMax', 1)
|
| |
- data = data.drop('totalSourceWordsMax', 1)
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def summary(data):
|
| |
-
|
| |
- stat1 = data[['language', 'territory', 'script']
|
| |
- ].drop_duplicates().count().max()
|
| |
- stat2 = data['src'].drop_duplicates().count()
|
| |
- stat3 = data[['src', 'filename']].drop_duplicates().count().max()
|
| |
- stat4 = data['language'].drop_duplicates().count()
|
| |
- stat5 = data.groupby(['src', 'dirname'])['totalMessage'].max().sum()
|
| |
- stat6 = data.groupby(['src', 'dirname'])['totalSourceWords'].max().sum()
|
| |
-
|
| |
- print("")
|
| |
- print("We have:")
|
| |
- print(" * number of upstream sources: "+str(stat2))
|
| |
- print(" * number of distinct lang-script-territory: "+str(stat1))
|
| |
- print(" * number of languages: "+str(stat4))
|
| |
- print(" * translation files: "+str(stat3))
|
| |
- print("This represents:")
|
| |
- print(" * Total messages: "+str(stat5))
|
| |
- print(" * Total words: "+str(stat6))
|
| |
- print("")
|
| |
-
|
| |
- return data
|
| |
-
|
| |
-
|
| |
- def info(dataset, step):
|
| |
- """Print basic informations about current dataset"""
|
| |
- print(" * "+step+" → we now have "+str(len(dataset))+" rows")
|
| |
-
|
| |
-
|
| |
- def store(dataset, name):
|
| |
- """Store dataset to csv"""
|
| |
- global RESULT_FOLDER
|
| |
- dataset.to_csv(RESULT_FOLDER+"/"+name, index=False)
|
| |
-
|
| |
-
|
| |
- def concat_csv(packages_folder, stats_folder):
|
| |
- dirs = [f for f in os.listdir(packages_folder) if os.path.isdir(os.path.join(packages_folder, f))]
|
| |
-
|
| |
- with open(os.path.join(stats_folder, "_concat.csv"), "w") as outfile:
|
| |
- for name in dirs:
|
| |
- try:
|
| |
- with open(os.path.join(packages_folder, name, "stats.csv")) as infile:
|
| |
- for line in infile:
|
| |
- outfile.write(line)
|
| |
- except FileNotFoundError:
|
| |
- pass
|
| |
-
|
| |
- if __name__ == '__main__':
|
| |
- main()
|
| |