| |
@@ -1,5 +1,5 @@
|
| |
#!/usr/bin/env python3
|
| |
- """Consolidate each po files into compendium"""
|
| |
+ """ Generate static asciidoc pages from generated statistics """
|
| |
|
| |
import argparse
|
| |
import datetime
|
| |
@@ -15,10 +15,31 @@
|
| |
import utils
|
| |
|
| |
|
| |
+ def get_territories_for_language(language_name: str, cldr_languages: dict) -> list:
|
| |
+ log = logging.getLogger("buildWebsite.get_territory")
|
| |
+
|
| |
+ code = language_name.split("_", 1)[0] # ro_MD or zh_Hant_HK
|
| |
+ code = code.split("@", 1)[0] # ca@valencia
|
| |
+
|
| |
+ territories = cldr_languages.get(code, {}).get("_territories", [])
|
| |
+ territories = territories + cldr_languages.get(code + "-alt-secondary", {}).get("_territories", [])
|
| |
+
|
| |
+ # if language contains a territory code, then only keep this one
|
| |
+ if len(language_name.split("_")) > 1:
|
| |
+ if language_name.split("_")[-1] in territories:
|
| |
+ territories = [language_name.split("_")[-1]]
|
| |
+
|
| |
+ if len(territories) == 0:
|
| |
+ territories = ["not-found-in-cldr"]
|
| |
+ log.warning(f"The language {code} does not exist in territories data from CLDR")
|
| |
+
|
| |
+ return territories
|
| |
+
|
| |
+
|
| |
def main():
|
| |
"""Handle params"""
|
| |
|
| |
- parser = argparse.ArgumentParser(description="")
|
| |
+ parser = argparse.ArgumentParser(description="Generate static asciidoc pages from generated statistics")
|
| |
|
| |
parser.add_argument(
|
| |
"--results",
|
| |
@@ -45,26 +66,20 @@
|
| |
utils.set_logging(args.verbose, args.results)
|
| |
log = logging.getLogger("buildWebsite")
|
| |
|
| |
- results_folder = "./results/{v}/".format(v=args.results)
|
| |
- langs_log = os.path.join(results_folder, "build_language_list.log")
|
| |
- langs_stats = os.path.join(results_folder, "languages-stats")
|
| |
- packages_stats = os.path.join(results_folder, "packages-stats")
|
| |
-
|
| |
- data_langs_folder = os.path.join(results_folder, "languages-website")
|
| |
- data_pkgs_folder = os.path.join(results_folder, "packages-website")
|
| |
+ results_folder = f"./results/{args.results}/"
|
| |
+ langs_stats = os.path.join(results_folder, "languages")
|
| |
+ packages_stats = os.path.join(results_folder, "packages")
|
| |
|
| |
tm_folder = os.path.join(results_folder, "languages-tm")
|
| |
|
| |
- static_folder = "./website/content/{v}/".format(v=args.results)
|
| |
+ static_folder = f"./website/content/{args.results}/"
|
| |
static_territories_folder = "./website/content/territories"
|
| |
static_langs_folder = os.path.join(static_folder, "language")
|
| |
static_pkgs_folder = os.path.join(static_folder, "package")
|
| |
- static_tm_folder = "./website/static/{v}/".format(v=args.results)
|
| |
+ static_tm_folder = f"./website/static/{args.results}/"
|
| |
|
| |
# clean destination folders
|
| |
for folder in [
|
| |
- data_langs_folder,
|
| |
- data_pkgs_folder,
|
| |
static_langs_folder,
|
| |
static_pkgs_folder,
|
| |
static_tm_folder,
|
| |
@@ -77,61 +92,10 @@
|
| |
log.info("Get distribution stats")
|
| |
distribution_stats = json.load(open(os.path.join(results_folder, "release.json")))
|
| |
|
| |
- log.info("Prepare json files for packages")
|
| |
- packages = [
|
| |
- d
|
| |
- for d in os.listdir(packages_stats)
|
| |
- if os.path.isfile(os.path.join(packages_stats, d))
|
| |
- ]
|
| |
- log_files = pd.read_csv(langs_log, header=None, skipinitialspace=True)
|
| |
- log_files = log_files.iloc[:, [0, 4]]
|
| |
- log_files.columns = ["filename", "lang_code"]
|
| |
-
|
| |
- packages_langs_results = dict()
|
| |
- count = 0
|
| |
- total = len(packages)
|
| |
- for package in sorted(packages):
|
| |
- count += 1
|
| |
- log.debug("Preparing package {c}/{t} - {p}".format(c=count, t=total, p=package))
|
| |
- package_name = package[: -len(".json")]
|
| |
- package_statistics_file = os.path.join(data_pkgs_folder, package_name + ".json")
|
| |
- file_stats = os.path.join(packages_stats, package_name + ".json")
|
| |
-
|
| |
- results = consolidate_package_stats(file_stats, log_files, os.path.join(results_folder, "package", package_name))
|
| |
- store_json_file(results, package_statistics_file)
|
| |
- for lang in results.get("stats", []):
|
| |
- val = packages_langs_results.get(lang["lang_code"], [])
|
| |
- val.append(
|
| |
- {
|
| |
- "name": package_name,
|
| |
- "progress": lang["progress"],
|
| |
- "translated": lang["translated"],
|
| |
- "team": lang["team"],
|
| |
- }
|
| |
- )
|
| |
- packages_langs_results[lang["lang_code"]] = val
|
| |
-
|
| |
- log.info("Prepare json files for languages")
|
| |
- languages = [
|
| |
- f
|
| |
- for f in os.listdir(langs_stats)
|
| |
- if os.path.isfile(os.path.join(langs_stats, f))
|
| |
- ]
|
| |
- for lang in sorted(languages):
|
| |
- if lang.endswith(".json"):
|
| |
- code = lang[: -len(".json")]
|
| |
- package_statistics_file = os.path.join(data_langs_folder, code + ".json")
|
| |
-
|
| |
- if os.path.isfile(package_statistics_file):
|
| |
- continue
|
| |
-
|
| |
- results = consolidate_language_stats(os.path.join(langs_stats, lang), distribution_stats)
|
| |
- results["packages"] = packages_langs_results.get(code, dict())
|
| |
- store_json_file(results, package_statistics_file)
|
| |
-
|
| |
log.info("Load CLDR data")
|
| |
with open("CLDR-raw/languageData.json", "r") as read_file:
|
| |
cldr_languages = json.load(read_file)
|
| |
+ cldr_version = cldr_languages["supplemental"]["version"]["_cldrVersion"]
|
| |
cldr_languages = cldr_languages["supplemental"]["languageData"]
|
| |
|
| |
with open("CLDR-raw/territories.json", "r") as read_file:
|
| |
@@ -145,53 +109,38 @@
|
| |
log.info("Generate static content for languages")
|
| |
languages = [
|
| |
f
|
| |
- for f in os.listdir(data_langs_folder)
|
| |
- if os.path.isfile(os.path.join(data_langs_folder, f))
|
| |
+ for f in os.listdir(langs_stats)
|
| |
+ if os.path.isfile(os.path.join(langs_stats, f))
|
| |
]
|
| |
- for lang in sorted(languages):
|
| |
- code = lang[: -len(".json")]
|
| |
- package_statistics_file = os.path.join(static_langs_folder, code + ".adoc")
|
| |
+ for language_file in sorted(languages):
|
| |
+ language = language_file[: -len(".json")]
|
| |
+ stats_file = os.path.join(langs_stats, language_file)
|
| |
+ destination_file = os.path.join(static_langs_folder, f"{language}.adoc")
|
| |
|
| |
- if os.path.isfile(package_statistics_file):
|
| |
- continue
|
| |
-
|
| |
- with open(os.path.join(data_langs_folder, lang), "r") as read_file:
|
| |
+ with open(stats_file, "r") as read_file:
|
| |
content = json.load(read_file)
|
| |
|
| |
- pd.DataFrame(content["packages"]).to_csv(os.path.join(static_tm_folder, f"{code}.csv"), index=False)
|
| |
-
|
| |
- cldr_code = code.split("_", 1)[0] # ro_MD or zh_Hant_HK
|
| |
- cldr_code = cldr_code.split("@", 1)[0] # ca@valencia
|
| |
-
|
| |
- territories = cldr_languages.get(cldr_code, {}).get("_territories", []) \
|
| |
- + cldr_languages.get(cldr_code + "-alt-secondary", {}).get("_territories", [])
|
| |
-
|
| |
- # if language contains a territory code, then only keep this one
|
| |
- if len(code.split("_")) > 1:
|
| |
- if code.split("_")[-1] in territories:
|
| |
- territories = [code.split("_")[-1]]
|
| |
+ pd.DataFrame\
|
| |
+ .from_dict(content['packages'], orient="index")\
|
| |
+ .to_csv(os.path.join(static_tm_folder, f"{language}.csv"), index_label="package")
|
| |
|
| |
- if len(territories) == 0:
|
| |
- log.warning("The language {l} does not exist in territories data from CLDR".format(l=code))
|
| |
- generate_static_pages_langs(args.results, code, content, package_statistics_file, territories, tm_folder, static_tm_folder)
|
| |
+ territories = get_territories_for_language(language, cldr_languages)
|
| |
+ generate_static_pages_langs(args.results, language, content, destination_file, territories, tm_folder, static_tm_folder)
|
| |
|
| |
log.info("Generate static content for packages")
|
| |
packages = [
|
| |
f
|
| |
- for f in os.listdir(data_pkgs_folder)
|
| |
- if os.path.isfile(os.path.join(data_pkgs_folder, f))
|
| |
+ for f in os.listdir(packages_stats)
|
| |
+ if os.path.isdir(os.path.join(packages_stats, f))
|
| |
]
|
| |
for package in sorted(packages):
|
| |
- code = package[: -len(".json")]
|
| |
- package_statistics_file = os.path.join(static_pkgs_folder, code + ".adoc")
|
| |
+ stats_file = os.path.join(packages_stats, package, "stats.json")
|
| |
+ destination_file = os.path.join(static_pkgs_folder, f"{package}.adoc")
|
| |
|
| |
- if os.path.isfile(package_statistics_file):
|
| |
- continue
|
| |
-
|
| |
- with open(os.path.join(data_pkgs_folder, package), "r") as read_file:
|
| |
+ with open(stats_file, "r") as read_file:
|
| |
content = json.load(read_file)
|
| |
|
| |
- generate_static_pages_packages(args.results, code, content, package_statistics_file)
|
| |
+ generate_static_pages_packages(args.results, package, content, destination_file)
|
| |
|
| |
log.info("Generating indexes")
|
| |
package_statistics_file = os.path.join(static_folder, "_index.adoc")
|
| |
@@ -207,7 +156,7 @@
|
| |
# prevent containers and alternative names to be included
|
| |
if code in cldr_territories_info.keys():
|
| |
package_statistics_file = os.path.join(static_territories_folder, code, "_index.adoc")
|
| |
- generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}))
|
| |
+ generate_territory_index(package_statistics_file, cldr_territories[code], code, cldr_territories_info.get(code, {}), cldr_version)
|
| |
|
| |
log.info("Copy translation memories")
|
| |
languages = [
|
| |
@@ -222,172 +171,8 @@
|
| |
log.info("done")
|
| |
|
| |
|
| |
- def consolidate_language_stats(stats_file, distribution_stats):
|
| |
- """ From a CSV file, return key indicators """
|
| |
- log = logging.getLogger("buildWebsite.consolidate_language_stats")
|
| |
- results = dict()
|
| |
- total_words_distrib = distribution_stats.get("totalsourcewords", 0)
|
| |
-
|
| |
- fieldnames = {
|
| |
- "filename": "str",
|
| |
- "translatedsourcewords": "int",
|
| |
- "fuzzysourcewords": "int",
|
| |
- "untranslatedsourcewords": "int",
|
| |
- "translated": "int",
|
| |
- "fuzzy": "int",
|
| |
- "untranslated": "int",
|
| |
- "translatedtargetwords": "int",
|
| |
- "team": "str",
|
| |
- "totalsourcewords": "int",
|
| |
- }
|
| |
-
|
| |
- stats_df = pd.read_json(stats_file, orient="index")
|
| |
- stats_df.fillna(0, inplace=True)
|
| |
- stats_df.reset_index(level=0, inplace=True)
|
| |
- stats_df["totalsourcewords"] = (
|
| |
- stats_df["untranslatedsourcewords"] + stats_df["translatedsourcewords"]
|
| |
- )
|
| |
- stats_df.columns = fieldnames.keys()
|
| |
-
|
| |
- stats_df["package"] = stats_df["filename"].str.split("/", expand=True)[4]
|
| |
-
|
| |
- results["packages"] = stats_df["package"].unique().tolist()
|
| |
- results["progress"] = round(
|
| |
- stats_df["translatedsourcewords"].sum()
|
| |
- / stats_df["totalsourcewords"].sum()
|
| |
- * 100,
|
| |
- 1,
|
| |
- )
|
| |
- results["progress_d"] = round(
|
| |
- stats_df["translatedsourcewords"].sum() / total_words_distrib * 100, 1
|
| |
- )
|
| |
- results["totalsourcewords_d"] = total_words_distrib
|
| |
-
|
| |
- for kpi in ["totalsourcewords", "translatedsourcewords"]:
|
| |
- results[kpi + "sum"] = int(stats_df[kpi].sum())
|
| |
-
|
| |
- return results
|
| |
-
|
| |
-
|
| |
- def consolidate_package_stats(stats_file, log_files, package_folder):
|
| |
- """ From a CSV file, return key indicators """
|
| |
- log = logging.getLogger("buildWebsite.consolidate_package_stats")
|
| |
- results = dict()
|
| |
-
|
| |
- fieldnames = {
|
| |
- "filename": "str",
|
| |
- "translatedsourcewords": "int",
|
| |
- "fuzzysourcewords": "int",
|
| |
- "untranslatedsourcewords": "int",
|
| |
- "translated": "int",
|
| |
- "fuzzy": "int",
|
| |
- "untranslated": "int",
|
| |
- "translatedtargetwords": "int",
|
| |
- "team": "str",
|
| |
- "totalsourcewords": "int",
|
| |
- }
|
| |
-
|
| |
- _json = json.load(open(stats_file))
|
| |
- dfs = []
|
| |
- total_source_words = 0
|
| |
-
|
| |
- for template in _json.keys():
|
| |
- tmp_df = pd.DataFrame.from_dict(_json.get(template), orient="index")
|
| |
- tmp_df.fillna(0, inplace=True)
|
| |
- tmp_df.reset_index(level=0, inplace=True)
|
| |
-
|
| |
- # sometimes, no file were found, which means no stats can be used
|
| |
- if len(tmp_df) == 0:
|
| |
- log.debug(" The template {t} for {f} is empty".format(t=template, f=stats_file))
|
| |
- continue
|
| |
-
|
| |
- tmp_df["totalsourcewords"] = (
|
| |
- tmp_df["untranslatedsourcewords"] + tmp_df["translatedsourcewords"]
|
| |
- )
|
| |
- tmp_df.columns = fieldnames.keys()
|
| |
-
|
| |
- total_source_words += max(tmp_df["totalsourcewords"])
|
| |
-
|
| |
- dfs.append(tmp_df)
|
| |
-
|
| |
- if len(dfs) > 1:
|
| |
- stats_df = pd.concat(dfs)
|
| |
- elif len(dfs) == 0:
|
| |
- log.debug("There is no stats for {f}".format(f=stats_file))
|
| |
- return results
|
| |
- else:
|
| |
- stats_df = dfs[0]
|
| |
-
|
| |
- stats_df_w_lang = pd.merge(stats_df, log_files, how="inner", on="filename")
|
| |
- stats_df_no_lang = pd.merge(stats_df, log_files, how="outer", indicator=True).loc[
|
| |
- lambda x: x["_merge"] == "left_only"
|
| |
- ]
|
| |
-
|
| |
- stats_df_w_lang["filename"] = stats_df_w_lang["filename"].apply(
|
| |
- lambda s: s[len(package_folder) + 2:]
|
| |
- )
|
| |
-
|
| |
- temp_translated = (
|
| |
- stats_df_w_lang.groupby(["lang_code"])
|
| |
- .agg(
|
| |
- {
|
| |
- "translatedsourcewords": ["sum"],
|
| |
- }
|
| |
- )
|
| |
- .reset_index()
|
| |
- .droplevel(1, axis=1)
|
| |
- )
|
| |
-
|
| |
- temp_teams = stats_df_w_lang.groupby("lang_code")["team"].apply(
|
| |
- lambda x: ", ".join(x.drop_duplicates())
|
| |
- )
|
| |
- temp_files = stats_df_w_lang.groupby("lang_code")["filename"].apply(
|
| |
- lambda x: ",".join(x)
|
| |
- )
|
| |
- temp_bis = pd.merge(temp_teams, temp_files, how="inner", on="lang_code")
|
| |
- temp = pd.merge(temp_translated, temp_bis, how="inner", on="lang_code").to_dict(
|
| |
- orient="records"
|
| |
- )
|
| |
-
|
| |
- for line in temp:
|
| |
- line["progress"] = 0
|
| |
- line["translated"] = line["translatedsourcewords"]
|
| |
-
|
| |
- if total_source_words == 0:
|
| |
- log.info(
|
| |
- " File {f} for file has translatedsourcewords = 0 in line {l}".format(
|
| |
- f=stats_file, l=line
|
| |
- )
|
| |
- )
|
| |
- line["progress"] = 0
|
| |
- continue
|
| |
- try:
|
| |
- line["progress"] = round(
|
| |
- (int(line["translatedsourcewords"]) / total_source_words) * 100
|
| |
- )
|
| |
- except OverflowError:
|
| |
- log.info(
|
| |
- " File {f} has Translated={t} and Source={tot}".format(
|
| |
- f=stats_file,
|
| |
- t=line["translatedsourcewords"],
|
| |
- tot=total_source_words,
|
| |
- )
|
| |
- )
|
| |
-
|
| |
- line["filename"] = line["filename"].split(",")
|
| |
-
|
| |
- results["stats"] = list()
|
| |
- for line in sorted(temp, key=lambda k: k["progress"], reverse=True):
|
| |
- del line["translatedsourcewords"]
|
| |
- results["stats"].append(line)
|
| |
-
|
| |
- results["totalsourcewords"] = total_source_words
|
| |
- results["no_languages"] = stats_df_no_lang["filename"].tolist()
|
| |
-
|
| |
- return results
|
| |
-
|
| |
-
|
| |
def generate_static_pages_langs(results: str, code: str, content: dict, destination_file: str, territories: list[str], tm_folder: str, static_tm_folder: str) -> None:
|
| |
+ """ Aggregate info and call language template """
|
| |
log = logging.getLogger("buildWebsite.generate_static_pages_langs")
|
| |
data = content
|
| |
data["lang_name_en"] = langtable.language_name(
|
| |
@@ -413,17 +198,36 @@
|
| |
apply_jinja_template(data, destination_file, "language.adoc")
|
| |
|
| |
|
| |
- def generate_static_pages_packages(results, code, content, destination_file):
|
| |
+ def generate_static_pages_packages(release: str, package: str, statistics: dict, destination_file: str) -> None:
|
| |
+ """ Aggregate info and call package template """
|
| |
log = logging.getLogger("buildWebsite.generate_static_pages_packages")
|
| |
- data = content
|
| |
- data["results"] = results
|
| |
- data["package"] = code
|
| |
+ data = statistics
|
| |
+ data["results"] = release
|
| |
+ data["package"] = package
|
| |
data["now"] = datetime.datetime.utcnow()
|
| |
|
| |
+ # in some rare cases, a package may have no translation progress
|
| |
+ if "stats" not in statistics.keys():
|
| |
+ data["stats"] = {}
|
| |
+ data["stats"]["languages"] = {}
|
| |
+
|
| |
+ if "error" in data["stats"]["languages"].keys():
|
| |
+ data["started_languages"] = len(data["stats"]["languages"]) - 1
|
| |
+ data["no_languages"] = len(data["stats"]["languages"]["error"]["filename"].split("./")) - 1
|
| |
+ else:
|
| |
+ data["started_languages"] = len(data["stats"]["languages"])
|
| |
+ data["no_languages"] = 0
|
| |
+
|
| |
+ # remove local path
|
| |
+ for lang in data["stats"]["languages"].keys():
|
| |
+ path = f"./results/{release}/packages/{package}/"
|
| |
+ data["stats"]["languages"][lang]["filename"] = data["stats"]["languages"][lang]["filename"].replace(path, " ")
|
| |
+
|
| |
apply_jinja_template(data, destination_file, "package.adoc")
|
| |
|
| |
|
| |
- def generate_release_index(release, destination_file, data):
|
| |
+ def generate_release_index(release: str, destination_file: str, data: dict) -> None:
|
| |
+ """ Aggregate info and call release index template """
|
| |
log = logging.getLogger("buildWebsite.generate_release_index")
|
| |
data["release"] = release
|
| |
data["now"] = datetime.datetime.utcnow()
|
| |
@@ -431,7 +235,8 @@
|
| |
apply_jinja_template(data, destination_file, "_index.release.adoc")
|
| |
|
| |
|
| |
- def generate_language_index(release, destination_file):
|
| |
+ def generate_language_index(release: str, destination_file: str) -> None:
|
| |
+ """ Aggregate info and call language index template """
|
| |
log = logging.getLogger("buildWebsite.generate_language_index")
|
| |
data = dict()
|
| |
data["release"] = release
|
| |
@@ -440,7 +245,8 @@
|
| |
apply_jinja_template(data, destination_file, "_index.language.adoc")
|
| |
|
| |
|
| |
- def generate_package_index(distribution, destination_file):
|
| |
+ def generate_package_index(distribution: str, destination_file: str) -> None:
|
| |
+ """ Aggregate info and call package index template """
|
| |
log = logging.getLogger("buildWebsite.generate_package_index")
|
| |
data = dict()
|
| |
data["distribution"] = distribution
|
| |
@@ -449,26 +255,34 @@
|
| |
apply_jinja_template(data, destination_file, "_index.package.adoc")
|
| |
|
| |
|
| |
- def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict):
|
| |
+ def generate_territory_index(destination_file: str, name: list[str], code: str, data: dict, cldr_version: str) -> None:
|
| |
+ """ Aggregate info and call territory index template """
|
| |
log = logging.getLogger("buildWebsite.generate_package_index")
|
| |
data["name"] = name
|
| |
data["code"] = code
|
| |
+ data["cldr_version"] = cldr_version
|
| |
|
| |
apply_jinja_template(data, destination_file, "_index.territory.adoc")
|
| |
|
| |
|
| |
- def store_json_file(content, destination_file):
|
| |
+ def store_json_file(content: dict, destination_file: str) -> None:
|
| |
+ """ Store a json file"""
|
| |
with open(destination_file, "w") as f:
|
| |
f.write(json.dumps(content, indent=2))
|
| |
|
| |
|
| |
- def apply_jinja_template(data: dict, destination_file: str, template_file: str):
|
| |
+ def apply_jinja_template(data: dict, destination_file: str, template_file: str) -> None:
|
| |
+ """ Call a jinja template with a data dictionary """
|
| |
os.makedirs(os.path.dirname(os.path.abspath(destination_file)), exist_ok=True)
|
| |
|
| |
template_loader = jinja2.FileSystemLoader(searchpath="./templates/")
|
| |
template_env = jinja2.Environment(loader=template_loader, undefined=jinja2.Undefined)
|
| |
template = template_env.get_template(template_file)
|
| |
- output_text = template.render(data)
|
| |
+ try:
|
| |
+ output_text = template.render(data)
|
| |
+ except jinja2.exceptions.UndefinedError as e:
|
| |
+ logging.error(f"Error with {destination_file}: {e}")
|
| |
+ raise
|
| |
|
| |
with open(destination_file, "w") as write_out:
|
| |
write_out.write(output_text)
|
| |
move all stats generation in build_stats.py to simplify code (and increase performance)