From fcc481d5402cd23e929546b917f727083ef431ae Mon Sep 17 00:00:00 2001 From: darknao Date: Sep 12 2020 21:01:17 +0000 Subject: [PATCH 1/14] SRPMs download overhaul Everything is done inside containers, specific for each release Every single SRPM available for each release is downloaded, processed, and removed Progression is saved in results/$release/data.json so each srpm is processed only once Signed-off-by: darknao --- diff --git a/README.md b/README.md index 700e263..c6a4f0a 100644 --- a/README.md +++ b/README.md @@ -4,32 +4,29 @@ Global statistics on translation levels of fedora products # Requirements -`dnf install translate-toolkit podman` +`dnf install podman` -## Create needed folders +## Create needed container images +Each release need is own image. + +```bash +podman build . -f docker/Dockerfile.$release -t fedlocstats:$release ``` -mkdir -p ./src.rpms/f30/ ./results/f30/ -virtualenv venv -source venv/bin/activate -pip install -r requirements.txt + +```bash +podman build . -f docker/Dockerfile.31 -t fedlocstats:31 +podman build . -f docker/Dockerfile.32 -t fedlocstats:32 +podman build . -f docker/Dockerfile.33 -t fedlocstats:33 ``` # Run the scripts -## Get package list - -This step is for now manual, I took list of DNF packages from Koji: - -* For F30: https://koji.fedoraproject.org/koji/buildinfo?buildID=1252912 -* For F31: https://kojipkgs.fedoraproject.org//packages/Fedora-Workstation-Live/30/20190421.n.0/data/logs/image/x86_64/root.log - -## Get the rpm sources - -`./download-f%%-srpm-in-container.sh` where %% is the fedora version (30 or 31) +```bash +podman run -it --rm -v ./:/src --tmpfs /tmp:size=4G fedlocstats:$release $script +``` -Downloading the file is done inside a container so we can produce stats even if -using Fedora 29. This represents about 7 GB for Fedora 30 and takes some time. +with `$script`, one of the following: ## Compute data @@ -78,18 +75,18 @@ Data in CLDR-raw folder comes from https://github.com/unicode-org/cldr/blob/mast ## Ideas 1. CLDR supplementalData.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/supplementalData.xml - 1. use territoryContainment to build geographic groups - 2. use languageData to detect default script - 3. use languageData to have basic stats about territories - 4. use territoryInfo to have advanced stats about territories + 1. use territoryContainment to build geographic groups + 2. use languageData to detect default script + 3. use languageData to have basic stats about territories + 4. use territoryInfo to have advanced stats about territories 2. CLDR supplementalMetadata.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/supplementalMetadata.xml - 1. use the replacement values harmonize content + 1. use the replacement values harmonize content 3. CLDR likelySubtags.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/likelySubtags.xml - 1. use the replacement advanced harmonization? + 1. use the replacement advanced harmonization? 4. CLDR languageInfo.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/languageInfo.xml - 1. can we say if language is >= 90% close to another one, we can consider we propagate translation statistics? + 1. can we say if language is >= 90% close to another one, we can consider we propagate translation statistics? 5. CLDR languageGroup.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/languageGroup.xml - 1. what is it? + 1. what is it? automatic calculation (group by territory + spoken percentage * spoken ) diff --git a/build.py b/build.py index 8b6989d..b4b0af3 100755 --- a/build.py +++ b/build.py @@ -1,9 +1,8 @@ #!/usr/bin/env python3 -""" For each packages in src.rpms folder :""" -""" extract srpm """ -""" run the translation_finder """ -""" Then, concat csv files """ - +# For each packages in src.rpms folder : +# extract srpm +# run the translation_finder +# Then, concat csv files import argparse import glob import os @@ -11,9 +10,16 @@ import subprocess import tempfile import yaml +from urllib.parse import urlparse +import dnf +import json +import distro + + from shutil import copyfile, copy2 from translation_finder import discover + def main(): """Handle params""" @@ -21,51 +27,95 @@ def main(): description="Computes stats for each srpm detected") parser.add_argument("--srpm", required=False, help="Only work on one SRPM, if selected") - parser.add_argument("--offset", required=False, type=int, - help="Provide the number of packages to ignore") - parser.add_argument("--release", required=True, type=int, default=31, - choices=[30, 31], - help="Provide the Fedora release to analyze") args = parser.parse_args() - srpm_folder="./src.rpms/f{v}/".format(v=args.release) - result_folder="./results/f{v}/".format(v=args.release) - tm_folder="./tm/f{v}/".format(v=args.release) + (distname, distrel, distid) = distro.linux_distribution() + result_folder = "./results/f{v}/".format(v=distrel) + tm_folder = "./tm/f{v}/".format(v=distrel) + + if not os.path.exists(result_folder): + os.mkdir(result_folder) + if not os.path.exists(tm_folder): + os.mkdir(tm_folder) + + processing_file = os.path.join(result_folder, "data.json") + + print("Fetching SRPMs url list") + p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm', + stdout=subprocess.PIPE, + shell=True) + url_list = str(p.stdout.read(), "utf-8").splitlines() + + # Load processing data, if any + try: + with open(processing_file) as f: + data = json.load(f) + except: + data = {} pkgs = [] - for (dirpath, dirnames, filenames) in os.walk(srpm_folder): - pkgs.extend(filenames) - break count = 0 + total_urls = len(url_list) - if args.srpm: - print("argument srpm is provided: " + args.srpm) - with tempfile.TemporaryDirectory() as tmp: - package = [x for x in pkgs if x == args.srpm][0] - srpm_file = "{srpm}/{a}".format(srpm=srpm_folder, a=package) - extract_srpm(tmp, srpm_file, result_folder) - discover_translations(tmp, package, result_folder, tm_folder) - else: - with tempfile.TemporaryDirectory() as tmp: - if args.offset: - pkgs = pkgs[slice(args.offset, len(pkgs))] + with tempfile.TemporaryDirectory() as tmp: + for line in url_list: + count += 1 + url = urlparse(line.strip()) + if not url.scheme: + continue + srpm_filename = os.path.basename(url.path) + srpm_data = dnf.subject.Subject(srpm_filename) + package = srpm_data.get_nevra_possibilities(forms=1)[0] - for package in pkgs: - count += 1 - print("") - print("{c}/{m}".format(c=count, m=len(pkgs))) + if args.srpm and args.srpm != package.name: + continue - if package.startswith("libreoffice"): - print("package ignored because really slow, please use --srpm") + if package.name not in data or data[package.name] != srpm_filename: + print("{c}/{t} processing {n}".format( + c=count, t=total_urls, n=package.name)) + + if url.scheme == "rsync": + dl = subprocess.run( + ['rsync', url.geturl(), '/tmp'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + else: + dl = subprocess.run( + ['curl', '-L', '--remote-name', url.geturl()], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd='/tmp') + + if dl.returncode: + print("error downloading srpm:") + print(dl.stdout) continue - srpm_file = "{srpm}/{a}".format(srpm=srpm_folder, a=package) - extract_srpm(tmp, srpm_file, result_folder) - discover_translations(tmp, package, result_folder, tm_folder) + srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename) + extract_srpm(tmp, srpm_path, result_folder) + discover_translations(tmp, package.name, result_folder, tm_folder) + + os.unlink(srpm_path) + + # save processed srpm name & version + data[package.name] = srpm_filename + with open(processing_file, "w") as f: + json.dump(data, f, indent=2) + print("") + else: + print("{c}/{t} skipping already processed {n}".format( + c=count, t=total_urls, n=package.name)) + + # if package.startswith("libreoffice"): + # print("package ignored because really slow, please use --srpm") + # continue subprocess.run(['./concat_csv.sh', result_folder], check=True) + def extract_srpm(tmp, name, result_folder): """extract srpm page""" print("extract_srpm: " + name) @@ -79,6 +129,7 @@ def extract_srpm(tmp, name, result_folder): out.close() error.close() + def discover_translations(tmp, name, result_folder, tm_folder): """find po file""" print("discover_translations: "+tmp) @@ -114,6 +165,7 @@ def discover_translations(tmp, name, result_folder, tm_folder): else: unknown_format(tmp, translation, name, translation["file_format"], result_folder) + def get_po_translation_level(path, mask, name, result_folder, tm_folder): filemask = mask["filemask"] print("get_po_translation_level: " + filemask) @@ -127,8 +179,8 @@ def get_po_translation_level(path, mask, name, result_folder, tm_folder): error.close() # Copy translation files in translation memory - for po in glob.glob(path +"/"+ filemask): - dest = tm_folder +"/"+ name +"/"+ filemask.split("*")[0] + for po in glob.glob(path + "/" + filemask): + dest = tm_folder + "/" + name + "/" + filemask.split("*")[0] os.makedirs(dest, exist_ok=True) copy2(po, dest) @@ -151,6 +203,7 @@ def get_ts_translation_level(path, mask, name, result_folder): subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path), result_folder + '/{p}.errors.txt'.format(p=name)], check=True) + def get_json_translation_level(path, mask, name, result_folder): filemask = mask["filemask"] print("get_json_translation_level: " + filemask) @@ -188,12 +241,13 @@ def get_json_translation_level(path, mask, name, result_folder): subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path), result_folder + '/{p}.errors.txt'.format(p=name)], check=True) + def unknown_format(path, results, srpm, tformat, result_folder): print("unknown_format:") with open(result_folder + "/todo_"+tformat+".txt", "a") as file: file.write(srpm + " " + results["filemask"] + "\n") + if __name__ == '__main__': main() - diff --git a/docker/Dockerfile.30 b/docker/Dockerfile.30 new file mode 100644 index 0000000..0626703 --- /dev/null +++ b/docker/Dockerfile.30 @@ -0,0 +1,12 @@ +FROM registry.fedoraproject.org/fedora:30 + +RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core python3-pip rsync vim + +COPY requirements.txt /src/requirements.txt +RUN pip3 install --no-cache -r /src/requirements.txt + +# Fix missing metalink for f30 +COPY docker/fedora-updates-modular.repo /etc/yum.repos.d/fedora-updates-modular.repo + +VOLUME /src +WORKDIR /src diff --git a/docker/Dockerfile.32 b/docker/Dockerfile.32 new file mode 100644 index 0000000..c2276ea --- /dev/null +++ b/docker/Dockerfile.32 @@ -0,0 +1,9 @@ +FROM registry.fedoraproject.org/fedora:32 + +RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core python3-pip rsync vim + +COPY requirements.txt /src/requirements.txt +RUN pip install --no-cache -r /src/requirements.txt + +VOLUME /src +WORKDIR /src diff --git a/docker/fedora-updates-modular.repo b/docker/fedora-updates-modular.repo new file mode 100644 index 0000000..988724f --- /dev/null +++ b/docker/fedora-updates-modular.repo @@ -0,0 +1,38 @@ +[updates-modular] +name=Fedora Modular $releasever - $basearch - Updates +failovermethod=priority +#baseurl=http://download.fedoraproject.org/pub/fedora/linux/updates/$releasever/Modular/$basearch/ +metalink=https://mirrors.fedoraproject.org/metalink?repo=updates-released-modular-f$releasever&arch=$basearch +enabled=1 +repo_gpgcheck=0 +type=rpm +gpgcheck=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-$releasever-$basearch +skip_if_unavailable=False + +[updates-modular-debuginfo] +name=Fedora Modular $releasever - $basearch - Updates - Debug +failovermethod=priority +#baseurl=http://download.fedoraproject.org/pub/fedora/linux/updates/$releasever/Modular/$basearch/debug/ +metalink=https://mirrors.fedoraproject.org/metalink?repo=updates-released-modular-debug-f$releasever&arch=$basearch +enabled=0 +repo_gpgcheck=0 +type=rpm +gpgcheck=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-$releasever-$basearch +skip_if_unavailable=False + +[updates-modular-source] +name=Fedora Modular $releasever - Updates Source +failovermethod=priority +baseurl=http://download.fedoraproject.org/pub/fedora/linux/updates/$releasever/Modular/SRPMS/ +#metalink=https://mirrors.fedoraproject.org/metalink?repo=updates-released-modular-source-f$releasever&arch=$basearch +enabled=0 +repo_gpgcheck=0 +type=rpm +gpgcheck=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-$releasever-$basearch +skip_if_unavailable=False diff --git a/extract_srpm.sh b/extract_srpm.sh index e7910f9..3b71ba5 100755 --- a/extract_srpm.sh +++ b/extract_srpm.sh @@ -3,7 +3,7 @@ # extract every existing archives (that most probably are source code) folder=$1 -package=$(pwd)/$2 +package=$2 hop=$(pwd)/ls.txt tmp=$(mktemp -d) @@ -13,12 +13,15 @@ if [ ! -e "$package" ] ; then exit 1 fi -rm -rf -- "$folder"/* +# please, don't do that, ever +# rm -rf -- "$folder"/* # remove hidden files -rm -rf -- "$folder"/.* 2> /dev/null +# rm -rf -- "$folder"/.* 2> /dev/null +rm -rf $folder +mkdir -p $folder pushd "$tmp" > /dev/null - +echo "extract $package" rpm2cpio "$package" | cpio -idm --no-preserve-owner --quiet # TODO: multiple archives in one srpm sqlite-3.26.0-3.fc30.src.rpm From 077c5d0bc4bc269a08dba76a7618cdaa5bd459d1 Mon Sep 17 00:00:00 2001 From: darknao Date: Sep 21 2020 20:29:56 +0000 Subject: [PATCH 2/14] check rpm version Signed-off-by: darknao --- diff --git a/build.py b/build.py index b4b0af3..8ef5b63 100755 --- a/build.py +++ b/build.py @@ -12,6 +12,7 @@ import yaml from urllib.parse import urlparse import dnf +import rpm import json import distro @@ -70,43 +71,49 @@ def main(): if args.srpm and args.srpm != package.name: continue - if package.name not in data or data[package.name] != srpm_filename: - print("{c}/{t} processing {n}".format( - c=count, t=total_urls, n=package.name)) - - if url.scheme == "rsync": - dl = subprocess.run( - ['rsync', url.geturl(), '/tmp'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - else: - dl = subprocess.run( - ['curl', '-L', '--remote-name', url.geturl()], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - cwd='/tmp') - - if dl.returncode: - print("error downloading srpm:") - print(dl.stdout) + if package.name in data: + # Compare version + known_package = dnf.subject.Subject(data[package.name]).get_nevra_possibilities(forms=1)[0] + if rpm.labelCompare( + (package.epoch, package.version, package.release), + (known_package.epoch, known_package.version, known_package.release)) <= 0: + print("{c}/{t} skipping already processed {n}".format( + c=count, t=total_urls, n=package.name)) continue - srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename) - extract_srpm(tmp, srpm_path, result_folder) - discover_translations(tmp, package.name, result_folder, tm_folder) + print("{c}/{t} processing {n}".format( + c=count, t=total_urls, n=package.name)) - os.unlink(srpm_path) - - # save processed srpm name & version - data[package.name] = srpm_filename - with open(processing_file, "w") as f: - json.dump(data, f, indent=2) - print("") + if url.scheme == "rsync": + dl = subprocess.run( + ['rsync', url.geturl(), '/tmp'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) else: - print("{c}/{t} skipping already processed {n}".format( - c=count, t=total_urls, n=package.name)) + dl = subprocess.run( + ['curl', '-L', '--remote-name', url.geturl()], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd='/tmp') + + if dl.returncode: + print("error downloading srpm:") + print(dl.stdout) + continue + + srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename) + extract_srpm(tmp, srpm_path, result_folder) + discover_translations(tmp, package.name, result_folder, tm_folder) + + os.unlink(srpm_path) + + # save processed srpm name & version + data[package.name] = srpm_filename + with open(processing_file, "w") as f: + json.dump(data, f, indent=2) + print("") # if package.startswith("libreoffice"): # print("package ignored because really slow, please use --srpm") @@ -182,7 +189,8 @@ def get_po_translation_level(path, mask, name, result_folder, tm_folder): for po in glob.glob(path + "/" + filemask): dest = tm_folder + "/" + name + "/" + filemask.split("*")[0] os.makedirs(dest, exist_ok=True) - copy2(po, dest) + # use copyfile instead of copy2 to handle read-only files in rpm + copyfile(po, os.path.join(dest, os.path.basename(po))) subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path), result_folder + '/{p}.errors.txt'.format(p=name)], check=True) diff --git a/build_map.py b/build_map.py index 2344dc2..39579bf 100755 --- a/build_map.py +++ b/build_map.py @@ -18,7 +18,7 @@ def main(): parser = argparse.ArgumentParser( description="From a result file, build a json file for map rendering") parser.add_argument("--release", required=True, type=int, default=31, - choices=[30, 31], + choices=[30, 31, 32], help="Provide the Fedora release to analyze") parser.add_argument("--include_english", required=False, default=False, type=bool, help="Include english language in statistics?") diff --git a/build_stats.py b/build_stats.py index dca2c01..50457b5 100755 --- a/build_stats.py +++ b/build_stats.py @@ -16,7 +16,7 @@ def main(): parser = argparse.ArgumentParser( description="Consolidate every result files and produce a clean concatenated update") parser.add_argument("--release", required=True, type=int, default=31, - choices=[30, 31], + choices=[30, 31, 32], help="Provide the Fedora release to analyze") args = parser.parse_args() diff --git a/build_tm.py b/build_tm.py index 582cc54..fd8f63b 100755 --- a/build_tm.py +++ b/build_tm.py @@ -14,7 +14,7 @@ def main(): description="Creates compendium for every languages") parser.add_argument("--release", required=True, type=int, default=31, - choices=[30, 31], + choices=[30, 31, 32], help="Provide the Fedora release to analyze") parser.add_argument("--lang", required=False, type=str, From acd38e140bc054591389e750b291b1183f2022ba Mon Sep 17 00:00:00 2001 From: darknao Date: Sep 29 2020 20:45:58 +0000 Subject: [PATCH 3/14] README: add :z flag to volume mount --- diff --git a/README.md b/README.md index c6a4f0a..2c82919 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ podman build . -f docker/Dockerfile.33 -t fedlocstats:33 # Run the scripts ```bash -podman run -it --rm -v ./:/src --tmpfs /tmp:size=4G fedlocstats:$release $script +podman run -it --rm -v ./:/src:z --tmpfs /tmp:size=4G fedlocstats:$release $script ``` with `$script`, one of the following: From 4293cc4fa6d7a6849c0cd9e92161b8c0af65a717 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 30 2020 16:47:16 +0000 Subject: [PATCH 4/14] create the whole directory hierarchy if missing --- diff --git a/build.py b/build.py index 8ef5b63..4fd24d2 100755 --- a/build.py +++ b/build.py @@ -35,9 +35,9 @@ def main(): tm_folder = "./tm/f{v}/".format(v=distrel) if not os.path.exists(result_folder): - os.mkdir(result_folder) + os.makedirs(result_folder) if not os.path.exists(tm_folder): - os.mkdir(tm_folder) + os.makedirs(tm_folder) processing_file = os.path.join(result_folder, "data.json") From 06ab3b8aa39e643f5aee334ce2079c54a8acac92 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 30 2020 18:54:36 +0000 Subject: [PATCH 5/14] inform about the unknown format in output --- diff --git a/build.py b/build.py index 4fd24d2..cea111d 100755 --- a/build.py +++ b/build.py @@ -251,7 +251,7 @@ def get_json_translation_level(path, mask, name, result_folder): def unknown_format(path, results, srpm, tformat, result_folder): - print("unknown_format:") + print("unknown_format: " + tformat) with open(result_folder + "/todo_"+tformat+".txt", "a") as file: file.write(srpm + " " + results["filemask"] + "\n") From 8d5d8245a2ec00b84b9f3ca8a1262eaf793f2f4f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 01 2020 17:11:29 +0000 Subject: [PATCH 6/14] add local build of translation-finder --- diff --git a/docker/Dockerfile.32 b/docker/Dockerfile.32 index c2276ea..8b88287 100644 --- a/docker/Dockerfile.32 +++ b/docker/Dockerfile.32 @@ -4,6 +4,8 @@ RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core p COPY requirements.txt /src/requirements.txt RUN pip install --no-cache -r /src/requirements.txt +RUN pip install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip +RUN pip install git+https://github.com/WeblateOrg/translation-finder.git@master VOLUME /src WORKDIR /src From 2bdae5a73a0a64c58e482fc09c3ba919594dd05c Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 01 2020 19:53:04 +0000 Subject: [PATCH 7/14] move python packages to requirements.txt --- diff --git a/docker/Dockerfile.32 b/docker/Dockerfile.32 index 8b88287..c2276ea 100644 --- a/docker/Dockerfile.32 +++ b/docker/Dockerfile.32 @@ -4,8 +4,6 @@ RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core p COPY requirements.txt /src/requirements.txt RUN pip install --no-cache -r /src/requirements.txt -RUN pip install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip -RUN pip install git+https://github.com/WeblateOrg/translation-finder.git@master VOLUME /src WORKDIR /src diff --git a/requirements.txt b/requirements.txt index f15740a..3dfb040 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -translation-finder +git+https://github.com/WeblateOrg/language-data.git@master +git+https://github.com/WeblateOrg/translation-finder.git@master pyyaml pandas geopandas From 61ceb9f1c99ccb6fc56dc134e925154dc801f2ef Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 01 2020 20:49:21 +0000 Subject: [PATCH 8/14] add measures in data.json to be able to audit results --- diff --git a/build.py b/build.py index cea111d..3a12bb3 100755 --- a/build.py +++ b/build.py @@ -15,6 +15,7 @@ import dnf import rpm import json import distro +import time from shutil import copyfile, copy2 @@ -40,12 +41,21 @@ def main(): os.makedirs(tm_folder) processing_file = os.path.join(result_folder, "data.json") + srpm_list_file = os.path.join(result_folder, "srpm.txt") print("Fetching SRPMs url list") - p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm', - stdout=subprocess.PIPE, - shell=True) - url_list = str(p.stdout.read(), "utf-8").splitlines() + if not os.path.isfile(srpm_list_file): + p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm', + stdout=subprocess.PIPE, + shell=True) + url_list = str(p.stdout.read(), "utf-8").splitlines() + url_list = map(lambda x:x+'\n', url_list) + + with open (srpm_list_file, "w") as f: + f.writelines(url_list) + + with open(srpm_list_file, "r") as f: + url_list = f.readlines() # Load processing data, if any try: @@ -73,7 +83,7 @@ def main(): if package.name in data: # Compare version - known_package = dnf.subject.Subject(data[package.name]).get_nevra_possibilities(forms=1)[0] + known_package = dnf.subject.Subject(data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0] if rpm.labelCompare( (package.epoch, package.version, package.release), (known_package.epoch, known_package.version, known_package.release)) <= 0: @@ -105,12 +115,17 @@ def main(): srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename) extract_srpm(tmp, srpm_path, result_folder) - discover_translations(tmp, package.name, result_folder, tm_folder) + (tsearch, tcopy, results) = discover_translations(tmp, package.name, result_folder, tm_folder) os.unlink(srpm_path) # save processed srpm name & version - data[package.name] = srpm_filename + data[package.name] = { + "srpm": srpm_filename, + "tsearch": tsearch, + "tcopy": tcopy, + "results": results} + with open(processing_file, "w") as f: json.dump(data, f, indent=2) print("") @@ -141,7 +156,12 @@ def discover_translations(tmp, name, result_folder, tm_folder): """find po file""" print("discover_translations: "+tmp) translation_files = [] + tsearch = 0 + tcopy = 0 + cresults = 0 + + tsearch = time.time() # Check if there is a manual rule (like libreoffice) manual = "manual-discover/" + name + ".json" if os.path.isfile(manual): @@ -155,8 +175,12 @@ def discover_translations(tmp, name, result_folder, tm_folder): with open(result_folder + "/errors.txt", "a") as file: file.write(name + " on discover_translations\n") + tsearch = round(time.time() - tsearch, 1) + print(translation_files) + tcopy = time.time() + if translation_files: for translation in translation_files: # TODO: multiple translation files for same package gnome-clocks-3.32.0-1.fc30.src.rpm @@ -171,6 +195,10 @@ def discover_translations(tmp, name, result_folder, tm_folder): continue else: unknown_format(tmp, translation, name, translation["file_format"], result_folder) + tcopy = round(time.time() - tcopy, 1) + cresults = len(translation_files) + + return (tsearch, tcopy, cresults) def get_po_translation_level(path, mask, name, result_folder, tm_folder): From ee22baf5ae9a1d78e6a45cabb43a67911a6cbde3 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 01 2020 21:16:26 +0000 Subject: [PATCH 9/14] remove manual calculation and apply autopep --- diff --git a/build.py b/build.py index 3a12bb3..cf2bdc0 100755 --- a/build.py +++ b/build.py @@ -4,22 +4,19 @@ # run the translation_finder # Then, concat csv files import argparse +import dnf +import json import glob +import distro import os +import rpm import subprocess import tempfile -import yaml - -from urllib.parse import urlparse -import dnf -import rpm -import json -import distro import time - -from shutil import copyfile, copy2 +from shutil import copyfile from translation_finder import discover +from urllib.parse import urlparse def main(): @@ -45,13 +42,14 @@ def main(): print("Fetching SRPMs url list") if not os.path.isfile(srpm_list_file): - p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm', - stdout=subprocess.PIPE, - shell=True) + p = subprocess.Popen( + 'dnf download --source --skip-broken --url "*" | grep src.rpm', + stdout=subprocess.PIPE, + shell=True) url_list = str(p.stdout.read(), "utf-8").splitlines() - url_list = map(lambda x:x+'\n', url_list) + url_list = map(lambda x: x + '\n', url_list) - with open (srpm_list_file, "w") as f: + with open(srpm_list_file, "w") as f: f.writelines(url_list) with open(srpm_list_file, "r") as f: @@ -61,7 +59,7 @@ def main(): try: with open(processing_file) as f: data = json.load(f) - except: + except BaseException: data = {} pkgs = [] @@ -83,10 +81,15 @@ def main(): if package.name in data: # Compare version - known_package = dnf.subject.Subject(data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0] + known_package = dnf.subject.Subject( + data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0] if rpm.labelCompare( - (package.epoch, package.version, package.release), - (known_package.epoch, known_package.version, known_package.release)) <= 0: + (package.epoch, + package.version, + package.release), + (known_package.epoch, + known_package.version, + known_package.release)) <= 0: print("{c}/{t} skipping already processed {n}".format( c=count, t=total_urls, n=package.name)) continue @@ -96,17 +99,17 @@ def main(): if url.scheme == "rsync": dl = subprocess.run( - ['rsync', url.geturl(), '/tmp'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + ['rsync', url.geturl(), '/tmp'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) else: dl = subprocess.run( - ['curl', '-L', '--remote-name', url.geturl()], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - cwd='/tmp') + ['curl', '-L', '--remote-name', url.geturl()], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd='/tmp') if dl.returncode: print("error downloading srpm:") @@ -115,7 +118,8 @@ def main(): srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename) extract_srpm(tmp, srpm_path, result_folder) - (tsearch, tcopy, results) = discover_translations(tmp, package.name, result_folder, tm_folder) + (tsearch, tcopy, results) = discover_translations( + tmp, package.name, result_folder, tm_folder) os.unlink(srpm_path) @@ -154,47 +158,44 @@ def extract_srpm(tmp, name, result_folder): def discover_translations(tmp, name, result_folder, tm_folder): """find po file""" - print("discover_translations: "+tmp) + print("discover_translations: " + tmp) translation_files = [] tsearch = 0 tcopy = 0 cresults = 0 - tsearch = time.time() - # Check if there is a manual rule (like libreoffice) - manual = "manual-discover/" + name + ".json" - if os.path.isfile(manual): - with open(manual, 'r') as stream: - translation_files = yaml.load(stream, Loader=yaml.SafeLoader) - else: - try: - translation_files = discover(tmp) - except OSError: - print("error while searching for new") - with open(result_folder + "/errors.txt", "a") as file: - file.write(name + " on discover_translations\n") + try: + translation_files = discover(tmp) + except OSError: + with open(result_folder + "/errors.txt", "a") as file: + file.write(name + " on discover_translations\n") tsearch = round(time.time() - tsearch, 1) - print(translation_files) - tcopy = time.time() if translation_files: for translation in translation_files: - # TODO: multiple translation files for same package gnome-clocks-3.32.0-1.fc30.src.rpm + # TODO: multiple translation files for same package + # gnome-clocks-3.32.0-1.fc30.src.rpm if translation["file_format"] == "po": - get_po_translation_level(tmp, translation, name, result_folder, tm_folder) + get_po_translation_level( + tmp, translation, name, result_folder, tm_folder) elif translation["file_format"] == "ts": get_ts_translation_level(tmp, translation, name, result_folder) elif translation["file_format"] == "json": - get_json_translation_level(tmp, translation, name, result_folder) + get_json_translation_level( + tmp, translation, name, result_folder) elif translation["file_format"] == "auto": # it's a detection of .tx configuration continue else: - unknown_format(tmp, translation, name, translation["file_format"], result_folder) + unknown_format( + translation, + name, + translation["file_format"], + result_folder) tcopy = round(time.time() - tcopy, 1) cresults = len(translation_files) @@ -220,8 +221,12 @@ def get_po_translation_level(path, mask, name, result_folder, tm_folder): # use copyfile instead of copy2 to handle read-only files in rpm copyfile(po, os.path.join(dest, os.path.basename(po))) - subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path), - result_folder + '/{p}.errors.txt'.format(p=name)], check=True) + subprocess.run(["sed", + "-i", + "-e", + "s|{p}|.|g".format(p=path), + result_folder + '/{p}.errors.txt'.format(p=name)], + check=True) def get_ts_translation_level(path, mask, name, result_folder): @@ -236,8 +241,12 @@ def get_ts_translation_level(path, mask, name, result_folder): stats.close() error.close() - subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path), - result_folder + '/{p}.errors.txt'.format(p=name)], check=True) + subprocess.run(["sed", + "-i", + "-e", + "s|{p}|.|g".format(p=path), + result_folder + '/{p}.errors.txt'.format(p=name)], + check=True) def get_json_translation_level(path, mask, name, result_folder): @@ -249,39 +258,63 @@ def get_json_translation_level(path, mask, name, result_folder): # move only related json files to a temporary folder with tempfile.TemporaryDirectory() as tmpjson: - for filename in glob.iglob(path+"/"+filemask): + for filename in glob.iglob(path + "/" + filemask): # if filesare in language subfolder, reproduce the hierarchy - dest = os.path.join(*(os.path.dirname(filename).split(os.path.sep)[3:])) + dest = os.path.join( + *(os.path.dirname(filename).split(os.path.sep)[3:])) os.makedirs(tmpjson + "/" + dest, exist_ok=True) - copyfile(filename, tmpjson + "/" + dest + "/" + os.path.basename(filename)) + copyfile( + filename, + tmpjson + + "/" + + dest + + "/" + + os.path.basename(filename)) # convert json files to po files with tempfile.TemporaryDirectory() as tmppo: # use existing template, in not existing (probably a bug), try "en" - template_file = tmpjson+"/"+mask.get("template", filemask.replace("*", "en")) + template_file = tmpjson + "/" + \ + mask.get("template", filemask.replace("*", "en")) if os.path.isfile(template_file): - subprocess.run(["json2po", "-t", template_file, tmpjson, tmppo, "--progress=none"], - stderr=error, check=True, cwd=tmppo) + subprocess.run(["json2po", + "-t", + template_file, + tmpjson, + tmppo, + "--progress=none"], + stderr=error, + check=True, + cwd=tmppo) # compute stats - subprocess.run(["pocount", filemask.split("*")[0], "--csv"], - stdout=stats, stderr=error, check=True, cwd=tmppo) + subprocess.run(["pocount", + filemask.split("*")[0], + "--csv"], + stdout=stats, + stderr=error, + check=True, + cwd=tmppo) else: print(" template doesn't exist, is it a translation-finder bug?") stats.close() error.close() - subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path), - result_folder + '/{p}.errors.txt'.format(p=name)], check=True) + subprocess.run(["sed", + "-i", + "-e", + "s|{p}|.|g".format(p=path), + result_folder + '/{p}.errors.txt'.format(p=name)], + check=True) -def unknown_format(path, results, srpm, tformat, result_folder): +def unknown_format(results, srpm, tformat, result_folder): print("unknown_format: " + tformat) - with open(result_folder + "/todo_"+tformat+".txt", "a") as file: + with open(result_folder + "/todo_" + tformat + ".txt", "a") as file: file.write(srpm + " " + results["filemask"] + "\n") diff --git a/manual-discover/libreoffice-6.2.2.2-4.fc30.src.rpm.json b/manual-discover/libreoffice-6.2.2.2-4.fc30.src.rpm.json deleted file mode 100644 index 9a2cbd2..0000000 --- a/manual-discover/libreoffice-6.2.2.2-4.fc30.src.rpm.json +++ /dev/null @@ -1,6 +0,0 @@ -[ - { - "filemask":"translations/source/*/accessibility/messages.po", - "file_format":"po" - } -] \ No newline at end of file From a48051bf95ae90ab39de71d181e1e254a72c2c47 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 01 2020 21:33:38 +0000 Subject: [PATCH 10/14] add a simple json to csv converter --- diff --git a/convertCSV.py b/convertCSV.py new file mode 100755 index 0000000..dfa8fc8 --- /dev/null +++ b/convertCSV.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import json + +def main(): + """Handle params""" + + parser = argparse.ArgumentParser( + description="Convert a data.json into csv") + parser.add_argument("--json", required=True, + help="Json file to convert") + args = parser.parse_args() + + with open(args.json) as f: + data = json.load(f) + + with open('out.csv', mode='w') as csv_file: + csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + csv_writer.writerow(["package", "srpm", "tsearch", "tcopy", "results"]) + + for d in data: + csv_writer.writerow([d, data[d]["srpm"], data[d]["tsearch"], data[d]["tcopy"], data[d]["results"]]) + +if __name__ == '__main__': + main() From 3cdfccf3741a761e500f0067c940086b62c6debe Mon Sep 17 00:00:00 2001 From: darknao Date: Oct 01 2020 21:54:13 +0000 Subject: [PATCH 11/14] Cache srpm urls list for 24h, and add few options: -k, --keep : Keep SRPMs in /srpms volume to avoid redownloading them -f, --force : Force the processing of all packages filter : Filter packages list by name using regex, like "libreoffice", or "gnome.*" --- diff --git a/.gitignore b/.gitignore index 91f4d69..d80c98e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ venv/ rpms/ -src.rpms/ \ No newline at end of file +src.rpms/ +results/ +srpms_*.lst +tm/ diff --git a/README.md b/README.md index 2c82919..611b123 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ podman build . -f docker/Dockerfile.33 -t fedlocstats:33 # Run the scripts ```bash -podman run -it --rm -v ./:/src:z --tmpfs /tmp:size=4G fedlocstats:$release $script +podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release $script ``` with `$script`, one of the following: diff --git a/build.py b/build.py index cf2bdc0..b36893a 100755 --- a/build.py +++ b/build.py @@ -13,6 +13,8 @@ import rpm import subprocess import tempfile import time +import datetime as dt +import re from shutil import copyfile from translation_finder import discover @@ -24,13 +26,24 @@ def main(): parser = argparse.ArgumentParser( description="Computes stats for each srpm detected") - parser.add_argument("--srpm", required=False, - help="Only work on one SRPM, if selected") + parser.add_argument("filter", default=None, nargs='?', + help="package name filter (regex)") + parser.add_argument("-k", "--keep-srpms", default=False, + action='store_true', dest='keep', + help="Keep SRPMs in /srpms") + parser.add_argument("-f", "--force", default=False, + action='store_true', dest='force', + help="Ignore past progression state") args = parser.parse_args() + srpm_regex = None + if args.filter: + srpm_regex = re.compile("^{}$".format(args.filter)) + (distname, distrel, distid) = distro.linux_distribution() result_folder = "./results/f{v}/".format(v=distrel) tm_folder = "./tm/f{v}/".format(v=distrel) + srpms_path = "/srpms" if not os.path.exists(result_folder): os.makedirs(result_folder) @@ -39,21 +52,25 @@ def main(): processing_file = os.path.join(result_folder, "data.json") srpm_list_file = os.path.join(result_folder, "srpm.txt") - - print("Fetching SRPMs url list") - if not os.path.isfile(srpm_list_file): - p = subprocess.Popen( - 'dnf download --source --skip-broken --url "*" | grep src.rpm', - stdout=subprocess.PIPE, - shell=True) - url_list = str(p.stdout.read(), "utf-8").splitlines() - url_list = map(lambda x: x + '\n', url_list) - - with open(srpm_list_file, "w") as f: - f.writelines(url_list) - - with open(srpm_list_file, "r") as f: - url_list = f.readlines() + url_list = None + + if os.path.isfile(srpm_list_file): + list_file_stats = os.stat(srpm_list_file) + last_mod = dt.datetime.fromtimestamp(list_file_stats.st_mtime) + if dt.datetime.now() - last_mod < dt.timedelta(hours=24): + with open(srpm_list_file) as f: + url_list = f.readlines() + + if not url_list: + print("Fetching SRPMs url list") + p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm', + stdout=subprocess.PIPE, + shell=True) + + urls = str(p.stdout.read(), "utf-8") + with open(srpm_list_file, 'w') as f: + f.write(urls) + url_list = urls.splitlines() # Load processing data, if any try: @@ -62,7 +79,6 @@ def main(): except BaseException: data = {} - pkgs = [] count = 0 total_urls = len(url_list) @@ -76,10 +92,10 @@ def main(): srpm_data = dnf.subject.Subject(srpm_filename) package = srpm_data.get_nevra_possibilities(forms=1)[0] - if args.srpm and args.srpm != package.name: + if srpm_regex and not srpm_regex.match(package.name): continue - if package.name in data: + if package.name in data and not args.force: # Compare version known_package = dnf.subject.Subject( data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0] @@ -97,31 +113,34 @@ def main(): print("{c}/{t} processing {n}".format( c=count, t=total_urls, n=package.name)) - if url.scheme == "rsync": - dl = subprocess.run( - ['rsync', url.geturl(), '/tmp'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - else: - dl = subprocess.run( - ['curl', '-L', '--remote-name', url.geturl()], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - cwd='/tmp') - - if dl.returncode: - print("error downloading srpm:") - print(dl.stdout) - continue + srpm_path = os.path.join(srpms_path, srpm_filename) + if not os.path.isfile(srpm_path): + print("downloading {}".format(srpm_filename)) + if url.scheme == "rsync": + dl = subprocess.run( + ['rsync', url.geturl(), srpms_path], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + else: + dl = subprocess.run( + ['curl', '-L', '--remote-name', url.geturl()], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=srpms_path) + + if dl.returncode: + print("error downloading srpm:") + print(dl.stdout) + continue - srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename) extract_srpm(tmp, srpm_path, result_folder) (tsearch, tcopy, results) = discover_translations( tmp, package.name, result_folder, tm_folder) - os.unlink(srpm_path) + if not args.keep: + os.unlink(srpm_path) # save processed srpm name & version data[package.name] = { From 4bec9db8bd48a180d4500fd81897a7fd0e455e39 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 06 2020 19:23:25 +0000 Subject: [PATCH 12/14] add prefix to tmp dirs --- diff --git a/build.py b/build.py index cf2bdc0..6a9a005 100755 --- a/build.py +++ b/build.py @@ -66,7 +66,7 @@ def main(): count = 0 total_urls = len(url_list) - with tempfile.TemporaryDirectory() as tmp: + with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmp: for line in url_list: count += 1 url = urlparse(line.strip()) @@ -257,7 +257,7 @@ def get_json_translation_level(path, mask, name, result_folder): error = open(result_folder + '/{p}.errors.txt'.format(p=name), 'a') # move only related json files to a temporary folder - with tempfile.TemporaryDirectory() as tmpjson: + with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmpjson: for filename in glob.iglob(path + "/" + filemask): # if filesare in language subfolder, reproduce the hierarchy dest = os.path.join( @@ -273,7 +273,7 @@ def get_json_translation_level(path, mask, name, result_folder): os.path.basename(filename)) # convert json files to po files - with tempfile.TemporaryDirectory() as tmppo: + with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmppo: # use existing template, in not existing (probably a bug), try "en" template_file = tmpjson + "/" + \ mask.get("template", filemask.replace("*", "en")) From 188e5b9abfa6624e6079969660b3ba7789d31bc0 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 06 2020 19:23:47 +0000 Subject: [PATCH 13/14] Merge branch 'wholedistribution' of ssh://pagure.io/fedora-localization-statistics into wholedistribution --- diff --git a/.gitignore b/.gitignore index 91f4d69..d80c98e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ venv/ rpms/ -src.rpms/ \ No newline at end of file +src.rpms/ +results/ +srpms_*.lst +tm/ diff --git a/README.md b/README.md index 2c82919..611b123 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ podman build . -f docker/Dockerfile.33 -t fedlocstats:33 # Run the scripts ```bash -podman run -it --rm -v ./:/src:z --tmpfs /tmp:size=4G fedlocstats:$release $script +podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release $script ``` with `$script`, one of the following: diff --git a/build.py b/build.py index 6a9a005..03114d1 100755 --- a/build.py +++ b/build.py @@ -13,6 +13,8 @@ import rpm import subprocess import tempfile import time +import datetime as dt +import re from shutil import copyfile from translation_finder import discover @@ -24,13 +26,24 @@ def main(): parser = argparse.ArgumentParser( description="Computes stats for each srpm detected") - parser.add_argument("--srpm", required=False, - help="Only work on one SRPM, if selected") + parser.add_argument("filter", default=None, nargs='?', + help="package name filter (regex)") + parser.add_argument("-k", "--keep-srpms", default=False, + action='store_true', dest='keep', + help="Keep SRPMs in /srpms") + parser.add_argument("-f", "--force", default=False, + action='store_true', dest='force', + help="Ignore past progression state") args = parser.parse_args() + srpm_regex = None + if args.filter: + srpm_regex = re.compile("^{}$".format(args.filter)) + (distname, distrel, distid) = distro.linux_distribution() result_folder = "./results/f{v}/".format(v=distrel) tm_folder = "./tm/f{v}/".format(v=distrel) + srpms_path = "/srpms" if not os.path.exists(result_folder): os.makedirs(result_folder) @@ -39,21 +52,25 @@ def main(): processing_file = os.path.join(result_folder, "data.json") srpm_list_file = os.path.join(result_folder, "srpm.txt") - - print("Fetching SRPMs url list") - if not os.path.isfile(srpm_list_file): - p = subprocess.Popen( - 'dnf download --source --skip-broken --url "*" | grep src.rpm', - stdout=subprocess.PIPE, - shell=True) - url_list = str(p.stdout.read(), "utf-8").splitlines() - url_list = map(lambda x: x + '\n', url_list) - - with open(srpm_list_file, "w") as f: - f.writelines(url_list) - - with open(srpm_list_file, "r") as f: - url_list = f.readlines() + url_list = None + + if os.path.isfile(srpm_list_file): + list_file_stats = os.stat(srpm_list_file) + last_mod = dt.datetime.fromtimestamp(list_file_stats.st_mtime) + if dt.datetime.now() - last_mod < dt.timedelta(hours=24): + with open(srpm_list_file) as f: + url_list = f.readlines() + + if not url_list: + print("Fetching SRPMs url list") + p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm', + stdout=subprocess.PIPE, + shell=True) + + urls = str(p.stdout.read(), "utf-8") + with open(srpm_list_file, 'w') as f: + f.write(urls) + url_list = urls.splitlines() # Load processing data, if any try: @@ -62,7 +79,6 @@ def main(): except BaseException: data = {} - pkgs = [] count = 0 total_urls = len(url_list) @@ -76,10 +92,10 @@ def main(): srpm_data = dnf.subject.Subject(srpm_filename) package = srpm_data.get_nevra_possibilities(forms=1)[0] - if args.srpm and args.srpm != package.name: + if srpm_regex and not srpm_regex.match(package.name): continue - if package.name in data: + if package.name in data and not args.force: # Compare version known_package = dnf.subject.Subject( data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0] @@ -97,31 +113,34 @@ def main(): print("{c}/{t} processing {n}".format( c=count, t=total_urls, n=package.name)) - if url.scheme == "rsync": - dl = subprocess.run( - ['rsync', url.geturl(), '/tmp'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - else: - dl = subprocess.run( - ['curl', '-L', '--remote-name', url.geturl()], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - cwd='/tmp') - - if dl.returncode: - print("error downloading srpm:") - print(dl.stdout) - continue + srpm_path = os.path.join(srpms_path, srpm_filename) + if not os.path.isfile(srpm_path): + print("downloading {}".format(srpm_filename)) + if url.scheme == "rsync": + dl = subprocess.run( + ['rsync', url.geturl(), srpms_path], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + else: + dl = subprocess.run( + ['curl', '-L', '--remote-name', url.geturl()], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=srpms_path) + + if dl.returncode: + print("error downloading srpm:") + print(dl.stdout) + continue - srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename) extract_srpm(tmp, srpm_path, result_folder) (tsearch, tcopy, results) = discover_translations( tmp, package.name, result_folder, tm_folder) - os.unlink(srpm_path) + if not args.keep: + os.unlink(srpm_path) # save processed srpm name & version data[package.name] = { From a1d379180be6545d22ad2991c240f3e20eb33d75 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Oct 06 2020 21:13:26 +0000 Subject: [PATCH 14/14] save result per file type this helps to detect what really is important (po files for now) taking translation-finder from git also requires to make multiple pip steps, including manual requirement installation... --- diff --git a/.gitignore b/.gitignore index d80c98e..0695f40 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ venv/ rpms/ -src.rpms/ +srpms/ results/ srpms_*.lst tm/ diff --git a/build.py b/build.py index 03114d1..9d2ffeb 100755 --- a/build.py +++ b/build.py @@ -216,7 +216,10 @@ def discover_translations(tmp, name, result_folder, tm_folder): translation["file_format"], result_folder) tcopy = round(time.time() - tcopy, 1) - cresults = len(translation_files) + + cresults = dict() + for file in translation_files: + cresults[file["file_format"]] = cresults.get(file["file_format"], 0) + 1 return (tsearch, tcopy, cresults) diff --git a/convertCSV.py b/convertCSV.py index dfa8fc8..9d1d5d3 100755 --- a/convertCSV.py +++ b/convertCSV.py @@ -4,6 +4,27 @@ import argparse import csv import json +EXTENSION_MAP = ( + (".po", "po"), + ("strings.xml", "aresource"), + (".ini", "joomla"), + (".csv", "csv"), + (".json", "json-nested"), + (".dtd", "dtd"), + (".php", "php"), + (".xlf", "xliff"), + (".xliff", "xliff"), + (".ts", "ts"), + (".resx", "resx"), + (".resw", "resx"), + (".xlsx", "xlsx"), + (".yml", "yaml"), + (".yaml", "yaml"), + (".properties", "properties"), + (".strings", "strings"), +) + + def main(): """Handle params""" @@ -18,10 +39,20 @@ def main(): with open('out.csv', mode='w') as csv_file: csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - csv_writer.writerow(["package", "srpm", "tsearch", "tcopy", "results"]) + + row = ["package", "srpm", "tsearch", "tcopy"] + for end, result in EXTENSION_MAP: + row.append(end) + + csv_writer.writerow(row) + for d in data: - csv_writer.writerow([d, data[d]["srpm"], data[d]["tsearch"], data[d]["tcopy"], data[d]["results"]]) + row = [d, data[d]["srpm"], data[d]["tsearch"], data[d]["tcopy"]] + for end, result in EXTENSION_MAP: + row.append(data[d]["results"].get(result,0)) + + csv_writer.writerow(row) if __name__ == '__main__': main() diff --git a/docker/Dockerfile.32 b/docker/Dockerfile.32 index c2276ea..b6f5622 100644 --- a/docker/Dockerfile.32 +++ b/docker/Dockerfile.32 @@ -4,6 +4,9 @@ RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core p COPY requirements.txt /src/requirements.txt RUN pip install --no-cache -r /src/requirements.txt +RUN pip install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip +RUN pip install charamel +RUN pip install git+https://github.com/WeblateOrg/translation-finder.git VOLUME /src WORKDIR /src diff --git a/requirements.txt b/requirements.txt index 3dfb040..edde0f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ -git+https://github.com/WeblateOrg/language-data.git@master -git+https://github.com/WeblateOrg/translation-finder.git@master pyyaml pandas geopandas