From fcc481d5402cd23e929546b917f727083ef431ae Mon Sep 17 00:00:00 2001
From: darknao <darknao@fedoraproject.org>
Date: Sep 12 2020 21:01:17 +0000
Subject: [PATCH 1/14] SRPMs download overhaul

Everything is done inside containers, specific for each release
Every single SRPM available for each release is downloaded, processed, and removed
Progression is saved in results/$release/data.json so each srpm is processed only once

Signed-off-by: darknao <darknao@fedoraproject.org>

---

diff --git a/README.md b/README.md
index 700e263..c6a4f0a 100644
--- a/README.md
+++ b/README.md
@@ -4,32 +4,29 @@ Global statistics on translation levels of fedora products
 
 # Requirements
 
-`dnf install translate-toolkit podman`
+`dnf install podman`
 
-## Create needed folders
+## Create needed container images
 
+Each release need is own image.
+
+```bash
+podman build . -f docker/Dockerfile.$release -t fedlocstats:$release
 ```
-mkdir -p ./src.rpms/f30/ ./results/f30/
-virtualenv venv
-source venv/bin/activate
-pip install -r requirements.txt
+
+```bash
+podman build . -f docker/Dockerfile.31 -t fedlocstats:31
+podman build . -f docker/Dockerfile.32 -t fedlocstats:32
+podman build . -f docker/Dockerfile.33 -t fedlocstats:33
 ```
 
 # Run the scripts
 
-## Get package list
-
-This step is for now manual, I took list of DNF packages from Koji:
-
-* For F30: https://koji.fedoraproject.org/koji/buildinfo?buildID=1252912
-* For F31: https://kojipkgs.fedoraproject.org//packages/Fedora-Workstation-Live/30/20190421.n.0/data/logs/image/x86_64/root.log
-
-## Get the rpm sources
-
-`./download-f%%-srpm-in-container.sh` where %% is the fedora version (30 or 31)
+```bash
+podman run -it --rm -v ./:/src --tmpfs /tmp:size=4G fedlocstats:$release $script
+```
 
-Downloading the file is done inside a container so we can produce stats even if 
-using Fedora 29. This represents about 7 GB for Fedora 30 and takes some time.
+with `$script`, one of the following:
 
 ## Compute data
 
@@ -78,18 +75,18 @@ Data in CLDR-raw folder comes from https://github.com/unicode-org/cldr/blob/mast
 ## Ideas
 
 1. CLDR supplementalData.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/supplementalData.xml
-    1. use territoryContainment to build geographic groups
-    2. use languageData to detect default script
-    3. use languageData to have basic stats about territories
-    4. use territoryInfo to have advanced stats about territories
+   1. use territoryContainment to build geographic groups
+   2. use languageData to detect default script
+   3. use languageData to have basic stats about territories
+   4. use territoryInfo to have advanced stats about territories
 2. CLDR supplementalMetadata.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/supplementalMetadata.xml
-    1. use the replacement values harmonize content
+   1. use the replacement values harmonize content
 3. CLDR likelySubtags.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/likelySubtags.xml
-    1. use the replacement advanced harmonization?
+   1. use the replacement advanced harmonization?
 4. CLDR languageInfo.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/languageInfo.xml
-    1. can we say if language is >= 90% close to another one, we can consider we propagate translation statistics?
+   1. can we say if language is >= 90% close to another one, we can consider we propagate translation statistics?
 5. CLDR languageGroup.xml: https://github.com/unicode-org/cldr/blob/master/common/supplemental/languageGroup.xml
-    1. what is it?
+   1. what is it?
 
 automatic calculation (group by territory + spoken percentage * spoken )
 
diff --git a/build.py b/build.py
index 8b6989d..b4b0af3 100755
--- a/build.py
+++ b/build.py
@@ -1,9 +1,8 @@
 #!/usr/bin/env python3
-""" For each packages in src.rpms folder :"""
-"""   extract srpm                        """
-"""   run the translation_finder          """
-""" Then, concat csv files                """
-
+# For each packages in src.rpms folder :
+#   extract srpm
+#   run the translation_finder
+# Then, concat csv files
 import argparse
 import glob
 import os
@@ -11,9 +10,16 @@ import subprocess
 import tempfile
 import yaml
 
+from urllib.parse import urlparse
+import dnf
+import json
+import distro
+
+
 from shutil import copyfile, copy2
 from translation_finder import discover
 
+
 def main():
     """Handle params"""
 
@@ -21,51 +27,95 @@ def main():
         description="Computes stats for each srpm detected")
     parser.add_argument("--srpm", required=False,
                         help="Only work on one SRPM, if selected")
-    parser.add_argument("--offset", required=False, type=int,
-                        help="Provide the number of packages to ignore")
-    parser.add_argument("--release", required=True, type=int, default=31,
-                        choices=[30, 31],
-                        help="Provide the Fedora release to analyze")
     args = parser.parse_args()
 
-    srpm_folder="./src.rpms/f{v}/".format(v=args.release)
-    result_folder="./results/f{v}/".format(v=args.release)
-    tm_folder="./tm/f{v}/".format(v=args.release)
+    (distname, distrel, distid) = distro.linux_distribution()
+    result_folder = "./results/f{v}/".format(v=distrel)
+    tm_folder = "./tm/f{v}/".format(v=distrel)
+
+    if not os.path.exists(result_folder):
+        os.mkdir(result_folder)
+    if not os.path.exists(tm_folder):
+        os.mkdir(tm_folder)
+
+    processing_file = os.path.join(result_folder, "data.json")
+
+    print("Fetching SRPMs url list")
+    p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm',
+                         stdout=subprocess.PIPE,
+                         shell=True)
+    url_list = str(p.stdout.read(), "utf-8").splitlines()
+
+    # Load processing data, if any
+    try:
+        with open(processing_file) as f:
+            data = json.load(f)
+    except:
+        data = {}
 
     pkgs = []
-    for (dirpath, dirnames, filenames) in os.walk(srpm_folder):
-        pkgs.extend(filenames)
-        break
     count = 0
+    total_urls = len(url_list)
 
-    if args.srpm:
-        print("argument srpm is provided: " + args.srpm)
-        with tempfile.TemporaryDirectory() as tmp:
-            package = [x for x in pkgs if x == args.srpm][0]
-            srpm_file = "{srpm}/{a}".format(srpm=srpm_folder, a=package)
-            extract_srpm(tmp, srpm_file, result_folder)
-            discover_translations(tmp, package, result_folder, tm_folder)
-    else:
-        with tempfile.TemporaryDirectory() as tmp:
-            if args.offset:
-                pkgs = pkgs[slice(args.offset, len(pkgs))]
+    with tempfile.TemporaryDirectory() as tmp:
+        for line in url_list:
+            count += 1
+            url = urlparse(line.strip())
+            if not url.scheme:
+                continue
+            srpm_filename = os.path.basename(url.path)
+            srpm_data = dnf.subject.Subject(srpm_filename)
+            package = srpm_data.get_nevra_possibilities(forms=1)[0]
 
-            for package in pkgs:
-                count += 1
-                print("")
-                print("{c}/{m}".format(c=count, m=len(pkgs)))
+            if args.srpm and args.srpm != package.name:
+                continue
 
-                if package.startswith("libreoffice"):
-                    print("package ignored because really slow, please use --srpm")
+            if package.name not in data or data[package.name] != srpm_filename:
+                print("{c}/{t} processing {n}".format(
+                    c=count, t=total_urls, n=package.name))
+
+                if url.scheme == "rsync":
+                    dl = subprocess.run(
+                            ['rsync', url.geturl(), '/tmp'],
+                            stdin=subprocess.PIPE,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT)
+                else:
+                    dl = subprocess.run(
+                            ['curl', '-L', '--remote-name', url.geturl()],
+                            stdin=subprocess.PIPE,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT,
+                            cwd='/tmp')
+
+                if dl.returncode:
+                    print("error downloading srpm:")
+                    print(dl.stdout)
                     continue
 
-                srpm_file = "{srpm}/{a}".format(srpm=srpm_folder, a=package)
-                extract_srpm(tmp, srpm_file, result_folder)
-                discover_translations(tmp, package, result_folder, tm_folder)
+                srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename)
+                extract_srpm(tmp, srpm_path, result_folder)
+                discover_translations(tmp, package.name, result_folder, tm_folder)
+
+                os.unlink(srpm_path)
+
+                # save processed srpm name & version
+                data[package.name] = srpm_filename
+                with open(processing_file, "w") as f:
+                    json.dump(data, f, indent=2)
+                print("")
+            else:
+                print("{c}/{t} skipping already processed {n}".format(
+                    c=count, t=total_urls, n=package.name))
+
+            # if package.startswith("libreoffice"):
+            #    print("package ignored because really slow, please use --srpm")
+            #    continue
 
     subprocess.run(['./concat_csv.sh', result_folder],
                    check=True)
 
+
 def extract_srpm(tmp, name, result_folder):
     """extract srpm page"""
     print("extract_srpm: " + name)
@@ -79,6 +129,7 @@ def extract_srpm(tmp, name, result_folder):
     out.close()
     error.close()
 
+
 def discover_translations(tmp, name, result_folder, tm_folder):
     """find po file"""
     print("discover_translations: "+tmp)
@@ -114,6 +165,7 @@ def discover_translations(tmp, name, result_folder, tm_folder):
             else:
                 unknown_format(tmp, translation, name, translation["file_format"], result_folder)
 
+
 def get_po_translation_level(path, mask, name, result_folder, tm_folder):
     filemask = mask["filemask"]
     print("get_po_translation_level: " + filemask)
@@ -127,8 +179,8 @@ def get_po_translation_level(path, mask, name, result_folder, tm_folder):
     error.close()
 
     # Copy translation files in translation memory
-    for po in glob.glob(path +"/"+ filemask):
-        dest = tm_folder +"/"+ name +"/"+ filemask.split("*")[0]
+    for po in glob.glob(path + "/" + filemask):
+        dest = tm_folder + "/" + name + "/" + filemask.split("*")[0]
         os.makedirs(dest, exist_ok=True)
         copy2(po, dest)
 
@@ -151,6 +203,7 @@ def get_ts_translation_level(path, mask, name, result_folder):
     subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path),
                     result_folder + '/{p}.errors.txt'.format(p=name)], check=True)
 
+
 def get_json_translation_level(path, mask, name, result_folder):
     filemask = mask["filemask"]
     print("get_json_translation_level: " + filemask)
@@ -188,12 +241,13 @@ def get_json_translation_level(path, mask, name, result_folder):
     subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path),
                     result_folder + '/{p}.errors.txt'.format(p=name)], check=True)
 
+
 def unknown_format(path, results, srpm, tformat, result_folder):
     print("unknown_format:")
 
     with open(result_folder + "/todo_"+tformat+".txt", "a") as file:
         file.write(srpm + " " + results["filemask"] + "\n")
 
+
 if __name__ == '__main__':
     main()
-
diff --git a/docker/Dockerfile.30 b/docker/Dockerfile.30
new file mode 100644
index 0000000..0626703
--- /dev/null
+++ b/docker/Dockerfile.30
@@ -0,0 +1,12 @@
+FROM registry.fedoraproject.org/fedora:30
+
+RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core python3-pip rsync vim
+
+COPY requirements.txt /src/requirements.txt
+RUN pip3 install --no-cache -r /src/requirements.txt
+
+# Fix missing metalink for f30
+COPY docker/fedora-updates-modular.repo /etc/yum.repos.d/fedora-updates-modular.repo
+
+VOLUME /src
+WORKDIR /src
diff --git a/docker/Dockerfile.32 b/docker/Dockerfile.32
new file mode 100644
index 0000000..c2276ea
--- /dev/null
+++ b/docker/Dockerfile.32
@@ -0,0 +1,9 @@
+FROM registry.fedoraproject.org/fedora:32
+
+RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core python3-pip rsync vim
+
+COPY requirements.txt /src/requirements.txt
+RUN pip install --no-cache -r /src/requirements.txt
+
+VOLUME /src
+WORKDIR /src
diff --git a/docker/fedora-updates-modular.repo b/docker/fedora-updates-modular.repo
new file mode 100644
index 0000000..988724f
--- /dev/null
+++ b/docker/fedora-updates-modular.repo
@@ -0,0 +1,38 @@
+[updates-modular]
+name=Fedora Modular $releasever - $basearch - Updates
+failovermethod=priority
+#baseurl=http://download.fedoraproject.org/pub/fedora/linux/updates/$releasever/Modular/$basearch/
+metalink=https://mirrors.fedoraproject.org/metalink?repo=updates-released-modular-f$releasever&arch=$basearch
+enabled=1
+repo_gpgcheck=0
+type=rpm
+gpgcheck=1
+metadata_expire=6h
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-$releasever-$basearch
+skip_if_unavailable=False
+
+[updates-modular-debuginfo]
+name=Fedora Modular $releasever - $basearch - Updates - Debug
+failovermethod=priority
+#baseurl=http://download.fedoraproject.org/pub/fedora/linux/updates/$releasever/Modular/$basearch/debug/
+metalink=https://mirrors.fedoraproject.org/metalink?repo=updates-released-modular-debug-f$releasever&arch=$basearch
+enabled=0
+repo_gpgcheck=0
+type=rpm
+gpgcheck=1
+metadata_expire=6h
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-$releasever-$basearch
+skip_if_unavailable=False
+
+[updates-modular-source]
+name=Fedora Modular $releasever - Updates Source
+failovermethod=priority
+baseurl=http://download.fedoraproject.org/pub/fedora/linux/updates/$releasever/Modular/SRPMS/
+#metalink=https://mirrors.fedoraproject.org/metalink?repo=updates-released-modular-source-f$releasever&arch=$basearch
+enabled=0
+repo_gpgcheck=0
+type=rpm
+gpgcheck=1
+metadata_expire=6h
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-$releasever-$basearch
+skip_if_unavailable=False
diff --git a/extract_srpm.sh b/extract_srpm.sh
index e7910f9..3b71ba5 100755
--- a/extract_srpm.sh
+++ b/extract_srpm.sh
@@ -3,7 +3,7 @@
 # extract every existing archives (that most probably are source code)
 
 folder=$1
-package=$(pwd)/$2
+package=$2
 
 hop=$(pwd)/ls.txt
 tmp=$(mktemp -d)
@@ -13,12 +13,15 @@ if [ ! -e "$package" ] ; then
     exit 1
 fi
 
-rm -rf -- "$folder"/*
+# please, don't do that, ever
+# rm -rf -- "$folder"/*
 # remove hidden files
-rm -rf -- "$folder"/.* 2> /dev/null
+# rm -rf -- "$folder"/.* 2> /dev/null
+rm -rf $folder
+mkdir -p $folder
 
 pushd "$tmp" > /dev/null
-
+echo "extract $package"
 rpm2cpio "$package" | cpio -idm --no-preserve-owner --quiet
 
 # TODO: multiple archives in one srpm sqlite-3.26.0-3.fc30.src.rpm

From 077c5d0bc4bc269a08dba76a7618cdaa5bd459d1 Mon Sep 17 00:00:00 2001
From: darknao <darknao@fedoraproject.org>
Date: Sep 21 2020 20:29:56 +0000
Subject: [PATCH 2/14] check rpm version


Signed-off-by: darknao <darknao@fedoraproject.org>

---

diff --git a/build.py b/build.py
index b4b0af3..8ef5b63 100755
--- a/build.py
+++ b/build.py
@@ -12,6 +12,7 @@ import yaml
 
 from urllib.parse import urlparse
 import dnf
+import rpm
 import json
 import distro
 
@@ -70,43 +71,49 @@ def main():
             if args.srpm and args.srpm != package.name:
                 continue
 
-            if package.name not in data or data[package.name] != srpm_filename:
-                print("{c}/{t} processing {n}".format(
-                    c=count, t=total_urls, n=package.name))
-
-                if url.scheme == "rsync":
-                    dl = subprocess.run(
-                            ['rsync', url.geturl(), '/tmp'],
-                            stdin=subprocess.PIPE,
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT)
-                else:
-                    dl = subprocess.run(
-                            ['curl', '-L', '--remote-name', url.geturl()],
-                            stdin=subprocess.PIPE,
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT,
-                            cwd='/tmp')
-
-                if dl.returncode:
-                    print("error downloading srpm:")
-                    print(dl.stdout)
+            if package.name in data:
+                # Compare version
+                known_package = dnf.subject.Subject(data[package.name]).get_nevra_possibilities(forms=1)[0]
+                if rpm.labelCompare(
+                        (package.epoch, package.version, package.release),
+                        (known_package.epoch, known_package.version, known_package.release)) <= 0:
+                    print("{c}/{t} skipping already processed {n}".format(
+                        c=count, t=total_urls, n=package.name))
                     continue
 
-                srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename)
-                extract_srpm(tmp, srpm_path, result_folder)
-                discover_translations(tmp, package.name, result_folder, tm_folder)
+            print("{c}/{t} processing {n}".format(
+                c=count, t=total_urls, n=package.name))
 
-                os.unlink(srpm_path)
-
-                # save processed srpm name & version
-                data[package.name] = srpm_filename
-                with open(processing_file, "w") as f:
-                    json.dump(data, f, indent=2)
-                print("")
+            if url.scheme == "rsync":
+                dl = subprocess.run(
+                        ['rsync', url.geturl(), '/tmp'],
+                        stdin=subprocess.PIPE,
+                        stdout=subprocess.PIPE,
+                        stderr=subprocess.STDOUT)
             else:
-                print("{c}/{t} skipping already processed {n}".format(
-                    c=count, t=total_urls, n=package.name))
+                dl = subprocess.run(
+                        ['curl', '-L', '--remote-name', url.geturl()],
+                        stdin=subprocess.PIPE,
+                        stdout=subprocess.PIPE,
+                        stderr=subprocess.STDOUT,
+                        cwd='/tmp')
+
+            if dl.returncode:
+                print("error downloading srpm:")
+                print(dl.stdout)
+                continue
+
+            srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename)
+            extract_srpm(tmp, srpm_path, result_folder)
+            discover_translations(tmp, package.name, result_folder, tm_folder)
+
+            os.unlink(srpm_path)
+
+            # save processed srpm name & version
+            data[package.name] = srpm_filename
+            with open(processing_file, "w") as f:
+                json.dump(data, f, indent=2)
+            print("")
 
             # if package.startswith("libreoffice"):
             #    print("package ignored because really slow, please use --srpm")
@@ -182,7 +189,8 @@ def get_po_translation_level(path, mask, name, result_folder, tm_folder):
     for po in glob.glob(path + "/" + filemask):
         dest = tm_folder + "/" + name + "/" + filemask.split("*")[0]
         os.makedirs(dest, exist_ok=True)
-        copy2(po, dest)
+        # use copyfile instead of copy2 to handle read-only files in rpm
+        copyfile(po, os.path.join(dest, os.path.basename(po)))
 
     subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path),
                     result_folder + '/{p}.errors.txt'.format(p=name)], check=True)
diff --git a/build_map.py b/build_map.py
index 2344dc2..39579bf 100755
--- a/build_map.py
+++ b/build_map.py
@@ -18,7 +18,7 @@ def main():
     parser = argparse.ArgumentParser(
         description="From a result file, build a json file for map rendering")
     parser.add_argument("--release", required=True, type=int, default=31,
-                        choices=[30, 31],
+                        choices=[30, 31, 32],
                         help="Provide the Fedora release to analyze")
     parser.add_argument("--include_english", required=False, default=False, type=bool,
                         help="Include english language in statistics?")
diff --git a/build_stats.py b/build_stats.py
index dca2c01..50457b5 100755
--- a/build_stats.py
+++ b/build_stats.py
@@ -16,7 +16,7 @@ def main():
     parser = argparse.ArgumentParser(
         description="Consolidate every result files and produce a clean concatenated update")
     parser.add_argument("--release", required=True, type=int, default=31,
-                        choices=[30, 31],
+                        choices=[30, 31, 32],
                         help="Provide the Fedora release to analyze")
 
     args = parser.parse_args()
diff --git a/build_tm.py b/build_tm.py
index 582cc54..fd8f63b 100755
--- a/build_tm.py
+++ b/build_tm.py
@@ -14,7 +14,7 @@ def main():
     description="Creates compendium for every languages")
 
     parser.add_argument("--release", required=True, type=int, default=31,
-                        choices=[30, 31],
+                        choices=[30, 31, 32],
                         help="Provide the Fedora release to analyze")
 
     parser.add_argument("--lang", required=False, type=str,

From acd38e140bc054591389e750b291b1183f2022ba Mon Sep 17 00:00:00 2001
From: darknao <darknao@fedoraproject.org>
Date: Sep 29 2020 20:45:58 +0000
Subject: [PATCH 3/14] README: add :z flag to volume mount


---

diff --git a/README.md b/README.md
index c6a4f0a..2c82919 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ podman build . -f docker/Dockerfile.33 -t fedlocstats:33
 # Run the scripts
 
 ```bash
-podman run -it --rm -v ./:/src --tmpfs /tmp:size=4G fedlocstats:$release $script
+podman run -it --rm -v ./:/src:z --tmpfs /tmp:size=4G fedlocstats:$release $script
 ```
 
 with `$script`, one of the following:

From 4293cc4fa6d7a6849c0cd9e92161b8c0af65a717 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Sep 30 2020 16:47:16 +0000
Subject: [PATCH 4/14] create the whole directory hierarchy if missing


---

diff --git a/build.py b/build.py
index 8ef5b63..4fd24d2 100755
--- a/build.py
+++ b/build.py
@@ -35,9 +35,9 @@ def main():
     tm_folder = "./tm/f{v}/".format(v=distrel)
 
     if not os.path.exists(result_folder):
-        os.mkdir(result_folder)
+        os.makedirs(result_folder)
     if not os.path.exists(tm_folder):
-        os.mkdir(tm_folder)
+        os.makedirs(tm_folder)
 
     processing_file = os.path.join(result_folder, "data.json")
 

From 06ab3b8aa39e643f5aee334ce2079c54a8acac92 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Sep 30 2020 18:54:36 +0000
Subject: [PATCH 5/14] inform about the unknown format in output


---

diff --git a/build.py b/build.py
index 4fd24d2..cea111d 100755
--- a/build.py
+++ b/build.py
@@ -251,7 +251,7 @@ def get_json_translation_level(path, mask, name, result_folder):
 
 
 def unknown_format(path, results, srpm, tformat, result_folder):
-    print("unknown_format:")
+    print("unknown_format: " + tformat)
 
     with open(result_folder + "/todo_"+tformat+".txt", "a") as file:
         file.write(srpm + " " + results["filemask"] + "\n")

From 8d5d8245a2ec00b84b9f3ca8a1262eaf793f2f4f Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Oct 01 2020 17:11:29 +0000
Subject: [PATCH 6/14] add local build of translation-finder


---

diff --git a/docker/Dockerfile.32 b/docker/Dockerfile.32
index c2276ea..8b88287 100644
--- a/docker/Dockerfile.32
+++ b/docker/Dockerfile.32
@@ -4,6 +4,8 @@ RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core p
 
 COPY requirements.txt /src/requirements.txt
 RUN pip install --no-cache -r /src/requirements.txt
+RUN pip install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip
+RUN pip install git+https://github.com/WeblateOrg/translation-finder.git@master
 
 VOLUME /src
 WORKDIR /src

From 2bdae5a73a0a64c58e482fc09c3ba919594dd05c Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Oct 01 2020 19:53:04 +0000
Subject: [PATCH 7/14] move python packages to requirements.txt


---

diff --git a/docker/Dockerfile.32 b/docker/Dockerfile.32
index 8b88287..c2276ea 100644
--- a/docker/Dockerfile.32
+++ b/docker/Dockerfile.32
@@ -4,8 +4,6 @@ RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core p
 
 COPY requirements.txt /src/requirements.txt
 RUN pip install --no-cache -r /src/requirements.txt
-RUN pip install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip
-RUN pip install git+https://github.com/WeblateOrg/translation-finder.git@master
 
 VOLUME /src
 WORKDIR /src
diff --git a/requirements.txt b/requirements.txt
index f15740a..3dfb040 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
-translation-finder
+git+https://github.com/WeblateOrg/language-data.git@master
+git+https://github.com/WeblateOrg/translation-finder.git@master
 pyyaml
 pandas
 geopandas

From 61ceb9f1c99ccb6fc56dc134e925154dc801f2ef Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Oct 01 2020 20:49:21 +0000
Subject: [PATCH 8/14] add measures in data.json to be able to audit results


---

diff --git a/build.py b/build.py
index cea111d..3a12bb3 100755
--- a/build.py
+++ b/build.py
@@ -15,6 +15,7 @@ import dnf
 import rpm
 import json
 import distro
+import time
 
 
 from shutil import copyfile, copy2
@@ -40,12 +41,21 @@ def main():
         os.makedirs(tm_folder)
 
     processing_file = os.path.join(result_folder, "data.json")
+    srpm_list_file = os.path.join(result_folder, "srpm.txt")
 
     print("Fetching SRPMs url list")
-    p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm',
-                         stdout=subprocess.PIPE,
-                         shell=True)
-    url_list = str(p.stdout.read(), "utf-8").splitlines()
+    if not os.path.isfile(srpm_list_file):
+        p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm',
+                             stdout=subprocess.PIPE,
+                             shell=True)
+        url_list = str(p.stdout.read(), "utf-8").splitlines()
+        url_list = map(lambda x:x+'\n', url_list)
+
+        with open (srpm_list_file, "w") as f:
+            f.writelines(url_list)
+
+    with open(srpm_list_file, "r") as f:
+        url_list = f.readlines()
 
     # Load processing data, if any
     try:
@@ -73,7 +83,7 @@ def main():
 
             if package.name in data:
                 # Compare version
-                known_package = dnf.subject.Subject(data[package.name]).get_nevra_possibilities(forms=1)[0]
+                known_package = dnf.subject.Subject(data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0]
                 if rpm.labelCompare(
                         (package.epoch, package.version, package.release),
                         (known_package.epoch, known_package.version, known_package.release)) <= 0:
@@ -105,12 +115,17 @@ def main():
 
             srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename)
             extract_srpm(tmp, srpm_path, result_folder)
-            discover_translations(tmp, package.name, result_folder, tm_folder)
+            (tsearch, tcopy, results) = discover_translations(tmp, package.name, result_folder, tm_folder)
 
             os.unlink(srpm_path)
 
             # save processed srpm name & version
-            data[package.name] = srpm_filename
+            data[package.name] = {
+                "srpm": srpm_filename,
+                "tsearch": tsearch,
+                "tcopy": tcopy,
+                "results": results}
+
             with open(processing_file, "w") as f:
                 json.dump(data, f, indent=2)
             print("")
@@ -141,7 +156,12 @@ def discover_translations(tmp, name, result_folder, tm_folder):
     """find po file"""
     print("discover_translations: "+tmp)
     translation_files = []
+    tsearch = 0
+    tcopy = 0
+    cresults = 0
+
 
+    tsearch = time.time()
     # Check if there is a manual rule (like libreoffice)
     manual = "manual-discover/" + name + ".json"
     if os.path.isfile(manual):
@@ -155,8 +175,12 @@ def discover_translations(tmp, name, result_folder, tm_folder):
             with open(result_folder + "/errors.txt", "a") as file:
                 file.write(name + " on discover_translations\n")
 
+    tsearch = round(time.time() - tsearch, 1)
+
     print(translation_files)
 
+    tcopy = time.time()
+
     if translation_files:
         for translation in translation_files:
             # TODO: multiple translation files for same package gnome-clocks-3.32.0-1.fc30.src.rpm
@@ -171,6 +195,10 @@ def discover_translations(tmp, name, result_folder, tm_folder):
                 continue
             else:
                 unknown_format(tmp, translation, name, translation["file_format"], result_folder)
+    tcopy = round(time.time() - tcopy, 1)
+    cresults = len(translation_files)
+
+    return (tsearch, tcopy, cresults)
 
 
 def get_po_translation_level(path, mask, name, result_folder, tm_folder):

From ee22baf5ae9a1d78e6a45cabb43a67911a6cbde3 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Oct 01 2020 21:16:26 +0000
Subject: [PATCH 9/14] remove manual calculation and apply autopep


---

diff --git a/build.py b/build.py
index 3a12bb3..cf2bdc0 100755
--- a/build.py
+++ b/build.py
@@ -4,22 +4,19 @@
 #   run the translation_finder
 # Then, concat csv files
 import argparse
+import dnf
+import json
 import glob
+import distro
 import os
+import rpm
 import subprocess
 import tempfile
-import yaml
-
-from urllib.parse import urlparse
-import dnf
-import rpm
-import json
-import distro
 import time
 
-
-from shutil import copyfile, copy2
+from shutil import copyfile
 from translation_finder import discover
+from urllib.parse import urlparse
 
 
 def main():
@@ -45,13 +42,14 @@ def main():
 
     print("Fetching SRPMs url list")
     if not os.path.isfile(srpm_list_file):
-        p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm',
-                             stdout=subprocess.PIPE,
-                             shell=True)
+        p = subprocess.Popen(
+            'dnf download --source --skip-broken --url "*" | grep src.rpm',
+            stdout=subprocess.PIPE,
+            shell=True)
         url_list = str(p.stdout.read(), "utf-8").splitlines()
-        url_list = map(lambda x:x+'\n', url_list)
+        url_list = map(lambda x: x + '\n', url_list)
 
-        with open (srpm_list_file, "w") as f:
+        with open(srpm_list_file, "w") as f:
             f.writelines(url_list)
 
     with open(srpm_list_file, "r") as f:
@@ -61,7 +59,7 @@ def main():
     try:
         with open(processing_file) as f:
             data = json.load(f)
-    except:
+    except BaseException:
         data = {}
 
     pkgs = []
@@ -83,10 +81,15 @@ def main():
 
             if package.name in data:
                 # Compare version
-                known_package = dnf.subject.Subject(data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0]
+                known_package = dnf.subject.Subject(
+                    data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0]
                 if rpm.labelCompare(
-                        (package.epoch, package.version, package.release),
-                        (known_package.epoch, known_package.version, known_package.release)) <= 0:
+                    (package.epoch,
+                     package.version,
+                     package.release),
+                    (known_package.epoch,
+                     known_package.version,
+                     known_package.release)) <= 0:
                     print("{c}/{t} skipping already processed {n}".format(
                         c=count, t=total_urls, n=package.name))
                     continue
@@ -96,17 +99,17 @@ def main():
 
             if url.scheme == "rsync":
                 dl = subprocess.run(
-                        ['rsync', url.geturl(), '/tmp'],
-                        stdin=subprocess.PIPE,
-                        stdout=subprocess.PIPE,
-                        stderr=subprocess.STDOUT)
+                    ['rsync', url.geturl(), '/tmp'],
+                    stdin=subprocess.PIPE,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT)
             else:
                 dl = subprocess.run(
-                        ['curl', '-L', '--remote-name', url.geturl()],
-                        stdin=subprocess.PIPE,
-                        stdout=subprocess.PIPE,
-                        stderr=subprocess.STDOUT,
-                        cwd='/tmp')
+                    ['curl', '-L', '--remote-name', url.geturl()],
+                    stdin=subprocess.PIPE,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    cwd='/tmp')
 
             if dl.returncode:
                 print("error downloading srpm:")
@@ -115,7 +118,8 @@ def main():
 
             srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename)
             extract_srpm(tmp, srpm_path, result_folder)
-            (tsearch, tcopy, results) = discover_translations(tmp, package.name, result_folder, tm_folder)
+            (tsearch, tcopy, results) = discover_translations(
+                tmp, package.name, result_folder, tm_folder)
 
             os.unlink(srpm_path)
 
@@ -154,47 +158,44 @@ def extract_srpm(tmp, name, result_folder):
 
 def discover_translations(tmp, name, result_folder, tm_folder):
     """find po file"""
-    print("discover_translations: "+tmp)
+    print("discover_translations: " + tmp)
     translation_files = []
     tsearch = 0
     tcopy = 0
     cresults = 0
 
-
     tsearch = time.time()
-    # Check if there is a manual rule (like libreoffice)
-    manual = "manual-discover/" + name + ".json"
-    if os.path.isfile(manual):
-        with open(manual, 'r') as stream:
-            translation_files = yaml.load(stream, Loader=yaml.SafeLoader)
-    else:
-        try:
-            translation_files = discover(tmp)
-        except OSError:
-            print("error while searching for new")
-            with open(result_folder + "/errors.txt", "a") as file:
-                file.write(name + " on discover_translations\n")
+    try:
+        translation_files = discover(tmp)
+    except OSError:
+        with open(result_folder + "/errors.txt", "a") as file:
+            file.write(name + " on discover_translations\n")
 
     tsearch = round(time.time() - tsearch, 1)
 
-    print(translation_files)
-
     tcopy = time.time()
 
     if translation_files:
         for translation in translation_files:
-            # TODO: multiple translation files for same package gnome-clocks-3.32.0-1.fc30.src.rpm
+            # TODO: multiple translation files for same package
+            # gnome-clocks-3.32.0-1.fc30.src.rpm
             if translation["file_format"] == "po":
-                get_po_translation_level(tmp, translation, name, result_folder, tm_folder)
+                get_po_translation_level(
+                    tmp, translation, name, result_folder, tm_folder)
             elif translation["file_format"] == "ts":
                 get_ts_translation_level(tmp, translation, name, result_folder)
             elif translation["file_format"] == "json":
-                get_json_translation_level(tmp, translation, name, result_folder)
+                get_json_translation_level(
+                    tmp, translation, name, result_folder)
             elif translation["file_format"] == "auto":
                 # it's a detection of .tx configuration
                 continue
             else:
-                unknown_format(tmp, translation, name, translation["file_format"], result_folder)
+                unknown_format(
+                    translation,
+                    name,
+                    translation["file_format"],
+                    result_folder)
     tcopy = round(time.time() - tcopy, 1)
     cresults = len(translation_files)
 
@@ -220,8 +221,12 @@ def get_po_translation_level(path, mask, name, result_folder, tm_folder):
         # use copyfile instead of copy2 to handle read-only files in rpm
         copyfile(po, os.path.join(dest, os.path.basename(po)))
 
-    subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path),
-                    result_folder + '/{p}.errors.txt'.format(p=name)], check=True)
+    subprocess.run(["sed",
+                    "-i",
+                    "-e",
+                    "s|{p}|.|g".format(p=path),
+                    result_folder + '/{p}.errors.txt'.format(p=name)],
+                   check=True)
 
 
 def get_ts_translation_level(path, mask, name, result_folder):
@@ -236,8 +241,12 @@ def get_ts_translation_level(path, mask, name, result_folder):
     stats.close()
     error.close()
 
-    subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path),
-                    result_folder + '/{p}.errors.txt'.format(p=name)], check=True)
+    subprocess.run(["sed",
+                    "-i",
+                    "-e",
+                    "s|{p}|.|g".format(p=path),
+                    result_folder + '/{p}.errors.txt'.format(p=name)],
+                   check=True)
 
 
 def get_json_translation_level(path, mask, name, result_folder):
@@ -249,39 +258,63 @@ def get_json_translation_level(path, mask, name, result_folder):
 
     # move only related json files to a temporary folder
     with tempfile.TemporaryDirectory() as tmpjson:
-        for filename in glob.iglob(path+"/"+filemask):
+        for filename in glob.iglob(path + "/" + filemask):
             # if filesare in language subfolder, reproduce the hierarchy
-            dest = os.path.join(*(os.path.dirname(filename).split(os.path.sep)[3:]))
+            dest = os.path.join(
+                *(os.path.dirname(filename).split(os.path.sep)[3:]))
             os.makedirs(tmpjson + "/" + dest, exist_ok=True)
 
-            copyfile(filename, tmpjson + "/" + dest + "/" + os.path.basename(filename))
+            copyfile(
+                filename,
+                tmpjson +
+                "/" +
+                dest +
+                "/" +
+                os.path.basename(filename))
 
         # convert json files to po files
         with tempfile.TemporaryDirectory() as tmppo:
             # use existing template, in not existing (probably a bug), try "en"
-            template_file = tmpjson+"/"+mask.get("template", filemask.replace("*", "en"))
+            template_file = tmpjson + "/" + \
+                mask.get("template", filemask.replace("*", "en"))
 
             if os.path.isfile(template_file):
-                subprocess.run(["json2po", "-t", template_file, tmpjson, tmppo, "--progress=none"],
-                               stderr=error, check=True, cwd=tmppo)
+                subprocess.run(["json2po",
+                                "-t",
+                                template_file,
+                                tmpjson,
+                                tmppo,
+                                "--progress=none"],
+                               stderr=error,
+                               check=True,
+                               cwd=tmppo)
 
                 # compute stats
-                subprocess.run(["pocount", filemask.split("*")[0], "--csv"],
-                               stdout=stats, stderr=error, check=True, cwd=tmppo)
+                subprocess.run(["pocount",
+                                filemask.split("*")[0],
+                                "--csv"],
+                               stdout=stats,
+                               stderr=error,
+                               check=True,
+                               cwd=tmppo)
             else:
                 print("  template doesn't exist, is it a translation-finder bug?")
 
     stats.close()
     error.close()
 
-    subprocess.run(["sed", "-i", "-e", "s|{p}|.|g".format(p=path),
-                    result_folder + '/{p}.errors.txt'.format(p=name)], check=True)
+    subprocess.run(["sed",
+                    "-i",
+                    "-e",
+                    "s|{p}|.|g".format(p=path),
+                    result_folder + '/{p}.errors.txt'.format(p=name)],
+                   check=True)
 
 
-def unknown_format(path, results, srpm, tformat, result_folder):
+def unknown_format(results, srpm, tformat, result_folder):
     print("unknown_format: " + tformat)
 
-    with open(result_folder + "/todo_"+tformat+".txt", "a") as file:
+    with open(result_folder + "/todo_" + tformat + ".txt", "a") as file:
         file.write(srpm + " " + results["filemask"] + "\n")
 
 
diff --git a/manual-discover/libreoffice-6.2.2.2-4.fc30.src.rpm.json b/manual-discover/libreoffice-6.2.2.2-4.fc30.src.rpm.json
deleted file mode 100644
index 9a2cbd2..0000000
--- a/manual-discover/libreoffice-6.2.2.2-4.fc30.src.rpm.json
+++ /dev/null
@@ -1,6 +0,0 @@
-[
-   {
-      "filemask":"translations/source/*/accessibility/messages.po",
-      "file_format":"po"
-   }
-]
\ No newline at end of file

From a48051bf95ae90ab39de71d181e1e254a72c2c47 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Oct 01 2020 21:33:38 +0000
Subject: [PATCH 10/14] add a simple json to csv converter


---

diff --git a/convertCSV.py b/convertCSV.py
new file mode 100755
index 0000000..dfa8fc8
--- /dev/null
+++ b/convertCSV.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+
+def main():
+    """Handle params"""
+
+    parser = argparse.ArgumentParser(
+        description="Convert a data.json into csv")
+    parser.add_argument("--json", required=True,
+                        help="Json file to convert")
+    args = parser.parse_args()
+
+    with open(args.json) as f:
+        data = json.load(f)
+
+    with open('out.csv', mode='w') as csv_file:
+        csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+        csv_writer.writerow(["package", "srpm", "tsearch", "tcopy", "results"])
+
+        for d in data:
+            csv_writer.writerow([d, data[d]["srpm"], data[d]["tsearch"], data[d]["tcopy"], data[d]["results"]])
+
+if __name__ == '__main__':
+    main()

From 3cdfccf3741a761e500f0067c940086b62c6debe Mon Sep 17 00:00:00 2001
From: darknao <darknao@fedoraproject.org>
Date: Oct 01 2020 21:54:13 +0000
Subject: [PATCH 11/14] Cache srpm urls list for 24h, and add few options:

-k, --keep : Keep SRPMs in /srpms volume to avoid redownloading them
-f, --force : Force the processing of all packages
filter : Filter packages list by name using regex, like "libreoffice", or "gnome.*"

---

diff --git a/.gitignore b/.gitignore
index 91f4d69..d80c98e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 venv/
 rpms/
-src.rpms/
\ No newline at end of file
+src.rpms/
+results/
+srpms_*.lst
+tm/
diff --git a/README.md b/README.md
index 2c82919..611b123 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ podman build . -f docker/Dockerfile.33 -t fedlocstats:33
 # Run the scripts
 
 ```bash
-podman run -it --rm -v ./:/src:z --tmpfs /tmp:size=4G fedlocstats:$release $script
+podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release $script
 ```
 
 with `$script`, one of the following:
diff --git a/build.py b/build.py
index cf2bdc0..b36893a 100755
--- a/build.py
+++ b/build.py
@@ -13,6 +13,8 @@ import rpm
 import subprocess
 import tempfile
 import time
+import datetime as dt
+import re
 
 from shutil import copyfile
 from translation_finder import discover
@@ -24,13 +26,24 @@ def main():
 
     parser = argparse.ArgumentParser(
         description="Computes stats for each srpm detected")
-    parser.add_argument("--srpm", required=False,
-                        help="Only work on one SRPM, if selected")
+    parser.add_argument("filter", default=None, nargs='?',
+                        help="package name filter (regex)")
+    parser.add_argument("-k", "--keep-srpms", default=False,
+                        action='store_true', dest='keep',
+                        help="Keep SRPMs in /srpms")
+    parser.add_argument("-f", "--force", default=False,
+                        action='store_true', dest='force',
+                        help="Ignore past progression state")
     args = parser.parse_args()
 
+    srpm_regex = None
+    if args.filter:
+        srpm_regex = re.compile("^{}$".format(args.filter))
+
     (distname, distrel, distid) = distro.linux_distribution()
     result_folder = "./results/f{v}/".format(v=distrel)
     tm_folder = "./tm/f{v}/".format(v=distrel)
+    srpms_path = "/srpms"
 
     if not os.path.exists(result_folder):
         os.makedirs(result_folder)
@@ -39,21 +52,25 @@ def main():
 
     processing_file = os.path.join(result_folder, "data.json")
     srpm_list_file = os.path.join(result_folder, "srpm.txt")
-
-    print("Fetching SRPMs url list")
-    if not os.path.isfile(srpm_list_file):
-        p = subprocess.Popen(
-            'dnf download --source --skip-broken --url "*" | grep src.rpm',
-            stdout=subprocess.PIPE,
-            shell=True)
-        url_list = str(p.stdout.read(), "utf-8").splitlines()
-        url_list = map(lambda x: x + '\n', url_list)
-
-        with open(srpm_list_file, "w") as f:
-            f.writelines(url_list)
-
-    with open(srpm_list_file, "r") as f:
-        url_list = f.readlines()
+    url_list = None
+
+    if os.path.isfile(srpm_list_file):
+        list_file_stats = os.stat(srpm_list_file)
+        last_mod = dt.datetime.fromtimestamp(list_file_stats.st_mtime)
+        if dt.datetime.now() - last_mod < dt.timedelta(hours=24):
+            with open(srpm_list_file) as f:
+                url_list = f.readlines()
+
+    if not url_list:
+        print("Fetching SRPMs url list")
+        p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm',
+                             stdout=subprocess.PIPE,
+                             shell=True)
+
+        urls = str(p.stdout.read(), "utf-8")
+        with open(srpm_list_file, 'w') as f:
+            f.write(urls)
+        url_list = urls.splitlines()
 
     # Load processing data, if any
     try:
@@ -62,7 +79,6 @@ def main():
     except BaseException:
         data = {}
 
-    pkgs = []
     count = 0
     total_urls = len(url_list)
 
@@ -76,10 +92,10 @@ def main():
             srpm_data = dnf.subject.Subject(srpm_filename)
             package = srpm_data.get_nevra_possibilities(forms=1)[0]
 
-            if args.srpm and args.srpm != package.name:
+            if srpm_regex and not srpm_regex.match(package.name):
                 continue
 
-            if package.name in data:
+            if package.name in data and not args.force:
                 # Compare version
                 known_package = dnf.subject.Subject(
                     data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0]
@@ -97,31 +113,34 @@ def main():
             print("{c}/{t} processing {n}".format(
                 c=count, t=total_urls, n=package.name))
 
-            if url.scheme == "rsync":
-                dl = subprocess.run(
-                    ['rsync', url.geturl(), '/tmp'],
-                    stdin=subprocess.PIPE,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.STDOUT)
-            else:
-                dl = subprocess.run(
-                    ['curl', '-L', '--remote-name', url.geturl()],
-                    stdin=subprocess.PIPE,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.STDOUT,
-                    cwd='/tmp')
-
-            if dl.returncode:
-                print("error downloading srpm:")
-                print(dl.stdout)
-                continue
+            srpm_path = os.path.join(srpms_path, srpm_filename)
+            if not os.path.isfile(srpm_path):
+                print("downloading {}".format(srpm_filename))
+                if url.scheme == "rsync":
+                    dl = subprocess.run(
+                            ['rsync', url.geturl(), srpms_path],
+                            stdin=subprocess.PIPE,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT)
+                else:
+                    dl = subprocess.run(
+                            ['curl', '-L', '--remote-name', url.geturl()],
+                            stdin=subprocess.PIPE,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT,
+                            cwd=srpms_path)
+
+                if dl.returncode:
+                    print("error downloading srpm:")
+                    print(dl.stdout)
+                    continue
 
-            srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename)
             extract_srpm(tmp, srpm_path, result_folder)
             (tsearch, tcopy, results) = discover_translations(
                 tmp, package.name, result_folder, tm_folder)
 
-            os.unlink(srpm_path)
+            if not args.keep:
+                os.unlink(srpm_path)
 
             # save processed srpm name & version
             data[package.name] = {

From 4bec9db8bd48a180d4500fd81897a7fd0e455e39 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Oct 06 2020 19:23:25 +0000
Subject: [PATCH 12/14] add prefix to tmp dirs


---

diff --git a/build.py b/build.py
index cf2bdc0..6a9a005 100755
--- a/build.py
+++ b/build.py
@@ -66,7 +66,7 @@ def main():
     count = 0
     total_urls = len(url_list)
 
-    with tempfile.TemporaryDirectory() as tmp:
+    with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmp:
         for line in url_list:
             count += 1
             url = urlparse(line.strip())
@@ -257,7 +257,7 @@ def get_json_translation_level(path, mask, name, result_folder):
     error = open(result_folder + '/{p}.errors.txt'.format(p=name), 'a')
 
     # move only related json files to a temporary folder
-    with tempfile.TemporaryDirectory() as tmpjson:
+    with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmpjson:
         for filename in glob.iglob(path + "/" + filemask):
             # if filesare in language subfolder, reproduce the hierarchy
             dest = os.path.join(
@@ -273,7 +273,7 @@ def get_json_translation_level(path, mask, name, result_folder):
                 os.path.basename(filename))
 
         # convert json files to po files
-        with tempfile.TemporaryDirectory() as tmppo:
+        with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmppo:
             # use existing template, in not existing (probably a bug), try "en"
             template_file = tmpjson + "/" + \
                 mask.get("template", filemask.replace("*", "en"))

From 188e5b9abfa6624e6079969660b3ba7789d31bc0 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Oct 06 2020 19:23:47 +0000
Subject: [PATCH 13/14] Merge branch 'wholedistribution' of ssh://pagure.io/fedora-localization-statistics into wholedistribution


---

diff --git a/.gitignore b/.gitignore
index 91f4d69..d80c98e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 venv/
 rpms/
-src.rpms/
\ No newline at end of file
+src.rpms/
+results/
+srpms_*.lst
+tm/
diff --git a/README.md b/README.md
index 2c82919..611b123 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ podman build . -f docker/Dockerfile.33 -t fedlocstats:33
 # Run the scripts
 
 ```bash
-podman run -it --rm -v ./:/src:z --tmpfs /tmp:size=4G fedlocstats:$release $script
+podman run -it --rm -v ./:/src:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:$release $script
 ```
 
 with `$script`, one of the following:
diff --git a/build.py b/build.py
index 6a9a005..03114d1 100755
--- a/build.py
+++ b/build.py
@@ -13,6 +13,8 @@ import rpm
 import subprocess
 import tempfile
 import time
+import datetime as dt
+import re
 
 from shutil import copyfile
 from translation_finder import discover
@@ -24,13 +26,24 @@ def main():
 
     parser = argparse.ArgumentParser(
         description="Computes stats for each srpm detected")
-    parser.add_argument("--srpm", required=False,
-                        help="Only work on one SRPM, if selected")
+    parser.add_argument("filter", default=None, nargs='?',
+                        help="package name filter (regex)")
+    parser.add_argument("-k", "--keep-srpms", default=False,
+                        action='store_true', dest='keep',
+                        help="Keep SRPMs in /srpms")
+    parser.add_argument("-f", "--force", default=False,
+                        action='store_true', dest='force',
+                        help="Ignore past progression state")
     args = parser.parse_args()
 
+    srpm_regex = None
+    if args.filter:
+        srpm_regex = re.compile("^{}$".format(args.filter))
+
     (distname, distrel, distid) = distro.linux_distribution()
     result_folder = "./results/f{v}/".format(v=distrel)
     tm_folder = "./tm/f{v}/".format(v=distrel)
+    srpms_path = "/srpms"
 
     if not os.path.exists(result_folder):
         os.makedirs(result_folder)
@@ -39,21 +52,25 @@ def main():
 
     processing_file = os.path.join(result_folder, "data.json")
     srpm_list_file = os.path.join(result_folder, "srpm.txt")
-
-    print("Fetching SRPMs url list")
-    if not os.path.isfile(srpm_list_file):
-        p = subprocess.Popen(
-            'dnf download --source --skip-broken --url "*" | grep src.rpm',
-            stdout=subprocess.PIPE,
-            shell=True)
-        url_list = str(p.stdout.read(), "utf-8").splitlines()
-        url_list = map(lambda x: x + '\n', url_list)
-
-        with open(srpm_list_file, "w") as f:
-            f.writelines(url_list)
-
-    with open(srpm_list_file, "r") as f:
-        url_list = f.readlines()
+    url_list = None
+
+    if os.path.isfile(srpm_list_file):
+        list_file_stats = os.stat(srpm_list_file)
+        last_mod = dt.datetime.fromtimestamp(list_file_stats.st_mtime)
+        if dt.datetime.now() - last_mod < dt.timedelta(hours=24):
+            with open(srpm_list_file) as f:
+                url_list = f.readlines()
+
+    if not url_list:
+        print("Fetching SRPMs url list")
+        p = subprocess.Popen('dnf download --source --skip-broken --url "*" | grep src.rpm',
+                             stdout=subprocess.PIPE,
+                             shell=True)
+
+        urls = str(p.stdout.read(), "utf-8")
+        with open(srpm_list_file, 'w') as f:
+            f.write(urls)
+        url_list = urls.splitlines()
 
     # Load processing data, if any
     try:
@@ -62,7 +79,6 @@ def main():
     except BaseException:
         data = {}
 
-    pkgs = []
     count = 0
     total_urls = len(url_list)
 
@@ -76,10 +92,10 @@ def main():
             srpm_data = dnf.subject.Subject(srpm_filename)
             package = srpm_data.get_nevra_possibilities(forms=1)[0]
 
-            if args.srpm and args.srpm != package.name:
+            if srpm_regex and not srpm_regex.match(package.name):
                 continue
 
-            if package.name in data:
+            if package.name in data and not args.force:
                 # Compare version
                 known_package = dnf.subject.Subject(
                     data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0]
@@ -97,31 +113,34 @@ def main():
             print("{c}/{t} processing {n}".format(
                 c=count, t=total_urls, n=package.name))
 
-            if url.scheme == "rsync":
-                dl = subprocess.run(
-                    ['rsync', url.geturl(), '/tmp'],
-                    stdin=subprocess.PIPE,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.STDOUT)
-            else:
-                dl = subprocess.run(
-                    ['curl', '-L', '--remote-name', url.geturl()],
-                    stdin=subprocess.PIPE,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.STDOUT,
-                    cwd='/tmp')
-
-            if dl.returncode:
-                print("error downloading srpm:")
-                print(dl.stdout)
-                continue
+            srpm_path = os.path.join(srpms_path, srpm_filename)
+            if not os.path.isfile(srpm_path):
+                print("downloading {}".format(srpm_filename))
+                if url.scheme == "rsync":
+                    dl = subprocess.run(
+                            ['rsync', url.geturl(), srpms_path],
+                            stdin=subprocess.PIPE,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT)
+                else:
+                    dl = subprocess.run(
+                            ['curl', '-L', '--remote-name', url.geturl()],
+                            stdin=subprocess.PIPE,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT,
+                            cwd=srpms_path)
+
+                if dl.returncode:
+                    print("error downloading srpm:")
+                    print(dl.stdout)
+                    continue
 
-            srpm_path = "{srpm}/{a}".format(srpm='/tmp', a=srpm_filename)
             extract_srpm(tmp, srpm_path, result_folder)
             (tsearch, tcopy, results) = discover_translations(
                 tmp, package.name, result_folder, tm_folder)
 
-            os.unlink(srpm_path)
+            if not args.keep:
+                os.unlink(srpm_path)
 
             # save processed srpm name & version
             data[package.name] = {

From a1d379180be6545d22ad2991c240f3e20eb33d75 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Holcroft <jean-baptiste@holcroft.fr>
Date: Oct 06 2020 21:13:26 +0000
Subject: [PATCH 14/14] save result per file type


this helps to detect what really is important (po files for now)
taking translation-finder from git also requires to make multiple pip steps, including manual requirement installation...

---

diff --git a/.gitignore b/.gitignore
index d80c98e..0695f40 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 venv/
 rpms/
-src.rpms/
+srpms/
 results/
 srpms_*.lst
 tm/
diff --git a/build.py b/build.py
index 03114d1..9d2ffeb 100755
--- a/build.py
+++ b/build.py
@@ -216,7 +216,10 @@ def discover_translations(tmp, name, result_folder, tm_folder):
                     translation["file_format"],
                     result_folder)
     tcopy = round(time.time() - tcopy, 1)
-    cresults = len(translation_files)
+
+    cresults = dict()
+    for file in translation_files:
+        cresults[file["file_format"]] = cresults.get(file["file_format"], 0) + 1
 
     return (tsearch, tcopy, cresults)
 
diff --git a/convertCSV.py b/convertCSV.py
index dfa8fc8..9d1d5d3 100755
--- a/convertCSV.py
+++ b/convertCSV.py
@@ -4,6 +4,27 @@ import argparse
 import csv
 import json
 
+EXTENSION_MAP = (
+    (".po", "po"),
+    ("strings.xml", "aresource"),
+    (".ini", "joomla"),
+    (".csv", "csv"),
+    (".json", "json-nested"),
+    (".dtd", "dtd"),
+    (".php", "php"),
+    (".xlf", "xliff"),
+    (".xliff", "xliff"),
+    (".ts", "ts"),
+    (".resx", "resx"),
+    (".resw", "resx"),
+    (".xlsx", "xlsx"),
+    (".yml", "yaml"),
+    (".yaml", "yaml"),
+    (".properties", "properties"),
+    (".strings", "strings"),
+)
+
+
 def main():
     """Handle params"""
 
@@ -18,10 +39,20 @@ def main():
 
     with open('out.csv', mode='w') as csv_file:
         csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
-        csv_writer.writerow(["package", "srpm", "tsearch", "tcopy", "results"])
+
+        row = ["package", "srpm", "tsearch", "tcopy"]
+        for end, result in EXTENSION_MAP:
+            row.append(end)
+
+        csv_writer.writerow(row)
+
 
         for d in data:
-            csv_writer.writerow([d, data[d]["srpm"], data[d]["tsearch"], data[d]["tcopy"], data[d]["results"]])
+            row = [d, data[d]["srpm"], data[d]["tsearch"], data[d]["tcopy"]]
+            for end, result in EXTENSION_MAP:
+                row.append(data[d]["results"].get(result,0))
+
+            csv_writer.writerow(row)
 
 if __name__ == '__main__':
     main()
diff --git a/docker/Dockerfile.32 b/docker/Dockerfile.32
index c2276ea..b6f5622 100644
--- a/docker/Dockerfile.32
+++ b/docker/Dockerfile.32
@@ -4,6 +4,9 @@ RUN dnf install -y lbzip2 unzip xz git cpio translate-toolkit dnf-plugins-core p
 
 COPY requirements.txt /src/requirements.txt
 RUN pip install --no-cache -r /src/requirements.txt
+RUN pip install --upgrade https://github.com/WeblateOrg/language-data/archive/master.zip
+RUN pip install charamel
+RUN pip install git+https://github.com/WeblateOrg/translation-finder.git
 
 VOLUME /src
 WORKDIR /src
diff --git a/requirements.txt b/requirements.txt
index 3dfb040..edde0f9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,3 @@
-git+https://github.com/WeblateOrg/language-data.git@master
-git+https://github.com/WeblateOrg/translation-finder.git@master
 pyyaml
 pandas
 geopandas