From ca284b3fc9f701c1f2d60098709c75a37f605291 Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: Sep 12 2022 10:29:07 +0000 Subject: [PATCH 1/2] backend: decode special characters from URLs Otherwise we will flood the database with values like %40spacewalkproject. Also, ignore double encoded special characters. --- diff --git a/backend/copr_backend/hitcounter.py b/backend/copr_backend/hitcounter.py index 2a2e032..0222462 100644 --- a/backend/copr_backend/hitcounter.py +++ b/backend/copr_backend/hitcounter.py @@ -5,6 +5,7 @@ Shared logic for hitcounter scripts import os import re from datetime import datetime +from requests.utils import unquote from copr_common.request import SafeRequest from copr_backend.helpers import BackendConfigReader @@ -118,6 +119,21 @@ def get_hit_data(accesses, log): url, bot.group(1)) continue + # Convert encoded characters from their %40 values back to @. + url = unquote(url) + + # I don't know how or why but occasionally there is an URL that is + # encoded twice (%2540oamg -> %40oamg - > @oamg), and yet its status + # code is 200. AFAIK these appear only for EPEL-7 chroots and their + # User-Agent is something like urlgrabber/3.10%20yum/3.4.3 + # I wasn't able to reproduce such accesses, and we decided to not count + # them + if url != unquote(url): + log.warning("Skipping: %s (double encoded URL, user-agent: '%s', " + "status: %s)", access["cs-uri-stem"], + access["cs(User-Agent)"], access["sc-status"]) + continue + # We don't want to count every accessed URL, only those pointing to # RPM files and repo file key_strings = url_to_key_strings(url) From 79e69073a5789470ca2aa0bc17a94dd60936b04c Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: Sep 12 2022 10:29:14 +0000 Subject: [PATCH 2/2] backend: ignore downloaded SRPM files Otherwise we count each SRPM that DistGit downloads from backend, and that inflates the stats of project RPMs downloaded. We could specify User-Agent on DistGit and ignore it in the hitcounter script but we agreed that we can ignore SRPM downloads altogether. --- diff --git a/backend/copr_backend/hitcounter.py b/backend/copr_backend/hitcounter.py index 0222462..bfe9341 100644 --- a/backend/copr_backend/hitcounter.py +++ b/backend/copr_backend/hitcounter.py @@ -141,6 +141,12 @@ def get_hit_data(accesses, log): log.debug("Skipping: %s", url) continue + if any(x for x in key_strings + if x.startswith("chroot_rpms_dl_stat|") + and x.endswith("|srpm-builds")): + log.debug("Skipping %s (SRPM build)", url) + continue + log.debug("Processing: %s", url) # When counting RPM access, we want to iterate both project hits and