#11796 Remove pdc from get-retired-packages.py script
Merged 5 days ago by humaton. Opened 7 months ago by lenkaseg.
lenkaseg/releng remove_pdc  into  main

@@ -2,108 +2,53 @@ 

  specified branches marked as inactive


  from __future__ import print_function

- try:

-     from urllib import urlencode

- except ImportError:

-     from urllib.parse import urlencode

- import multiprocessing.pool

- from multiprocessing import cpu_count

  import argparse

- from math import ceil

- from functools import partial

  import sys

- import traceback

  import requests


- PDC_URL = 'https://pdc.fedoraproject.org'

- # These are set to 4 so that there is a performance gain but it should be low

- # enough to not overwhelm PDC



+ DISTGIT_URL = "https://src.fedoraproject.org/lookaside/"



- def handle_errors(f):

-     def _wrapper(*args, **kwargs):

-         """ A decorator for `f` that prints tracebacks. """

-         try:

-             return f(*args, **kwargs)

-         except:

-             traceback.print_exc()

-             raise

-     _wrapper.__name__ = f.__name__

-     _wrapper.__doc__ = f.__doc__

-     return _wrapper



- def get_component_branch_page(branch_name, page, page_size=100):

-     query_args = {'type': 'rpm', 'name': branch_name, 'active': False,

-                   'page_size': page_size, 'page': page}

-     pdc_api_query_url = '{0}/rest_api/v1/component-branches/?{1}'.format(

-         PDC_URL.rstrip('/'), urlencode(query_args))

+ def get_retired_pkgs(branch_name):

+     distgit_api_query_url = "{0}/retired_in_{1}.json".format(DISTGIT_URL.rstrip("/"), branch_name)

+     rv = requests.get(distgit_api_query_url, timeout=30)


-         rv = requests.get(pdc_api_query_url, timeout=30)

-     except (requests.ConnectionError, requests.ConnectTimeout):

-         print('The connection to PDC failed', file=sys.stderr)

+         rv.raise_for_status()

+     except requests.exceptions.HTTPError as e:

+         print(

+             f"The connection to distgit failed. Failed url: {distgit_api_query_url}, error: {str(e)}",

+             file=sys.stderr,

+         )




          return rv.json()

      except ValueError:

-         print('The data returned from PDC was not JSON', file=sys.stderr)

+         print(

+             f"The data returned from distgit for branch {branch_name} was not JSON", file=sys.stderr

+         )




- def get_pkgs_from_page(branch_name, page):

-     pkgs_set = set()

-     rv_json = get_component_branch_page(branch_name, page)

-     # Extract the package names from API results

-     for branch_rv in rv_json['results']:

-         pkgs_set.add(str(branch_rv['global_component']))


-     return pkgs_set



- @handle_errors

- def get_pkg_branch_status(branch_name):

-     # Get total number of branches that fit the query

-     component_branch_page_one = \

-         get_component_branch_page(branch_name, page=1, page_size=1)

-     # Get the total number of pages

-     num_pages = int(ceil(component_branch_page_one['count'] / 100.0))

-     # Since we are going to multi-thread, we need to make a partial function

-     # call so that all the function needs is an iterable to run

-     partial_get_pkgs_from_page = partial(get_pkgs_from_page, branch_name)

-     # Start processing NUM_THREADS_PER_PROCESS pages at a time

-     pool = multiprocessing.pool.ThreadPool(NUM_THREADS_PER_PROCESS)

-     pkg_sets = pool.map(partial_get_pkgs_from_page, range(1, num_pages + 1))

-     pool.close()

-     # Return a set of all the packages from the pages queried

-     if pkg_sets:

-         return set.union(*pkg_sets)

-     else:

-         return set()



- if __name__ == '__main__':

+ if __name__ == "__main__":

      parser = argparse.ArgumentParser()

-     help = 'the branches that the returned packages will have retired'

-     parser.add_argument('branches', nargs='+', help=help)

+     help = "the branches that the returned packages will have retired"

+     parser.add_argument("branches", nargs="+", help=help)

      args = parser.parse_args()

-     if cpu_count() > MAX_NUM_PROCESSES:

-         num_processes = MAX_NUM_PROCESSES

-     else:

-         num_processes = cpu_count()

-     # Process up to num_processes branches at a time in separate processes

-     pool = multiprocessing.Pool(processes=num_processes)

-     pkg_sets = pool.map(get_pkg_branch_status, args.branches)

-     pool.close()


+     pkg_sets = []

+     for branch in args.branches:

+         pkg_list = get_retired_pkgs(branch).get(branch)

+         pkg_sets.extend(pkg_list)

+         print(pkg_sets)

      # Return only the packages that have all the specified branches and are

      # retired

-     pkgs = list(set.intersection(*pkg_sets))

+     pkgs = list(set(pkg_sets))

      if pkgs:

          for pkg in sorted(pkgs):



-         print('No retired packages were returned from the branches: {0}'

-               .format(', '.join(args.branches), file=sys.stderr))

+         print(

+             "No retired packages were returned from the branches: {0}".format(

+                 ", ".join(args.branches)

+             )

+         )

Based on PR#11757 and PR#11795 remove pdc from the script and use the link to lookaside where the json of retired packages live now.

Signed-off-by: Lenka Segura lsegura@redhat.com

Looks good to me. :)

Is this ready to merge? or is there still work before it's ready to go?

I think we should wait until we have the cron job for getting the retired packages deployed to production. Which is planned for the next Monday.

Can you confirm that @humaton ?

rebased onto 5a8b6be

2 months ago

Yes the JSON is regenerating nightly and it seems to have correct data in it.

rebased onto 29a0cb0

a month ago

Ok, I have just merged changes to the script generating the JSON data. Let's wait until it runs and will merge this tomorrow, and watch for any issues.

rebased onto 3e8c69d

a month ago

rebased onto 1cd6df4

12 days ago

rebased onto 58b73f4

10 days ago

Pull-Request has been merged by humaton

5 days ago