#4224 Match longest extension first for archivetype
Opened 5 months ago by mikem. Modified 6 hours ago

file modified
+8 -5
@@ -7796,15 +7796,18 @@ 

      else:

          raise koji.GenericError('one of filename, type_name, or type_id must be specified')

  

-     parts = filename.split('.')

+     # otherwise match the filename

      query = QueryProcessor(

          tables=['archivetypes'],

          columns=['id', 'name', 'description', 'extensions', 'compression_type'],

-         clauses=['extensions ~* %(pattern)s'],

+         clauses=[r"%(ext)s IN (SELECT lower(s)"

+                  r" FROM unnest(regexp_split_to_array(extensions, '\s+')) AS s)"],

      )

-     for start in range(len(parts) - 1, -1, -1):

+     # match longest extension first. e.g. .tar.gz before .gz

+     parts = filename.lower().split('.')

+     for start in range(len(parts)):

          ext = '.'.join(parts[start:])

-         query.values['pattern'] = r'(\s|^)%s(\s|$)' % ext

+         query.values['ext'] = ext

          results = query.execute()

  

          if len(results) == 1:
@@ -7814,7 +7817,7 @@ 

              raise koji.GenericError('multiple matches for file extension: %s' % ext)

      # otherwise

      if strict:

-         raise koji.GenericError('unsupported file extension: %s' % ext)

+         raise koji.GenericError('unsupported file extension: %s' % filename)

      else:

          return None

  

@@ -61,7 +61,8 @@ 

          archive_info = [{'id': 1, 'name': 'archive-type-1', 'extensions': 'ext'},

                          {'id': 2, 'name': 'archive-type-2', 'extensions': 'ext'}]

          filename = 'test-filename.ext'

-         self.qp_execute_return_value = archive_info

+         self.qp_execute_side_effect = [[], archive_info]

+         # no matches for full name, multiple matches for .ext

          with self.assertRaises(koji.GenericError) as ex:

              kojihub.get_archive_type(filename=filename)

          self.assertEqual("multiple matches for file extension: ext", str(ex.exception))
@@ -70,7 +71,10 @@ 

          query = self.queries[0]

          self.assertEqual(query.tables, ['archivetypes'])

          self.assertEqual(query.joins, None)

-         self.assertEqual(query.clauses, ['extensions ~* %(pattern)s'])

+         _clauses = [

+             "%(ext)s IN (SELECT lower(s) FROM "

+             "unnest(regexp_split_to_array(extensions, '\\s+')) AS s)"]

+         self.assertEqual(query.clauses, _clauses)

          self.assertEqual(query.columns,

                           ['compression_type', 'description', 'extensions', 'id', 'name'])

          get_archive_type_by_name.assert_not_called()
@@ -91,7 +95,10 @@ 

          query = self.queries[0]

          self.assertEqual(query.tables, ['archivetypes'])

          self.assertEqual(query.joins, None)

-         self.assertEqual(query.clauses, ['extensions ~* %(pattern)s'])

+         _clauses = [

+             "%(ext)s IN (SELECT lower(s) FROM "

+             "unnest(regexp_split_to_array(extensions, '\\s+')) AS s)"]

+         self.assertEqual(query.clauses, _clauses)

          self.assertEqual(query.columns,

                           ['compression_type', 'description', 'extensions', 'id', 'name'])

          get_archive_type_by_name.assert_not_called()
@@ -111,7 +118,10 @@ 

          query = self.queries[0]

          self.assertEqual(query.tables, ['archivetypes'])

          self.assertEqual(query.joins, None)

-         self.assertEqual(query.clauses, ['extensions ~* %(pattern)s'])

+         _clauses = [

+             "%(ext)s IN (SELECT lower(s) FROM "

+             "unnest(regexp_split_to_array(extensions, '\\s+')) AS s)"]

+         self.assertEqual(query.clauses, _clauses)

          self.assertEqual(query.columns,

                           ['compression_type', 'description', 'extensions', 'id', 'name'])

          get_archive_type_by_name.assert_not_called()
@@ -129,7 +139,10 @@ 

          query = self.queries[0]

          self.assertEqual(query.tables, ['archivetypes'])

          self.assertEqual(query.joins, None)

-         self.assertEqual(query.clauses, ['extensions ~* %(pattern)s'])

+         _clauses = [

+             "%(ext)s IN (SELECT lower(s) FROM "

+             "unnest(regexp_split_to_array(extensions, '\\s+')) AS s)"]

+         self.assertEqual(query.clauses, _clauses)

          self.assertEqual(query.columns,

                           ['compression_type', 'description', 'extensions', 'id', 'name'])

          get_archive_type_by_name.assert_not_called()

This addresses a longstanding bug in get_archive_type. In cases of layered extensions, the code would match a shorter extension before a longer one. E.g. ".gz" would match before ".tar.gz". We have avoided such overlapping archivetypes for this reason, however we can't control what instances do.

Fixes https://pagure.io/koji/issue/4291

1 new commit added

  • make sure extension matching is case insensitive
5 months ago

1 new commit added

  • fix unit test
5 months ago

Metadata Update from @tkopecek:
- Pull-request tagged with: testing-ready

2 months ago

Metadata Update from @tkopecek:
- Pull-request untagged with: testing-ready
- Pull-request tagged with: testing-basic

6 hours ago