#112 Optimize co-package generation
Merged 4 years ago by pingou. Opened 4 years ago by qulogic.
qulogic/mdapi index  into  master

file modified
+11
@@ -192,6 +192,16 @@ 

          raise NotImplementedError(archive)

  

  

+ def index_db(name, tempdb):

+     print(f'{name.ljust(padding)} Indexing file: {tempdb}')

+ 

+     if tempdb.endswith('primary.sqlite'):

+         conn = sqlite3.connect(tempdb)

+         conn.execute('CREATE INDEX packageSource ON packages (rpm_sourcerpm)')

+         conn.commit()

+         conn.close()

+ 

+ 

  def compare_dbs(name, db1, db2, cache1, cache2):

      print(f'{name.ljust(padding)} Comparing {db1} and {db2}')

  
@@ -412,6 +422,7 @@ 

  

              download_db(name, repomd_url, archive)

              decompress_db(name, archive, tempdb)

+             index_db(name, tempdb)

              if PUBLISH_CHANGES:

                  packages = compare_dbs(name, tempdb, destfile, cache1, cache2)

                  publish_changes(name, packages, repomd_url)

file modified
+1 -1
@@ -153,7 +153,7 @@ 

              if pkg.rpm_sourcerpm:

                  async with db.execute(GET_CO_PACKAGE, (pkg.rpm_sourcerpm,)) as cursor:

                      copkgs = await cursor.fetchall()

-                 out['co-packages'] = list({cpkg[2] for cpkg in copkgs})

+                 out['co-packages'] = [cpkg[0] for cpkg in copkgs]

              else:

                  out['co-packages'] = []

              out['repo'] = repotype if repotype else 'release'

file modified
+1 -11
@@ -46,17 +46,7 @@ 

                        FROM {}

                        WHERE pkgKey = ?"""

  

- GET_CO_PACKAGE = """SELECT pkgKey,

-                            pkgId,

-                            name,

-                            rpm_sourcerpm,

-                            epoch,

-                            version,

-                            release,

-                            arch,

-                            summary,

-                            description,

-                            url

+ GET_CO_PACKAGE = """SELECT DISTINCT(name)

                      FROM packages

                      WHERE rpm_sourcerpm = ?"""

  

The GET_CO_PACKAGE query is used in _expand_pkg_info, which is used in get_pkg/get_src_pkg/_process_dep and the latter is used in most other API calls. This means the query is used in every single call except index and list_branches. It turns out that this query is slower than it can be.

Firstly, it returns 10 extra columns, and uniqueness is run in Python. This can be moved to sqlite using DISTINCT. Secondly, determining co-packages uses the rpm_sourcerpm column, which has no index. This makes lookups on that column very slow.

For Rawhide source packages, 60% produce 1 package, 15% produce 2, 3.3% produce 3, 10.6% produce 4, 4.2% produce 5, 2.0% produce 6, and the remaining are <1% for anywhere from 7-90, a few singular source packages with 100-300 packages, and of course texlive at the far end of 5936. For comparison, I ran ab -c 100 -n 1000 http://127.0.0.1:8080/f31/pkg/${pkg} on master, and the two commits here. The results are as follows for some packages in that range, with 3 columns for each result:

guake (1 co-package):

Server Software:        Python/3.7
Server Hostname:        127.0.0.1
Server Port:            8080

Document Path:          /f31/pkg/guake
Document Length:        2676 bytes

Concurrency Level:      100
Time taken for tests:   10.738 11.354 4.795 seconds
Complete requests:      1000
Failed requests:        0
Total transferred:      2902000 bytes
HTML transferred:       2676000 bytes
Requests per second:      93.130   88.070 208.570 [#/sec] (mean)
Time per request:       1073.818 1135.412 479.466 [ms] (mean)
Time per request:         10.738   11.354   4.795 [ms] (mean, across all concurrent requests)
Transfer rate:           263.920  249.600 591.070 [Kbytes/sec] received

Connection Times (ms)
              min    |           mean[±/-sd]          |     median    |       max
Connect:     0  0  0 |    0±  0.6    0±  0.6   0± 0.9 |    0    0   0 |    3    3   5
Processing: 22 30 11 | 1040±102.0 1098±116.7 469±54.5 | 1058 1121 480 | 1290 1348 578
Waiting:    19 27  6 | 1028±101.1 1083±115.3 464±53.4 | 1048 1107 475 | 1284 1315 565
Total:      22 30 11 | 1040±101.9 1098±116.7 469±54.6 | 1059 1121 481 | 1290 1348 581

Percentage of the requests served within a certain time (ms)
  50%   1059 1121 481
  66%   1089 1151 490
  75%   1106 1165 499
  80%   1116 1179 505
  90%   1141 1209 518
  95%   1157 1237 526
  98%   1201 1263 539
  99%   1240 1301 547
 100%   1290 1348 581 (longest request)

gcc (82 co-packages):

Server Software:        Python/3.7
Server Hostname:        127.0.0.1
Server Port:            8080

Document Path:          /f31/pkg/gcc
Document Length:        3803 bytes

Concurrency Level:      100
Time taken for tests:   12.264 8.664 5.030 seconds
Complete requests:      1000
Failed requests:        0
Total transferred:      4029000 bytes
HTML transferred:       3803000 bytes
Requests per second:      81.540 115.420 198.820 [#/sec] (mean)
Time per request:       1226.416 866.374 502.969 [ms] (mean)
Time per request:         12.264   8.664   5.030 [ms] (mean, across all concurrent requests)
Transfer rate:           320.820 454.140 782.270 [Kbytes/sec] received

Connection Times (ms)
              min    |            mean[±sd]          |     median   |       max
Connect:     0  0  0 |    0±  0.6   1±  0.6   1± 0.6 |    0   0   0 |    3    3   3
Processing: 12 13  7 | 1195±170.7 851±105.4 495±47.5 | 1255 864 503 | 1593 1195 588
Waiting:     9 10  4 | 1188±168.9 842±104.7 490±45.7 | 1247 855 498 | 1588 1189 582
Total:      12 14  7 | 1196±170.8 851±105.3 495±47.5 | 1256 864 504 | 1593 1196 588

Percentage of the requests served within a certain time (ms)
  50%   1256  864 504
  66%   1274  885 517
  75%   1284  896 527
  80%   1291  906 532
  90%   1322  928 545
  95%   1370  987 554
  98%   1440 1100 560
  99%   1468 1135 567
 100%   1593 1196 588 (longest request)

glibc (222 co-packages):

Server Software:        Python/3.7
Server Hostname:        127.0.0.1
Server Port:            8080

Document Path:          /f31/pkg/glibc
Document Length:        18019 bytes

Concurrency Level:      100
Time taken for tests:   20.770 11.707 10.725 seconds
Complete requests:      1000
Failed requests:        0
Total transferred:      18246000 bytes
HTML transferred:       18019000 bytes
Requests per second:      48.150   85.420   93.240 [#/sec] (mean)
Time per request:       2076.976 1170.684 1072.459 [ms] (mean)
Time per request:         20.770   11.707   10.725 [ms] (mean, across all concurrent requests)
Transfer rate:           857.900 1522.050 1661.450 [Kbytes/sec] received

Connection Times (ms)
               min    |             mean[±sd]            |     median     |       max
Connect:      0  0  0 |    0±  0.9    0±  0.7    0±  0.8 |    0    0    0 |    4    4    4
Processing:  13 11 10 | 2060±137.3 1148±125.3 1054±113.3 | 2072 1181 1085 | 2397 1351 1216
Waiting:      7  6  5 | 2056±136.0 1143±123.4 1050±111.7 | 2068 1175 1081 | 2386 1344 1206
Total:       13 11 10 | 2060±137.4 1149±125.3 1055±113.4 | 2072 1181 1085 | 2397 1351 1217

Percentage of the requests served within a certain time (ms)
  50%   2072 1181 1085
  66%   2120 1207 1108
  75%   2146 1227 1122
  80%   2160 1233 1133
  90%   2204 1254 1152
  95%   2242 1275 1166
  98%   2276 1298 1184
  99%   2295 1313 1195
 100%   2397 1351 1217 (longest request)

lodash (299 co-packages (max other than texlive)):

Server Software:        Python/3.7
Server Hostname:        127.0.0.1
Server Port:            8080

Document Path:          /f31/pkg/lodash
Document Length:        10603 bytes

Concurrency Level:      100
Time taken for tests:   28.533 17.792 8.471 seconds
Complete requests:      1000
Failed requests:        0
Total transferred:      10830000 bytes
HTML transferred:       10603000 bytes
Requests per second:      35.050   56.200  118.060  [#/sec] (mean)
Time per request:       2853.338 1779.239  847.053  [ms] (mean)
Time per request:         28.533   17.792   8.4710  [ms] (mean, across all concurrent requests)
Transfer rate:           370.660  594.420 1248.580  [Kbytes/sec] received

Connection Times (ms)
              min    |             mean[±sd]           |     median    |       max
Connect:     0  0  0 |    1±  0.6    1±  0.8   0±  0.8 |    0    0   0 |    3    3   4
Processing: 27 30 11 | 2784±402.7 1724±224.3 827±108.9 | 2870 1809 860 | 3574 2132 996
Waiting:    23 25  6 | 2775±401.6 1716±223.1 821±107.3 | 2863 1802 853 | 3564 2121 989
Total:      27 30 11 | 2784±402.6 1725±224.4 827±109.0 | 2871 1810 860 | 3574 2132 996

Percentage of the requests served within a certain time (ms)
  50%   2871 1810 860
  66%   2924 1821 879
  75%   2982 1843 893
  80%   3029 1852 898
  90%   3173 1872 916
  95%   3278 1892 934
  98%   3375 1914 954
  99%   3443 1946 968
 100%   3574 2132 996 (longest request)

texlive (5936 co-packages):

Server Software:        Python/3.7
Server Hostname:        127.0.0.1
Server Port:            8080

Document Path:          /f31/pkg/texlive
Document Length:        139946 bytes

Concurrency Level:      100
Time taken for tests:   331.041 88.963 86.424 seconds
Complete requests:      1000
Failed requests:        0
Total transferred:      140174000 bytes
HTML transferred:       139946000 bytes
Requests per second:        3.020   11.240   11.570 [#/sec] (mean)
Time per request:       33104.143 8896.277 8642.402 [ms] (mean)
Time per request:         331.041   88.963   86.424 [ms] (mean, across all concurrent requests)
Transfer rate:            413.510 1538.720 1583.920 [Kbytes/sec] received

Connection Times (ms)
              min    |               mean[±sd]            |      median     |        max
Connect:     0  0  0 |     0±   0.6    0±  0.6    0±  0.6 |     0    0    0 |     3    3    3
Processing: 32 21 18 | 32934±1324.7 8826±556.3 8574±495.2 | 33101 8911 8564 | 34806 9888 9586
Waiting:    28 17 14 | 32933±1324.7 8824±556.0 8572±495.6 | 33099 8904 8563 | 34806 9888 9586
Total:      32 21 18 | 32934±1324.6 8826±556.2 8575±495.0 | 33101 8911 8564 | 34807 9888 9586

Percentage of the requests served within a certain time (ms)
  50%  33101 8911 8564
  66%  33301 9069 8848
  75%  33468 9155 8931
  80%  33542 9214 8974
  90%  33840 9349 9067
  95%  34041 9471 9160
  98%  34291 9549 9269
  99%  34392 9611 9327
 100%  34807 9888 9586 (longest request)

Thus the first commit is a bit slower (5%) for most packages (since most are 1-to-1), but 1.42-3.72× faster (Requests per second) for larger packages. But the second commit is helpful for all packages, between 1.03-2.36× faster over the first commit. Overall, requests are 2.24-3.83× faster.

I like this, thanks for tracking this down!

Pull-Request has been merged by pingou

4 years ago