#2334 frontend: no delay after large SRPM upload
Merged 2 years ago by praiskup. Opened 2 years ago by praiskup.
Unknown source no-delay-after-upload  into  main

@@ -16,6 +16,7 @@

  from openid_teams.teams import TeamsResponse

  

  from coprs.redis_session import RedisSessionInterface

+ from coprs.request import get_request_class

  

  app = flask.Flask(__name__)

  if "COPRS_ENVIRON_PRODUCTION" in os.environ:
@@ -96,6 +97,8 @@

  })

  app.cache = cache

  

+ app.request_class = get_request_class(app)

+ 

  from coprs.views import admin_ns

  from coprs.views.admin_ns import admin_general

  from coprs.views import api_ns

@@ -21,6 +21,7 @@

  from coprs import db

  from coprs import models

  from coprs import helpers

+ from coprs.request import save_form_file_field_to

  from coprs.exceptions import (

      ActionInProgressException,

      BadRequest,
@@ -582,12 +583,12 @@

              copr_dirname=copr_dirname, **build_options)

  

      @classmethod

-     def create_new_from_upload(cls, user, copr, f_uploader, orig_filename,

+     def create_new_from_upload(cls, user, copr, form_field, orig_filename,

                                 chroot_names=None, copr_dirname=None, **build_options):

          """

          :type user: models.User

          :type copr: models.Copr

-         :param f_uploader(file_path): function which stores data at the given `file_path`

+         :param form_field: object representing an uploaded file in form

          :return:

          """

          tmp = None
@@ -596,7 +597,7 @@

              tmp_name = os.path.basename(tmp)

              filename = secure_filename(orig_filename)

              file_path = os.path.join(tmp, filename)

-             f_uploader(file_path)

+             save_form_file_field_to(form_field, file_path)

          except OSError as error:

              if tmp:

                  shutil.rmtree(tmp)

@@ -0,0 +1,87 @@

+ """

+ Use NamedTemporaryFile for large file uploads, so we eventually don't have copy

+ the uploaded file to the final destination.

+ """

+ 

+ import os

+ import tempfile

+ import typing as t

+ 

+ from werkzeug.formparser import default_stream_factory

+ import flask

+ 

+ 

+ NAMED_FILE_FROM_BYTES = 50*1024*1024

+ 

+ 

+ def get_request_class(app):

+     """

+     Return a Copr-specific Request class instantiated by Flask for every

+     request.  We want to override the internal _get_file_stream() (HACK!) method

+     to enforce using the NamedTemporaryFile class for the large uploaded files -

+     instead of the werkzeug's default SpooledTemporaryFile.  While

+     SpooledTemporaryFile() by default stores small files into memory and

+     fallbacks to the /tmp directory, NamedTemporaryFile() data destination can

+     be controlled by our Copr logic, so we can store the file into

+     config[STORAGE_DIR] directly.  The slow uploads (typically network limits,

+     from remote users' boxes) are thus directly stored to the target volume, and

+     once uploaded we don't have to do the expensive cross-volume file copy

+     (/tmp => STORAGE_DIR).  For very large source RPMs this could even mean

+     several minutes.

+ 

+     Be careful, we can't simply "move" the temporary file to the target

+     destination because the upper level Flask/Werkzeug logic automatically

+     unlinks the temporary file when the corresponding request is processed.

+     We have to do os.link() instead.

+ 

+     Reported to Flask upstream: https://github.com/pallets/flask/issues/4844

+     """

+ 

+     class _CoprRequestClass(flask.Request):

+         # pylint: disable=no-self-use

+         def _get_file_stream(

+             self,

+             total_content_length: t.Optional[int],

+             content_type: t.Optional[str],

+             filename: t.Optional[str] = None,

+             content_length: t.Optional[int] = None,

+         ) -> t.IO[bytes]:

+             """Called to get a stream for the file upload."""

+ 

+             # We do this hack only for reasonably large uploaded files,

+             # elsewhere we use the default Flask upload mechanism.  This is to

+             # stay as close to the upstream behavior as possible and stay

+             # compatible in the future.

+             if total_content_length and total_content_length >= NAMED_FILE_FROM_BYTES:

+                 # pylint: disable=consider-using-with

+                 tfile = tempfile.NamedTemporaryFile(

+                     "rb+", dir=app.config["STORAGE_DIR"])

+                 setattr(tfile, "named_file", True)

+                 return t.cast(t.IO[bytes], tfile)

+ 

+             # For the smaller RPMs, we keep using the default Temporary files

+             # abstraction.

+             return default_stream_factory(

+                 total_content_length,

+                 content_type,

+                 filename,

+                 content_length,

+             )

+ 

+     return _CoprRequestClass

+ 

+ 

+ def save_form_file_field_to(field, path: str):

+     """

+     Store the uploaded data on Werkzeug file field.  Use this if the hack

+     with _CoprRequestClass above is in action, and when you are sure that the

+     uploaded file is on the same partition with the target path (so we can

+     hardlink).

+     """

+     stream = field.data.stream

+ 

+     # Detect if SpooledTemporaryFile or NamedTemporaryFile is the storage

+     if hasattr(stream, "named_file"):

+         os.link(stream.name, path)

+     else:

+         field.data.save(path)

@@ -188,7 +188,7 @@

      def create_new_build(options):

          return BuildsLogic.create_new_from_upload(

              flask.g.user, copr,

-             f_uploader=lambda path: form.pkgs.data.save(path),

+             form.pkgs,

              orig_filename=secure_filename(form.pkgs.data.filename),

              **options,

          )

@@ -420,7 +420,7 @@

      def factory(**build_options):

          BuildsLogic.create_new_from_upload(

              flask.g.user, copr,

-             f_uploader=lambda path: form.pkgs.data.save(path),

+             form.pkgs,

              orig_filename=form.pkgs.data.filename,

              chroot_names=form.selected_chroots,

              **build_options

@@ -220,8 +220,10 @@

          """ Post API3 form under "user" """

  

          method = self.test_class_object.post_api3_with_auth

-         if "upload_comps" in content:

-             method = self.test_class_object.post_api3_with_auth_multipart

+         for key in content:

+             if isinstance(content[key], FileStorage):

+                 method = self.test_class_object.post_api3_with_auth_multipart

+                 break

          return method(

              url, content, self.test_class_object.transaction_user)

  
@@ -320,6 +322,23 @@

          resp = self.post(route, data)

          return resp

  

+     def submit_uploaded_build(self, project, source_rpm, build_options=None):

+         """

+         Submit a build via source RPM

+         """

+         route = "/api_3/build/create/upload"

+         data = {

+             "ownername": self.transaction_username,

+             "projectname": project,

+             "pkgs": FileStorage(

+                 stream=open(source_rpm, "rb"),

+                 filename=os.path.basename(source_rpm),

+                 content_type="application/rpm",

+             )

+         }

+         data.update(self._form_data_from_build_options(build_options))

+         return self.post(route, data)

+ 

      def create_distgit_package(self, project, pkgname, options=None,

                                 expected_status_code=None):

          route = "/api_3/package/add/{}/{}/{}/distgit".format(

@@ -2,12 +2,15 @@

  

  import json

  import os

+ import tempfile

  import time

+ from unittest import mock

  

  import pytest

  

  from sqlalchemy.orm.exc import NoResultFound

  from coprs import models

+ from coprs.request import NAMED_FILE_FROM_BYTES

  

  from copr_common.enums import StatusEnum

  from coprs.exceptions import (ActionInProgressException,
@@ -429,13 +432,13 @@

          assert "has no active chroots" in str(error.value)

          assert len(self.c1.active_copr_chroots) == 0

  

+     @mock.patch('coprs.logic.builds_logic.save_form_file_field_to')

      @pytest.mark.usefixtures("f_users", "f_coprs", "f_mock_chroots", "f_db")

-     def test_create_new_from_upload_no_space_left(self):

-         def f_uploader(file_path):

-             raise OSError("[Errno 28] No space left on device")

+     def test_create_new_from_upload_no_space_left(self, save_form_patch):

+         save_form_patch.side_effect = OSError("[Errno 28] No space left on device")

  

          with pytest.raises(InsufficientStorage) as error:

-             BuildsLogic.create_new_from_upload(self.u1, self.c1, f_uploader,

+             BuildsLogic.create_new_from_upload(self.u1, self.c1, None,

                                                 "fake.src.rpm",

                                                 chroot_names=["fedora-18-x86_64"],

                                                 copr_dirname=None)
@@ -540,22 +543,56 @@

  

      @TransactionDecorator("u1")

      @pytest.mark.usefixtures("f_users", "f_users_api", "f_mock_chroots", "f_db")

+     # Call SpooledTemporaryFile twice (once in memory, once in file), and

+     # NamedTemporaryFile once.

+     @pytest.mark.parametrize("kbytes", [100, 1024, 100*1024])

+     def test_srpm_upload(self, kbytes):

+         self.web_ui.new_project("test", ["fedora-rawhide-i386"])

+ 

+         srpm_base = "test.src.rpm"

+         workdir = os.path.dirname(__file__)

+         srpm = os.path.join(workdir, srpm_base)

+         with open(srpm, "w", encoding="utf-8") as fd:

+             string = "x"*1024*kbytes

+             fd.write(string)

+ 

+         not_patched = tempfile.NamedTemporaryFile

+         with mock.patch("coprs.request.tempfile.NamedTemporaryFile",

+                         wraps=not_patched) as patch:

+             resp = self.api3.submit_uploaded_build("test", srpm)

+             exp_count = 1 if kbytes*1024 > NAMED_FILE_FROM_BYTES else 0

+             assert patch.call_count == exp_count

+ 

+         assert resp.status_code == 200

+         # url = https://copr.stg.fedoraproject.org/tmp/tmpf_3w8r9i/test.src.rpm'

+         url = json.loads(resp.data)["source_package"]["url"]

+         tmpdir = url.split("/")[-2]

+         storage = os.path.join(self.app.config["STORAGE_DIR"], tmpdir, srpm_base)

+         stat = os.stat(storage)

+         assert stat.st_size == 1024*kbytes

+         assert stat.st_nlink == 1

+ 

+     @TransactionDecorator("u1")

+     @pytest.mark.usefixtures("f_users", "f_users_api", "f_mock_chroots", "f_db")

      @pytest.mark.parametrize("fail", [False, True])

      def test_temporary_data_removed(self, fail):

          self.web_ui.new_project("test", ["fedora-rawhide-i386"])

          content = "content"

          filename = "fake.src.rpm"

-         def f_uploader(file_path):

+         def save_field(_field, file_path):

              assert file_path.endswith(filename)

-             with open(file_path, "w") as fd:

+             with open(file_path, "w", encoding="utf-8") as fd:

                  fd.write(content)

  

          user = models.User.query.get(1)

          copr = models.Copr.query.first()

-         build = BuildsLogic.create_new_from_upload(

-             user, copr, f_uploader, os.path.basename(filename),

-             chroot_names=["fedora-18-x86_64"],

-             copr_dirname=None)

+ 

+         with mock.patch("coprs.logic.builds_logic.save_form_file_field_to",

+                         new=save_field):

+             build = BuildsLogic.create_new_from_upload(

+                 user, copr, None, os.path.basename(filename),

+                 chroot_names=["fedora-18-x86_64"],

+                 copr_dirname=None)

  

          source_dict = build.source_json_dict

          storage = os.path.join(self.app.config["STORAGE_DIR"], source_dict["tmp"])

Don't upload files to /tmp, but directly to STORAGE_DIR. This means we
don't have to do expensive /tmp => STORAGE_DIR copy when the file is
uploaded - so the client doesn't have to wait more (and potentially
timeout).

Fixes: #2284

For now this is just work in progress; (a) the tests need to be fixed, and
(b) this is a hack solving a quite common case, which should be reported
upstream at least (and linked from our hack-code).

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

rebased onto 90b5db392a01c485e4556624caec75899cdc1345

2 years ago

Metadata Update from @praiskup:
- Pull-request tagged with: wip

2 years ago

Build succeeded.

Is there a reason why don't use it for smaller files as well?

This condition deserves an explanation. I know that this is because of the

Be careful, we can't simply "move" the temporary file to the target
destination because the upper level Flask/Werkzeug logic
automatically unlinks the temporary file.

But I still don't understand the condition. I would expect simply calling os.link just before calling form_field.data.save.

Can you please link the upstream flask issue here?

So far the PR looks, I like the approach. Hopefully, an official solution will be provided by flask soon.

In the docs https://flask.palletsprojects.com/en/2.2.x/patterns/fileuploads/#improving-uploads , there is:

So how exactly does Flask handle uploads? Well it will store them in the webserver’s memory if the files are reasonably small, otherwise in a temporary location (as returned by tempfile.gettempdir()).

So an alternative solution might be somehow mocking the tempfile.gettempdir to return STORAGE_DIR instead. Just in case it turns out that we need a different approach.

I should have document this. My though was to go through the default Flask code path most of the time, and special-case only the files where it makes real sense... but I can change it if you have a different preference.

Good point, I'll explain. We do either os.link or form_field.data.save, not both. If stream.name is defined, it is NamedTemporaryFile - and we can os.link() (we can't link SpooledTemporaryFile).

So an alternative solution might be somehow mocking the tempfile.gettempdir to
return STORAGE_DIR instead. Just in case it turns out that we need a different
approach.

That seems possible, but it might well be even slower. With
SpooledTemporaryFile we still can not "move" or "link", we have to copy using
the 'field.data.save()' method. That will be:

STORAGE_DIR (slow) => STORAGE_DIR

Instead of:

/tmp (fast read) => STORAGE_DIR

We could move the STORAGE_DIR to /tmp but it doesn't survive power cycle, and is relatively small.

Heh, my thinking was exactly the opposite - Not having special cases, and therefore not having any special-case bugs :D
But your approach is reasonable, let's keep it as is.

rebased onto 3a444dfadc21cdeb587aa6bae45254820ef1f46d

2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

rebased onto 5ea3b52e67688456aa64400ba6c385d8091a0913

2 years ago

Build succeeded.

rebased onto 661f4d657255e0443db828a63694daae04c58bee

2 years ago

Metadata Update from @praiskup:
- Pull-request untagged with: wip

2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

rebased onto feed8eb9aa1fc00eb69986c522ace36d0c814066

2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

rebased onto d5bea15

2 years ago

Build succeeded.

Thank you very much for the additional comments.
LGTM now.

Commit 38e41db fixes this pull-request

Pull-Request has been merged by praiskup

2 years ago