#1404 backend, frontend: cancel also "starting" builds
Merged 5 years ago by praiskup. Opened 5 years ago by praiskup.
Unknown source cancel-starting-task  into  master

@@ -350,6 +350,18 @@

          self.canceled = bool(self.redis_get_worker_flag("cancel_request"))

          return self.canceled

  

+     def _cancel_if_requested(self):

+         """

+         Raise BuildCanceled exception if there's already a request for

+         cancellation.  This is useful as "quick and cheap check" before starting

+         some expensive task that would have to be later canceled anyways.

+         We can call this multiple times, anytime we feel it is appropriate.

+         """

+         if self._cancel_task_check_request():

+             self.log.warning("Canceling the build early")

+             self._drop_host()

+             raise BuildCanceled

+ 

      def _cancel_vm_allocation(self):

          self.redis_set_worker_flag("canceling", 1)

          self._drop_host()
@@ -659,10 +671,12 @@

          failed = True

  

          self._wait_for_repo()

+         self._cancel_if_requested()

          self._alloc_host()

          self._alloc_ssh_connection()

          self._check_vm()

          self._fill_build_info_file()

+         self._cancel_if_requested()

          self._mark_running(attempt)

          self._start_remote_build()

          transfer_failure = CancellableThreadTask(

@@ -534,6 +534,42 @@

  

  @_patch_bwbuild_object("CANCEL_CHECK_PERIOD", 0.5)

  @mock.patch("copr_backend.sign.SIGN_BINARY", "tests/fake-bin-sign")

+ def test_cancel_before_vm(f_build_rpm_sign_on, caplog):

+     config = f_build_rpm_sign_on

+     worker = config.bw

+     # set this early

+     worker.redis_set_worker_flag("cancel_request", 1)

+     worker.process()

+     assert_logs_exist([

+         "Build was canceled",

+         "Canceling the build early",

+         COMMON_MSGS["not finished"],

+         "Worker failed build",

+     ], caplog)

+     assert_logs_dont_exist(["Releasing VM back to pool"], caplog)

+ 

+ @_patch_bwbuild_object("CANCEL_CHECK_PERIOD", 0.5)

+ @mock.patch("copr_backend.sign.SIGN_BINARY", "tests/fake-bin-sign")

+ def test_cancel_before_start(f_build_rpm_sign_on, caplog):

+     config = f_build_rpm_sign_on

+     worker = config.bw

+ 

+     # cancel request right before starting the build

+     worker._fill_build_info_file = mock.MagicMock()

+     worker._fill_build_info_file.side_effect = \

+         lambda: worker.redis_set_worker_flag("cancel_request", 1)

+ 

+     worker.process()

+     assert_logs_exist([

+         "Build was canceled",

+         "Releasing VM back to pool",

+         "Canceling the build early",

+         COMMON_MSGS["not finished"],

+         "Worker failed build",

+     ], caplog)

+ 

+ @_patch_bwbuild_object("CANCEL_CHECK_PERIOD", 0.5)

+ @mock.patch("copr_backend.sign.SIGN_BINARY", "tests/fake-bin-sign")

  def test_build_retry(f_build_rpm_sign_on):

      config = f_build_rpm_sign_on

      worker = config.bw

@@ -913,13 +913,13 @@

              raise InsufficientRightsException(

                  "You are not allowed to cancel this build.")

          if not build.cancelable:

-             if build.status == StatusEnum("starting"):

-                 # this is not intuitive, that's why we provide more specific message

-                 err_msg = "Cannot cancel build {} in state 'starting'".format(build.id)

-             else:

-                 err_msg = "Cannot cancel build {}".format(build.id)

+             err_msg = "Cannot cancel build {}".format(build.id)

              raise ConflictingRequest(err_msg)

  

+         # No matter the state, we tell backend to cancel this build.  Even when

+         # the build is in pending state (worker manager may be already handling

+         # this build ATM, and creating an asynchronous worker which needs to be

+         # canceled).

          ActionsLogic.send_cancel_build(build)

  

          build.canceled = True

@@ -1148,7 +1148,7 @@

          """

          Find out if this build is cancelable.

          """

-         return not self.finished and self.status != StatusEnum("starting")

+         return not self.finished

  

      @property

      def repeatable(self):

There's no point to not deliver a cancel request when the build is in
"starting" state. At least now.

Also, from the backend perspective, try to check if there's the cancel
request always before we start doing some expensive actions.

Fixes: #1391

Why do we call it twice? Is there a considerable lag caused by some of those functions called in between?

There's no point to not deliver a cancel request when the build is in
"starting" state. At least now.

IIRC we historically weren't able to cancel build in that state. I don't know if it had something to do with our queue architecture or the state of OpenStack though. Maybe @msuchy would know the reason if you are interested in it.

Anyway, except for that one question, LGTM.

Because both _alloc_host() and _transfer_log_file() may take quite some/eat lot of resources when we just attempted to start them, so it's better to do cheap check whether it makes sense to even begin with the task rather than cancel it later.

The _cancel_if_requested() is meant to be called anytime we want.

IIRC we historically weren't able to cancel build in that state. I don't know if it had something to do with our queue architecture or the state of OpenStack though. Maybe @msuchy would know the reason if you are interested in it.

The code we talk about is a very old one, and I think that the build was
marked as non cancellable one because we were not sure whether backend
already started the build or not. IOW, there was a risk that we could
make it "canceled" on FE, but BE would still process the build due to
a race condition (yea, some very similar bug was there anyway, ... but I
think this was the original intention).

I'm not entirely sure, so I don't want to dump uncertain statement into
commit message. But I'm pretty sure we want to make "starting" state a
cancellable one with the current code.

I would maybe put a short comment about it in the code because otherwise someone might see the code after some time, scratch his head why it is there twice and remove it.

rebased onto 65d4c8c06d06dce8bedef4b07cc410d98c2bb707

5 years ago

rebased onto 0041878dfc4aac192f5a1aac84f09616702b9115

5 years ago

rebased onto 06d7629

5 years ago

Pull-Request has been merged by praiskup

5 years ago