#1803 Ensure we stop all phases when exiting abnormally (#1799)
Closed 14 days ago by lsedlar. Opened 6 months ago by adamwill.
adamwill/pungi phases-track-kill  into  master

file modified
+12 -5
@@ -24,6 +24,7 @@ 

      def __init__(self, compose):

          self.compose = compose

          self.msg = "---------- PHASE: %s ----------" % self.name.upper()

+         self.started = False

          self.finished = False

          self._skipped = False

  
@@ -55,6 +56,7 @@ 

          return False

  

      def start(self):

+         self.started = True

          self._skipped = self.skip()

          if self._skipped:

              self.compose.log_warning("[SKIP ] %s" % self.msg)
@@ -104,21 +106,26 @@ 

                  "Note that variants can be excluded in configuration file"

              )

  

-     def stop(self):

-         if self.finished:

+     def stop(self, kill=False):

+         if self.finished or not self.started:

              return

          if hasattr(self, "pool"):

+             if kill:

+                 self.pool.kill()

              self.pool.stop()

          self.finished = True

-         self.compose.log_info("[DONE ] %s" % self.msg)

+         if kill:

+             self.compose.log_info("[KILLED ] %s" % self.msg)

+         else:

+             self.compose.log_info("[DONE ] %s" % self.msg)

  

-         if hasattr(self, "_start_time"):

+         if hasattr(self, "_start_time") and not kill:

              self.compose.log_info(

                  "PHASE %s took %d seconds"

                  % (self.name.upper(), math.ceil(time.time() - self._start_time))

              )

  

-         if self.used_patterns is not None:

+         if self.used_patterns is not None and not kill:

              # We only want to report this if the config was actually queried.

              self.report_unused_patterns()

          self.compose.notifier.send("phase-stop", phase_name=self.name)

file modified
+12 -4
@@ -24,6 +24,7 @@ 

          self.msg = "---------- PHASE: %s ----------" % self.name.upper()

          self.compose = compose

          self.finished = False

+         self.started = False

          self.pool = ThreadPool(logger=self.compose._logger)

          if not phases_schema:

              msg = "No running schema was set for WeaverPhase"
@@ -40,6 +41,7 @@ 

              self.pool.log_error(msg)

              raise RuntimeError(msg)

  

+         self.started = True

          self.compose.log_info("[BEGIN] %s" % self.msg)

          self.run()

  
@@ -50,13 +52,19 @@ 

  

          self.pool.start()

  

-     def stop(self):

-         if self.finished:

+     def stop(self, kill=False):

+         if self.finished or not self.started:

              return

          if hasattr(self, "pool"):

-             self.pool.stop()

+             if kill:

+                 self.pool.kill()

+             else:

+                 self.pool.stop()

          self.finished = True

-         self.compose.log_info("[DONE ] %s" % self.msg)

+         if kill:

+             self.compose.log_info("[KILLED ] %s" % self.msg)

+         else:

+             self.compose.log_info("[DONE ] %s" % self.msg)

  

  

  class PipelineThread(WorkerThread):

@@ -35,10 +35,12 @@ 

  

  

  COMPOSE = None

+ PHASES = []

  

  

  def main():

      global COMPOSE

+     global PHASES

  

      PHASES_NAMES_MODIFIED = PHASES_NAMES + ["productimg"]

  
@@ -456,6 +458,7 @@ 

              continue

          try:

              phase.validate()

+             PHASES.append(phase)

          except ValueError as ex:

              for i in str(ex).splitlines():

                  errors.append("%s: %s" % (phase.name.upper(), i))
@@ -632,6 +635,8 @@ 

      if COMPOSE:

          try_kill_children(signum)

          try_kill_koji_tasks()

+         for phase in PHASES:

+             phase.stop(kill=True)

          COMPOSE.log_error("Compose run failed: signal %s" % signum)

          COMPOSE.log_error("Traceback:\n%s" % "\n".join(traceback.format_stack(frame)))

          COMPOSE.log_critical("Compose failed: %s" % COMPOSE.topdir)
@@ -650,6 +655,8 @@ 

      try:

          main()

      except (Exception, KeyboardInterrupt) as ex:

+         for phase in PHASES:

+             phase.stop(kill=True)

          if COMPOSE:

              COMPOSE.log_error("Compose run failed: %s" % ex)

              COMPOSE.traceback(show_locals=getattr(ex, "show_locals", True))

Add a global list of PHASES, and if we wind up in sigterm_handler
or the except block of cli_main, call stop on every non-skipped
phase, to ensure we can exit.

To avoid actually running through stop() on phases that were
never started, add a started attribute to phases that's set on
start, and exit stop() early if it's not True.

Signed-off-by: Adam Williamson awilliam@redhat.com

hmm, I guess this will just wait for the phase to complete, not kill it...that's probably not what we want. we probably need a kill we can call.

well, hmm, maybe we do want to hang around waiting? I guess that's what we did before? please advise :)

Yeah, this seems like a good idea. Maybe it could be taken one step further: add a kill method to PhaseBase, which would first call self.pool.kill() and then stop(). It's still not going to guarantee shutdown, since it's more "politely asking to please don't start any new tasks" instead of actually killing. It should help stop pkgset, gather and createrepo faster, as those have pools with more tasks than threads. All image building creates a thread per task, so there it won't help.

I don't think there's any point in leaving the process hanging on failure. At least I don't see any value in it. It causes problems with locks, blocks calling services, and potentially complicates debugging if the logs are not fully flushed.

ok. I was actually going to add a 'kill' path to stop, I started that then paused it waiting for your advice. I'll finish that and turn this PR into it, you can see what you think.

rebased onto d95d1f5

6 months ago

rebased onto d95d1f5

6 months ago

@lsedlar did https://pagure.io/pungi/c/8558b74 instead, which does look better, so let's close this.

Pull-Request has been closed by lsedlar

14 days ago