calculate max fail against all hosts in batch

currently it is doing only from the 'active' hosts in the batch which means the percentage goes up as hosts fail instead of staying the same. added debug info for max fail fixes #32255
2024-09-14 20:13:21 +02:00 · 2017-10-30 16:39:15 -04:00 · 2017-10-30 16:39:15 -04:00 · 4fb9e54c50
commit 4fb9e54c50
parent 41685fb516
3 changed files with 13 additions and 23 deletions
--- a/lib/ansible/executor/play_iterator.py
+++ b/lib/ansible/executor/play_iterator.py
@ -203,7 +203,9 @@ class PlayIterator:

        self._host_states = {}
        start_at_matched = False
-        for host in inventory.get_hosts(self._play.hosts):
+        batch = inventory.get_hosts(self._play.hosts)
+        self.batch_size = len(batch)
+        for host in batch:
            self._host_states[host.name] = HostState(blocks=self._blocks)
            # if we're looking to start at a specific task, iterate through
            # the tasks for this host until we find the specified task
--- a/lib/ansible/executor/task_queue_manager.py
+++ b/lib/ansible/executor/task_queue_manager.py
@ -242,22 +242,6 @@ class TaskQueueManager:
            loader=self._loader,
        )

-        # Fork # of forks, # of hosts or serial, whichever is lowest
-        num_hosts = len(self._inventory.get_hosts(new_play.hosts, ignore_restrictions=True))
-
-        max_serial = 0
-        if new_play.serial:
-            # the play has not been post_validated here, so we may need
-            # to convert the scalar value to a list at this point
-            serial_items = new_play.serial
-            if not isinstance(serial_items, list):
-                serial_items = [serial_items]
-            max_serial = max([pct_to_int(x, num_hosts) for x in serial_items])
-
-        contenders = [self._options.forks, max_serial, num_hosts]
-        contenders = [v for v in contenders if v is not None and v > 0]
-        self._initialize_processes(min(contenders))
-
        play_context = PlayContext(new_play, self._options, self.passwords, self._connection_lockfile.fileno())
        for callback_plugin in self._callback_plugins:
            if hasattr(callback_plugin, 'set_play_context'):
@ -268,11 +252,6 @@ class TaskQueueManager:
        # initialize the shared dictionary containing the notified handlers
        self._initialize_notified_handlers(new_play)

-        # load the specified strategy (or the default linear one)
-        strategy = strategy_loader.get(new_play.strategy, self)
-        if strategy is None:
-            raise AnsibleError("Invalid play strategy specified: %s" % new_play.strategy, obj=play._ds)
-
        # build the iterator
        iterator = PlayIterator(
            inventory=self._inventory,
@ -283,6 +262,14 @@ class TaskQueueManager:
            start_at_done=self._start_at_done,
        )

+        # adjust to # of workers to configured forks or size of batch, whatever is lower
+        self._initialize_processes(min(self._options.forks, iterator.batch_size))
+
+        # load the specified strategy (or the default linear one)
+        strategy = strategy_loader.get(new_play.strategy, self)
+        if strategy is None:
+            raise AnsibleError("Invalid play strategy specified: %s" % new_play.strategy, obj=play._ds)
+
        # Because the TQM may survive multiple play runs, we start by marking
        # any hosts as failed in the iterator here which may have been marked
        # as failed in previous runs. Then we clear the internal list of failed
--- a/lib/ansible/plugins/strategy/linear.py
+++ b/lib/ansible/plugins/strategy/linear.py
@ -401,7 +401,7 @@ class StrategyModule(StrategyBase):
                if iterator._play.max_fail_percentage is not None and len(results) > 0:
                    percentage = iterator._play.max_fail_percentage / 100.0

-                    if (len(self._tqm._failed_hosts) / len(results)) > percentage:
+                    if (len(self._tqm._failed_hosts) / iterator.batch_size) > percentage:
                        for host in hosts_left:
                            # don't double-mark hosts, or the iterator will potentially
                            # fail them out of the rescue/always states
@ -410,6 +410,7 @@ class StrategyModule(StrategyBase):
                                iterator.mark_host_failed(host)
                        self._tqm.send_callback('v2_playbook_on_no_hosts_remaining')
                        result |= self._tqm.RUN_FAILED_BREAK_PLAY
+                    display.debug('(%s failed / %s total )> %s max fail' % (len(self._tqm._failed_hosts), iterator.batch_size, percentage))
                display.debug("done checking for max_fail_percentage")

                display.debug("checking to see if all hosts have failed and the running result is not ok")