diff --git a/CHANGES b/CHANGES index 514deb2ce0a3c441ffb26d2420f0bfd0e25997f7..e48760fe58c0c569c2af6660aaf19c4b8f637b3a 100755 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,10 @@ = Changes from previous Cobalt Versions = +== Changes to 1.0.30 == + * Fixed a race condition for Cray systems where a job draining may be + disrupted briefly on job termination, allowing other jobs to run + instead. + == Changes to 1.0.29 == * A timeout for automatic functions that was forcing a once-per-ten-seconds cycle time has been relaxed. The default is now much faster and should @@ -9,7 +14,7 @@ be responsive while ALPS updates are being fetched for Cray systems. == Changes to 1.0.28 == - * Fix an issue where defering a cyclic reservation could result in the + * Fix an issue where deferring a cyclic reservation could result in the reservation start time being sent two cycle periods forward rather than one. diff --git a/src/lib/Components/system/CraySystem.py b/src/lib/Components/system/CraySystem.py index e614d21cd57ed8ac6b1629c5511fbd7e4ee8e3aa..dac62036648a4efe24ef3f9a6274f72d00be2582 100644 --- a/src/lib/Components/system/CraySystem.py +++ b/src/lib/Components/system/CraySystem.py @@ -1133,11 +1133,11 @@ class CraySystem(BaseSystem): # 1. idle nodes that are already marked for draining. # 2. Nodes that are in an in-use status (busy, allocated). # 3. Nodes marked for cleanup that are not allocated to a real - # jobid. CLEANING_ID is a sentiel jobid value so we can set - # a drain window on cleaning nodes easiliy. Not sure if this + # jobid. CLEANING_ID is a sentinel jobid value so we can set + # a drain window on cleaning nodes easily. Not sure if this # is the right thing to do. --PMR candidate_list = [] - candidate_list = [nid for nid in node_id_list + candidate_list = [str(nid) for nid in node_id_list if (not self.nodes[str(nid)].draining and (self.nodes[str(nid)].status in ['idle']) or (self.nodes[str(nid)].status in cleanup_statuses) @@ -1158,8 +1158,9 @@ class CraySystem(BaseSystem): if (self.nodes[str(nid)].status != 'down' and self.nodes[str(nid)].managed): self.nodes[str(nid)].set_drain(loc_time[1], job['jobid']) - candidate_list.extend([nid for nid in running_nodes if - self.nodes[str(nid)].draining]) + # It's a list not a set, need to ensure uniqueness + candidate_list.extend([str(nid) for nid in running_nodes if + self.nodes[str(nid)].draining and str(nid) not in candidate_list]) candidate_drain_time = int(loc_time[1]) if len(candidate_list) >= int(job['nodes']): # Enough nodes have been found to drain for this job diff --git a/testsuite/TestCobalt/TestComponents/test_cray.py b/testsuite/TestCobalt/TestComponents/test_cray.py index 8ce147b7a40ca146f3c9072fa51fa4357d67e7b5..a67fded1414ffb682fc230d8b6c1005d2316ca58 100644 --- a/testsuite/TestCobalt/TestComponents/test_cray.py +++ b/testsuite/TestCobalt/TestComponents/test_cray.py @@ -691,6 +691,24 @@ class TestCraySystem(object): assert_match(self.system.nodes[str(i)].drain_jobid, 1, "Bad drain job") assert_match(self.system.nodes[str(i)].drain_until, 100.0, "Bad drain time") + def test_seelct_nodes_for_draining_no_exit_flap(self): + '''CraySystem._select_nodes_for_draining: do not flap when job exits with other running jobs.''' + # This results in the "draining flap" on job exit during a large job drain. You need multiple running + # jobs, an exiting job on a cleanup node that still shows as running from the queue manager, and you + # have to get a duplicate node into the candidate_list while this is running. This is based on the + # local simulator reproduction case. + end_times = [[['2'], 100.0], [['3'], 200.0]] + self.system.nodes['2'].status = 'cleanup-pending' + self.system.nodes['3'].status = 'busy' + self.base_job['nodes'] = 5 + drain_nodes = self.system._select_nodes_for_draining(self.base_job, + end_times) + assert_match(sorted(drain_nodes), ['1', '2', '3', '4', '5'], "Bad Selection") + for i in ['1', '2', '3', '4', '5']: + assert_match(self.system.nodes[str(i)].draining, True, "Draining not set") + assert_match(self.system.nodes[str(i)].drain_jobid, 1, "Bad drain job") + assert_match(self.system.nodes[str(i)].drain_until, 200.0, "Bad drain time") + # common checks for find_job_location def assert_draining(self, nid, until, drain_jobid): assert self.system.nodes[str(nid)].draining, "Node %s should be draining" % nid