diff --git a/README.rst b/README.rst index 08be18a..cfa9195 100644 --- a/README.rst +++ b/README.rst @@ -134,7 +134,7 @@ check_swarm Usage [--connection [//docker.socket|:] | --secure-connection [:]] [--timeout TIMEOUT] - (--swarm | --service SERVICE [SERVICE ...] | --ignore_paused) + (--swarm | --service SERVICE [SERVICE ...] | --ignore-paused) [-V] Check docker swarm. @@ -151,7 +151,7 @@ check_swarm Usage --service SERVICE [SERVICE ...] One or more RegEx that match the names of the services(s) to check. - --ignore_paused Don't require global services to be running on paused nodes + --ignore-paused Don't require global services to be running on paused nodes -V show program's version number and exit Gotchas diff --git a/check_docker/check_swarm.py b/check_docker/check_swarm.py index 347c774..96fa9cf 100755 --- a/check_docker/check_swarm.py +++ b/check_docker/check_swarm.py @@ -202,15 +202,16 @@ def check_swarm(): def process_global_service(name, ignore_paused=False): - bad_node_states = {'drain'} + ignore_node_states = {'drain'} if ignore_paused: - bad_node_states.add('paused') + ignore_node_states.add('pause') # Get all the nodes we care about based on their state node_list, _ = get_nodes() node_index = set() for node in node_list: - if node['Spec']['Availability'] in bad_node_states: + # we can ignore these nodes + if node['Spec']['Availability'] in ignore_node_states: continue node_index.add(node['ID']) @@ -219,13 +220,12 @@ def process_global_service(name, ignore_paused=False): # Also note, this ignores conditions where services state they are running on a node not in the index. service_tasks = get_service_tasks(name) for task in service_tasks: - if task['Status']['State'] != 'running': - critical('Global service {service} has one or more tasks not running'.format(service=name)) - return - node_index.discard(task['NodeID']) + if task['Status']['State'] == 'running' and task['NodeID'] in node_index: + node_index.discard(task['NodeID']) if len(node_index) > 0: - critical('Global service {service} has {count} tasks not running'.format(service=name, count=len(node_list))) + critical('Global service {service} has {count} tasks not running'.format(service=name, count=len(node_index))) + return ok('Global service {service} OK'.format(service=name)) @@ -310,10 +310,10 @@ def process_args(args): default=[], help='One or more RegEx that match the names of the services(s) to check.') - swarm_group.add_argument('--ignore_paused', - dest='ignore_paused', - action='store_true', - help="Don't require global services to be running on paused nodes") + parser.add_argument('--ignore-paused', + dest='ignore_paused', + action='store_true', + help="Don't require global services to be running on paused nodes") # Debug logging parser.add_argument('--debug', diff --git a/tests/test_check_swarm.py b/tests/test_check_swarm.py index b106b47..4df8d22 100644 --- a/tests/test_check_swarm.py +++ b/tests/test_check_swarm.py @@ -31,7 +31,7 @@ def active_node(): @pytest.fixture def paused_node(): - return {"ID": 43, 'Spec': {'Availability': 'paused'}} + return {"ID": 43, 'Spec': {'Availability': 'pause'}} @pytest.fixture @@ -313,7 +313,7 @@ def test_check_services_global_ignore_paused(check_swarm, fs): @pytest.mark.parametrize("service_list, ignore_paused, expected_rc", ( ([active_node_task, paused_node_task, drain_node_task], False, cs.OK_RC), ([active_node_task, drain_node_task], False, cs.CRITICAL_RC), - ([active_node_task, paused_node_task], False, cs.OK_RC), + ([active_node_task, paused_node_task], True, cs.OK_RC), ([active_node_task], False, cs.CRITICAL_RC), ([paused_node_task], False, cs.CRITICAL_RC), ([], False, cs.CRITICAL_RC),