-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlambda_function.py
More file actions
95 lines (82 loc) · 2.78 KB
/
lambda_function.py
File metadata and controls
95 lines (82 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import boto3
import json
import requests
from datetime import datetime
def send_pagerduty_alert(message):
headers = {
'Authorization': f"Token token={os.environ['PAGERDUTY_TOKEN']}",
'Content-Type': 'application/json',
'Accept': 'application/vnd.pagerduty+json;version=2'
}
payload = {
"incident": {
"type": "incident",
"title": "ECS Task Count Alert",
"service": {
"id": os.environ['PAGERDUTY_SERVICE_ID'],
"type": "service_reference"
},
"body": {
"type": "incident_body",
"details": message
}
}
}
response = requests.post(
'https://api.pagerduty.com/incidents',
headers=headers,
json=payload
)
return response.status_code == 201
def update_ecs_service(ecs_client, cluster, service):
try:
response = ecs_client.update_service(
cluster=cluster,
service=service,
desiredCount=int(os.environ['DESIRED_COUNT'])
)
return True
except Exception as e:
print(f"Error updating ECS service: {str(e)}")
return False
def handler(event, context):
ecs_client = boto3.client('ecs')
cluster_name = os.environ['CLUSTER_NAME']
service_name = os.environ['SERVICE_NAME']
desired_count = int(os.environ['DESIRED_COUNT'])
try:
# Get running tasks
tasks_response = ecs_client.list_tasks(
cluster=cluster_name,
serviceName=service_name
)
task_count = len(tasks_response['taskArns'])
if task_count < desired_count:
message = f"Alert: Only {task_count} tasks running in cluster {cluster_name}. Expected {desired_count} tasks."
print(message)
# Send PagerDuty alert
alert_sent = send_pagerduty_alert(message)
# Attempt to restore desired count
service_updated = update_ecs_service(ecs_client, cluster_name, service_name)
return {
'statusCode': 200,
'body': json.dumps({
'message': message,
'alert_sent': alert_sent,
'service_updated': service_updated
})
}
return {
'statusCode': 200,
'body': json.dumps({
'message': f"Task count is normal: {task_count} tasks running"
})
}
except Exception as e:
error_message = f"Error monitoring ECS tasks: {str(e)}"
print(error_message)
return {
'statusCode': 500,
'body': json.dumps({'error': error_message})
}