terraform.aws-baseline-infra/modules/ManagementGovernance/Monitoring.EC2/main.tf

380 lines
16 KiB
Terraform
Raw Normal View History

resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_System:${var.ec2-instance-id}"
comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator
evaluation_periods = var.settings.StatusCheckFailed_System.evaluation_periods
metric_name = "StatusCheckFailed_System"
period = var.settings.StatusCheckFailed_System.period
statistic = var.settings.StatusCheckFailed_System.statistic
threshold = var.settings.StatusCheckFailed_System.threshold
alarm_description = "EC2:StatusCheckFailed_System"
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.StatusCheckFailed_System.action]
ok_actions = [var.settings.StatusCheckFailed_System.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}"
comparison_operator = var.settings.StatusCheckFailed_Instance.comparison_operator
evaluation_periods = var.settings.StatusCheckFailed_Instance.evaluation_periods
metric_name = "StatusCheckFailed_Instance"
period = var.settings.StatusCheckFailed_Instance.period
statistic = var.settings.StatusCheckFailed_Instance.statistic
threshold = var.settings.StatusCheckFailed_Instance.threshold
alarm_description = "EC2:StatusCheckFailed_Instance"
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.StatusCheckFailed_Instance.action]
ok_actions = [var.settings.StatusCheckFailed_Instance.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:EC2:CPUUtilization:${var.ec2-instance-id}"
comparison_operator = var.settings.CPUUtilization.comparison_operator
evaluation_periods = var.settings.CPUUtilization.evaluation_periods
metric_name = "CPUUtilization"
period = var.settings.CPUUtilization.period
statistic = var.settings.CPUUtilization.statistic
threshold = var.settings.CPUUtilization.threshold
alarm_description = "EC2:CPUUtilization"
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.CPUUtilization.action]
ok_actions = [var.settings.CPUUtilization.action]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = var.ec2-instance-id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
2022-11-02 18:05:26 +08:00
}
# cwagent metrics
data "aws_instance" "ec2-instance" {
instance_id = var.ec2-instance-id
}
# get instance OS
data "external" "ec2-os" {
program = ["bash", "${path.module}/get-os-platform.sh"]
query = {
input = var.ec2-instance-id
}
}
# Linux specific checks
# default cw agent uses mem_used_percent metric
/*
2022-11-02 18:05:26 +08:00
resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
2022-11-02 18:05:26 +08:00
alarm_name = "${var.cw-alarm-prefix}:EC2:mem_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "mem_free"
period = "900"
statistic = "Average"
threshold = var.threshold-mem_free
alarm_description = "EC2:mem_free"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
2022-11-02 18:05:26 +08:00
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
*/
2022-11-02 18:05:26 +08:00
resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:mem_used_percent:${var.ec2-instance-id}"
comparison_operator = var.settings.mem_used_percent.comparison_operator
evaluation_periods = var.settings.mem_used_percent.evaluation_periods
metric_name = "mem_used_percent"
period = var.settings.mem_used_percent.period
statistic = var.settings.mem_used_percent.statistic
threshold = var.settings.mem_used_percent.threshold
alarm_description = "EC2:mem_used_percent"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.mem_used_percent.action]
ok_actions = [var.settings.mem_used_percent.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
# default cw agent uses swap_used_percent metric
/*
2022-11-02 18:05:26 +08:00
resource "aws_cloudwatch_metric_alarm" "ec2-swap_free" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
2022-11-02 18:05:26 +08:00
# zero is fine as most ec2 instances are deployed without any swap
2022-11-03 21:11:15 +08:00
alarm_name = "${var.cw-alarm-prefix}:EC2:swap_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
threshold = var.threshold-swap_free
alarm_description = "EC2:swap_free"
2022-11-02 18:05:26 +08:00
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
2022-11-02 18:05:26 +08:00
metric_query {
id = "m1"
metric {
metric_name = "swap_free"
namespace = "CWAgent"
period = 900
stat = "Average"
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
}
metric_query {
id = "e1"
expression = "IF(m1==0, ${var.threshold-swap_free}, m1)"
label = "swap_free_if_not_zero"
return_data = "true"
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
*/
2022-11-02 18:05:26 +08:00
data "external" "cw-dimensions" {
program = ["bash", "${path.module}/get-cwagent-dimensions.sh"]
query = {
input = var.ec2-instance-id
}
}
2022-11-14 09:41:12 +08:00
resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:swap_used_percent:${var.ec2-instance-id}"
comparison_operator = var.settings.swap_used_percent.comparison_operator
evaluation_periods = var.settings.swap_used_percent.evaluation_periods
metric_name = "swap_used_percent"
period = var.settings.swap_used_percent.period
statistic = var.settings.swap_used_percent.statistic
threshold = var.settings.swap_used_percent.threshold
alarm_description = "EC2:swap_used_percent"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.swap_used_percent.action]
ok_actions = [var.settings.swap_used_percent.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
2022-11-14 09:41:12 +08:00
}
}
# default cw agent uses disk_used_percent metric
/*
2022-11-02 18:05:26 +08:00
resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" {
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
2022-11-02 18:05:26 +08:00
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "disk_free"
period = "900"
statistic = "Average"
threshold = var.threshold-disk_free
alarm_description = "EC2:disk_free"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
2022-11-14 09:41:12 +08:00
dimensions = data.external.cw-dimensions.result
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
2022-11-02 18:05:26 +08:00
}
}
*/
resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" {
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_used_percent:${var.ec2-instance-id}"
comparison_operator = var.settings.disk_used_percent.comparison_operator
evaluation_periods = var.settings.disk_used_percent.evaluation_periods
metric_name = "disk_used_percent"
period = var.settings.disk_used_percent.period
statistic = var.settings.disk_used_percent.statistic
threshold = var.settings.disk_used_percent.threshold
alarm_description = "EC2:disk_used_percent"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.disk_used_percent.action]
ok_actions = [var.settings.disk_used_percent.action]
dimensions = data.external.cw-dimensions.result
2022-11-02 18:05:26 +08:00
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
2022-11-02 18:05:26 +08:00
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}"
comparison_operator = var.settings.disk_inodes_free.comparison_operator
evaluation_periods = var.settings.disk_inodes_free.evaluation_periods
2022-11-02 18:05:26 +08:00
metric_name = "disk_inodes_free"
period = var.settings.disk_inodes_free.period
statistic = var.settings.disk_inodes_free.statistic
threshold = var.settings.disk_inodes_free.threshold
2022-11-02 18:05:26 +08:00
alarm_description = "EC2:disk_inodes_free"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.disk_inodes_free.action]
ok_actions = [var.settings.disk_inodes_free.action]
2022-11-14 09:41:12 +08:00
dimensions = data.external.cw-dimensions.result
/*
2022-11-02 18:05:26 +08:00
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
device = data.external.disk-device.result.device
fstype = data.external.disk-device.result.fstype
path = "/"
2022-11-02 18:05:26 +08:00
}
2022-11-14 09:41:12 +08:00
*/
2022-11-02 18:05:26 +08:00
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
2022-11-03 21:11:15 +08:00
}
# process metric not published by default cw agent config
2022-11-03 21:11:15 +08:00
resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
2022-11-03 21:11:15 +08:00
alarm_name = "${var.cw-alarm-prefix}:EC2:processes_total:${var.ec2-instance-id}"
comparison_operator = var.settings.processes_total.comparison_operator
evaluation_periods = var.settings.processes_total.evaluation_periods
2022-11-03 21:11:15 +08:00
metric_name = "processes_total"
period = var.settings.processes_total.period
statistic = var.settings.processes_total.statistic
threshold = var.settings.processes_total.threshold
2022-11-03 21:11:15 +08:00
alarm_description = "EC2:processes_total"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.processes_total.action]
ok_actions = [var.settings.processes_total.action]
2022-11-03 21:11:15 +08:00
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
# Windows specific checks
resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
count = data.external.ec2-os.result.os == "Windows" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:MemoryCommittedPct:${var.ec2-instance-id}"
comparison_operator = var.settings.MemoryCommittedPct.comparison_operator
evaluation_periods = var.settings.MemoryCommittedPct.evaluation_periods
metric_name = "Memory % Committed Bytes In Use"
period = var.settings.MemoryCommittedPct.period
statistic = var.settings.MemoryCommittedPct.statistic
threshold = var.settings.MemoryCommittedPct.threshold
alarm_description = "EC2:MemoryCommittedBytes"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.MemoryCommittedPct.action]
ok_actions = [var.settings.MemoryCommittedPct.action]
dimensions = {
objectname = "Memory"
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-LogicalDiskFreePct" {
count = data.external.ec2-os.result.os == "Windows" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:LogicalDiskFreePct:${var.ec2-instance-id}"
comparison_operator = var.settings.LogicalDiskFreePct.comparison_operator
evaluation_periods = var.settings.LogicalDiskFreePct.evaluation_periods
metric_name = "LogicalDisk % Free Space"
period = var.settings.LogicalDiskFreePct.period
statistic = var.settings.LogicalDiskFreePct.statistic
threshold = var.settings.LogicalDiskFreePct.threshold
alarm_description = "EC2:OsDiskFreePct"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.LogicalDiskFreePct.action]
ok_actions = [var.settings.LogicalDiskFreePct.action]
dimensions = {
instance = "C:"
objectname = "LogicalDisk"
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}