diff --git a/modules/ManagementGovernance/Monitoring.EC2/main.tf b/modules/ManagementGovernance/Monitoring.EC2/main.tf index f453332..43daec7 100644 --- a/modules/ManagementGovernance/Monitoring.EC2/main.tf +++ b/modules/ManagementGovernance/Monitoring.EC2/main.tf @@ -82,6 +82,7 @@ data "external" "ec2-os" { } # Linux specific checks +# default cw agent uses mem_used_percent metric resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" { count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:mem_free:${var.ec2-instance-id}" @@ -108,6 +109,33 @@ resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" { } } +resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" { + count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 + alarm_name = "${var.cw-alarm-prefix}:EC2:mem_used_percent:${var.ec2-instance-id}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "mem_used_percent" + period = "900" + statistic = "Average" + threshold = var.threshold-mem_used_percent + alarm_description = "EC2:mem_used_percent" + namespace = "CWAgent" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-standard] + ok_actions = [var.sns-targets.alarm-actions-standard] + dimensions = { + InstanceId = var.ec2-instance-id + ImageId = data.aws_instance.ec2-instance.ami + InstanceType = data.aws_instance.ec2-instance.instance_type + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +# default cw agent uses swap_used_percent metric resource "aws_cloudwatch_metric_alarm" "ec2-swap_free" { count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 # zero is fine as most ec2 instances are deployed without any swap @@ -147,17 +175,6 @@ resource "aws_cloudwatch_metric_alarm" "ec2-swap_free" { } } - -# get device dimension from cw metrics -/* -data "external" "disk-device" { - program = ["bash", "${path.module}/get-cwagent-device.sh"] - query = { - input = var.ec2-instance-id - } -} -*/ - data "external" "cw-dimensions" { program = ["bash", "${path.module}/get-cwagent-dimensions.sh"] query = { @@ -165,8 +182,36 @@ data "external" "cw-dimensions" { } } -resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" { + +resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" { count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 + alarm_name = "${var.cw-alarm-prefix}:EC2:swap_used_percent:${var.ec2-instance-id}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "swap_used_percent" + period = "900" + statistic = "Average" + threshold = var.threshold-swap_used_percent + alarm_description = "EC2:swap_used_percent" + namespace = "CWAgent" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-standard] + ok_actions = [var.sns-targets.alarm-actions-standard] + dimensions = { + InstanceId = var.ec2-instance-id + ImageId = data.aws_instance.ec2-instance.ami + InstanceType = data.aws_instance.ec2-instance.instance_type + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +# default cw agent uses disk_used_percent metric +resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" { + count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:disk_free:${var.ec2-instance-id}" comparison_operator = "LessThanThreshold" evaluation_periods = "2" @@ -181,16 +226,30 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" { alarm_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.sns-targets.alarm-actions-urgent] dimensions = data.external.cw-dimensions.result - /* - dimensions = { - InstanceId = var.ec2-instance-id - ImageId = data.aws_instance.ec2-instance.ami - InstanceType = data.aws_instance.ec2-instance.instance_type - device = data.external.disk-device.result.device - fstype = data.external.disk-device.result.fstype - path = "/" + + tags = var.default-tags + lifecycle { + ignore_changes = [tags] } - */ +} + +resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" { + count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0 + alarm_name = "${var.cw-alarm-prefix}:EC2:disk_used_percent:${var.ec2-instance-id}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "disk_used_percent" + period = "900" + statistic = "Average" + threshold = var.threshold-disk_used_percentage + alarm_description = "EC2:disk_used_percent" + namespace = "CWAgent" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = data.external.cw-dimensions.result + tags = var.default-tags lifecycle { ignore_changes = [tags] @@ -199,7 +258,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" { resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" { - count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 + count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}" comparison_operator = "LessThanThreshold" evaluation_periods = "2" @@ -230,6 +289,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" { } } +# process metric not published by default cw agent config resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" { count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:processes_total:${var.ec2-instance-id}" diff --git a/modules/ManagementGovernance/Monitoring.EC2/variables.tf b/modules/ManagementGovernance/Monitoring.EC2/variables.tf index 41aae1c..84780c1 100644 --- a/modules/ManagementGovernance/Monitoring.EC2/variables.tf +++ b/modules/ManagementGovernance/Monitoring.EC2/variables.tf @@ -7,8 +7,11 @@ variable "default-tags" {} variable "threshold-CPUUtilization" {} variable "threshold-mem_free" {} +variable "threshold-mem_used_percent" {} variable "threshold-swap_free" {} +variable "threshold-swap_used_percent" {} variable "threshold-disk_free" {} +variable "threshold-disk_used_percentage" {} variable "threshold-disk_inodes_free" {} variable "threshold-processes_total" {} variable threshold-MemoryCommittedPct {}