From 282df3c08ba4c13ca15cf625bd409e9510a01cb3 Mon Sep 17 00:00:00 2001 From: KF Date: Wed, 2 Nov 2022 18:05:26 +0800 Subject: [PATCH] UPD: added EC2 monitoring with cwagent --- .../Monitoring.EC2/main.tf | 142 ++++++++++++++++++ .../Monitoring.EC2/variables.tf | 30 ++-- .../Monitoring.NLB/main.tf | 2 +- .../Monitoring.RDS/main.tf | 2 +- 4 files changed, 161 insertions(+), 15 deletions(-) diff --git a/modules/ManagementGovernance/Monitoring.EC2/main.tf b/modules/ManagementGovernance/Monitoring.EC2/main.tf index ca5bd57..c6bead4 100644 --- a/modules/ManagementGovernance/Monitoring.EC2/main.tf +++ b/modules/ManagementGovernance/Monitoring.EC2/main.tf @@ -66,4 +66,146 @@ resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" { lifecycle { ignore_changes = [tags] } +} + +# cwagent metrics +data "aws_instance" "ec2-instance" { + instance_id = var.ec2-instance-id +} + +resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" { + alarm_name = "${var.cw-alarm-prefix}:EC2:mem_free:${var.ec2-instance-id}" + comparison_operator = "LessThanThreshold" + evaluation_periods = "2" + metric_name = "mem_free" + period = "900" + statistic = "Average" + threshold = var.threshold-mem_free + alarm_description = "EC2:mem_free" + namespace = "CWAgent" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-standard] + ok_actions = [var.alarm-actions-standard] + treat_missing_data = "notBreaching" + dimensions = { + InstanceId = var.ec2-instance-id + ImageId = data.aws_instance.ec2-instance.ami + InstanceType = data.aws_instance.ec2-instance.instance_type + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ec2-swap_free" { + # zero is fine as most ec2 instances are deployed without any swap + alarm_name = "${var.cw-alarm-prefix}:EC2:swap_free:${var.ec2-instance-id}" + comparison_operator = "LessThanThreshold" + evaluation_periods = "2" + # metric_name = "swap_free" + # period = "900" + # statistic = "Average" + threshold = var.threshold-swap_free + alarm_description = "EC2:swap_free" + # namespace = "CWAgent" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-standard] + ok_actions = [var.alarm-actions-standard] + treat_missing_data = "notBreaching" + metric_query { + id = "m1" + metric { + metric_name = "swap_free" + namespace = "CWAgent" + period = 900 + stat = "Average" + dimensions = { + InstanceId = var.ec2-instance-id + ImageId = data.aws_instance.ec2-instance.ami + InstanceType = data.aws_instance.ec2-instance.instance_type + } + } + } + metric_query { + id = "e1" + expression = "IF(m1==0, ${var.threshold-swap_free}, m1)" + label = "swap_free_if_not_zero" + return_data = "true" + } + + #dimensions = { + # InstanceId = var.ec2-instance-id + # ImageId = data.aws_instance.ec2-instance.ami + # InstanceType = data.aws_instance.ec2-instance.instance_type + #} + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" { + alarm_name = "${var.cw-alarm-prefix}:EC2:disk_free:${var.ec2-instance-id}" + comparison_operator = "LessThanThreshold" + evaluation_periods = "2" + metric_name = "disk_free" + period = "900" + statistic = "Average" + threshold = var.threshold-disk_free + alarm_description = "EC2:disk_free" + namespace = "CWAgent" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] + treat_missing_data = "notBreaching" + dimensions = { + InstanceId = var.ec2-instance-id + ImageId = data.aws_instance.ec2-instance.ami + InstanceType = data.aws_instance.ec2-instance.instance_type + # The following will not work. AWS expects the device name from OS, not from AWS perspective + # device = one(data.aws_instance.ec2-instance.root_block_device[*].device_name) + # device = "" + fstype = "xfs" + path = "/" + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + + +resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" { + alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}" + comparison_operator = "LessThanThreshold" + evaluation_periods = "2" + metric_name = "disk_inodes_free" + period = "300" + statistic = "Average" + threshold = var.threshold-disk_inodes_free + alarm_description = "EC2:disk_inodes_free" + namespace = "CWAgent" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] + treat_missing_data = "notBreaching" + dimensions = { + InstanceId = var.ec2-instance-id + ImageId = data.aws_instance.ec2-instance.ami + InstanceType = data.aws_instance.ec2-instance.instance_type + # The following will not work. AWS expects the device name from OS, not from AWS perspective + # device = one(data.aws_instance.ec2-instance.root_block_device[*].device_name) + # device = "" + fstype = "xfs" + path = "/" + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } } \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.EC2/variables.tf b/modules/ManagementGovernance/Monitoring.EC2/variables.tf index f789798..9c1878a 100644 --- a/modules/ManagementGovernance/Monitoring.EC2/variables.tf +++ b/modules/ManagementGovernance/Monitoring.EC2/variables.tf @@ -1,22 +1,26 @@ -variable cw-alarm-prefix {} -variable actions-enabled {} -variable ec2-instance-id {} -variable alarm-actions-urgent { - type = string +variable "cw-alarm-prefix" {} +variable "actions-enabled" {} +variable "ec2-instance-id" {} +variable "alarm-actions-urgent" { + type = string default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent" } -variable alarm-actions-emergency { - type = string +variable "alarm-actions-emergency" { + type = string default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency" } -variable alarm-actions-standard { - type = string +variable "alarm-actions-standard" { + type = string default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard" } -variable alarm-actions-general { - type = string +variable "alarm-actions-general" { + type = string default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support" } -variable default-tags {} +variable "default-tags" {} -variable threshold-CPUUtilization {} \ No newline at end of file +variable "threshold-CPUUtilization" {} +variable "threshold-mem_free" {} +variable "threshold-swap_free" {} +variable "threshold-disk_free" {} +variable "threshold-disk_inodes_free" {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.NLB/main.tf b/modules/ManagementGovernance/Monitoring.NLB/main.tf index be3aa8c..86e0c5d 100644 --- a/modules/ManagementGovernance/Monitoring.NLB/main.tf +++ b/modules/ManagementGovernance/Monitoring.NLB/main.tf @@ -17,7 +17,7 @@ module "nlb-targetgroups" { resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" { for_each = module.nlb-targetgroups.result-set alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}" - comparison_operator = "LessThanThreshold" + comparison_operator = "LessThanOrEqualToThreshold" evaluation_periods = "1" metric_name = "HealthyHostCount" period = "300" diff --git a/modules/ManagementGovernance/Monitoring.RDS/main.tf b/modules/ManagementGovernance/Monitoring.RDS/main.tf index 63d4e5c..d2c1254 100644 --- a/modules/ManagementGovernance/Monitoring.RDS/main.tf +++ b/modules/ManagementGovernance/Monitoring.RDS/main.tf @@ -70,7 +70,7 @@ resource "aws_cloudwatch_metric_alarm" "rds-memory" { resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" { alarm_name = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}" comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" + evaluation_periods = "2" metric_name = "DiskQueueDepth" period = "300" statistic = "Average"