UPD: added EC2 monitoring with cwagent

This commit is contained in:
KF 2022-11-02 18:05:26 +08:00
parent b3ba6f2441
commit 282df3c08b
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
4 changed files with 161 additions and 15 deletions

View File

@ -67,3 +67,145 @@ resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
ignore_changes = [tags] ignore_changes = [tags]
} }
} }
# cwagent metrics
data "aws_instance" "ec2-instance" {
instance_id = var.ec2-instance-id
}
resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" {
alarm_name = "${var.cw-alarm-prefix}:EC2:mem_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "mem_free"
period = "900"
statistic = "Average"
threshold = var.threshold-mem_free
alarm_description = "EC2:mem_free"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-standard]
ok_actions = [var.alarm-actions-standard]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-swap_free" {
# zero is fine as most ec2 instances are deployed without any swap
alarm_name = "${var.cw-alarm-prefix}:EC2:swap_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
# metric_name = "swap_free"
# period = "900"
# statistic = "Average"
threshold = var.threshold-swap_free
alarm_description = "EC2:swap_free"
# namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-standard]
ok_actions = [var.alarm-actions-standard]
treat_missing_data = "notBreaching"
metric_query {
id = "m1"
metric {
metric_name = "swap_free"
namespace = "CWAgent"
period = 900
stat = "Average"
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
}
metric_query {
id = "e1"
expression = "IF(m1==0, ${var.threshold-swap_free}, m1)"
label = "swap_free_if_not_zero"
return_data = "true"
}
#dimensions = {
# InstanceId = var.ec2-instance-id
# ImageId = data.aws_instance.ec2-instance.ami
# InstanceType = data.aws_instance.ec2-instance.instance_type
#}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" {
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "disk_free"
period = "900"
statistic = "Average"
threshold = var.threshold-disk_free
alarm_description = "EC2:disk_free"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
# The following will not work. AWS expects the device name from OS, not from AWS perspective
# device = one(data.aws_instance.ec2-instance.root_block_device[*].device_name)
# device = ""
fstype = "xfs"
path = "/"
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "disk_inodes_free"
period = "300"
statistic = "Average"
threshold = var.threshold-disk_inodes_free
alarm_description = "EC2:disk_inodes_free"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
# The following will not work. AWS expects the device name from OS, not from AWS perspective
# device = one(data.aws_instance.ec2-instance.root_block_device[*].device_name)
# device = ""
fstype = "xfs"
path = "/"
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -1,22 +1,26 @@
variable cw-alarm-prefix {} variable "cw-alarm-prefix" {}
variable actions-enabled {} variable "actions-enabled" {}
variable ec2-instance-id {} variable "ec2-instance-id" {}
variable alarm-actions-urgent { variable "alarm-actions-urgent" {
type = string type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent" default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent"
} }
variable alarm-actions-emergency { variable "alarm-actions-emergency" {
type = string type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency" default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency"
} }
variable alarm-actions-standard { variable "alarm-actions-standard" {
type = string type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard" default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard"
} }
variable alarm-actions-general { variable "alarm-actions-general" {
type = string type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support" default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support"
} }
variable default-tags {} variable "default-tags" {}
variable threshold-CPUUtilization {} variable "threshold-CPUUtilization" {}
variable "threshold-mem_free" {}
variable "threshold-swap_free" {}
variable "threshold-disk_free" {}
variable "threshold-disk_inodes_free" {}

View File

@ -17,7 +17,7 @@ module "nlb-targetgroups" {
resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" { resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" {
for_each = module.nlb-targetgroups.result-set for_each = module.nlb-targetgroups.result-set
alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}" alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}"
comparison_operator = "LessThanThreshold" comparison_operator = "LessThanOrEqualToThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "HealthyHostCount" metric_name = "HealthyHostCount"
period = "300" period = "300"

View File

@ -70,7 +70,7 @@ resource "aws_cloudwatch_metric_alarm" "rds-memory" {
resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" { resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" {
alarm_name = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}" alarm_name = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1" evaluation_periods = "2"
metric_name = "DiskQueueDepth" metric_name = "DiskQueueDepth"
period = "300" period = "300"
statistic = "Average" statistic = "Average"