UPD: added swap_used_percent and mem_used_percent metrics for linux

This commit is contained in:
xpk 2022-11-16 23:21:48 +08:00
parent d30103f29e
commit a2800fafd3
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
2 changed files with 85 additions and 22 deletions

View File

@ -82,6 +82,7 @@ data "external" "ec2-os" {
}
# Linux specific checks
# default cw agent uses mem_used_percent metric
resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:mem_free:${var.ec2-instance-id}"
@ -108,6 +109,33 @@ resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" {
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:mem_used_percent:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "mem_used_percent"
period = "900"
statistic = "Average"
threshold = var.threshold-mem_used_percent
alarm_description = "EC2:mem_used_percent"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
# default cw agent uses swap_used_percent metric
resource "aws_cloudwatch_metric_alarm" "ec2-swap_free" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
# zero is fine as most ec2 instances are deployed without any swap
@ -147,17 +175,6 @@ resource "aws_cloudwatch_metric_alarm" "ec2-swap_free" {
}
}
# get device dimension from cw metrics
/*
data "external" "disk-device" {
program = ["bash", "${path.module}/get-cwagent-device.sh"]
query = {
input = var.ec2-instance-id
}
}
*/
data "external" "cw-dimensions" {
program = ["bash", "${path.module}/get-cwagent-dimensions.sh"]
query = {
@ -165,8 +182,36 @@ data "external" "cw-dimensions" {
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" {
resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:swap_used_percent:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "swap_used_percent"
period = "900"
statistic = "Average"
threshold = var.threshold-swap_used_percent
alarm_description = "EC2:swap_used_percent"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
# default cw agent uses disk_used_percent metric
resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" {
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
@ -181,16 +226,30 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" {
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = data.external.cw-dimensions.result
/*
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
device = data.external.disk-device.result.device
fstype = data.external.disk-device.result.fstype
path = "/"
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
*/
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" {
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_used_percent:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "disk_used_percent"
period = "900"
statistic = "Average"
threshold = var.threshold-disk_used_percentage
alarm_description = "EC2:disk_used_percent"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = data.external.cw-dimensions.result
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
@ -199,7 +258,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" {
resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
@ -230,6 +289,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
}
}
# process metric not published by default cw agent config
resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:processes_total:${var.ec2-instance-id}"

View File

@ -7,8 +7,11 @@ variable "default-tags" {}
variable "threshold-CPUUtilization" {}
variable "threshold-mem_free" {}
variable "threshold-mem_used_percent" {}
variable "threshold-swap_free" {}
variable "threshold-swap_used_percent" {}
variable "threshold-disk_free" {}
variable "threshold-disk_used_percentage" {}
variable "threshold-disk_inodes_free" {}
variable "threshold-processes_total" {}
variable threshold-MemoryCommittedPct {}