UPD: various bug fixes and enhancements

This commit is contained in:
xpk 2023-01-05 23:35:31 +08:00
parent bcdbb23221
commit 48191b808f
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
24 changed files with 272 additions and 768 deletions

View File

@ -1,17 +1,17 @@
resource "aws_cloudwatch_metric_alarm" "asg-CPUUtilization" { resource "aws_cloudwatch_metric_alarm" "asg-CPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ASG:CPUUtilization:${var.asg-name}" alarm_name = "${var.cw-alarm-prefix}:ASG:CPUUtilization:${var.asg-name}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.CPUUtilization.comparison_operator
evaluation_periods = "3" evaluation_periods = var.settings.CPUUtilization.evaluation_periods
metric_name = "CPUUtilization" metric_name = "CPUUtilization"
period = "1800" period = var.settings.CPUUtilization.period
statistic = "Average" statistic = var.settings.CPUUtilization.statistic
threshold = var.threshold-CPUUtilization threshold = var.settings.CPUUtilization.threshold
alarm_description = "ASG:CPUUtilization" alarm_description = "ASG:CPUUtilization"
namespace = "AWS/EC2" namespace = "AWS/EC2"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.CPUUtilization.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.CPUUtilization.action]
dimensions = { dimensions = {
AutoScalingGroupName = var.asg-name AutoScalingGroupName = var.asg-name
} }

View File

@ -1,7 +1,5 @@
variable cw-alarm-prefix {} variable cw-alarm-prefix {}
variable actions-enabled {} variable actions-enabled {}
variable asg-name {} variable asg-name {}
variable sns-targets {} variable settings {}
variable default-tags {} variable default-tags {}
variable threshold-CPUUtilization {}

View File

@ -1,17 +1,17 @@
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" { resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_System:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_System:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator
evaluation_periods = "1" evaluation_periods = var.settings.StatusCheckFailed_System.evaluation_periods
metric_name = "StatusCheckFailed_System" metric_name = "StatusCheckFailed_System"
period = "300" period = var.settings.StatusCheckFailed_System.period
statistic = "Maximum" statistic = var.settings.StatusCheckFailed_System.statistic
threshold = 0 threshold = var.settings.StatusCheckFailed_System.threshold
alarm_description = "EC2:StatusCheckFailed_System" alarm_description = "EC2:StatusCheckFailed_System"
namespace = "AWS/EC2" namespace = "AWS/EC2"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-emergency] alarm_actions = [var.settings.StatusCheckFailed_System.action]
ok_actions = [var.sns-targets.alarm-actions-emergency] ok_actions = [var.settings.StatusCheckFailed_System.action]
dimensions = { dimensions = {
InstanceId = var.ec2-instance-id InstanceId = var.ec2-instance-id
} }
@ -23,18 +23,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" { resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.StatusCheckFailed_Instance.comparison_operator
evaluation_periods = "1" evaluation_periods = var.settings.StatusCheckFailed_Instance.evaluation_periods
metric_name = "StatusCheckFailed_Instance" metric_name = "StatusCheckFailed_Instance"
period = "300" period = var.settings.StatusCheckFailed_Instance.period
statistic = "Maximum" statistic = var.settings.StatusCheckFailed_Instance.statistic
threshold = 0 threshold = var.settings.StatusCheckFailed_Instance.threshold
alarm_description = "EC2:StatusCheckFailed_Instance" alarm_description = "EC2:StatusCheckFailed_Instance"
namespace = "AWS/EC2" namespace = "AWS/EC2"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-emergency] alarm_actions = [var.settings.StatusCheckFailed_Instance.action]
ok_actions = [var.sns-targets.alarm-actions-emergency] ok_actions = [var.settings.StatusCheckFailed_Instance.action]
dimensions = { dimensions = {
InstanceId = var.ec2-instance-id InstanceId = var.ec2-instance-id
} }
@ -46,18 +46,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" { resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:EC2:CPUUtilization:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:CPUUtilization:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.CPUUtilization.comparison_operator
evaluation_periods = "6" evaluation_periods = var.settings.CPUUtilization.evaluation_periods
metric_name = "CPUUtilization" metric_name = "CPUUtilization"
period = "300" period = var.settings.CPUUtilization.period
statistic = "Average" statistic = var.settings.CPUUtilization.statistic
threshold = var.threshold-CPUUtilization threshold = var.settings.CPUUtilization.threshold
alarm_description = "EC2:CPUUtilization" alarm_description = "EC2:CPUUtilization"
namespace = "AWS/EC2" namespace = "AWS/EC2"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.CPUUtilization.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.CPUUtilization.action]
treat_missing_data = "notBreaching" treat_missing_data = "notBreaching"
dimensions = { dimensions = {
InstanceId = var.ec2-instance-id InstanceId = var.ec2-instance-id
@ -114,18 +114,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" {
resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" { resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:mem_used_percent:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:mem_used_percent:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.mem_used_percent.comparison_operator
evaluation_periods = "2" evaluation_periods = var.settings.mem_used_percent.evaluation_periods
metric_name = "mem_used_percent" metric_name = "mem_used_percent"
period = "900" period = var.settings.mem_used_percent.period
statistic = "Average" statistic = var.settings.mem_used_percent.statistic
threshold = var.threshold-mem_used_percent threshold = var.settings.mem_used_percent.threshold
alarm_description = "EC2:mem_used_percent" alarm_description = "EC2:mem_used_percent"
namespace = "CWAgent" namespace = "CWAgent"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard] alarm_actions = [var.settings.mem_used_percent.action]
ok_actions = [var.sns-targets.alarm-actions-standard] ok_actions = [var.settings.mem_used_percent.action]
dimensions = { dimensions = {
InstanceId = var.ec2-instance-id InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami ImageId = data.aws_instance.ec2-instance.ami
@ -190,18 +190,18 @@ data "external" "cw-dimensions" {
resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" { resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:swap_used_percent:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:swap_used_percent:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.swap_used_percent.comparison_operator
evaluation_periods = "2" evaluation_periods = var.settings.swap_used_percent.evaluation_periods
metric_name = "swap_used_percent" metric_name = "swap_used_percent"
period = "900" period = var.settings.swap_used_percent.period
statistic = "Average" statistic = var.settings.swap_used_percent.statistic
threshold = var.threshold-swap_used_percent threshold = var.settings.swap_used_percent.threshold
alarm_description = "EC2:swap_used_percent" alarm_description = "EC2:swap_used_percent"
namespace = "CWAgent" namespace = "CWAgent"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.swap_used_percent.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.swap_used_percent.action]
dimensions = { dimensions = {
InstanceId = var.ec2-instance-id InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami ImageId = data.aws_instance.ec2-instance.ami
@ -242,18 +242,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" {
resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" { resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" {
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0 count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_used_percent:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:disk_used_percent:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.disk_used_percent.comparison_operator
evaluation_periods = "2" evaluation_periods = var.settings.disk_used_percent.evaluation_periods
metric_name = "disk_used_percent" metric_name = "disk_used_percent"
period = "900" period = var.settings.disk_used_percent.period
statistic = "Average" statistic = var.settings.disk_used_percent.statistic
threshold = var.threshold-disk_used_percentage threshold = var.settings.disk_used_percent.threshold
alarm_description = "EC2:disk_used_percent" alarm_description = "EC2:disk_used_percent"
namespace = "CWAgent" namespace = "CWAgent"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.disk_used_percent.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.disk_used_percent.action]
dimensions = data.external.cw-dimensions.result dimensions = data.external.cw-dimensions.result
tags = var.default-tags tags = var.default-tags
@ -266,18 +266,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" {
resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" { resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0 count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold" comparison_operator = var.settings.disk_inodes_free.comparison_operator
evaluation_periods = "2" evaluation_periods = var.settings.disk_inodes_free.evaluation_periods
metric_name = "disk_inodes_free" metric_name = "disk_inodes_free"
period = "300" period = var.settings.disk_inodes_free.period
statistic = "Average" statistic = var.settings.disk_inodes_free.statistic
threshold = var.threshold-disk_inodes_free threshold = var.settings.disk_inodes_free.threshold
alarm_description = "EC2:disk_inodes_free" alarm_description = "EC2:disk_inodes_free"
namespace = "CWAgent" namespace = "CWAgent"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.disk_inodes_free.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.disk_inodes_free.action]
dimensions = data.external.cw-dimensions.result dimensions = data.external.cw-dimensions.result
/* /*
dimensions = { dimensions = {
@ -299,18 +299,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" { resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:processes_total:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:processes_total:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.processes_total.comparison_operator
evaluation_periods = "2" evaluation_periods = var.settings.processes_total.evaluation_periods
metric_name = "processes_total" metric_name = "processes_total"
period = "900" period = var.settings.processes_total.period
statistic = "Average" statistic = var.settings.processes_total.statistic
threshold = var.threshold-processes_total threshold = var.settings.processes_total.threshold
alarm_description = "EC2:processes_total" alarm_description = "EC2:processes_total"
namespace = "CWAgent" namespace = "CWAgent"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.processes_total.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.processes_total.action]
dimensions = { dimensions = {
InstanceId = var.ec2-instance-id InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami ImageId = data.aws_instance.ec2-instance.ami
@ -327,18 +327,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" { resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
count = data.external.ec2-os.result.os == "Windows" ? 1 : 0 count = data.external.ec2-os.result.os == "Windows" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:MemoryCommittedPct:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:MemoryCommittedPct:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.MemoryCommittedPct.comparison_operator
evaluation_periods = "2" evaluation_periods = var.settings.MemoryCommittedPct.evaluation_periods
metric_name = "Memory % Committed Bytes In Use" metric_name = "Memory % Committed Bytes In Use"
period = "900" period = var.settings.MemoryCommittedPct.period
statistic = "Average" statistic = var.settings.MemoryCommittedPct.statistic
threshold = var.threshold-MemoryCommittedPct threshold = var.settings.MemoryCommittedPct.threshold
alarm_description = "EC2:MemoryCommittedBytes" alarm_description = "EC2:MemoryCommittedBytes"
namespace = "CWAgent" namespace = "CWAgent"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.MemoryCommittedPct.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.MemoryCommittedPct.action]
dimensions = { dimensions = {
objectname = "Memory" objectname = "Memory"
InstanceId = var.ec2-instance-id InstanceId = var.ec2-instance-id
@ -351,21 +351,21 @@ resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
} }
} }
resource "aws_cloudwatch_metric_alarm" "ec2-OsDiskFreePct" { resource "aws_cloudwatch_metric_alarm" "ec2-LogicalDiskFreePct" {
count = data.external.ec2-os.result.os == "Windows" ? 1 : 0 count = data.external.ec2-os.result.os == "Windows" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:OsDiskFreePct:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:LogicalDiskFreePct:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold" comparison_operator = var.settings.LogicalDiskFreePct.comparison_operator
evaluation_periods = "2" evaluation_periods = var.settings.LogicalDiskFreePct.evaluation_periods
metric_name = "LogicalDisk % Free Space" metric_name = "LogicalDisk % Free Space"
period = "300" period = var.settings.LogicalDiskFreePct.period
statistic = "Average" statistic = var.settings.LogicalDiskFreePct.statistic
threshold = var.threshold-LogicalDiskFreePct threshold = var.settings.LogicalDiskFreePct.threshold
alarm_description = "EC2:OsDiskFreePct" alarm_description = "EC2:OsDiskFreePct"
namespace = "CWAgent" namespace = "CWAgent"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.LogicalDiskFreePct.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.LogicalDiskFreePct.action]
dimensions = { dimensions = {
instance = "C:" instance = "C:"
objectname = "LogicalDisk" objectname = "LogicalDisk"

View File

@ -1,18 +1,6 @@
variable "cw-alarm-prefix" {} variable "cw-alarm-prefix" {}
variable "actions-enabled" {} variable "actions-enabled" {}
variable "ec2-instance-id" {} variable "ec2-instance-id" {}
variable sns-targets {} variable "settings" {}
variable "default-tags" {} variable "default-tags" {}
variable "threshold-CPUUtilization" {}
# variable "threshold-mem_free" {}
variable "threshold-mem_used_percent" {}
# variable "threshold-swap_free" {}
variable "threshold-swap_used_percent" {}
# variable "threshold-disk_free" {}
variable "threshold-disk_used_percentage" {}
variable "threshold-disk_inodes_free" {}
variable "threshold-processes_total" {}
variable threshold-MemoryCommittedPct {}
variable threshold-LogicalDiskFreePct {}

View File

@ -2,19 +2,19 @@
resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" { resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" {
for_each = toset(var.pod-names) for_each = toset(var.pod-names)
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_cpu_utilization:${each.value}" alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm1.metric}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.alarm1.comparison_operator
evaluation_periods = "3" evaluation_periods = var.settings.alarm1.evaluation_periods
metric_name = "pod_cpu_utilization" metric_name = var.settings.alarm1.metric
period = "300" period = var.settings.alarm1.period
statistic = "Average" statistic = var.settings.alarm1.statistic
threshold = var.threshold-pod_cpu_utilization threshold = var.settings.alarm1.threshold
alarm_description = "EKS:pod_cpu_utilization" alarm_description = "EKS:${var.settings.alarm1.metric}"
namespace = "ContainerInsights" namespace = "ContainerInsights"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.alarm1.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.alarm1.action]
dimensions = { dimensions = {
"PodName" = each.value "PodName" = each.value
"ClusterName" = var.cluster-name "ClusterName" = var.cluster-name
@ -29,19 +29,19 @@ resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" {
resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" { resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" {
for_each = toset(var.pod-names) for_each = toset(var.pod-names)
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_memory_utilization:${each.value}" alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm2.metric}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3" evaluation_periods = "3"
metric_name = "pod_memory_utilization" metric_name = var.settings.alarm2.metric
period = "300" period = var.settings.alarm2.period
statistic = "Average" statistic = var.settings.alarm2.statistic
threshold = var.threshold-pod_memory_utilization threshold = var.settings.alarm2.threshold
alarm_description = "EKS:pod_memory_utilization" alarm_description = "EKS:${var.settings.alarm2.metric}"
namespace = "ContainerInsights" namespace = "ContainerInsights"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.alarm2.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.alarm2.action]
dimensions = { dimensions = {
"PodName" = each.value "PodName" = each.value
"ClusterName" = var.cluster-name "ClusterName" = var.cluster-name
@ -56,19 +56,19 @@ resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" {
resource "aws_cloudwatch_metric_alarm" "eks-pod_number_of_container_restarts" { resource "aws_cloudwatch_metric_alarm" "eks-pod_number_of_container_restarts" {
for_each = toset(var.pod-names) for_each = toset(var.pod-names)
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_number_of_container_restarts:${each.value}" alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm3.metric}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3" evaluation_periods = "3"
metric_name = "pod_number_of_container_restarts" metric_name = var.settings.alarm3.metric
period = "300" period = var.settings.alarm3.period
statistic = "Average" statistic = var.settings.alarm3.statistic
threshold = var.threshold-pod_number_of_container_restarts threshold = var.settings.alarm3.threshold
alarm_description = "EKS:pod_number_of_container_restarts" alarm_description = "EKS:${var.settings.alarm3.metric}"
namespace = "ContainerInsights" namespace = "ContainerInsights"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.alarm3.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.alarm3.action]
dimensions = { dimensions = {
"PodName" = each.value "PodName" = each.value
"ClusterName" = var.cluster-name "ClusterName" = var.cluster-name

View File

@ -1,6 +1,5 @@
variable cw-alarm-prefix {} variable cw-alarm-prefix {}
variable actions-enabled {} variable actions-enabled {}
variable sns-targets {}
variable default-tags {} variable default-tags {}
variable cluster-name {} variable cluster-name {}
@ -8,6 +7,4 @@ variable eks-namespace {}
variable pod-names { variable pod-names {
type = list type = list
} }
variable threshold-pod_cpu_utilization {} variable settings {}
variable threshold-pod_memory_utilization {}
variable threshold-pod_number_of_container_restarts {}

View File

@ -1,17 +1,18 @@
resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" { resource "aws_cloudwatch_metric_alarm" "emr-alarms" {
alarm_name = "${var.cw-alarm-prefix}:EMR:AppsPending:${var.job-flow-id}" for_each = var.settings
comparison_operator = "GreaterThanThreshold" alarm_name = "${var.cw-alarm-prefix}:EMR:${each.value["metric"]}:${var.job-flow-id}"
evaluation_periods = "1" comparison_operator = each.value["comparison_operator"]
metric_name = "AppsPending" evaluation_periods = each.value["evaluation_periods"]
period = "1800" metric_name = each.value["metric"]
statistic = "Average" period = each.value["period"]
threshold = var.threshold-AppsPending statistic = each.value["statistic"]
alarm_description = "EMR:AppsPending" threshold = each.value["threshold"]
alarm_description = "EMR:${each.value["metric"]}"
namespace = "AWS/ElasticMapReduce" namespace = "AWS/ElasticMapReduce"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard] alarm_actions = [each.value["action"]]
ok_actions = [var.sns-targets.alarm-actions-standard] ok_actions = [each.value["action"]]
dimensions = { dimensions = {
JobFlowId = var.job-flow-id JobFlowId = var.job-flow-id
} }
@ -21,6 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
} }
} }
/*
resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" { resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
alarm_name = "${var.cw-alarm-prefix}:EMR:CapacityRemainingGB:${var.job-flow-id}" alarm_name = "${var.cw-alarm-prefix}:EMR:CapacityRemainingGB:${var.job-flow-id}"
comparison_operator = "LessThanThreshold" comparison_operator = "LessThanThreshold"
@ -28,13 +30,13 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
metric_name = "CapacityRemainingGB" metric_name = "CapacityRemainingGB"
period = "3600" period = "3600"
statistic = "Average" statistic = "Average"
threshold = var.threshold-CapacityRemainingGB threshold = var.settings.CapacityRemainingGB.threshold
alarm_description = "EMR:CapacityRemainingGB" alarm_description = "EMR:CapacityRemainingGB"
namespace = "AWS/ElasticMapReduce" namespace = "AWS/ElasticMapReduce"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.CapacityRemainingGB.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.CapacityRemainingGB.action]
dimensions = { dimensions = {
JobFlowId = var.job-flow-id JobFlowId = var.job-flow-id
} }
@ -42,4 +44,5 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
lifecycle { lifecycle {
ignore_changes = [tags] ignore_changes = [tags]
} }
} }
*/

View File

@ -1,8 +1,5 @@
variable cw-alarm-prefix {} variable cw-alarm-prefix {}
variable actions-enabled {} variable actions-enabled {}
variable job-flow-id {} variable job-flow-id {}
variable threshold-AppsPending {} variable settings {}
variable threshold-CapacityRemainingGB {}
variable sns-targets {}
variable default-tags {} variable default-tags {}

View File

@ -19,7 +19,7 @@ PATTERN
} }
resource "aws_cloudwatch_event_target" "TargetForEventRule" { resource "aws_cloudwatch_event_target" "TargetForEventRule" {
rule = aws_cloudwatch_event_rule.EventRule.name rule = aws_cloudwatch_event_rule.EventRule.name
target_id = "rackspace-standard-sns" target_id = "health-event-notification-sns"
arn = var.sns-targets.alarm-actions-standard arn = var.settings.healthEvents.action
} }

View File

@ -1,5 +1,5 @@
variable cw-alarm-prefix {} variable cw-alarm-prefix {}
variable actions-enabled {} variable actions-enabled {}
variable sns-targets {} variable settings {}
variable default-tags {} variable default-tags {}

View File

@ -1,17 +1,17 @@
resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" { resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
alarm_name = "${var.cw-alarm-prefix}:Kafka:ZooKeeperRequestLatencyMsMean:${var.cluster-name}" alarm_name = "${var.cw-alarm-prefix}:Kafka:ZooKeeperRequestLatencyMsMean:${var.cluster-name}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.ZooKeeperRequestLatencyMsMean.comparison_operator
evaluation_periods = "3" evaluation_periods = var.settings.ZooKeeperRequestLatencyMsMean.evaluation_periods
metric_name = "ZooKeeperRequestLatencyMsMean" metric_name = "ZooKeeperRequestLatencyMsMean"
period = "1800" period = var.settings.ZooKeeperRequestLatencyMsMean.period
statistic = "Average" statistic = var.settings.ZooKeeperRequestLatencyMsMean.statistic
threshold = var.threshold-ZooKeeperRequestLatencyMsMean threshold = var.settings.ZooKeeperRequestLatencyMsMean.threshold
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean" alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
namespace = "AWS/Kafka" namespace = "AWS/Kafka"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action]
dimensions = { dimensions = {
"Cluster Name" = var.cluster-name "Cluster Name" = var.cluster-name
} }
@ -45,21 +45,21 @@ module "msk-brokers" {
resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" { resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)]) for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:CpuUsage:${var.cluster-name}-${each.value}" alarm_name = "${var.cw-alarm-prefix}:Kafka:CpuUsage:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.CpuUserSystem.comparison_operator
evaluation_periods = "3" evaluation_periods = var.settings.CpuUserSystem.evaluation_periods
threshold = var.threshold-CpuUserSystem threshold = var.settings.CpuUserSystem.threshold
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean" alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.CpuUserSystem.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.CpuUserSystem.action]
metric_query { metric_query {
id = "m1" id = "m1"
metric { metric {
metric_name = "CpuUser" metric_name = "CpuUser"
namespace = "AWS/Kafka" namespace = "AWS/Kafka"
period = 300 period = var.settings.CpuUserSystem.period
stat = "Average" stat = var.settings.CpuUserSystem.statistic
dimensions = { dimensions = {
"Cluster Name" = var.cluster-name "Cluster Name" = var.cluster-name
"Broker ID" = each.value "Broker ID" = each.value
@ -72,8 +72,8 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
metric { metric {
metric_name = "CpuSystem" metric_name = "CpuSystem"
namespace = "AWS/Kafka" namespace = "AWS/Kafka"
period = 300 period = var.settings.CpuUserSystem.period
stat = "Average" stat = var.settings.CpuUserSystem.statistic
dimensions = { dimensions = {
"Cluster Name" = var.cluster-name "Cluster Name" = var.cluster-name
"Broker ID" = each.value "Broker ID" = each.value
@ -97,18 +97,18 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" { resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)]) for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:KafkaDataLogsDiskUsed:${var.cluster-name}-${each.value}" alarm_name = "${var.cw-alarm-prefix}:Kafka:KafkaDataLogsDiskUsed:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.KafkaDataLogsDiskUsed.comparison_operator
evaluation_periods = "3" evaluation_periods = var.settings.KafkaDataLogsDiskUsed.evaluation_periods
metric_name = "KafkaDataLogsDiskUsed" metric_name = "KafkaDataLogsDiskUsed"
period = "300" period = var.settings.KafkaDataLogsDiskUsed.period
statistic = "Average" statistic = var.settings.KafkaDataLogsDiskUsed.statistic
threshold = var.threshold-KafkaDataLogsDiskUsed threshold = var.settings.KafkaDataLogsDiskUsed.threshold
alarm_description = "Kafka:KafkaDataLogsDiskUsed" alarm_description = "Kafka:KafkaDataLogsDiskUsed"
namespace = "AWS/Kafka" namespace = "AWS/Kafka"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.KafkaDataLogsDiskUsed.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.KafkaDataLogsDiskUsed.action]
dimensions = { dimensions = {
"Cluster Name" = var.cluster-name "Cluster Name" = var.cluster-name
"Broker ID" = each.value "Broker ID" = each.value
@ -122,18 +122,18 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" { resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)]) for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:HeapMemoryAfterGC:${var.cluster-name}-${each.value}" alarm_name = "${var.cw-alarm-prefix}:Kafka:HeapMemoryAfterGC:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.HeapMemoryAfterGC.comparison_operator
evaluation_periods = "3" evaluation_periods = var.settings.HeapMemoryAfterGC.evaluation_periods
metric_name = "HeapMemoryAfterGC" metric_name = "HeapMemoryAfterGC"
period = "300" period = var.settings.HeapMemoryAfterGC.period
statistic = "Average" statistic = var.settings.HeapMemoryAfterGC.statistic
threshold = var.threshold-HeapMemoryAfterGC threshold = var.settings.HeapMemoryAfterGC.threshold
alarm_description = "Kafka:HeapMemoryAfterGC" alarm_description = "Kafka:HeapMemoryAfterGC"
namespace = "AWS/Kafka" namespace = "AWS/Kafka"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [var.settings.HeapMemoryAfterGC.action]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [var.settings.HeapMemoryAfterGC.action]
dimensions = { dimensions = {
"Cluster Name" = var.cluster-name "Cluster Name" = var.cluster-name
"Broker ID" = each.value "Broker ID" = each.value

View File

@ -1,10 +1,6 @@
variable cw-alarm-prefix {} variable cw-alarm-prefix {}
variable actions-enabled {} variable actions-enabled {}
variable cluster-name {} variable cluster-name {}
variable sns-targets {} variable settings {}
variable default-tags {} variable default-tags {}
variable threshold-ZooKeeperRequestLatencyMsMean {}
variable threshold-CpuUserSystem {}
variable threshold-KafkaDataLogsDiskUsed {}
variable threshold-HeapMemoryAfterGC {}

View File

@ -1,17 +1,43 @@
resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" { resource "aws_cloudwatch_metric_alarm" "ngw-alarms" {
alarm_name = "${var.cw-alarm-prefix}:NGW:ErrorPortAllocation:${var.res-id}" for_each = var.settings
comparison_operator = "GreaterThanThreshold" alarm_name = "${var.cw-alarm-prefix}:NGW:${each.value["metric"]}:${var.res-id}"
evaluation_periods = "2" comparison_operator = each.value["comparison_operator"]
metric_name = "ErrorPortAllocation" evaluation_periods = each.value["evaluation_periods"]
period = "300" metric_name = each.value["metric"]
statistic = "Average" period = each.value["period"]
threshold = var.threshold-ErrorPortAllocation statistic = each.value["statistic"]
alarm_description = "NGW:ErrorPortAllocation" threshold = each.value["threshold"]
alarm_description = "NGW:${each.value["metric"]}"
namespace = "AWS/NATGateway" namespace = "AWS/NATGateway"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [each.value["action"]]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [each.value["action"]]
dimensions = {
NatGatewayId = var.res-id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
/*
resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" {
alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm1.metric}:${var.res-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = var.settings.alarm1.metric
period = "300"
statistic = "Average"
threshold = var.settings.alarm1.threshold
alarm_description = "NGW:${var.settings.alarm1.metric}"
namespace = "AWS/NATGateway"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.alarm1.action]
ok_actions = [var.settings.alarm1.action]
dimensions = { dimensions = {
NatGatewayId = var.res-id NatGatewayId = var.res-id
} }
@ -22,19 +48,19 @@ resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" {
} }
resource "aws_cloudwatch_metric_alarm" "ngw-ConnectionEstablishedCount" { resource "aws_cloudwatch_metric_alarm" "ngw-ConnectionEstablishedCount" {
alarm_name = "${var.cw-alarm-prefix}:NGW:ConnectionEstablishedCount:${var.res-id}" alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm2.metric}:${var.res-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2" evaluation_periods = "2"
metric_name = "ConnectionEstablishedCount" metric_name = var.settings.alarm2.metric
period = "300" period = "300"
statistic = "Average" statistic = "Average"
threshold = var.threshold-ConnectionEstablishedCount threshold = var.settings.alarm2.threshold
alarm_description = "NGW:ConnectionEstablishedCount" alarm_description = "NGW:${var.settings.alarm2.metric}"
namespace = "AWS/NATGateway" namespace = "AWS/NATGateway"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard] alarm_actions = [var.settings.alarm2.action]
ok_actions = [var.sns-targets.alarm-actions-standard] ok_actions = [var.settings.alarm2.action]
dimensions = { dimensions = {
NatGatewayId = var.res-id NatGatewayId = var.res-id
} }
@ -45,19 +71,19 @@ resource "aws_cloudwatch_metric_alarm" "ngw-ConnectionEstablishedCount" {
} }
resource "aws_cloudwatch_metric_alarm" "ngw-PacketsDropCount" { resource "aws_cloudwatch_metric_alarm" "ngw-PacketsDropCount" {
alarm_name = "${var.cw-alarm-prefix}:NGW:PacketsDropCount:${var.res-id}" alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm3.metric}:${var.res-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2" evaluation_periods = "2"
metric_name = "PacketsDropCount" metric_name = var.settings.alarm3.metric
period = "300" period = "300"
statistic = "Average" statistic = "Average"
threshold = var.threshold-PacketsDropCount threshold = var.settings.alarm3.threshold
alarm_description = "NGW:PacketsDropCount" alarm_description = "NGW:${var.settings.alarm3.metric}"
namespace = "AWS/NATGateway" namespace = "AWS/NATGateway"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard] alarm_actions = [var.settings.alarm3.action]
ok_actions = [var.sns-targets.alarm-actions-standard] ok_actions = [var.settings.alarm3.action]
dimensions = { dimensions = {
NatGatewayId = var.res-id NatGatewayId = var.res-id
} }
@ -65,4 +91,5 @@ resource "aws_cloudwatch_metric_alarm" "ngw-PacketsDropCount" {
lifecycle { lifecycle {
ignore_changes = [tags] ignore_changes = [tags]
} }
} }
*/

View File

@ -1,8 +1,5 @@
variable cw-alarm-prefix {} variable cw-alarm-prefix {}
variable actions-enabled {} variable actions-enabled {}
variable res-id {} variable res-id {}
variable threshold-ErrorPortAllocation { }
variable threshold-ConnectionEstablishedCount {}
variable threshold-PacketsDropCount {}
variable sns-targets {}
variable default-tags {} variable default-tags {}
variable settings {}

View File

@ -17,18 +17,18 @@ module "nlb-targetgroups" {
resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" { resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" {
for_each = module.nlb-targetgroups.result-set for_each = module.nlb-targetgroups.result-set
alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}" alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}"
comparison_operator = "LessThanThreshold" comparison_operator = var.settings.HealthHostCountMin.comparison_operator
evaluation_periods = "1" evaluation_periods = var.settings.HealthHostCountMin.evaluation_periods
metric_name = "HealthyHostCount" metric_name = "HealthyHostCount"
period = "300" period = var.settings.HealthHostCountMin.period
statistic = "Minimum" statistic = var.settings.HealthHostCountMin.statistic
threshold = var.threshold-HealthHostCountMin threshold = var.settings.HealthHostCountMin.threshold
alarm_description = "NLBTG:HealthyHostCount" alarm_description = "NLBTG:HealthyHostCount"
namespace = "AWS/NetworkELB" namespace = "AWS/NetworkELB"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-emergency] alarm_actions = [var.settings.HealthHostCountMin.action]
ok_actions = [var.sns-targets.alarm-actions-emergency] ok_actions = [var.settings.HealthHostCountMin.action]
dimensions = { dimensions = {
TargetGroup = split(":", each.value)[5] TargetGroup = split(":", each.value)[5]
LoadBalancer = "net/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}" LoadBalancer = "net/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"

View File

@ -1,6 +1,5 @@
variable cw-alarm-prefix {} variable cw-alarm-prefix {}
variable actions-enabled {} variable actions-enabled {}
variable load-balancer {} variable load-balancer {}
variable threshold-HealthHostCountMin {} variable settings {}
variable sns-targets {}
variable default-tags {} variable default-tags {}

View File

@ -1,307 +1,20 @@
data "aws_caller_identity" "this" {} data "aws_caller_identity" "this" {}
resource "aws_cloudwatch_metric_alarm" "ES-CPUUtilization" { resource "aws_cloudwatch_metric_alarm" "ES-alarms" {
alarm_name = "${var.cw-alarm-prefix}:ES:CPUUtilization:${var.domain-name}" for_each = var.settings
comparison_operator = "GreaterThanThreshold" alarm_name = "${var.cw-alarm-prefix}:ES:${each.value["metric"]}:${var.domain-name}"
evaluation_periods = "3" comparison_operator = each.value["comparison_operator"]
metric_name = "CPUUtilization" evaluation_periods = each.value["evaluation_periods"]
period = "1800" metric_name = each.value["metric"]
statistic = "Average" period = each.value["period"]
threshold = var.threshold-CPUUtilization statistic = each.value["statistic"]
alarm_description = "ES:CPUUtilization" threshold = each.value["threshold"]
alarm_description = "ES:${each.value["metric"]}"
namespace = "AWS/ES" namespace = "AWS/ES"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [each.value["action"]]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [each.value["action"]]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-SearchLatency" {
alarm_name = "${var.cw-alarm-prefix}:ES:SearchLatency:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "SearchLatency"
period = "1800"
statistic = "Average"
threshold = var.threshold-SearchLatency
alarm_description = "ES:SearchLatency"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-IndexingLatency" {
alarm_name = "${var.cw-alarm-prefix}:ES:IndexingLatency:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "IndexingLatency"
period = "1800"
statistic = "Average"
threshold = var.threshold-IndexingLatency
alarm_description = "ES:IndexingLatency"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" {
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterStatusRed:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ClusterStatus.red"
period = "900"
statistic = "Maximum"
threshold = 0
alarm_description = "At least one primary shard and its replicas aren't allocated to a node."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteQueue" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteQueue:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolWriteQueue"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolWriteQueue
alarm_description = "The cluster is experiencing high indexing concurrency. Review and control indexing requests, or increase cluster resources."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchQueue" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchQueue:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolSearchQueue"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolSearchQueue
alarm_description = "The cluster is experiencing high search concurrency. Consider scaling your cluster. You can also increase the search queue size, but increasing it excessively can cause out of memory errors."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchRejected" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchRejected:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolSearchRejected"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolSearchRejected
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteRejected" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteRejected:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolWriteRejected"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolWriteRejected
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-MasterCPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ES:MasterCPUUtilization:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MasterCPUUtilization"
period = "300"
statistic = "Average"
threshold = var.threshold-MasterCPUUtilization
alarm_description = "MasterCPUUtilization"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-MasterJVMMemoryPressure" {
alarm_name = "${var.cw-alarm-prefix}:ES:MasterJVMMemoryPressure:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MasterJVMMemoryPressure"
period = "60"
statistic = "Average"
threshold = var.threshold-MasterJVMMemoryPressure
alarm_description = "MasterJVMMemoryPressure"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-JVMMemoryPressure" {
alarm_name = "${var.cw-alarm-prefix}:ES:JVMMemoryPressure:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "JVMMemoryPressure"
period = "60"
statistic = "Average"
threshold = var.threshold-JVMMemoryPressure
alarm_description = "The cluster could encounter out of memory errors if usage increases. Consider scaling vertically. OpenSearch Service uses half of an instance's RAM for the Java heap, up to a heap size of 32 GiB. You can scale instances vertically up to 64 GiB of RAM, at which point you can scale horizontally by adding instances."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ClusterIndexWritesBlocked" {
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterIndexWritesBlocked:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ClusterIndexWritesBlocked"
period = "60"
statistic = "Average"
threshold = var.threshold-ClusterIndexWritesBlocked
alarm_description = "Your cluster is blocking write requests. See ClusterBlockException."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-FreeStorageSpace" {
alarm_name = "${var.cw-alarm-prefix}:ES:FreeStorageSpace:${var.domain-name}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "FreeStorageSpace"
period = "300"
statistic = "Average"
threshold = var.threshold-FreeStorageSpace
alarm_description = "A node in your cluster is low on free storage space."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = { dimensions = {
DomainName = var.domain-name DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id ClientId = data.aws_caller_identity.this.id

View File

@ -1,19 +1,6 @@
variable "cw-alarm-prefix" {} variable "cw-alarm-prefix" {}
variable "actions-enabled" {} variable "actions-enabled" {}
variable "domain-name" {} variable "domain-name" {}
variable "sns-targets" {} variable "settings" {}
variable "default-tags" {} variable "default-tags" {}
variable "threshold-CPUUtilization" {}
variable "threshold-SearchLatency" {}
variable "threshold-IndexingLatency" {}
variable "threshold-ThreadpoolWriteQueue" {}
variable "threshold-ThreadpoolSearchQueue" {}
variable "threshold-ThreadpoolSearchRejected" {}
variable "threshold-ThreadpoolWriteRejected" {}
variable "threshold-MasterCPUUtilization" {}
variable "threshold-MasterJVMMemoryPressure" {}
variable "threshold-JVMMemoryPressure" {}
variable "threshold-ClusterIndexWritesBlocked" {}
variable "threshold-FreeStorageSpace" {}
# variable threshold-KibanaHealthyNodes {}

View File

@ -1,17 +1,18 @@
resource "aws_cloudwatch_metric_alarm" "rds-cpu" { resource "aws_cloudwatch_metric_alarm" "rds-alarms" {
alarm_name = "${var.cw-alarm-prefix}:RDS:CpuUtilization:${var.rds-instance-name}" for_each = var.settings
comparison_operator = "GreaterThanThreshold" alarm_name = "${var.cw-alarm-prefix}:RDS:${each.value["metric"]}:${var.rds-instance-name}"
evaluation_periods = "1" comparison_operator = each.value["comparison_operator"]
metric_name = "CPUUtilization" evaluation_periods = each.value["evaluation_periods"]
period = "3600" metric_name = each.value["metric"]
statistic = "Average" period = each.value["period"]
threshold = var.threshold-CpuUtilization statistic = each.value["statistic"]
alarm_description = "RDS:CpuUtilization" threshold = each.value["threshold"]
alarm_description = "RDS:${each.value["metric"]}"
namespace = "AWS/RDS" namespace = "AWS/RDS"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [each.value["action"]]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [each.value["action"]]
dimensions = { dimensions = {
DBInstanceIdentifier = var.rds-instance-name DBInstanceIdentifier = var.rds-instance-name
} }
@ -20,118 +21,3 @@ resource "aws_cloudwatch_metric_alarm" "rds-cpu" {
ignore_changes = [tags] ignore_changes = [tags]
} }
} }
resource "aws_cloudwatch_metric_alarm" "rds-storage" {
alarm_name = "${var.cw-alarm-prefix}:RDS:FreeStorageSpace:${var.rds-instance-name}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = "FreeStorageSpace"
period = "3600"
statistic = "Average"
threshold = var.threshold-FreeStorageSpace
alarm_description = "RDS:FreeStorageSpace"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "rds-memory" {
alarm_name = "${var.cw-alarm-prefix}:RDS:FreeableMemory:${var.rds-instance-name}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = "FreeableMemory"
period = "3600"
statistic = "Average"
threshold = var.threshold-FreeableMemory
alarm_description = "RDS:FreeableMemory"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" {
alarm_name = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "DiskQueueDepth"
period = "300"
statistic = "Average"
threshold = var.threshold-DiskQueueDepth
alarm_description = "RDS:DiskQueueDepth"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "rds-ReadLatency" {
alarm_name = "${var.cw-alarm-prefix}:RDS:ReadLatency:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ReadLatency"
period = "900"
statistic = "Average"
threshold = var.threshold-ReadLatency
alarm_description = "RDS:ReadLatency"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "rds-WriteLatency" {
alarm_name = "${var.cw-alarm-prefix}:RDS:WriteLatency:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "WriteLatency"
period = "900"
statistic = "Average"
threshold = var.threshold-WriteLatency
alarm_description = "RDS:WriteLatency"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -1,12 +1,5 @@
variable cw-alarm-prefix {} variable cw-alarm-prefix {}
variable actions-enabled {} variable actions-enabled {}
variable rds-instance-name {} variable rds-instance-name {}
variable sns-targets {} variable settings {}
variable default-tags {} variable default-tags {}
variable threshold-FreeableMemory {}
variable threshold-CpuUtilization {}
variable threshold-FreeStorageSpace {}
variable threshold-DiskQueueDepth {}
variable threshold-ReadLatency {}
variable threshold-WriteLatency {}

View File

@ -1,17 +1,18 @@
resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" { resource "aws_cloudwatch_metric_alarm" "redis-alarms" {
alarm_name = "${var.cw-alarm-prefix}:Redis:EngineCPUUtilization:${var.redis-cluster-id}" for_each = var.settings
comparison_operator = "GreaterThanThreshold" alarm_name = "${var.cw-alarm-prefix}:Redis:${each.value["metric"]}:${var.redis-cluster-id}"
evaluation_periods = "1" comparison_operator = each.value["comparison_operator"]
metric_name = "EngineCPUUtilization" evaluation_periods = each.value["evaluation_periods"]
period = "3600" metric_name = each.value["metric"]
statistic = "Average" period = each.value["period"]
threshold = var.threshold-EngineCPUUtilization statistic = each.value["statistic"]
alarm_description = "Redis:EngineCPUUtilization" threshold = each.value["threshold"]
namespace = "AWS/ElastiCache" alarm_description = "NGW:${each.value["metric"]}"
namespace = "AWS/NATGateway"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent] alarm_actions = [each.value["action"]]
ok_actions = [var.sns-targets.alarm-actions-urgent] ok_actions = [each.value["action"]]
dimensions = { dimensions = {
CacheClusterId = var.redis-cluster-id CacheClusterId = var.redis-cluster-id
} }
@ -19,75 +20,4 @@ resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" {
lifecycle { lifecycle {
ignore_changes = [tags] ignore_changes = [tags]
} }
}
resource "aws_cloudwatch_metric_alarm" "redis-DatabaseMemoryUsagePercentage" {
alarm_name = "${var.cw-alarm-prefix}:Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "DatabaseMemoryUsagePercentage"
period = "3600"
statistic = "Average"
threshold = var.threshold-DatabaseMemoryUsagePercentage
alarm_description = "Redis:DatabaseMemoryUsagePercentage"
namespace = "AWS/ElastiCache"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
CacheClusterId = var.redis-cluster-id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" {
alarm_name = "${var.cw-alarm-prefix}:Redis:CacheHitRate:${var.redis-cluster-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "4"
metric_name = "CacheHitRate"
period = "900"
statistic = "Average"
threshold = var.threshold-CacheHitRate
alarm_description = "Redis:CacheHitRate"
namespace = "AWS/ElastiCache"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
CacheClusterId = var.redis-cluster-id
# CacheNodeId = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "redis-StringBasedCmdsLatency" {
alarm_name = "${var.cw-alarm-prefix}:Redis:StringBasedCmdsLatency:${var.redis-cluster-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "StringBasedCmdsLatency"
period = "60"
statistic = "Average"
threshold = var.threshold-StringBasedCmdsLatency
alarm_description = "The average latency, in microseconds, of the string-based commands run during a selected time range"
namespace = "AWS/ElastiCache"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
CacheClusterId = var.redis-cluster-id
# CacheNodeId = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
} }

View File

@ -1,10 +1,5 @@
variable "cw-alarm-prefix" {} variable "cw-alarm-prefix" {}
variable "actions-enabled" {} variable "actions-enabled" {}
variable "redis-cluster-id" {} variable "redis-cluster-id" {}
variable "sns-targets" {} variable "settings" {}
variable "default-tags" {} variable "default-tags" {}
variable "threshold-EngineCPUUtilization" {}
variable "threshold-DatabaseMemoryUsagePercentage" {}
variable "threshold-CacheHitRate" {}
variable "threshold-StringBasedCmdsLatency" {}

View File

@ -1,17 +1,17 @@
resource "aws_cloudwatch_metric_alarm" "tgw-PacketDropCountNoRoute" { resource "aws_cloudwatch_metric_alarm" "tgw-PacketDropCountNoRoute" {
alarm_name = "${var.cw-alarm-prefix}:TGW:PacketDropCountNoRoute:${var.tgw-id}" alarm_name = "${var.cw-alarm-prefix}:TGW:PacketDropCountNoRoute:${var.tgw-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = var.settings.PacketDropCountNoRoute.comparison_operator
evaluation_periods = "2" evaluation_periods = var.settings.PacketDropCountNoRoute.evaluation_periods
metric_name = "PacketDropCountNoRoute" metric_name = "PacketDropCountNoRoute"
period = "300" period = var.settings.PacketDropCountNoRoute.period
statistic = "Average" statistic = var.settings.PacketDropCountNoRoute.statistic
threshold = var.threshold-PacketDropCountNoRoute threshold = var.settings.PacketDropCountNoRoute.threshold
alarm_description = "TGW:PacketDropCountNoRoute" alarm_description = "TGW:PacketDropCountNoRoute"
namespace = "AWS/TransitGateway" namespace = "AWS/TransitGateway"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = var.actions-enabled actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard] alarm_actions = [var.settings.PacketDropCountNoRoute.action]
ok_actions = [var.sns-targets.alarm-actions-standard] ok_actions = [var.settings.PacketDropCountNoRoute.action]
dimensions = { dimensions = {
TransitGateway = var.tgw-id TransitGateway = var.tgw-id
} }

View File

@ -1,7 +1,5 @@
variable cw-alarm-prefix {} variable cw-alarm-prefix {}
variable actions-enabled {} variable actions-enabled {}
variable tgw-id {} variable tgw-id {}
variable threshold-PacketDropCountNoRoute {} variable settings {}
variable sns-targets {}
variable default-tags {} variable default-tags {}