UPD: various bug fixes and enhancements

This commit is contained in:
xpk 2023-01-05 23:35:31 +08:00
parent bcdbb23221
commit 48191b808f
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
24 changed files with 272 additions and 768 deletions

View File

@ -1,17 +1,17 @@
resource "aws_cloudwatch_metric_alarm" "asg-CPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ASG:CPUUtilization:${var.asg-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
comparison_operator = var.settings.CPUUtilization.comparison_operator
evaluation_periods = var.settings.CPUUtilization.evaluation_periods
metric_name = "CPUUtilization"
period = "1800"
statistic = "Average"
threshold = var.threshold-CPUUtilization
period = var.settings.CPUUtilization.period
statistic = var.settings.CPUUtilization.statistic
threshold = var.settings.CPUUtilization.threshold
alarm_description = "ASG:CPUUtilization"
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.CPUUtilization.action]
ok_actions = [var.settings.CPUUtilization.action]
dimensions = {
AutoScalingGroupName = var.asg-name
}

View File

@ -1,7 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable asg-name {}
variable sns-targets {}
variable settings {}
variable default-tags {}
variable threshold-CPUUtilization {}

View File

@ -1,17 +1,17 @@
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_System:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator
evaluation_periods = var.settings.StatusCheckFailed_System.evaluation_periods
metric_name = "StatusCheckFailed_System"
period = "300"
statistic = "Maximum"
threshold = 0
period = var.settings.StatusCheckFailed_System.period
statistic = var.settings.StatusCheckFailed_System.statistic
threshold = var.settings.StatusCheckFailed_System.threshold
alarm_description = "EC2:StatusCheckFailed_System"
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-emergency]
ok_actions = [var.sns-targets.alarm-actions-emergency]
alarm_actions = [var.settings.StatusCheckFailed_System.action]
ok_actions = [var.settings.StatusCheckFailed_System.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
@ -23,18 +23,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
comparison_operator = var.settings.StatusCheckFailed_Instance.comparison_operator
evaluation_periods = var.settings.StatusCheckFailed_Instance.evaluation_periods
metric_name = "StatusCheckFailed_Instance"
period = "300"
statistic = "Maximum"
threshold = 0
period = var.settings.StatusCheckFailed_Instance.period
statistic = var.settings.StatusCheckFailed_Instance.statistic
threshold = var.settings.StatusCheckFailed_Instance.threshold
alarm_description = "EC2:StatusCheckFailed_Instance"
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-emergency]
ok_actions = [var.sns-targets.alarm-actions-emergency]
alarm_actions = [var.settings.StatusCheckFailed_Instance.action]
ok_actions = [var.settings.StatusCheckFailed_Instance.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
@ -46,18 +46,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:EC2:CPUUtilization:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "6"
comparison_operator = var.settings.CPUUtilization.comparison_operator
evaluation_periods = var.settings.CPUUtilization.evaluation_periods
metric_name = "CPUUtilization"
period = "300"
statistic = "Average"
threshold = var.threshold-CPUUtilization
period = var.settings.CPUUtilization.period
statistic = var.settings.CPUUtilization.statistic
threshold = var.settings.CPUUtilization.threshold
alarm_description = "EC2:CPUUtilization"
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.CPUUtilization.action]
ok_actions = [var.settings.CPUUtilization.action]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = var.ec2-instance-id
@ -114,18 +114,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" {
resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:mem_used_percent:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
comparison_operator = var.settings.mem_used_percent.comparison_operator
evaluation_periods = var.settings.mem_used_percent.evaluation_periods
metric_name = "mem_used_percent"
period = "900"
statistic = "Average"
threshold = var.threshold-mem_used_percent
period = var.settings.mem_used_percent.period
statistic = var.settings.mem_used_percent.statistic
threshold = var.settings.mem_used_percent.threshold
alarm_description = "EC2:mem_used_percent"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
alarm_actions = [var.settings.mem_used_percent.action]
ok_actions = [var.settings.mem_used_percent.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
@ -190,18 +190,18 @@ data "external" "cw-dimensions" {
resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:swap_used_percent:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
comparison_operator = var.settings.swap_used_percent.comparison_operator
evaluation_periods = var.settings.swap_used_percent.evaluation_periods
metric_name = "swap_used_percent"
period = "900"
statistic = "Average"
threshold = var.threshold-swap_used_percent
period = var.settings.swap_used_percent.period
statistic = var.settings.swap_used_percent.statistic
threshold = var.settings.swap_used_percent.threshold
alarm_description = "EC2:swap_used_percent"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.swap_used_percent.action]
ok_actions = [var.settings.swap_used_percent.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
@ -242,18 +242,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" {
resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" {
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_used_percent:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
comparison_operator = var.settings.disk_used_percent.comparison_operator
evaluation_periods = var.settings.disk_used_percent.evaluation_periods
metric_name = "disk_used_percent"
period = "900"
statistic = "Average"
threshold = var.threshold-disk_used_percentage
period = var.settings.disk_used_percent.period
statistic = var.settings.disk_used_percent.statistic
threshold = var.settings.disk_used_percent.threshold
alarm_description = "EC2:disk_used_percent"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.disk_used_percent.action]
ok_actions = [var.settings.disk_used_percent.action]
dimensions = data.external.cw-dimensions.result
tags = var.default-tags
@ -266,18 +266,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" {
resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
comparison_operator = var.settings.disk_inodes_free.comparison_operator
evaluation_periods = var.settings.disk_inodes_free.evaluation_periods
metric_name = "disk_inodes_free"
period = "300"
statistic = "Average"
threshold = var.threshold-disk_inodes_free
period = var.settings.disk_inodes_free.period
statistic = var.settings.disk_inodes_free.statistic
threshold = var.settings.disk_inodes_free.threshold
alarm_description = "EC2:disk_inodes_free"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.disk_inodes_free.action]
ok_actions = [var.settings.disk_inodes_free.action]
dimensions = data.external.cw-dimensions.result
/*
dimensions = {
@ -299,18 +299,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:processes_total:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
comparison_operator = var.settings.processes_total.comparison_operator
evaluation_periods = var.settings.processes_total.evaluation_periods
metric_name = "processes_total"
period = "900"
statistic = "Average"
threshold = var.threshold-processes_total
period = var.settings.processes_total.period
statistic = var.settings.processes_total.statistic
threshold = var.settings.processes_total.threshold
alarm_description = "EC2:processes_total"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.processes_total.action]
ok_actions = [var.settings.processes_total.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
@ -327,18 +327,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
count = data.external.ec2-os.result.os == "Windows" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:MemoryCommittedPct:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
comparison_operator = var.settings.MemoryCommittedPct.comparison_operator
evaluation_periods = var.settings.MemoryCommittedPct.evaluation_periods
metric_name = "Memory % Committed Bytes In Use"
period = "900"
statistic = "Average"
threshold = var.threshold-MemoryCommittedPct
period = var.settings.MemoryCommittedPct.period
statistic = var.settings.MemoryCommittedPct.statistic
threshold = var.settings.MemoryCommittedPct.threshold
alarm_description = "EC2:MemoryCommittedBytes"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.MemoryCommittedPct.action]
ok_actions = [var.settings.MemoryCommittedPct.action]
dimensions = {
objectname = "Memory"
InstanceId = var.ec2-instance-id
@ -351,21 +351,21 @@ resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-OsDiskFreePct" {
resource "aws_cloudwatch_metric_alarm" "ec2-LogicalDiskFreePct" {
count = data.external.ec2-os.result.os == "Windows" ? 1 : 0
alarm_name = "${var.cw-alarm-prefix}:EC2:OsDiskFreePct:${var.ec2-instance-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
alarm_name = "${var.cw-alarm-prefix}:EC2:LogicalDiskFreePct:${var.ec2-instance-id}"
comparison_operator = var.settings.LogicalDiskFreePct.comparison_operator
evaluation_periods = var.settings.LogicalDiskFreePct.evaluation_periods
metric_name = "LogicalDisk % Free Space"
period = "300"
statistic = "Average"
threshold = var.threshold-LogicalDiskFreePct
period = var.settings.LogicalDiskFreePct.period
statistic = var.settings.LogicalDiskFreePct.statistic
threshold = var.settings.LogicalDiskFreePct.threshold
alarm_description = "EC2:OsDiskFreePct"
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.LogicalDiskFreePct.action]
ok_actions = [var.settings.LogicalDiskFreePct.action]
dimensions = {
instance = "C:"
objectname = "LogicalDisk"

View File

@ -1,18 +1,6 @@
variable "cw-alarm-prefix" {}
variable "actions-enabled" {}
variable "ec2-instance-id" {}
variable sns-targets {}
variable "settings" {}
variable "default-tags" {}
variable "threshold-CPUUtilization" {}
# variable "threshold-mem_free" {}
variable "threshold-mem_used_percent" {}
# variable "threshold-swap_free" {}
variable "threshold-swap_used_percent" {}
# variable "threshold-disk_free" {}
variable "threshold-disk_used_percentage" {}
variable "threshold-disk_inodes_free" {}
variable "threshold-processes_total" {}
variable threshold-MemoryCommittedPct {}
variable threshold-LogicalDiskFreePct {}
variable "default-tags" {}

View File

@ -2,19 +2,19 @@
resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" {
for_each = toset(var.pod-names)
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_cpu_utilization:${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "pod_cpu_utilization"
period = "300"
statistic = "Average"
threshold = var.threshold-pod_cpu_utilization
alarm_description = "EKS:pod_cpu_utilization"
alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm1.metric}"
comparison_operator = var.settings.alarm1.comparison_operator
evaluation_periods = var.settings.alarm1.evaluation_periods
metric_name = var.settings.alarm1.metric
period = var.settings.alarm1.period
statistic = var.settings.alarm1.statistic
threshold = var.settings.alarm1.threshold
alarm_description = "EKS:${var.settings.alarm1.metric}"
namespace = "ContainerInsights"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.alarm1.action]
ok_actions = [var.settings.alarm1.action]
dimensions = {
"PodName" = each.value
"ClusterName" = var.cluster-name
@ -29,19 +29,19 @@ resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" {
resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" {
for_each = toset(var.pod-names)
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_memory_utilization:${each.value}"
alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm2.metric}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "pod_memory_utilization"
period = "300"
statistic = "Average"
threshold = var.threshold-pod_memory_utilization
alarm_description = "EKS:pod_memory_utilization"
metric_name = var.settings.alarm2.metric
period = var.settings.alarm2.period
statistic = var.settings.alarm2.statistic
threshold = var.settings.alarm2.threshold
alarm_description = "EKS:${var.settings.alarm2.metric}"
namespace = "ContainerInsights"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.alarm2.action]
ok_actions = [var.settings.alarm2.action]
dimensions = {
"PodName" = each.value
"ClusterName" = var.cluster-name
@ -56,19 +56,19 @@ resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" {
resource "aws_cloudwatch_metric_alarm" "eks-pod_number_of_container_restarts" {
for_each = toset(var.pod-names)
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_number_of_container_restarts:${each.value}"
alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm3.metric}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "pod_number_of_container_restarts"
period = "300"
statistic = "Average"
threshold = var.threshold-pod_number_of_container_restarts
alarm_description = "EKS:pod_number_of_container_restarts"
metric_name = var.settings.alarm3.metric
period = var.settings.alarm3.period
statistic = var.settings.alarm3.statistic
threshold = var.settings.alarm3.threshold
alarm_description = "EKS:${var.settings.alarm3.metric}"
namespace = "ContainerInsights"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.alarm3.action]
ok_actions = [var.settings.alarm3.action]
dimensions = {
"PodName" = each.value
"ClusterName" = var.cluster-name

View File

@ -1,6 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable sns-targets {}
variable default-tags {}
variable cluster-name {}
@ -8,6 +7,4 @@ variable eks-namespace {}
variable pod-names {
type = list
}
variable threshold-pod_cpu_utilization {}
variable threshold-pod_memory_utilization {}
variable threshold-pod_number_of_container_restarts {}
variable settings {}

View File

@ -1,17 +1,18 @@
resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
alarm_name = "${var.cw-alarm-prefix}:EMR:AppsPending:${var.job-flow-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "AppsPending"
period = "1800"
statistic = "Average"
threshold = var.threshold-AppsPending
alarm_description = "EMR:AppsPending"
resource "aws_cloudwatch_metric_alarm" "emr-alarms" {
for_each = var.settings
alarm_name = "${var.cw-alarm-prefix}:EMR:${each.value["metric"]}:${var.job-flow-id}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "EMR:${each.value["metric"]}"
namespace = "AWS/ElasticMapReduce"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
JobFlowId = var.job-flow-id
}
@ -21,6 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
}
}
/*
resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
alarm_name = "${var.cw-alarm-prefix}:EMR:CapacityRemainingGB:${var.job-flow-id}"
comparison_operator = "LessThanThreshold"
@ -28,13 +30,13 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
metric_name = "CapacityRemainingGB"
period = "3600"
statistic = "Average"
threshold = var.threshold-CapacityRemainingGB
threshold = var.settings.CapacityRemainingGB.threshold
alarm_description = "EMR:CapacityRemainingGB"
namespace = "AWS/ElasticMapReduce"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.CapacityRemainingGB.action]
ok_actions = [var.settings.CapacityRemainingGB.action]
dimensions = {
JobFlowId = var.job-flow-id
}
@ -42,4 +44,5 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
lifecycle {
ignore_changes = [tags]
}
}
}
*/

View File

@ -1,8 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable job-flow-id {}
variable threshold-AppsPending {}
variable threshold-CapacityRemainingGB {}
variable sns-targets {}
variable settings {}
variable default-tags {}

View File

@ -19,7 +19,7 @@ PATTERN
}
resource "aws_cloudwatch_event_target" "TargetForEventRule" {
rule = aws_cloudwatch_event_rule.EventRule.name
target_id = "rackspace-standard-sns"
arn = var.sns-targets.alarm-actions-standard
}
rule = aws_cloudwatch_event_rule.EventRule.name
target_id = "health-event-notification-sns"
arn = var.settings.healthEvents.action
}

View File

@ -1,5 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable sns-targets {}
variable settings {}
variable default-tags {}

View File

@ -1,17 +1,17 @@
resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
alarm_name = "${var.cw-alarm-prefix}:Kafka:ZooKeeperRequestLatencyMsMean:${var.cluster-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
comparison_operator = var.settings.ZooKeeperRequestLatencyMsMean.comparison_operator
evaluation_periods = var.settings.ZooKeeperRequestLatencyMsMean.evaluation_periods
metric_name = "ZooKeeperRequestLatencyMsMean"
period = "1800"
statistic = "Average"
threshold = var.threshold-ZooKeeperRequestLatencyMsMean
period = var.settings.ZooKeeperRequestLatencyMsMean.period
statistic = var.settings.ZooKeeperRequestLatencyMsMean.statistic
threshold = var.settings.ZooKeeperRequestLatencyMsMean.threshold
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action]
ok_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action]
dimensions = {
"Cluster Name" = var.cluster-name
}
@ -45,21 +45,21 @@ module "msk-brokers" {
resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:CpuUsage:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
threshold = var.threshold-CpuUserSystem
comparison_operator = var.settings.CpuUserSystem.comparison_operator
evaluation_periods = var.settings.CpuUserSystem.evaluation_periods
threshold = var.settings.CpuUserSystem.threshold
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.CpuUserSystem.action]
ok_actions = [var.settings.CpuUserSystem.action]
metric_query {
id = "m1"
metric {
metric_name = "CpuUser"
namespace = "AWS/Kafka"
period = 300
stat = "Average"
period = var.settings.CpuUserSystem.period
stat = var.settings.CpuUserSystem.statistic
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
@ -72,8 +72,8 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
metric {
metric_name = "CpuSystem"
namespace = "AWS/Kafka"
period = 300
stat = "Average"
period = var.settings.CpuUserSystem.period
stat = var.settings.CpuUserSystem.statistic
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
@ -97,18 +97,18 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:KafkaDataLogsDiskUsed:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
comparison_operator = var.settings.KafkaDataLogsDiskUsed.comparison_operator
evaluation_periods = var.settings.KafkaDataLogsDiskUsed.evaluation_periods
metric_name = "KafkaDataLogsDiskUsed"
period = "300"
statistic = "Average"
threshold = var.threshold-KafkaDataLogsDiskUsed
period = var.settings.KafkaDataLogsDiskUsed.period
statistic = var.settings.KafkaDataLogsDiskUsed.statistic
threshold = var.settings.KafkaDataLogsDiskUsed.threshold
alarm_description = "Kafka:KafkaDataLogsDiskUsed"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.KafkaDataLogsDiskUsed.action]
ok_actions = [var.settings.KafkaDataLogsDiskUsed.action]
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
@ -122,18 +122,18 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:HeapMemoryAfterGC:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
comparison_operator = var.settings.HeapMemoryAfterGC.comparison_operator
evaluation_periods = var.settings.HeapMemoryAfterGC.evaluation_periods
metric_name = "HeapMemoryAfterGC"
period = "300"
statistic = "Average"
threshold = var.threshold-HeapMemoryAfterGC
period = var.settings.HeapMemoryAfterGC.period
statistic = var.settings.HeapMemoryAfterGC.statistic
threshold = var.settings.HeapMemoryAfterGC.threshold
alarm_description = "Kafka:HeapMemoryAfterGC"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [var.settings.HeapMemoryAfterGC.action]
ok_actions = [var.settings.HeapMemoryAfterGC.action]
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value

View File

@ -1,10 +1,6 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable cluster-name {}
variable sns-targets {}
variable settings {}
variable default-tags {}
variable threshold-ZooKeeperRequestLatencyMsMean {}
variable threshold-CpuUserSystem {}
variable threshold-KafkaDataLogsDiskUsed {}
variable threshold-HeapMemoryAfterGC {}

View File

@ -1,17 +1,43 @@
resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" {
alarm_name = "${var.cw-alarm-prefix}:NGW:ErrorPortAllocation:${var.res-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ErrorPortAllocation"
period = "300"
statistic = "Average"
threshold = var.threshold-ErrorPortAllocation
alarm_description = "NGW:ErrorPortAllocation"
resource "aws_cloudwatch_metric_alarm" "ngw-alarms" {
for_each = var.settings
alarm_name = "${var.cw-alarm-prefix}:NGW:${each.value["metric"]}:${var.res-id}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "NGW:${each.value["metric"]}"
namespace = "AWS/NATGateway"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
NatGatewayId = var.res-id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
/*
resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" {
alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm1.metric}:${var.res-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = var.settings.alarm1.metric
period = "300"
statistic = "Average"
threshold = var.settings.alarm1.threshold
alarm_description = "NGW:${var.settings.alarm1.metric}"
namespace = "AWS/NATGateway"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.alarm1.action]
ok_actions = [var.settings.alarm1.action]
dimensions = {
NatGatewayId = var.res-id
}
@ -22,19 +48,19 @@ resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" {
}
resource "aws_cloudwatch_metric_alarm" "ngw-ConnectionEstablishedCount" {
alarm_name = "${var.cw-alarm-prefix}:NGW:ConnectionEstablishedCount:${var.res-id}"
alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm2.metric}:${var.res-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ConnectionEstablishedCount"
metric_name = var.settings.alarm2.metric
period = "300"
statistic = "Average"
threshold = var.threshold-ConnectionEstablishedCount
alarm_description = "NGW:ConnectionEstablishedCount"
threshold = var.settings.alarm2.threshold
alarm_description = "NGW:${var.settings.alarm2.metric}"
namespace = "AWS/NATGateway"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
alarm_actions = [var.settings.alarm2.action]
ok_actions = [var.settings.alarm2.action]
dimensions = {
NatGatewayId = var.res-id
}
@ -45,19 +71,19 @@ resource "aws_cloudwatch_metric_alarm" "ngw-ConnectionEstablishedCount" {
}
resource "aws_cloudwatch_metric_alarm" "ngw-PacketsDropCount" {
alarm_name = "${var.cw-alarm-prefix}:NGW:PacketsDropCount:${var.res-id}"
alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm3.metric}:${var.res-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "PacketsDropCount"
metric_name = var.settings.alarm3.metric
period = "300"
statistic = "Average"
threshold = var.threshold-PacketsDropCount
alarm_description = "NGW:PacketsDropCount"
threshold = var.settings.alarm3.threshold
alarm_description = "NGW:${var.settings.alarm3.metric}"
namespace = "AWS/NATGateway"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
alarm_actions = [var.settings.alarm3.action]
ok_actions = [var.settings.alarm3.action]
dimensions = {
NatGatewayId = var.res-id
}
@ -65,4 +91,5 @@ resource "aws_cloudwatch_metric_alarm" "ngw-PacketsDropCount" {
lifecycle {
ignore_changes = [tags]
}
}
}
*/

View File

@ -1,8 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable res-id {}
variable threshold-ErrorPortAllocation { }
variable threshold-ConnectionEstablishedCount {}
variable threshold-PacketsDropCount {}
variable sns-targets {}
variable default-tags {}
variable settings {}

View File

@ -17,18 +17,18 @@ module "nlb-targetgroups" {
resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" {
for_each = module.nlb-targetgroups.result-set
alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
comparison_operator = var.settings.HealthHostCountMin.comparison_operator
evaluation_periods = var.settings.HealthHostCountMin.evaluation_periods
metric_name = "HealthyHostCount"
period = "300"
statistic = "Minimum"
threshold = var.threshold-HealthHostCountMin
period = var.settings.HealthHostCountMin.period
statistic = var.settings.HealthHostCountMin.statistic
threshold = var.settings.HealthHostCountMin.threshold
alarm_description = "NLBTG:HealthyHostCount"
namespace = "AWS/NetworkELB"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-emergency]
ok_actions = [var.sns-targets.alarm-actions-emergency]
alarm_actions = [var.settings.HealthHostCountMin.action]
ok_actions = [var.settings.HealthHostCountMin.action]
dimensions = {
TargetGroup = split(":", each.value)[5]
LoadBalancer = "net/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"

View File

@ -1,6 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable load-balancer {}
variable threshold-HealthHostCountMin {}
variable sns-targets {}
variable settings {}
variable default-tags {}

View File

@ -1,307 +1,20 @@
data "aws_caller_identity" "this" {}
resource "aws_cloudwatch_metric_alarm" "ES-CPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ES:CPUUtilization:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "CPUUtilization"
period = "1800"
statistic = "Average"
threshold = var.threshold-CPUUtilization
alarm_description = "ES:CPUUtilization"
resource "aws_cloudwatch_metric_alarm" "ES-alarms" {
for_each = var.settings
alarm_name = "${var.cw-alarm-prefix}:ES:${each.value["metric"]}:${var.domain-name}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "ES:${each.value["metric"]}"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-SearchLatency" {
alarm_name = "${var.cw-alarm-prefix}:ES:SearchLatency:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "SearchLatency"
period = "1800"
statistic = "Average"
threshold = var.threshold-SearchLatency
alarm_description = "ES:SearchLatency"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-IndexingLatency" {
alarm_name = "${var.cw-alarm-prefix}:ES:IndexingLatency:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "IndexingLatency"
period = "1800"
statistic = "Average"
threshold = var.threshold-IndexingLatency
alarm_description = "ES:IndexingLatency"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" {
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterStatusRed:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ClusterStatus.red"
period = "900"
statistic = "Maximum"
threshold = 0
alarm_description = "At least one primary shard and its replicas aren't allocated to a node."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteQueue" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteQueue:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolWriteQueue"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolWriteQueue
alarm_description = "The cluster is experiencing high indexing concurrency. Review and control indexing requests, or increase cluster resources."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchQueue" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchQueue:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolSearchQueue"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolSearchQueue
alarm_description = "The cluster is experiencing high search concurrency. Consider scaling your cluster. You can also increase the search queue size, but increasing it excessively can cause out of memory errors."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchRejected" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchRejected:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolSearchRejected"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolSearchRejected
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteRejected" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteRejected:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolWriteRejected"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolWriteRejected
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-MasterCPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ES:MasterCPUUtilization:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MasterCPUUtilization"
period = "300"
statistic = "Average"
threshold = var.threshold-MasterCPUUtilization
alarm_description = "MasterCPUUtilization"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-MasterJVMMemoryPressure" {
alarm_name = "${var.cw-alarm-prefix}:ES:MasterJVMMemoryPressure:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MasterJVMMemoryPressure"
period = "60"
statistic = "Average"
threshold = var.threshold-MasterJVMMemoryPressure
alarm_description = "MasterJVMMemoryPressure"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-JVMMemoryPressure" {
alarm_name = "${var.cw-alarm-prefix}:ES:JVMMemoryPressure:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "JVMMemoryPressure"
period = "60"
statistic = "Average"
threshold = var.threshold-JVMMemoryPressure
alarm_description = "The cluster could encounter out of memory errors if usage increases. Consider scaling vertically. OpenSearch Service uses half of an instance's RAM for the Java heap, up to a heap size of 32 GiB. You can scale instances vertically up to 64 GiB of RAM, at which point you can scale horizontally by adding instances."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ClusterIndexWritesBlocked" {
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterIndexWritesBlocked:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ClusterIndexWritesBlocked"
period = "60"
statistic = "Average"
threshold = var.threshold-ClusterIndexWritesBlocked
alarm_description = "Your cluster is blocking write requests. See ClusterBlockException."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-FreeStorageSpace" {
alarm_name = "${var.cw-alarm-prefix}:ES:FreeStorageSpace:${var.domain-name}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "FreeStorageSpace"
period = "300"
statistic = "Average"
threshold = var.threshold-FreeStorageSpace
alarm_description = "A node in your cluster is low on free storage space."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id

View File

@ -1,19 +1,6 @@
variable "cw-alarm-prefix" {}
variable "actions-enabled" {}
variable "domain-name" {}
variable "sns-targets" {}
variable "settings" {}
variable "default-tags" {}
variable "threshold-CPUUtilization" {}
variable "threshold-SearchLatency" {}
variable "threshold-IndexingLatency" {}
variable "threshold-ThreadpoolWriteQueue" {}
variable "threshold-ThreadpoolSearchQueue" {}
variable "threshold-ThreadpoolSearchRejected" {}
variable "threshold-ThreadpoolWriteRejected" {}
variable "threshold-MasterCPUUtilization" {}
variable "threshold-MasterJVMMemoryPressure" {}
variable "threshold-JVMMemoryPressure" {}
variable "threshold-ClusterIndexWritesBlocked" {}
variable "threshold-FreeStorageSpace" {}
# variable threshold-KibanaHealthyNodes {}

View File

@ -1,17 +1,18 @@
resource "aws_cloudwatch_metric_alarm" "rds-cpu" {
alarm_name = "${var.cw-alarm-prefix}:RDS:CpuUtilization:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "CPUUtilization"
period = "3600"
statistic = "Average"
threshold = var.threshold-CpuUtilization
alarm_description = "RDS:CpuUtilization"
resource "aws_cloudwatch_metric_alarm" "rds-alarms" {
for_each = var.settings
alarm_name = "${var.cw-alarm-prefix}:RDS:${each.value["metric"]}:${var.rds-instance-name}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "RDS:${each.value["metric"]}"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
@ -20,118 +21,3 @@ resource "aws_cloudwatch_metric_alarm" "rds-cpu" {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "rds-storage" {
alarm_name = "${var.cw-alarm-prefix}:RDS:FreeStorageSpace:${var.rds-instance-name}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = "FreeStorageSpace"
period = "3600"
statistic = "Average"
threshold = var.threshold-FreeStorageSpace
alarm_description = "RDS:FreeStorageSpace"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "rds-memory" {
alarm_name = "${var.cw-alarm-prefix}:RDS:FreeableMemory:${var.rds-instance-name}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = "FreeableMemory"
period = "3600"
statistic = "Average"
threshold = var.threshold-FreeableMemory
alarm_description = "RDS:FreeableMemory"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" {
alarm_name = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "DiskQueueDepth"
period = "300"
statistic = "Average"
threshold = var.threshold-DiskQueueDepth
alarm_description = "RDS:DiskQueueDepth"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "rds-ReadLatency" {
alarm_name = "${var.cw-alarm-prefix}:RDS:ReadLatency:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ReadLatency"
period = "900"
statistic = "Average"
threshold = var.threshold-ReadLatency
alarm_description = "RDS:ReadLatency"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "rds-WriteLatency" {
alarm_name = "${var.cw-alarm-prefix}:RDS:WriteLatency:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "WriteLatency"
period = "900"
statistic = "Average"
threshold = var.threshold-WriteLatency
alarm_description = "RDS:WriteLatency"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -1,12 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable rds-instance-name {}
variable sns-targets {}
variable settings {}
variable default-tags {}
variable threshold-FreeableMemory {}
variable threshold-CpuUtilization {}
variable threshold-FreeStorageSpace {}
variable threshold-DiskQueueDepth {}
variable threshold-ReadLatency {}
variable threshold-WriteLatency {}

View File

@ -1,17 +1,18 @@
resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:Redis:EngineCPUUtilization:${var.redis-cluster-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "EngineCPUUtilization"
period = "3600"
statistic = "Average"
threshold = var.threshold-EngineCPUUtilization
alarm_description = "Redis:EngineCPUUtilization"
namespace = "AWS/ElastiCache"
resource "aws_cloudwatch_metric_alarm" "redis-alarms" {
for_each = var.settings
alarm_name = "${var.cw-alarm-prefix}:Redis:${each.value["metric"]}:${var.redis-cluster-id}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "NGW:${each.value["metric"]}"
namespace = "AWS/NATGateway"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
CacheClusterId = var.redis-cluster-id
}
@ -19,75 +20,4 @@ resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" {
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "redis-DatabaseMemoryUsagePercentage" {
alarm_name = "${var.cw-alarm-prefix}:Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "DatabaseMemoryUsagePercentage"
period = "3600"
statistic = "Average"
threshold = var.threshold-DatabaseMemoryUsagePercentage
alarm_description = "Redis:DatabaseMemoryUsagePercentage"
namespace = "AWS/ElastiCache"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
CacheClusterId = var.redis-cluster-id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" {
alarm_name = "${var.cw-alarm-prefix}:Redis:CacheHitRate:${var.redis-cluster-id}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "4"
metric_name = "CacheHitRate"
period = "900"
statistic = "Average"
threshold = var.threshold-CacheHitRate
alarm_description = "Redis:CacheHitRate"
namespace = "AWS/ElastiCache"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
CacheClusterId = var.redis-cluster-id
# CacheNodeId = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "redis-StringBasedCmdsLatency" {
alarm_name = "${var.cw-alarm-prefix}:Redis:StringBasedCmdsLatency:${var.redis-cluster-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "StringBasedCmdsLatency"
period = "60"
statistic = "Average"
threshold = var.threshold-StringBasedCmdsLatency
alarm_description = "The average latency, in microseconds, of the string-based commands run during a selected time range"
namespace = "AWS/ElastiCache"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
CacheClusterId = var.redis-cluster-id
# CacheNodeId = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -1,10 +1,5 @@
variable "cw-alarm-prefix" {}
variable "actions-enabled" {}
variable "redis-cluster-id" {}
variable "sns-targets" {}
variable "settings" {}
variable "default-tags" {}
variable "threshold-EngineCPUUtilization" {}
variable "threshold-DatabaseMemoryUsagePercentage" {}
variable "threshold-CacheHitRate" {}
variable "threshold-StringBasedCmdsLatency" {}

View File

@ -1,17 +1,17 @@
resource "aws_cloudwatch_metric_alarm" "tgw-PacketDropCountNoRoute" {
alarm_name = "${var.cw-alarm-prefix}:TGW:PacketDropCountNoRoute:${var.tgw-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
comparison_operator = var.settings.PacketDropCountNoRoute.comparison_operator
evaluation_periods = var.settings.PacketDropCountNoRoute.evaluation_periods
metric_name = "PacketDropCountNoRoute"
period = "300"
statistic = "Average"
threshold = var.threshold-PacketDropCountNoRoute
period = var.settings.PacketDropCountNoRoute.period
statistic = var.settings.PacketDropCountNoRoute.statistic
threshold = var.settings.PacketDropCountNoRoute.threshold
alarm_description = "TGW:PacketDropCountNoRoute"
namespace = "AWS/TransitGateway"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
alarm_actions = [var.settings.PacketDropCountNoRoute.action]
ok_actions = [var.settings.PacketDropCountNoRoute.action]
dimensions = {
TransitGateway = var.tgw-id
}

View File

@ -1,7 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable tgw-id {}
variable threshold-PacketDropCountNoRoute {}
variable sns-targets {}
variable settings {}
variable default-tags {}