UPD: various bug fixes and enhancements
This commit is contained in:
parent
bcdbb23221
commit
48191b808f
@ -1,17 +1,17 @@
|
||||
resource "aws_cloudwatch_metric_alarm" "asg-CPUUtilization" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ASG:CPUUtilization:${var.asg-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
comparison_operator = var.settings.CPUUtilization.comparison_operator
|
||||
evaluation_periods = var.settings.CPUUtilization.evaluation_periods
|
||||
metric_name = "CPUUtilization"
|
||||
period = "1800"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-CPUUtilization
|
||||
period = var.settings.CPUUtilization.period
|
||||
statistic = var.settings.CPUUtilization.statistic
|
||||
threshold = var.settings.CPUUtilization.threshold
|
||||
alarm_description = "ASG:CPUUtilization"
|
||||
namespace = "AWS/EC2"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.CPUUtilization.action]
|
||||
ok_actions = [var.settings.CPUUtilization.action]
|
||||
dimensions = {
|
||||
AutoScalingGroupName = var.asg-name
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
variable asg-name {}
|
||||
variable sns-targets {}
|
||||
variable settings {}
|
||||
variable default-tags {}
|
||||
|
||||
variable threshold-CPUUtilization {}
|
@ -1,17 +1,17 @@
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_System:${var.ec2-instance-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator
|
||||
evaluation_periods = var.settings.StatusCheckFailed_System.evaluation_periods
|
||||
metric_name = "StatusCheckFailed_System"
|
||||
period = "300"
|
||||
statistic = "Maximum"
|
||||
threshold = 0
|
||||
period = var.settings.StatusCheckFailed_System.period
|
||||
statistic = var.settings.StatusCheckFailed_System.statistic
|
||||
threshold = var.settings.StatusCheckFailed_System.threshold
|
||||
alarm_description = "EC2:StatusCheckFailed_System"
|
||||
namespace = "AWS/EC2"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-emergency]
|
||||
ok_actions = [var.sns-targets.alarm-actions-emergency]
|
||||
alarm_actions = [var.settings.StatusCheckFailed_System.action]
|
||||
ok_actions = [var.settings.StatusCheckFailed_System.action]
|
||||
dimensions = {
|
||||
InstanceId = var.ec2-instance-id
|
||||
}
|
||||
@ -23,18 +23,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
comparison_operator = var.settings.StatusCheckFailed_Instance.comparison_operator
|
||||
evaluation_periods = var.settings.StatusCheckFailed_Instance.evaluation_periods
|
||||
metric_name = "StatusCheckFailed_Instance"
|
||||
period = "300"
|
||||
statistic = "Maximum"
|
||||
threshold = 0
|
||||
period = var.settings.StatusCheckFailed_Instance.period
|
||||
statistic = var.settings.StatusCheckFailed_Instance.statistic
|
||||
threshold = var.settings.StatusCheckFailed_Instance.threshold
|
||||
alarm_description = "EC2:StatusCheckFailed_Instance"
|
||||
namespace = "AWS/EC2"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-emergency]
|
||||
ok_actions = [var.sns-targets.alarm-actions-emergency]
|
||||
alarm_actions = [var.settings.StatusCheckFailed_Instance.action]
|
||||
ok_actions = [var.settings.StatusCheckFailed_Instance.action]
|
||||
dimensions = {
|
||||
InstanceId = var.ec2-instance-id
|
||||
}
|
||||
@ -46,18 +46,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:CPUUtilization:${var.ec2-instance-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "6"
|
||||
comparison_operator = var.settings.CPUUtilization.comparison_operator
|
||||
evaluation_periods = var.settings.CPUUtilization.evaluation_periods
|
||||
metric_name = "CPUUtilization"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-CPUUtilization
|
||||
period = var.settings.CPUUtilization.period
|
||||
statistic = var.settings.CPUUtilization.statistic
|
||||
threshold = var.settings.CPUUtilization.threshold
|
||||
alarm_description = "EC2:CPUUtilization"
|
||||
namespace = "AWS/EC2"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.CPUUtilization.action]
|
||||
ok_actions = [var.settings.CPUUtilization.action]
|
||||
treat_missing_data = "notBreaching"
|
||||
dimensions = {
|
||||
InstanceId = var.ec2-instance-id
|
||||
@ -114,18 +114,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" {
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" {
|
||||
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:mem_used_percent:${var.ec2-instance-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
comparison_operator = var.settings.mem_used_percent.comparison_operator
|
||||
evaluation_periods = var.settings.mem_used_percent.evaluation_periods
|
||||
metric_name = "mem_used_percent"
|
||||
period = "900"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-mem_used_percent
|
||||
period = var.settings.mem_used_percent.period
|
||||
statistic = var.settings.mem_used_percent.statistic
|
||||
threshold = var.settings.mem_used_percent.threshold
|
||||
alarm_description = "EC2:mem_used_percent"
|
||||
namespace = "CWAgent"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
alarm_actions = [var.settings.mem_used_percent.action]
|
||||
ok_actions = [var.settings.mem_used_percent.action]
|
||||
dimensions = {
|
||||
InstanceId = var.ec2-instance-id
|
||||
ImageId = data.aws_instance.ec2-instance.ami
|
||||
@ -190,18 +190,18 @@ data "external" "cw-dimensions" {
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" {
|
||||
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:swap_used_percent:${var.ec2-instance-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
comparison_operator = var.settings.swap_used_percent.comparison_operator
|
||||
evaluation_periods = var.settings.swap_used_percent.evaluation_periods
|
||||
metric_name = "swap_used_percent"
|
||||
period = "900"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-swap_used_percent
|
||||
period = var.settings.swap_used_percent.period
|
||||
statistic = var.settings.swap_used_percent.statistic
|
||||
threshold = var.settings.swap_used_percent.threshold
|
||||
alarm_description = "EC2:swap_used_percent"
|
||||
namespace = "CWAgent"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.swap_used_percent.action]
|
||||
ok_actions = [var.settings.swap_used_percent.action]
|
||||
dimensions = {
|
||||
InstanceId = var.ec2-instance-id
|
||||
ImageId = data.aws_instance.ec2-instance.ami
|
||||
@ -242,18 +242,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" {
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" {
|
||||
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_used_percent:${var.ec2-instance-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
comparison_operator = var.settings.disk_used_percent.comparison_operator
|
||||
evaluation_periods = var.settings.disk_used_percent.evaluation_periods
|
||||
metric_name = "disk_used_percent"
|
||||
period = "900"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-disk_used_percentage
|
||||
period = var.settings.disk_used_percent.period
|
||||
statistic = var.settings.disk_used_percent.statistic
|
||||
threshold = var.settings.disk_used_percent.threshold
|
||||
alarm_description = "EC2:disk_used_percent"
|
||||
namespace = "CWAgent"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.disk_used_percent.action]
|
||||
ok_actions = [var.settings.disk_used_percent.action]
|
||||
dimensions = data.external.cw-dimensions.result
|
||||
|
||||
tags = var.default-tags
|
||||
@ -266,18 +266,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" {
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
|
||||
count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
comparison_operator = var.settings.disk_inodes_free.comparison_operator
|
||||
evaluation_periods = var.settings.disk_inodes_free.evaluation_periods
|
||||
metric_name = "disk_inodes_free"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-disk_inodes_free
|
||||
period = var.settings.disk_inodes_free.period
|
||||
statistic = var.settings.disk_inodes_free.statistic
|
||||
threshold = var.settings.disk_inodes_free.threshold
|
||||
alarm_description = "EC2:disk_inodes_free"
|
||||
namespace = "CWAgent"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.disk_inodes_free.action]
|
||||
ok_actions = [var.settings.disk_inodes_free.action]
|
||||
dimensions = data.external.cw-dimensions.result
|
||||
/*
|
||||
dimensions = {
|
||||
@ -299,18 +299,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
|
||||
count = data.external.ec2-os.result.os == "Linux" ? 1 : 0
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:processes_total:${var.ec2-instance-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
comparison_operator = var.settings.processes_total.comparison_operator
|
||||
evaluation_periods = var.settings.processes_total.evaluation_periods
|
||||
metric_name = "processes_total"
|
||||
period = "900"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-processes_total
|
||||
period = var.settings.processes_total.period
|
||||
statistic = var.settings.processes_total.statistic
|
||||
threshold = var.settings.processes_total.threshold
|
||||
alarm_description = "EC2:processes_total"
|
||||
namespace = "CWAgent"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.processes_total.action]
|
||||
ok_actions = [var.settings.processes_total.action]
|
||||
dimensions = {
|
||||
InstanceId = var.ec2-instance-id
|
||||
ImageId = data.aws_instance.ec2-instance.ami
|
||||
@ -327,18 +327,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
|
||||
count = data.external.ec2-os.result.os == "Windows" ? 1 : 0
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:MemoryCommittedPct:${var.ec2-instance-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
comparison_operator = var.settings.MemoryCommittedPct.comparison_operator
|
||||
evaluation_periods = var.settings.MemoryCommittedPct.evaluation_periods
|
||||
metric_name = "Memory % Committed Bytes In Use"
|
||||
period = "900"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-MemoryCommittedPct
|
||||
period = var.settings.MemoryCommittedPct.period
|
||||
statistic = var.settings.MemoryCommittedPct.statistic
|
||||
threshold = var.settings.MemoryCommittedPct.threshold
|
||||
alarm_description = "EC2:MemoryCommittedBytes"
|
||||
namespace = "CWAgent"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.MemoryCommittedPct.action]
|
||||
ok_actions = [var.settings.MemoryCommittedPct.action]
|
||||
dimensions = {
|
||||
objectname = "Memory"
|
||||
InstanceId = var.ec2-instance-id
|
||||
@ -351,21 +351,21 @@ resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-OsDiskFreePct" {
|
||||
resource "aws_cloudwatch_metric_alarm" "ec2-LogicalDiskFreePct" {
|
||||
count = data.external.ec2-os.result.os == "Windows" ? 1 : 0
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:OsDiskFreePct:${var.ec2-instance-id}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
alarm_name = "${var.cw-alarm-prefix}:EC2:LogicalDiskFreePct:${var.ec2-instance-id}"
|
||||
comparison_operator = var.settings.LogicalDiskFreePct.comparison_operator
|
||||
evaluation_periods = var.settings.LogicalDiskFreePct.evaluation_periods
|
||||
metric_name = "LogicalDisk % Free Space"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-LogicalDiskFreePct
|
||||
period = var.settings.LogicalDiskFreePct.period
|
||||
statistic = var.settings.LogicalDiskFreePct.statistic
|
||||
threshold = var.settings.LogicalDiskFreePct.threshold
|
||||
alarm_description = "EC2:OsDiskFreePct"
|
||||
namespace = "CWAgent"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.LogicalDiskFreePct.action]
|
||||
ok_actions = [var.settings.LogicalDiskFreePct.action]
|
||||
dimensions = {
|
||||
instance = "C:"
|
||||
objectname = "LogicalDisk"
|
||||
|
@ -1,18 +1,6 @@
|
||||
variable "cw-alarm-prefix" {}
|
||||
variable "actions-enabled" {}
|
||||
variable "ec2-instance-id" {}
|
||||
variable sns-targets {}
|
||||
variable "settings" {}
|
||||
|
||||
variable "default-tags" {}
|
||||
|
||||
variable "threshold-CPUUtilization" {}
|
||||
# variable "threshold-mem_free" {}
|
||||
variable "threshold-mem_used_percent" {}
|
||||
# variable "threshold-swap_free" {}
|
||||
variable "threshold-swap_used_percent" {}
|
||||
# variable "threshold-disk_free" {}
|
||||
variable "threshold-disk_used_percentage" {}
|
||||
variable "threshold-disk_inodes_free" {}
|
||||
variable "threshold-processes_total" {}
|
||||
variable threshold-MemoryCommittedPct {}
|
||||
variable threshold-LogicalDiskFreePct {}
|
@ -2,19 +2,19 @@
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" {
|
||||
for_each = toset(var.pod-names)
|
||||
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_cpu_utilization:${each.value}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "pod_cpu_utilization"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-pod_cpu_utilization
|
||||
alarm_description = "EKS:pod_cpu_utilization"
|
||||
alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm1.metric}"
|
||||
comparison_operator = var.settings.alarm1.comparison_operator
|
||||
evaluation_periods = var.settings.alarm1.evaluation_periods
|
||||
metric_name = var.settings.alarm1.metric
|
||||
period = var.settings.alarm1.period
|
||||
statistic = var.settings.alarm1.statistic
|
||||
threshold = var.settings.alarm1.threshold
|
||||
alarm_description = "EKS:${var.settings.alarm1.metric}"
|
||||
namespace = "ContainerInsights"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.alarm1.action]
|
||||
ok_actions = [var.settings.alarm1.action]
|
||||
dimensions = {
|
||||
"PodName" = each.value
|
||||
"ClusterName" = var.cluster-name
|
||||
@ -29,19 +29,19 @@ resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" {
|
||||
resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" {
|
||||
for_each = toset(var.pod-names)
|
||||
|
||||
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_memory_utilization:${each.value}"
|
||||
alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm2.metric}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "pod_memory_utilization"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-pod_memory_utilization
|
||||
alarm_description = "EKS:pod_memory_utilization"
|
||||
metric_name = var.settings.alarm2.metric
|
||||
period = var.settings.alarm2.period
|
||||
statistic = var.settings.alarm2.statistic
|
||||
threshold = var.settings.alarm2.threshold
|
||||
alarm_description = "EKS:${var.settings.alarm2.metric}"
|
||||
namespace = "ContainerInsights"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.alarm2.action]
|
||||
ok_actions = [var.settings.alarm2.action]
|
||||
dimensions = {
|
||||
"PodName" = each.value
|
||||
"ClusterName" = var.cluster-name
|
||||
@ -56,19 +56,19 @@ resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" {
|
||||
resource "aws_cloudwatch_metric_alarm" "eks-pod_number_of_container_restarts" {
|
||||
for_each = toset(var.pod-names)
|
||||
|
||||
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_number_of_container_restarts:${each.value}"
|
||||
alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm3.metric}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "pod_number_of_container_restarts"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-pod_number_of_container_restarts
|
||||
alarm_description = "EKS:pod_number_of_container_restarts"
|
||||
metric_name = var.settings.alarm3.metric
|
||||
period = var.settings.alarm3.period
|
||||
statistic = var.settings.alarm3.statistic
|
||||
threshold = var.settings.alarm3.threshold
|
||||
alarm_description = "EKS:${var.settings.alarm3.metric}"
|
||||
namespace = "ContainerInsights"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.alarm3.action]
|
||||
ok_actions = [var.settings.alarm3.action]
|
||||
dimensions = {
|
||||
"PodName" = each.value
|
||||
"ClusterName" = var.cluster-name
|
||||
|
@ -1,6 +1,5 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
variable sns-targets {}
|
||||
variable default-tags {}
|
||||
|
||||
variable cluster-name {}
|
||||
@ -8,6 +7,4 @@ variable eks-namespace {}
|
||||
variable pod-names {
|
||||
type = list
|
||||
}
|
||||
variable threshold-pod_cpu_utilization {}
|
||||
variable threshold-pod_memory_utilization {}
|
||||
variable threshold-pod_number_of_container_restarts {}
|
||||
variable settings {}
|
@ -1,17 +1,18 @@
|
||||
resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:EMR:AppsPending:${var.job-flow-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "AppsPending"
|
||||
period = "1800"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-AppsPending
|
||||
alarm_description = "EMR:AppsPending"
|
||||
resource "aws_cloudwatch_metric_alarm" "emr-alarms" {
|
||||
for_each = var.settings
|
||||
alarm_name = "${var.cw-alarm-prefix}:EMR:${each.value["metric"]}:${var.job-flow-id}"
|
||||
comparison_operator = each.value["comparison_operator"]
|
||||
evaluation_periods = each.value["evaluation_periods"]
|
||||
metric_name = each.value["metric"]
|
||||
period = each.value["period"]
|
||||
statistic = each.value["statistic"]
|
||||
threshold = each.value["threshold"]
|
||||
alarm_description = "EMR:${each.value["metric"]}"
|
||||
namespace = "AWS/ElasticMapReduce"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
alarm_actions = [each.value["action"]]
|
||||
ok_actions = [each.value["action"]]
|
||||
dimensions = {
|
||||
JobFlowId = var.job-flow-id
|
||||
}
|
||||
@ -21,6 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:EMR:CapacityRemainingGB:${var.job-flow-id}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
@ -28,13 +30,13 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
|
||||
metric_name = "CapacityRemainingGB"
|
||||
period = "3600"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-CapacityRemainingGB
|
||||
threshold = var.settings.CapacityRemainingGB.threshold
|
||||
alarm_description = "EMR:CapacityRemainingGB"
|
||||
namespace = "AWS/ElasticMapReduce"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.CapacityRemainingGB.action]
|
||||
ok_actions = [var.settings.CapacityRemainingGB.action]
|
||||
dimensions = {
|
||||
JobFlowId = var.job-flow-id
|
||||
}
|
||||
@ -43,3 +45,4 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
*/
|
@ -1,8 +1,5 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
variable job-flow-id {}
|
||||
variable threshold-AppsPending {}
|
||||
variable threshold-CapacityRemainingGB {}
|
||||
|
||||
variable sns-targets {}
|
||||
variable settings {}
|
||||
variable default-tags {}
|
||||
|
@ -20,6 +20,6 @@ PATTERN
|
||||
|
||||
resource "aws_cloudwatch_event_target" "TargetForEventRule" {
|
||||
rule = aws_cloudwatch_event_rule.EventRule.name
|
||||
target_id = "rackspace-standard-sns"
|
||||
arn = var.sns-targets.alarm-actions-standard
|
||||
target_id = "health-event-notification-sns"
|
||||
arn = var.settings.healthEvents.action
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
|
||||
variable sns-targets {}
|
||||
variable settings {}
|
||||
variable default-tags {}
|
||||
|
@ -1,17 +1,17 @@
|
||||
resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:Kafka:ZooKeeperRequestLatencyMsMean:${var.cluster-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
comparison_operator = var.settings.ZooKeeperRequestLatencyMsMean.comparison_operator
|
||||
evaluation_periods = var.settings.ZooKeeperRequestLatencyMsMean.evaluation_periods
|
||||
metric_name = "ZooKeeperRequestLatencyMsMean"
|
||||
period = "1800"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ZooKeeperRequestLatencyMsMean
|
||||
period = var.settings.ZooKeeperRequestLatencyMsMean.period
|
||||
statistic = var.settings.ZooKeeperRequestLatencyMsMean.statistic
|
||||
threshold = var.settings.ZooKeeperRequestLatencyMsMean.threshold
|
||||
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
|
||||
namespace = "AWS/Kafka"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action]
|
||||
ok_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action]
|
||||
dimensions = {
|
||||
"Cluster Name" = var.cluster-name
|
||||
}
|
||||
@ -45,21 +45,21 @@ module "msk-brokers" {
|
||||
resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
|
||||
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
|
||||
alarm_name = "${var.cw-alarm-prefix}:Kafka:CpuUsage:${var.cluster-name}-${each.value}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
threshold = var.threshold-CpuUserSystem
|
||||
comparison_operator = var.settings.CpuUserSystem.comparison_operator
|
||||
evaluation_periods = var.settings.CpuUserSystem.evaluation_periods
|
||||
threshold = var.settings.CpuUserSystem.threshold
|
||||
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.CpuUserSystem.action]
|
||||
ok_actions = [var.settings.CpuUserSystem.action]
|
||||
metric_query {
|
||||
id = "m1"
|
||||
metric {
|
||||
metric_name = "CpuUser"
|
||||
namespace = "AWS/Kafka"
|
||||
period = 300
|
||||
stat = "Average"
|
||||
period = var.settings.CpuUserSystem.period
|
||||
stat = var.settings.CpuUserSystem.statistic
|
||||
dimensions = {
|
||||
"Cluster Name" = var.cluster-name
|
||||
"Broker ID" = each.value
|
||||
@ -72,8 +72,8 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
|
||||
metric {
|
||||
metric_name = "CpuSystem"
|
||||
namespace = "AWS/Kafka"
|
||||
period = 300
|
||||
stat = "Average"
|
||||
period = var.settings.CpuUserSystem.period
|
||||
stat = var.settings.CpuUserSystem.statistic
|
||||
dimensions = {
|
||||
"Cluster Name" = var.cluster-name
|
||||
"Broker ID" = each.value
|
||||
@ -97,18 +97,18 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
|
||||
resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
|
||||
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
|
||||
alarm_name = "${var.cw-alarm-prefix}:Kafka:KafkaDataLogsDiskUsed:${var.cluster-name}-${each.value}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
comparison_operator = var.settings.KafkaDataLogsDiskUsed.comparison_operator
|
||||
evaluation_periods = var.settings.KafkaDataLogsDiskUsed.evaluation_periods
|
||||
metric_name = "KafkaDataLogsDiskUsed"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-KafkaDataLogsDiskUsed
|
||||
period = var.settings.KafkaDataLogsDiskUsed.period
|
||||
statistic = var.settings.KafkaDataLogsDiskUsed.statistic
|
||||
threshold = var.settings.KafkaDataLogsDiskUsed.threshold
|
||||
alarm_description = "Kafka:KafkaDataLogsDiskUsed"
|
||||
namespace = "AWS/Kafka"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.KafkaDataLogsDiskUsed.action]
|
||||
ok_actions = [var.settings.KafkaDataLogsDiskUsed.action]
|
||||
dimensions = {
|
||||
"Cluster Name" = var.cluster-name
|
||||
"Broker ID" = each.value
|
||||
@ -122,18 +122,18 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
|
||||
resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" {
|
||||
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
|
||||
alarm_name = "${var.cw-alarm-prefix}:Kafka:HeapMemoryAfterGC:${var.cluster-name}-${each.value}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
comparison_operator = var.settings.HeapMemoryAfterGC.comparison_operator
|
||||
evaluation_periods = var.settings.HeapMemoryAfterGC.evaluation_periods
|
||||
metric_name = "HeapMemoryAfterGC"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-HeapMemoryAfterGC
|
||||
period = var.settings.HeapMemoryAfterGC.period
|
||||
statistic = var.settings.HeapMemoryAfterGC.statistic
|
||||
threshold = var.settings.HeapMemoryAfterGC.threshold
|
||||
alarm_description = "Kafka:HeapMemoryAfterGC"
|
||||
namespace = "AWS/Kafka"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [var.settings.HeapMemoryAfterGC.action]
|
||||
ok_actions = [var.settings.HeapMemoryAfterGC.action]
|
||||
dimensions = {
|
||||
"Cluster Name" = var.cluster-name
|
||||
"Broker ID" = each.value
|
||||
|
@ -1,10 +1,6 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
variable cluster-name {}
|
||||
variable sns-targets {}
|
||||
variable settings {}
|
||||
variable default-tags {}
|
||||
|
||||
variable threshold-ZooKeeperRequestLatencyMsMean {}
|
||||
variable threshold-CpuUserSystem {}
|
||||
variable threshold-KafkaDataLogsDiskUsed {}
|
||||
variable threshold-HeapMemoryAfterGC {}
|
@ -1,17 +1,43 @@
|
||||
resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:NGW:ErrorPortAllocation:${var.res-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ErrorPortAllocation"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ErrorPortAllocation
|
||||
alarm_description = "NGW:ErrorPortAllocation"
|
||||
resource "aws_cloudwatch_metric_alarm" "ngw-alarms" {
|
||||
for_each = var.settings
|
||||
alarm_name = "${var.cw-alarm-prefix}:NGW:${each.value["metric"]}:${var.res-id}"
|
||||
comparison_operator = each.value["comparison_operator"]
|
||||
evaluation_periods = each.value["evaluation_periods"]
|
||||
metric_name = each.value["metric"]
|
||||
period = each.value["period"]
|
||||
statistic = each.value["statistic"]
|
||||
threshold = each.value["threshold"]
|
||||
alarm_description = "NGW:${each.value["metric"]}"
|
||||
namespace = "AWS/NATGateway"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [each.value["action"]]
|
||||
ok_actions = [each.value["action"]]
|
||||
dimensions = {
|
||||
NatGatewayId = var.res-id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm1.metric}:${var.res-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = var.settings.alarm1.metric
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.settings.alarm1.threshold
|
||||
alarm_description = "NGW:${var.settings.alarm1.metric}"
|
||||
namespace = "AWS/NATGateway"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.settings.alarm1.action]
|
||||
ok_actions = [var.settings.alarm1.action]
|
||||
dimensions = {
|
||||
NatGatewayId = var.res-id
|
||||
}
|
||||
@ -22,19 +48,19 @@ resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" {
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ngw-ConnectionEstablishedCount" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:NGW:ConnectionEstablishedCount:${var.res-id}"
|
||||
alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm2.metric}:${var.res-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ConnectionEstablishedCount"
|
||||
metric_name = var.settings.alarm2.metric
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ConnectionEstablishedCount
|
||||
alarm_description = "NGW:ConnectionEstablishedCount"
|
||||
threshold = var.settings.alarm2.threshold
|
||||
alarm_description = "NGW:${var.settings.alarm2.metric}"
|
||||
namespace = "AWS/NATGateway"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
alarm_actions = [var.settings.alarm2.action]
|
||||
ok_actions = [var.settings.alarm2.action]
|
||||
dimensions = {
|
||||
NatGatewayId = var.res-id
|
||||
}
|
||||
@ -45,19 +71,19 @@ resource "aws_cloudwatch_metric_alarm" "ngw-ConnectionEstablishedCount" {
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ngw-PacketsDropCount" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:NGW:PacketsDropCount:${var.res-id}"
|
||||
alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm3.metric}:${var.res-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "PacketsDropCount"
|
||||
metric_name = var.settings.alarm3.metric
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-PacketsDropCount
|
||||
alarm_description = "NGW:PacketsDropCount"
|
||||
threshold = var.settings.alarm3.threshold
|
||||
alarm_description = "NGW:${var.settings.alarm3.metric}"
|
||||
namespace = "AWS/NATGateway"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
alarm_actions = [var.settings.alarm3.action]
|
||||
ok_actions = [var.settings.alarm3.action]
|
||||
dimensions = {
|
||||
NatGatewayId = var.res-id
|
||||
}
|
||||
@ -66,3 +92,4 @@ resource "aws_cloudwatch_metric_alarm" "ngw-PacketsDropCount" {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
*/
|
@ -1,8 +1,5 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
variable res-id {}
|
||||
variable threshold-ErrorPortAllocation { }
|
||||
variable threshold-ConnectionEstablishedCount {}
|
||||
variable threshold-PacketsDropCount {}
|
||||
variable sns-targets {}
|
||||
variable default-tags {}
|
||||
variable settings {}
|
@ -17,18 +17,18 @@ module "nlb-targetgroups" {
|
||||
resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" {
|
||||
for_each = module.nlb-targetgroups.result-set
|
||||
alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
comparison_operator = var.settings.HealthHostCountMin.comparison_operator
|
||||
evaluation_periods = var.settings.HealthHostCountMin.evaluation_periods
|
||||
metric_name = "HealthyHostCount"
|
||||
period = "300"
|
||||
statistic = "Minimum"
|
||||
threshold = var.threshold-HealthHostCountMin
|
||||
period = var.settings.HealthHostCountMin.period
|
||||
statistic = var.settings.HealthHostCountMin.statistic
|
||||
threshold = var.settings.HealthHostCountMin.threshold
|
||||
alarm_description = "NLBTG:HealthyHostCount"
|
||||
namespace = "AWS/NetworkELB"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-emergency]
|
||||
ok_actions = [var.sns-targets.alarm-actions-emergency]
|
||||
alarm_actions = [var.settings.HealthHostCountMin.action]
|
||||
ok_actions = [var.settings.HealthHostCountMin.action]
|
||||
dimensions = {
|
||||
TargetGroup = split(":", each.value)[5]
|
||||
LoadBalancer = "net/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"
|
||||
|
@ -1,6 +1,5 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
variable load-balancer {}
|
||||
variable threshold-HealthHostCountMin {}
|
||||
variable sns-targets {}
|
||||
variable settings {}
|
||||
variable default-tags {}
|
@ -1,307 +1,20 @@
|
||||
data "aws_caller_identity" "this" {}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-CPUUtilization" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:CPUUtilization:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "CPUUtilization"
|
||||
period = "1800"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-CPUUtilization
|
||||
alarm_description = "ES:CPUUtilization"
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-alarms" {
|
||||
for_each = var.settings
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:${each.value["metric"]}:${var.domain-name}"
|
||||
comparison_operator = each.value["comparison_operator"]
|
||||
evaluation_periods = each.value["evaluation_periods"]
|
||||
metric_name = each.value["metric"]
|
||||
period = each.value["period"]
|
||||
statistic = each.value["statistic"]
|
||||
threshold = each.value["threshold"]
|
||||
alarm_description = "ES:${each.value["metric"]}"
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-SearchLatency" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:SearchLatency:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "SearchLatency"
|
||||
period = "1800"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-SearchLatency
|
||||
alarm_description = "ES:SearchLatency"
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-IndexingLatency" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:IndexingLatency:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "IndexingLatency"
|
||||
period = "1800"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-IndexingLatency
|
||||
alarm_description = "ES:IndexingLatency"
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterStatusRed:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ClusterStatus.red"
|
||||
period = "900"
|
||||
statistic = "Maximum"
|
||||
threshold = 0
|
||||
alarm_description = "At least one primary shard and its replicas aren't allocated to a node."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteQueue" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteQueue:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ThreadpoolWriteQueue"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ThreadpoolWriteQueue
|
||||
alarm_description = "The cluster is experiencing high indexing concurrency. Review and control indexing requests, or increase cluster resources."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchQueue" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchQueue:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ThreadpoolSearchQueue"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ThreadpoolSearchQueue
|
||||
alarm_description = "The cluster is experiencing high search concurrency. Consider scaling your cluster. You can also increase the search queue size, but increasing it excessively can cause out of memory errors."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchRejected" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchRejected:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ThreadpoolSearchRejected"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ThreadpoolSearchRejected
|
||||
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteRejected" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteRejected:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ThreadpoolWriteRejected"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ThreadpoolWriteRejected
|
||||
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-MasterCPUUtilization" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:MasterCPUUtilization:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "MasterCPUUtilization"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-MasterCPUUtilization
|
||||
alarm_description = "MasterCPUUtilization"
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-MasterJVMMemoryPressure" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:MasterJVMMemoryPressure:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "MasterJVMMemoryPressure"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-MasterJVMMemoryPressure
|
||||
alarm_description = "MasterJVMMemoryPressure"
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-JVMMemoryPressure" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:JVMMemoryPressure:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "JVMMemoryPressure"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-JVMMemoryPressure
|
||||
alarm_description = "The cluster could encounter out of memory errors if usage increases. Consider scaling vertically. OpenSearch Service uses half of an instance's RAM for the Java heap, up to a heap size of 32 GiB. You can scale instances vertically up to 64 GiB of RAM, at which point you can scale horizontally by adding instances."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ClusterIndexWritesBlocked" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterIndexWritesBlocked:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ClusterIndexWritesBlocked"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ClusterIndexWritesBlocked
|
||||
alarm_description = "Your cluster is blocking write requests. See ClusterBlockException."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-FreeStorageSpace" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:FreeStorageSpace:${var.domain-name}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "FreeStorageSpace"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-FreeStorageSpace
|
||||
alarm_description = "A node in your cluster is low on free storage space."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [each.value["action"]]
|
||||
ok_actions = [each.value["action"]]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
|
@ -1,19 +1,6 @@
|
||||
variable "cw-alarm-prefix" {}
|
||||
variable "actions-enabled" {}
|
||||
variable "domain-name" {}
|
||||
variable "sns-targets" {}
|
||||
variable "settings" {}
|
||||
variable "default-tags" {}
|
||||
|
||||
variable "threshold-CPUUtilization" {}
|
||||
variable "threshold-SearchLatency" {}
|
||||
variable "threshold-IndexingLatency" {}
|
||||
variable "threshold-ThreadpoolWriteQueue" {}
|
||||
variable "threshold-ThreadpoolSearchQueue" {}
|
||||
variable "threshold-ThreadpoolSearchRejected" {}
|
||||
variable "threshold-ThreadpoolWriteRejected" {}
|
||||
variable "threshold-MasterCPUUtilization" {}
|
||||
variable "threshold-MasterJVMMemoryPressure" {}
|
||||
variable "threshold-JVMMemoryPressure" {}
|
||||
variable "threshold-ClusterIndexWritesBlocked" {}
|
||||
variable "threshold-FreeStorageSpace" {}
|
||||
# variable threshold-KibanaHealthyNodes {}
|
@ -1,132 +1,18 @@
|
||||
resource "aws_cloudwatch_metric_alarm" "rds-cpu" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:RDS:CpuUtilization:${var.rds-instance-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "CPUUtilization"
|
||||
period = "3600"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-CpuUtilization
|
||||
alarm_description = "RDS:CpuUtilization"
|
||||
resource "aws_cloudwatch_metric_alarm" "rds-alarms" {
|
||||
for_each = var.settings
|
||||
alarm_name = "${var.cw-alarm-prefix}:RDS:${each.value["metric"]}:${var.rds-instance-name}"
|
||||
comparison_operator = each.value["comparison_operator"]
|
||||
evaluation_periods = each.value["evaluation_periods"]
|
||||
metric_name = each.value["metric"]
|
||||
period = each.value["period"]
|
||||
statistic = each.value["statistic"]
|
||||
threshold = each.value["threshold"]
|
||||
alarm_description = "RDS:${each.value["metric"]}"
|
||||
namespace = "AWS/RDS"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DBInstanceIdentifier = var.rds-instance-name
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "rds-storage" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:RDS:FreeStorageSpace:${var.rds-instance-name}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "FreeStorageSpace"
|
||||
period = "3600"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-FreeStorageSpace
|
||||
alarm_description = "RDS:FreeStorageSpace"
|
||||
namespace = "AWS/RDS"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DBInstanceIdentifier = var.rds-instance-name
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "rds-memory" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:RDS:FreeableMemory:${var.rds-instance-name}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "FreeableMemory"
|
||||
period = "3600"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-FreeableMemory
|
||||
alarm_description = "RDS:FreeableMemory"
|
||||
namespace = "AWS/RDS"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DBInstanceIdentifier = var.rds-instance-name
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "DiskQueueDepth"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-DiskQueueDepth
|
||||
alarm_description = "RDS:DiskQueueDepth"
|
||||
namespace = "AWS/RDS"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DBInstanceIdentifier = var.rds-instance-name
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "rds-ReadLatency" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:RDS:ReadLatency:${var.rds-instance-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ReadLatency"
|
||||
period = "900"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ReadLatency
|
||||
alarm_description = "RDS:ReadLatency"
|
||||
namespace = "AWS/RDS"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DBInstanceIdentifier = var.rds-instance-name
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "rds-WriteLatency" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:RDS:WriteLatency:${var.rds-instance-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "WriteLatency"
|
||||
period = "900"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-WriteLatency
|
||||
alarm_description = "RDS:WriteLatency"
|
||||
namespace = "AWS/RDS"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [each.value["action"]]
|
||||
ok_actions = [each.value["action"]]
|
||||
dimensions = {
|
||||
DBInstanceIdentifier = var.rds-instance-name
|
||||
}
|
||||
|
@ -1,12 +1,5 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
variable rds-instance-name {}
|
||||
variable sns-targets {}
|
||||
variable settings {}
|
||||
variable default-tags {}
|
||||
|
||||
variable threshold-FreeableMemory {}
|
||||
variable threshold-CpuUtilization {}
|
||||
variable threshold-FreeStorageSpace {}
|
||||
variable threshold-DiskQueueDepth {}
|
||||
variable threshold-ReadLatency {}
|
||||
variable threshold-WriteLatency {}
|
@ -1,17 +1,18 @@
|
||||
resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:Redis:EngineCPUUtilization:${var.redis-cluster-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "EngineCPUUtilization"
|
||||
period = "3600"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-EngineCPUUtilization
|
||||
alarm_description = "Redis:EngineCPUUtilization"
|
||||
namespace = "AWS/ElastiCache"
|
||||
resource "aws_cloudwatch_metric_alarm" "redis-alarms" {
|
||||
for_each = var.settings
|
||||
alarm_name = "${var.cw-alarm-prefix}:Redis:${each.value["metric"]}:${var.redis-cluster-id}"
|
||||
comparison_operator = each.value["comparison_operator"]
|
||||
evaluation_periods = each.value["evaluation_periods"]
|
||||
metric_name = each.value["metric"]
|
||||
period = each.value["period"]
|
||||
statistic = each.value["statistic"]
|
||||
threshold = each.value["threshold"]
|
||||
alarm_description = "NGW:${each.value["metric"]}"
|
||||
namespace = "AWS/NATGateway"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
alarm_actions = [each.value["action"]]
|
||||
ok_actions = [each.value["action"]]
|
||||
dimensions = {
|
||||
CacheClusterId = var.redis-cluster-id
|
||||
}
|
||||
@ -20,74 +21,3 @@ resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "redis-DatabaseMemoryUsagePercentage" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "DatabaseMemoryUsagePercentage"
|
||||
period = "3600"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-DatabaseMemoryUsagePercentage
|
||||
alarm_description = "Redis:DatabaseMemoryUsagePercentage"
|
||||
namespace = "AWS/ElastiCache"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
CacheClusterId = var.redis-cluster-id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:Redis:CacheHitRate:${var.redis-cluster-id}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "4"
|
||||
metric_name = "CacheHitRate"
|
||||
period = "900"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-CacheHitRate
|
||||
alarm_description = "Redis:CacheHitRate"
|
||||
namespace = "AWS/ElastiCache"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
dimensions = {
|
||||
CacheClusterId = var.redis-cluster-id
|
||||
# CacheNodeId = each.value
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "redis-StringBasedCmdsLatency" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:Redis:StringBasedCmdsLatency:${var.redis-cluster-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "StringBasedCmdsLatency"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-StringBasedCmdsLatency
|
||||
alarm_description = "The average latency, in microseconds, of the string-based commands run during a selected time range"
|
||||
namespace = "AWS/ElastiCache"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
dimensions = {
|
||||
CacheClusterId = var.redis-cluster-id
|
||||
# CacheNodeId = each.value
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
@ -1,10 +1,5 @@
|
||||
variable "cw-alarm-prefix" {}
|
||||
variable "actions-enabled" {}
|
||||
variable "redis-cluster-id" {}
|
||||
variable "sns-targets" {}
|
||||
variable "settings" {}
|
||||
variable "default-tags" {}
|
||||
|
||||
variable "threshold-EngineCPUUtilization" {}
|
||||
variable "threshold-DatabaseMemoryUsagePercentage" {}
|
||||
variable "threshold-CacheHitRate" {}
|
||||
variable "threshold-StringBasedCmdsLatency" {}
|
@ -1,17 +1,17 @@
|
||||
resource "aws_cloudwatch_metric_alarm" "tgw-PacketDropCountNoRoute" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:TGW:PacketDropCountNoRoute:${var.tgw-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
comparison_operator = var.settings.PacketDropCountNoRoute.comparison_operator
|
||||
evaluation_periods = var.settings.PacketDropCountNoRoute.evaluation_periods
|
||||
metric_name = "PacketDropCountNoRoute"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-PacketDropCountNoRoute
|
||||
period = var.settings.PacketDropCountNoRoute.period
|
||||
statistic = var.settings.PacketDropCountNoRoute.statistic
|
||||
threshold = var.settings.PacketDropCountNoRoute.threshold
|
||||
alarm_description = "TGW:PacketDropCountNoRoute"
|
||||
namespace = "AWS/TransitGateway"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
alarm_actions = [var.settings.PacketDropCountNoRoute.action]
|
||||
ok_actions = [var.settings.PacketDropCountNoRoute.action]
|
||||
dimensions = {
|
||||
TransitGateway = var.tgw-id
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
variable tgw-id {}
|
||||
variable threshold-PacketDropCountNoRoute {}
|
||||
|
||||
variable sns-targets {}
|
||||
variable settings {}
|
||||
variable default-tags {}
|
||||
|
Loading…
Reference in New Issue
Block a user