From 48191b808f03054ac50b37952e9d11d73f1e8298 Mon Sep 17 00:00:00 2001 From: xpk Date: Thu, 5 Jan 2023 23:35:31 +0800 Subject: [PATCH] UPD: various bug fixes and enhancements --- .../Monitoring.ASG/main.tf | 14 +- .../Monitoring.ASG/variables.tf | 4 +- .../Monitoring.EC2/main.tf | 144 ++++---- .../Monitoring.EC2/variables.tf | 16 +- .../Monitoring.EKS/main.tf | 52 +-- .../Monitoring.EKS/variables.tf | 5 +- .../Monitoring.EMR/main.tf | 33 +- .../Monitoring.EMR/variables.tf | 5 +- .../Monitoring.EventBridge/main.tf | 8 +- .../Monitoring.EventBridge/variables.tf | 2 +- .../Monitoring.Kafka/main.tf | 60 ++-- .../Monitoring.Kafka/variables.tf | 6 +- .../Monitoring.NGW/main.tf | 75 +++-- .../Monitoring.NGW/variables.tf | 5 +- .../Monitoring.NLB/main.tf | 14 +- .../Monitoring.NLB/variables.tf | 3 +- .../Monitoring.OpenSearch/main.tf | 311 +----------------- .../Monitoring.OpenSearch/variables.tf | 15 +- .../Monitoring.RDS/main.tf | 138 +------- .../Monitoring.RDS/variables.tf | 9 +- .../Monitoring.Redis/main.tf | 96 +----- .../Monitoring.Redis/variables.tf | 7 +- .../Monitoring.TGW/main.tf | 14 +- .../Monitoring.TGW/variables.tf | 4 +- 24 files changed, 272 insertions(+), 768 deletions(-) diff --git a/modules/ManagementGovernance/Monitoring.ASG/main.tf b/modules/ManagementGovernance/Monitoring.ASG/main.tf index 74ed3cf..39d1a9a 100644 --- a/modules/ManagementGovernance/Monitoring.ASG/main.tf +++ b/modules/ManagementGovernance/Monitoring.ASG/main.tf @@ -1,17 +1,17 @@ resource "aws_cloudwatch_metric_alarm" "asg-CPUUtilization" { alarm_name = "${var.cw-alarm-prefix}:ASG:CPUUtilization:${var.asg-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" + comparison_operator = var.settings.CPUUtilization.comparison_operator + evaluation_periods = var.settings.CPUUtilization.evaluation_periods metric_name = "CPUUtilization" - period = "1800" - statistic = "Average" - threshold = var.threshold-CPUUtilization + period = var.settings.CPUUtilization.period + statistic = var.settings.CPUUtilization.statistic + threshold = var.settings.CPUUtilization.threshold alarm_description = "ASG:CPUUtilization" namespace = "AWS/EC2" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.CPUUtilization.action] + ok_actions = [var.settings.CPUUtilization.action] dimensions = { AutoScalingGroupName = var.asg-name } diff --git a/modules/ManagementGovernance/Monitoring.ASG/variables.tf b/modules/ManagementGovernance/Monitoring.ASG/variables.tf index f5b70c9..2f916c3 100644 --- a/modules/ManagementGovernance/Monitoring.ASG/variables.tf +++ b/modules/ManagementGovernance/Monitoring.ASG/variables.tf @@ -1,7 +1,5 @@ variable cw-alarm-prefix {} variable actions-enabled {} variable asg-name {} -variable sns-targets {} +variable settings {} variable default-tags {} - -variable threshold-CPUUtilization {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.EC2/main.tf b/modules/ManagementGovernance/Monitoring.EC2/main.tf index 808b02d..d880979 100644 --- a/modules/ManagementGovernance/Monitoring.EC2/main.tf +++ b/modules/ManagementGovernance/Monitoring.EC2/main.tf @@ -1,17 +1,17 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" { alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_System:${var.ec2-instance-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" + comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator + evaluation_periods = var.settings.StatusCheckFailed_System.evaluation_periods metric_name = "StatusCheckFailed_System" - period = "300" - statistic = "Maximum" - threshold = 0 + period = var.settings.StatusCheckFailed_System.period + statistic = var.settings.StatusCheckFailed_System.statistic + threshold = var.settings.StatusCheckFailed_System.threshold alarm_description = "EC2:StatusCheckFailed_System" namespace = "AWS/EC2" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-emergency] - ok_actions = [var.sns-targets.alarm-actions-emergency] + alarm_actions = [var.settings.StatusCheckFailed_System.action] + ok_actions = [var.settings.StatusCheckFailed_System.action] dimensions = { InstanceId = var.ec2-instance-id } @@ -23,18 +23,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" { resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" { alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" + comparison_operator = var.settings.StatusCheckFailed_Instance.comparison_operator + evaluation_periods = var.settings.StatusCheckFailed_Instance.evaluation_periods metric_name = "StatusCheckFailed_Instance" - period = "300" - statistic = "Maximum" - threshold = 0 + period = var.settings.StatusCheckFailed_Instance.period + statistic = var.settings.StatusCheckFailed_Instance.statistic + threshold = var.settings.StatusCheckFailed_Instance.threshold alarm_description = "EC2:StatusCheckFailed_Instance" namespace = "AWS/EC2" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-emergency] - ok_actions = [var.sns-targets.alarm-actions-emergency] + alarm_actions = [var.settings.StatusCheckFailed_Instance.action] + ok_actions = [var.settings.StatusCheckFailed_Instance.action] dimensions = { InstanceId = var.ec2-instance-id } @@ -46,18 +46,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" { resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" { alarm_name = "${var.cw-alarm-prefix}:EC2:CPUUtilization:${var.ec2-instance-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "6" + comparison_operator = var.settings.CPUUtilization.comparison_operator + evaluation_periods = var.settings.CPUUtilization.evaluation_periods metric_name = "CPUUtilization" - period = "300" - statistic = "Average" - threshold = var.threshold-CPUUtilization + period = var.settings.CPUUtilization.period + statistic = var.settings.CPUUtilization.statistic + threshold = var.settings.CPUUtilization.threshold alarm_description = "EC2:CPUUtilization" namespace = "AWS/EC2" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.CPUUtilization.action] + ok_actions = [var.settings.CPUUtilization.action] treat_missing_data = "notBreaching" dimensions = { InstanceId = var.ec2-instance-id @@ -114,18 +114,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-mem_free" { resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" { count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:mem_used_percent:${var.ec2-instance-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" + comparison_operator = var.settings.mem_used_percent.comparison_operator + evaluation_periods = var.settings.mem_used_percent.evaluation_periods metric_name = "mem_used_percent" - period = "900" - statistic = "Average" - threshold = var.threshold-mem_used_percent + period = var.settings.mem_used_percent.period + statistic = var.settings.mem_used_percent.statistic + threshold = var.settings.mem_used_percent.threshold alarm_description = "EC2:mem_used_percent" namespace = "CWAgent" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-standard] - ok_actions = [var.sns-targets.alarm-actions-standard] + alarm_actions = [var.settings.mem_used_percent.action] + ok_actions = [var.settings.mem_used_percent.action] dimensions = { InstanceId = var.ec2-instance-id ImageId = data.aws_instance.ec2-instance.ami @@ -190,18 +190,18 @@ data "external" "cw-dimensions" { resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" { count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:swap_used_percent:${var.ec2-instance-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" + comparison_operator = var.settings.swap_used_percent.comparison_operator + evaluation_periods = var.settings.swap_used_percent.evaluation_periods metric_name = "swap_used_percent" - period = "900" - statistic = "Average" - threshold = var.threshold-swap_used_percent + period = var.settings.swap_used_percent.period + statistic = var.settings.swap_used_percent.statistic + threshold = var.settings.swap_used_percent.threshold alarm_description = "EC2:swap_used_percent" namespace = "CWAgent" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.swap_used_percent.action] + ok_actions = [var.settings.swap_used_percent.action] dimensions = { InstanceId = var.ec2-instance-id ImageId = data.aws_instance.ec2-instance.ami @@ -242,18 +242,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_free" { resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" { count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:disk_used_percent:${var.ec2-instance-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" + comparison_operator = var.settings.disk_used_percent.comparison_operator + evaluation_periods = var.settings.disk_used_percent.evaluation_periods metric_name = "disk_used_percent" - period = "900" - statistic = "Average" - threshold = var.threshold-disk_used_percentage + period = var.settings.disk_used_percent.period + statistic = var.settings.disk_used_percent.statistic + threshold = var.settings.disk_used_percent.threshold alarm_description = "EC2:disk_used_percent" namespace = "CWAgent" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.disk_used_percent.action] + ok_actions = [var.settings.disk_used_percent.action] dimensions = data.external.cw-dimensions.result tags = var.default-tags @@ -266,18 +266,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent" { resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" { count = data.external.ec2-os.result.os == "Linux" && length(data.external.cw-dimensions.result) > 0 ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:disk_inodes_free:${var.ec2-instance-id}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "2" + comparison_operator = var.settings.disk_inodes_free.comparison_operator + evaluation_periods = var.settings.disk_inodes_free.evaluation_periods metric_name = "disk_inodes_free" - period = "300" - statistic = "Average" - threshold = var.threshold-disk_inodes_free + period = var.settings.disk_inodes_free.period + statistic = var.settings.disk_inodes_free.statistic + threshold = var.settings.disk_inodes_free.threshold alarm_description = "EC2:disk_inodes_free" namespace = "CWAgent" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.disk_inodes_free.action] + ok_actions = [var.settings.disk_inodes_free.action] dimensions = data.external.cw-dimensions.result /* dimensions = { @@ -299,18 +299,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" { resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" { count = data.external.ec2-os.result.os == "Linux" ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:processes_total:${var.ec2-instance-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" + comparison_operator = var.settings.processes_total.comparison_operator + evaluation_periods = var.settings.processes_total.evaluation_periods metric_name = "processes_total" - period = "900" - statistic = "Average" - threshold = var.threshold-processes_total + period = var.settings.processes_total.period + statistic = var.settings.processes_total.statistic + threshold = var.settings.processes_total.threshold alarm_description = "EC2:processes_total" namespace = "CWAgent" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.processes_total.action] + ok_actions = [var.settings.processes_total.action] dimensions = { InstanceId = var.ec2-instance-id ImageId = data.aws_instance.ec2-instance.ami @@ -327,18 +327,18 @@ resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" { resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" { count = data.external.ec2-os.result.os == "Windows" ? 1 : 0 alarm_name = "${var.cw-alarm-prefix}:EC2:MemoryCommittedPct:${var.ec2-instance-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" + comparison_operator = var.settings.MemoryCommittedPct.comparison_operator + evaluation_periods = var.settings.MemoryCommittedPct.evaluation_periods metric_name = "Memory % Committed Bytes In Use" - period = "900" - statistic = "Average" - threshold = var.threshold-MemoryCommittedPct + period = var.settings.MemoryCommittedPct.period + statistic = var.settings.MemoryCommittedPct.statistic + threshold = var.settings.MemoryCommittedPct.threshold alarm_description = "EC2:MemoryCommittedBytes" namespace = "CWAgent" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.MemoryCommittedPct.action] + ok_actions = [var.settings.MemoryCommittedPct.action] dimensions = { objectname = "Memory" InstanceId = var.ec2-instance-id @@ -351,21 +351,21 @@ resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" { } } -resource "aws_cloudwatch_metric_alarm" "ec2-OsDiskFreePct" { +resource "aws_cloudwatch_metric_alarm" "ec2-LogicalDiskFreePct" { count = data.external.ec2-os.result.os == "Windows" ? 1 : 0 - alarm_name = "${var.cw-alarm-prefix}:EC2:OsDiskFreePct:${var.ec2-instance-id}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "2" + alarm_name = "${var.cw-alarm-prefix}:EC2:LogicalDiskFreePct:${var.ec2-instance-id}" + comparison_operator = var.settings.LogicalDiskFreePct.comparison_operator + evaluation_periods = var.settings.LogicalDiskFreePct.evaluation_periods metric_name = "LogicalDisk % Free Space" - period = "300" - statistic = "Average" - threshold = var.threshold-LogicalDiskFreePct + period = var.settings.LogicalDiskFreePct.period + statistic = var.settings.LogicalDiskFreePct.statistic + threshold = var.settings.LogicalDiskFreePct.threshold alarm_description = "EC2:OsDiskFreePct" namespace = "CWAgent" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.LogicalDiskFreePct.action] + ok_actions = [var.settings.LogicalDiskFreePct.action] dimensions = { instance = "C:" objectname = "LogicalDisk" diff --git a/modules/ManagementGovernance/Monitoring.EC2/variables.tf b/modules/ManagementGovernance/Monitoring.EC2/variables.tf index c8ddc97..39caea2 100644 --- a/modules/ManagementGovernance/Monitoring.EC2/variables.tf +++ b/modules/ManagementGovernance/Monitoring.EC2/variables.tf @@ -1,18 +1,6 @@ variable "cw-alarm-prefix" {} variable "actions-enabled" {} variable "ec2-instance-id" {} -variable sns-targets {} +variable "settings" {} -variable "default-tags" {} - -variable "threshold-CPUUtilization" {} -# variable "threshold-mem_free" {} -variable "threshold-mem_used_percent" {} -# variable "threshold-swap_free" {} -variable "threshold-swap_used_percent" {} -# variable "threshold-disk_free" {} -variable "threshold-disk_used_percentage" {} -variable "threshold-disk_inodes_free" {} -variable "threshold-processes_total" {} -variable threshold-MemoryCommittedPct {} -variable threshold-LogicalDiskFreePct {} \ No newline at end of file +variable "default-tags" {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.EKS/main.tf b/modules/ManagementGovernance/Monitoring.EKS/main.tf index a1de6fc..8e1487f 100644 --- a/modules/ManagementGovernance/Monitoring.EKS/main.tf +++ b/modules/ManagementGovernance/Monitoring.EKS/main.tf @@ -2,19 +2,19 @@ resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" { for_each = toset(var.pod-names) - alarm_name = "${var.cw-alarm-prefix}:EKS:pod_cpu_utilization:${each.value}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" - metric_name = "pod_cpu_utilization" - period = "300" - statistic = "Average" - threshold = var.threshold-pod_cpu_utilization - alarm_description = "EKS:pod_cpu_utilization" + alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm1.metric}" + comparison_operator = var.settings.alarm1.comparison_operator + evaluation_periods = var.settings.alarm1.evaluation_periods + metric_name = var.settings.alarm1.metric + period = var.settings.alarm1.period + statistic = var.settings.alarm1.statistic + threshold = var.settings.alarm1.threshold + alarm_description = "EKS:${var.settings.alarm1.metric}" namespace = "ContainerInsights" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.alarm1.action] + ok_actions = [var.settings.alarm1.action] dimensions = { "PodName" = each.value "ClusterName" = var.cluster-name @@ -29,19 +29,19 @@ resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" { resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" { for_each = toset(var.pod-names) - alarm_name = "${var.cw-alarm-prefix}:EKS:pod_memory_utilization:${each.value}" + alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm2.metric}" comparison_operator = "GreaterThanThreshold" evaluation_periods = "3" - metric_name = "pod_memory_utilization" - period = "300" - statistic = "Average" - threshold = var.threshold-pod_memory_utilization - alarm_description = "EKS:pod_memory_utilization" + metric_name = var.settings.alarm2.metric + period = var.settings.alarm2.period + statistic = var.settings.alarm2.statistic + threshold = var.settings.alarm2.threshold + alarm_description = "EKS:${var.settings.alarm2.metric}" namespace = "ContainerInsights" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.alarm2.action] + ok_actions = [var.settings.alarm2.action] dimensions = { "PodName" = each.value "ClusterName" = var.cluster-name @@ -56,19 +56,19 @@ resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" { resource "aws_cloudwatch_metric_alarm" "eks-pod_number_of_container_restarts" { for_each = toset(var.pod-names) - alarm_name = "${var.cw-alarm-prefix}:EKS:pod_number_of_container_restarts:${each.value}" + alarm_name = "${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm3.metric}" comparison_operator = "GreaterThanThreshold" evaluation_periods = "3" - metric_name = "pod_number_of_container_restarts" - period = "300" - statistic = "Average" - threshold = var.threshold-pod_number_of_container_restarts - alarm_description = "EKS:pod_number_of_container_restarts" + metric_name = var.settings.alarm3.metric + period = var.settings.alarm3.period + statistic = var.settings.alarm3.statistic + threshold = var.settings.alarm3.threshold + alarm_description = "EKS:${var.settings.alarm3.metric}" namespace = "ContainerInsights" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.alarm3.action] + ok_actions = [var.settings.alarm3.action] dimensions = { "PodName" = each.value "ClusterName" = var.cluster-name diff --git a/modules/ManagementGovernance/Monitoring.EKS/variables.tf b/modules/ManagementGovernance/Monitoring.EKS/variables.tf index f5befc6..9a4e946 100644 --- a/modules/ManagementGovernance/Monitoring.EKS/variables.tf +++ b/modules/ManagementGovernance/Monitoring.EKS/variables.tf @@ -1,6 +1,5 @@ variable cw-alarm-prefix {} variable actions-enabled {} -variable sns-targets {} variable default-tags {} variable cluster-name {} @@ -8,6 +7,4 @@ variable eks-namespace {} variable pod-names { type = list } -variable threshold-pod_cpu_utilization {} -variable threshold-pod_memory_utilization {} -variable threshold-pod_number_of_container_restarts {} \ No newline at end of file +variable settings {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.EMR/main.tf b/modules/ManagementGovernance/Monitoring.EMR/main.tf index ca668fa..2beb0b0 100644 --- a/modules/ManagementGovernance/Monitoring.EMR/main.tf +++ b/modules/ManagementGovernance/Monitoring.EMR/main.tf @@ -1,17 +1,18 @@ -resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" { - alarm_name = "${var.cw-alarm-prefix}:EMR:AppsPending:${var.job-flow-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = "AppsPending" - period = "1800" - statistic = "Average" - threshold = var.threshold-AppsPending - alarm_description = "EMR:AppsPending" +resource "aws_cloudwatch_metric_alarm" "emr-alarms" { + for_each = var.settings + alarm_name = "${var.cw-alarm-prefix}:EMR:${each.value["metric"]}:${var.job-flow-id}" + comparison_operator = each.value["comparison_operator"] + evaluation_periods = each.value["evaluation_periods"] + metric_name = each.value["metric"] + period = each.value["period"] + statistic = each.value["statistic"] + threshold = each.value["threshold"] + alarm_description = "EMR:${each.value["metric"]}" namespace = "AWS/ElasticMapReduce" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-standard] - ok_actions = [var.sns-targets.alarm-actions-standard] + alarm_actions = [each.value["action"]] + ok_actions = [each.value["action"]] dimensions = { JobFlowId = var.job-flow-id } @@ -21,6 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" { } } +/* resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" { alarm_name = "${var.cw-alarm-prefix}:EMR:CapacityRemainingGB:${var.job-flow-id}" comparison_operator = "LessThanThreshold" @@ -28,13 +30,13 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" { metric_name = "CapacityRemainingGB" period = "3600" statistic = "Average" - threshold = var.threshold-CapacityRemainingGB + threshold = var.settings.CapacityRemainingGB.threshold alarm_description = "EMR:CapacityRemainingGB" namespace = "AWS/ElasticMapReduce" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.CapacityRemainingGB.action] + ok_actions = [var.settings.CapacityRemainingGB.action] dimensions = { JobFlowId = var.job-flow-id } @@ -42,4 +44,5 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" { lifecycle { ignore_changes = [tags] } -} \ No newline at end of file +} +*/ \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.EMR/variables.tf b/modules/ManagementGovernance/Monitoring.EMR/variables.tf index 0c5959d..ca18c24 100644 --- a/modules/ManagementGovernance/Monitoring.EMR/variables.tf +++ b/modules/ManagementGovernance/Monitoring.EMR/variables.tf @@ -1,8 +1,5 @@ variable cw-alarm-prefix {} variable actions-enabled {} variable job-flow-id {} -variable threshold-AppsPending {} -variable threshold-CapacityRemainingGB {} - -variable sns-targets {} +variable settings {} variable default-tags {} diff --git a/modules/ManagementGovernance/Monitoring.EventBridge/main.tf b/modules/ManagementGovernance/Monitoring.EventBridge/main.tf index 4224dea..fbfe20c 100644 --- a/modules/ManagementGovernance/Monitoring.EventBridge/main.tf +++ b/modules/ManagementGovernance/Monitoring.EventBridge/main.tf @@ -19,7 +19,7 @@ PATTERN } resource "aws_cloudwatch_event_target" "TargetForEventRule" { - rule = aws_cloudwatch_event_rule.EventRule.name - target_id = "rackspace-standard-sns" - arn = var.sns-targets.alarm-actions-standard -} + rule = aws_cloudwatch_event_rule.EventRule.name + target_id = "health-event-notification-sns" + arn = var.settings.healthEvents.action +} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.EventBridge/variables.tf b/modules/ManagementGovernance/Monitoring.EventBridge/variables.tf index ccc9c21..273359f 100644 --- a/modules/ManagementGovernance/Monitoring.EventBridge/variables.tf +++ b/modules/ManagementGovernance/Monitoring.EventBridge/variables.tf @@ -1,5 +1,5 @@ variable cw-alarm-prefix {} variable actions-enabled {} -variable sns-targets {} +variable settings {} variable default-tags {} diff --git a/modules/ManagementGovernance/Monitoring.Kafka/main.tf b/modules/ManagementGovernance/Monitoring.Kafka/main.tf index ca50ac6..d455816 100644 --- a/modules/ManagementGovernance/Monitoring.Kafka/main.tf +++ b/modules/ManagementGovernance/Monitoring.Kafka/main.tf @@ -1,17 +1,17 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" { alarm_name = "${var.cw-alarm-prefix}:Kafka:ZooKeeperRequestLatencyMsMean:${var.cluster-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" + comparison_operator = var.settings.ZooKeeperRequestLatencyMsMean.comparison_operator + evaluation_periods = var.settings.ZooKeeperRequestLatencyMsMean.evaluation_periods metric_name = "ZooKeeperRequestLatencyMsMean" - period = "1800" - statistic = "Average" - threshold = var.threshold-ZooKeeperRequestLatencyMsMean + period = var.settings.ZooKeeperRequestLatencyMsMean.period + statistic = var.settings.ZooKeeperRequestLatencyMsMean.statistic + threshold = var.settings.ZooKeeperRequestLatencyMsMean.threshold alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean" namespace = "AWS/Kafka" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action] + ok_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action] dimensions = { "Cluster Name" = var.cluster-name } @@ -45,21 +45,21 @@ module "msk-brokers" { resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" { for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)]) alarm_name = "${var.cw-alarm-prefix}:Kafka:CpuUsage:${var.cluster-name}-${each.value}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" - threshold = var.threshold-CpuUserSystem + comparison_operator = var.settings.CpuUserSystem.comparison_operator + evaluation_periods = var.settings.CpuUserSystem.evaluation_periods + threshold = var.settings.CpuUserSystem.threshold alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.CpuUserSystem.action] + ok_actions = [var.settings.CpuUserSystem.action] metric_query { id = "m1" metric { metric_name = "CpuUser" namespace = "AWS/Kafka" - period = 300 - stat = "Average" + period = var.settings.CpuUserSystem.period + stat = var.settings.CpuUserSystem.statistic dimensions = { "Cluster Name" = var.cluster-name "Broker ID" = each.value @@ -72,8 +72,8 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" { metric { metric_name = "CpuSystem" namespace = "AWS/Kafka" - period = 300 - stat = "Average" + period = var.settings.CpuUserSystem.period + stat = var.settings.CpuUserSystem.statistic dimensions = { "Cluster Name" = var.cluster-name "Broker ID" = each.value @@ -97,18 +97,18 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" { resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" { for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)]) alarm_name = "${var.cw-alarm-prefix}:Kafka:KafkaDataLogsDiskUsed:${var.cluster-name}-${each.value}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" + comparison_operator = var.settings.KafkaDataLogsDiskUsed.comparison_operator + evaluation_periods = var.settings.KafkaDataLogsDiskUsed.evaluation_periods metric_name = "KafkaDataLogsDiskUsed" - period = "300" - statistic = "Average" - threshold = var.threshold-KafkaDataLogsDiskUsed + period = var.settings.KafkaDataLogsDiskUsed.period + statistic = var.settings.KafkaDataLogsDiskUsed.statistic + threshold = var.settings.KafkaDataLogsDiskUsed.threshold alarm_description = "Kafka:KafkaDataLogsDiskUsed" namespace = "AWS/Kafka" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.KafkaDataLogsDiskUsed.action] + ok_actions = [var.settings.KafkaDataLogsDiskUsed.action] dimensions = { "Cluster Name" = var.cluster-name "Broker ID" = each.value @@ -122,18 +122,18 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" { resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" { for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)]) alarm_name = "${var.cw-alarm-prefix}:Kafka:HeapMemoryAfterGC:${var.cluster-name}-${each.value}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" + comparison_operator = var.settings.HeapMemoryAfterGC.comparison_operator + evaluation_periods = var.settings.HeapMemoryAfterGC.evaluation_periods metric_name = "HeapMemoryAfterGC" - period = "300" - statistic = "Average" - threshold = var.threshold-HeapMemoryAfterGC + period = var.settings.HeapMemoryAfterGC.period + statistic = var.settings.HeapMemoryAfterGC.statistic + threshold = var.settings.HeapMemoryAfterGC.threshold alarm_description = "Kafka:HeapMemoryAfterGC" namespace = "AWS/Kafka" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [var.settings.HeapMemoryAfterGC.action] + ok_actions = [var.settings.HeapMemoryAfterGC.action] dimensions = { "Cluster Name" = var.cluster-name "Broker ID" = each.value diff --git a/modules/ManagementGovernance/Monitoring.Kafka/variables.tf b/modules/ManagementGovernance/Monitoring.Kafka/variables.tf index 445bfba..4fa12ed 100644 --- a/modules/ManagementGovernance/Monitoring.Kafka/variables.tf +++ b/modules/ManagementGovernance/Monitoring.Kafka/variables.tf @@ -1,10 +1,6 @@ variable cw-alarm-prefix {} variable actions-enabled {} variable cluster-name {} -variable sns-targets {} +variable settings {} variable default-tags {} -variable threshold-ZooKeeperRequestLatencyMsMean {} -variable threshold-CpuUserSystem {} -variable threshold-KafkaDataLogsDiskUsed {} -variable threshold-HeapMemoryAfterGC {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.NGW/main.tf b/modules/ManagementGovernance/Monitoring.NGW/main.tf index 68ffd2c..636303d 100644 --- a/modules/ManagementGovernance/Monitoring.NGW/main.tf +++ b/modules/ManagementGovernance/Monitoring.NGW/main.tf @@ -1,17 +1,43 @@ -resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" { - alarm_name = "${var.cw-alarm-prefix}:NGW:ErrorPortAllocation:${var.res-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "ErrorPortAllocation" - period = "300" - statistic = "Average" - threshold = var.threshold-ErrorPortAllocation - alarm_description = "NGW:ErrorPortAllocation" +resource "aws_cloudwatch_metric_alarm" "ngw-alarms" { + for_each = var.settings + alarm_name = "${var.cw-alarm-prefix}:NGW:${each.value["metric"]}:${var.res-id}" + comparison_operator = each.value["comparison_operator"] + evaluation_periods = each.value["evaluation_periods"] + metric_name = each.value["metric"] + period = each.value["period"] + statistic = each.value["statistic"] + threshold = each.value["threshold"] + alarm_description = "NGW:${each.value["metric"]}" namespace = "AWS/NATGateway" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [each.value["action"]] + ok_actions = [each.value["action"]] + dimensions = { + NatGatewayId = var.res-id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + + +/* +resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" { + alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm1.metric}:${var.res-id}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = var.settings.alarm1.metric + period = "300" + statistic = "Average" + threshold = var.settings.alarm1.threshold + alarm_description = "NGW:${var.settings.alarm1.metric}" + namespace = "AWS/NATGateway" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.settings.alarm1.action] + ok_actions = [var.settings.alarm1.action] dimensions = { NatGatewayId = var.res-id } @@ -22,19 +48,19 @@ resource "aws_cloudwatch_metric_alarm" "ngw-ErrorPortAllocation" { } resource "aws_cloudwatch_metric_alarm" "ngw-ConnectionEstablishedCount" { - alarm_name = "${var.cw-alarm-prefix}:NGW:ConnectionEstablishedCount:${var.res-id}" + alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm2.metric}:${var.res-id}" comparison_operator = "GreaterThanThreshold" evaluation_periods = "2" - metric_name = "ConnectionEstablishedCount" + metric_name = var.settings.alarm2.metric period = "300" statistic = "Average" - threshold = var.threshold-ConnectionEstablishedCount - alarm_description = "NGW:ConnectionEstablishedCount" + threshold = var.settings.alarm2.threshold + alarm_description = "NGW:${var.settings.alarm2.metric}" namespace = "AWS/NATGateway" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-standard] - ok_actions = [var.sns-targets.alarm-actions-standard] + alarm_actions = [var.settings.alarm2.action] + ok_actions = [var.settings.alarm2.action] dimensions = { NatGatewayId = var.res-id } @@ -45,19 +71,19 @@ resource "aws_cloudwatch_metric_alarm" "ngw-ConnectionEstablishedCount" { } resource "aws_cloudwatch_metric_alarm" "ngw-PacketsDropCount" { - alarm_name = "${var.cw-alarm-prefix}:NGW:PacketsDropCount:${var.res-id}" + alarm_name = "${var.cw-alarm-prefix}:NGW:${var.settings.alarm3.metric}:${var.res-id}" comparison_operator = "GreaterThanThreshold" evaluation_periods = "2" - metric_name = "PacketsDropCount" + metric_name = var.settings.alarm3.metric period = "300" statistic = "Average" - threshold = var.threshold-PacketsDropCount - alarm_description = "NGW:PacketsDropCount" + threshold = var.settings.alarm3.threshold + alarm_description = "NGW:${var.settings.alarm3.metric}" namespace = "AWS/NATGateway" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-standard] - ok_actions = [var.sns-targets.alarm-actions-standard] + alarm_actions = [var.settings.alarm3.action] + ok_actions = [var.settings.alarm3.action] dimensions = { NatGatewayId = var.res-id } @@ -65,4 +91,5 @@ resource "aws_cloudwatch_metric_alarm" "ngw-PacketsDropCount" { lifecycle { ignore_changes = [tags] } -} \ No newline at end of file +} +*/ \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.NGW/variables.tf b/modules/ManagementGovernance/Monitoring.NGW/variables.tf index c10c01c..319e215 100644 --- a/modules/ManagementGovernance/Monitoring.NGW/variables.tf +++ b/modules/ManagementGovernance/Monitoring.NGW/variables.tf @@ -1,8 +1,5 @@ variable cw-alarm-prefix {} variable actions-enabled {} variable res-id {} -variable threshold-ErrorPortAllocation { } -variable threshold-ConnectionEstablishedCount {} -variable threshold-PacketsDropCount {} -variable sns-targets {} variable default-tags {} +variable settings {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.NLB/main.tf b/modules/ManagementGovernance/Monitoring.NLB/main.tf index b49bc0b..90c1de6 100644 --- a/modules/ManagementGovernance/Monitoring.NLB/main.tf +++ b/modules/ManagementGovernance/Monitoring.NLB/main.tf @@ -17,18 +17,18 @@ module "nlb-targetgroups" { resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" { for_each = module.nlb-targetgroups.result-set alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "1" + comparison_operator = var.settings.HealthHostCountMin.comparison_operator + evaluation_periods = var.settings.HealthHostCountMin.evaluation_periods metric_name = "HealthyHostCount" - period = "300" - statistic = "Minimum" - threshold = var.threshold-HealthHostCountMin + period = var.settings.HealthHostCountMin.period + statistic = var.settings.HealthHostCountMin.statistic + threshold = var.settings.HealthHostCountMin.threshold alarm_description = "NLBTG:HealthyHostCount" namespace = "AWS/NetworkELB" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-emergency] - ok_actions = [var.sns-targets.alarm-actions-emergency] + alarm_actions = [var.settings.HealthHostCountMin.action] + ok_actions = [var.settings.HealthHostCountMin.action] dimensions = { TargetGroup = split(":", each.value)[5] LoadBalancer = "net/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}" diff --git a/modules/ManagementGovernance/Monitoring.NLB/variables.tf b/modules/ManagementGovernance/Monitoring.NLB/variables.tf index 35c1740..13dac0a 100644 --- a/modules/ManagementGovernance/Monitoring.NLB/variables.tf +++ b/modules/ManagementGovernance/Monitoring.NLB/variables.tf @@ -1,6 +1,5 @@ variable cw-alarm-prefix {} variable actions-enabled {} variable load-balancer {} -variable threshold-HealthHostCountMin {} -variable sns-targets {} +variable settings {} variable default-tags {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf b/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf index 6e70eb6..ed152f0 100644 --- a/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf +++ b/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf @@ -1,307 +1,20 @@ data "aws_caller_identity" "this" {} -resource "aws_cloudwatch_metric_alarm" "ES-CPUUtilization" { - alarm_name = "${var.cw-alarm-prefix}:ES:CPUUtilization:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" - metric_name = "CPUUtilization" - period = "1800" - statistic = "Average" - threshold = var.threshold-CPUUtilization - alarm_description = "ES:CPUUtilization" +resource "aws_cloudwatch_metric_alarm" "ES-alarms" { + for_each = var.settings + alarm_name = "${var.cw-alarm-prefix}:ES:${each.value["metric"]}:${var.domain-name}" + comparison_operator = each.value["comparison_operator"] + evaluation_periods = each.value["evaluation_periods"] + metric_name = each.value["metric"] + period = each.value["period"] + statistic = each.value["statistic"] + threshold = each.value["threshold"] + alarm_description = "ES:${each.value["metric"]}" namespace = "AWS/ES" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-SearchLatency" { - alarm_name = "${var.cw-alarm-prefix}:ES:SearchLatency:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" - metric_name = "SearchLatency" - period = "1800" - statistic = "Average" - threshold = var.threshold-SearchLatency - alarm_description = "ES:SearchLatency" - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-IndexingLatency" { - alarm_name = "${var.cw-alarm-prefix}:ES:IndexingLatency:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" - metric_name = "IndexingLatency" - period = "1800" - statistic = "Average" - threshold = var.threshold-IndexingLatency - alarm_description = "ES:IndexingLatency" - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" { - alarm_name = "${var.cw-alarm-prefix}:ES:ClusterStatusRed:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "ClusterStatus.red" - period = "900" - statistic = "Maximum" - threshold = 0 - alarm_description = "At least one primary shard and its replicas aren't allocated to a node." - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteQueue" { - alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteQueue:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "ThreadpoolWriteQueue" - period = "60" - statistic = "Average" - threshold = var.threshold-ThreadpoolWriteQueue - alarm_description = "The cluster is experiencing high indexing concurrency. Review and control indexing requests, or increase cluster resources." - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-standard] - ok_actions = [var.sns-targets.alarm-actions-standard] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchQueue" { - alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchQueue:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "ThreadpoolSearchQueue" - period = "60" - statistic = "Average" - threshold = var.threshold-ThreadpoolSearchQueue - alarm_description = "The cluster is experiencing high search concurrency. Consider scaling your cluster. You can also increase the search queue size, but increasing it excessively can cause out of memory errors." - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-standard] - ok_actions = [var.sns-targets.alarm-actions-standard] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchRejected" { - alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchRejected:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "ThreadpoolSearchRejected" - period = "60" - statistic = "Average" - threshold = var.threshold-ThreadpoolSearchRejected - alarm_description = "These alarms notify you of domain issues that might impact performance and stability." - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteRejected" { - alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteRejected:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "ThreadpoolWriteRejected" - period = "60" - statistic = "Average" - threshold = var.threshold-ThreadpoolWriteRejected - alarm_description = "These alarms notify you of domain issues that might impact performance and stability." - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-MasterCPUUtilization" { - alarm_name = "${var.cw-alarm-prefix}:ES:MasterCPUUtilization:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "MasterCPUUtilization" - period = "300" - statistic = "Average" - threshold = var.threshold-MasterCPUUtilization - alarm_description = "MasterCPUUtilization" - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-MasterJVMMemoryPressure" { - alarm_name = "${var.cw-alarm-prefix}:ES:MasterJVMMemoryPressure:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "MasterJVMMemoryPressure" - period = "60" - statistic = "Average" - threshold = var.threshold-MasterJVMMemoryPressure - alarm_description = "MasterJVMMemoryPressure" - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-JVMMemoryPressure" { - alarm_name = "${var.cw-alarm-prefix}:ES:JVMMemoryPressure:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "JVMMemoryPressure" - period = "60" - statistic = "Average" - threshold = var.threshold-JVMMemoryPressure - alarm_description = "The cluster could encounter out of memory errors if usage increases. Consider scaling vertically. OpenSearch Service uses half of an instance's RAM for the Java heap, up to a heap size of 32 GiB. You can scale instances vertically up to 64 GiB of RAM, at which point you can scale horizontally by adding instances." - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-ClusterIndexWritesBlocked" { - alarm_name = "${var.cw-alarm-prefix}:ES:ClusterIndexWritesBlocked:${var.domain-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "ClusterIndexWritesBlocked" - period = "60" - statistic = "Average" - threshold = var.threshold-ClusterIndexWritesBlocked - alarm_description = "Your cluster is blocking write requests. See ClusterBlockException." - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DomainName = var.domain-name - ClientId = data.aws_caller_identity.this.id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "ES-FreeStorageSpace" { - alarm_name = "${var.cw-alarm-prefix}:ES:FreeStorageSpace:${var.domain-name}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "2" - metric_name = "FreeStorageSpace" - period = "300" - statistic = "Average" - threshold = var.threshold-FreeStorageSpace - alarm_description = "A node in your cluster is low on free storage space." - namespace = "AWS/ES" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [each.value["action"]] + ok_actions = [each.value["action"]] dimensions = { DomainName = var.domain-name ClientId = data.aws_caller_identity.this.id diff --git a/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf b/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf index 923cd2e..75735f3 100644 --- a/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf +++ b/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf @@ -1,19 +1,6 @@ variable "cw-alarm-prefix" {} variable "actions-enabled" {} variable "domain-name" {} -variable "sns-targets" {} +variable "settings" {} variable "default-tags" {} -variable "threshold-CPUUtilization" {} -variable "threshold-SearchLatency" {} -variable "threshold-IndexingLatency" {} -variable "threshold-ThreadpoolWriteQueue" {} -variable "threshold-ThreadpoolSearchQueue" {} -variable "threshold-ThreadpoolSearchRejected" {} -variable "threshold-ThreadpoolWriteRejected" {} -variable "threshold-MasterCPUUtilization" {} -variable "threshold-MasterJVMMemoryPressure" {} -variable "threshold-JVMMemoryPressure" {} -variable "threshold-ClusterIndexWritesBlocked" {} -variable "threshold-FreeStorageSpace" {} -# variable threshold-KibanaHealthyNodes {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.RDS/main.tf b/modules/ManagementGovernance/Monitoring.RDS/main.tf index 1030cb3..4acb125 100644 --- a/modules/ManagementGovernance/Monitoring.RDS/main.tf +++ b/modules/ManagementGovernance/Monitoring.RDS/main.tf @@ -1,17 +1,18 @@ -resource "aws_cloudwatch_metric_alarm" "rds-cpu" { - alarm_name = "${var.cw-alarm-prefix}:RDS:CpuUtilization:${var.rds-instance-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = "CPUUtilization" - period = "3600" - statistic = "Average" - threshold = var.threshold-CpuUtilization - alarm_description = "RDS:CpuUtilization" +resource "aws_cloudwatch_metric_alarm" "rds-alarms" { + for_each = var.settings + alarm_name = "${var.cw-alarm-prefix}:RDS:${each.value["metric"]}:${var.rds-instance-name}" + comparison_operator = each.value["comparison_operator"] + evaluation_periods = each.value["evaluation_periods"] + metric_name = each.value["metric"] + period = each.value["period"] + statistic = each.value["statistic"] + threshold = each.value["threshold"] + alarm_description = "RDS:${each.value["metric"]}" namespace = "AWS/RDS" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [each.value["action"]] + ok_actions = [each.value["action"]] dimensions = { DBInstanceIdentifier = var.rds-instance-name } @@ -20,118 +21,3 @@ resource "aws_cloudwatch_metric_alarm" "rds-cpu" { ignore_changes = [tags] } } - -resource "aws_cloudwatch_metric_alarm" "rds-storage" { - alarm_name = "${var.cw-alarm-prefix}:RDS:FreeStorageSpace:${var.rds-instance-name}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "1" - metric_name = "FreeStorageSpace" - period = "3600" - statistic = "Average" - threshold = var.threshold-FreeStorageSpace - alarm_description = "RDS:FreeStorageSpace" - namespace = "AWS/RDS" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DBInstanceIdentifier = var.rds-instance-name - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "rds-memory" { - alarm_name = "${var.cw-alarm-prefix}:RDS:FreeableMemory:${var.rds-instance-name}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "1" - metric_name = "FreeableMemory" - period = "3600" - statistic = "Average" - threshold = var.threshold-FreeableMemory - alarm_description = "RDS:FreeableMemory" - namespace = "AWS/RDS" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DBInstanceIdentifier = var.rds-instance-name - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" { - alarm_name = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "DiskQueueDepth" - period = "300" - statistic = "Average" - threshold = var.threshold-DiskQueueDepth - alarm_description = "RDS:DiskQueueDepth" - namespace = "AWS/RDS" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DBInstanceIdentifier = var.rds-instance-name - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "rds-ReadLatency" { - alarm_name = "${var.cw-alarm-prefix}:RDS:ReadLatency:${var.rds-instance-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "ReadLatency" - period = "900" - statistic = "Average" - threshold = var.threshold-ReadLatency - alarm_description = "RDS:ReadLatency" - namespace = "AWS/RDS" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DBInstanceIdentifier = var.rds-instance-name - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "rds-WriteLatency" { - alarm_name = "${var.cw-alarm-prefix}:RDS:WriteLatency:${var.rds-instance-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "WriteLatency" - period = "900" - statistic = "Average" - threshold = var.threshold-WriteLatency - alarm_description = "RDS:WriteLatency" - namespace = "AWS/RDS" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - DBInstanceIdentifier = var.rds-instance-name - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.RDS/variables.tf b/modules/ManagementGovernance/Monitoring.RDS/variables.tf index a6e9c9e..7bcc13a 100644 --- a/modules/ManagementGovernance/Monitoring.RDS/variables.tf +++ b/modules/ManagementGovernance/Monitoring.RDS/variables.tf @@ -1,12 +1,5 @@ variable cw-alarm-prefix {} variable actions-enabled {} variable rds-instance-name {} -variable sns-targets {} +variable settings {} variable default-tags {} - -variable threshold-FreeableMemory {} -variable threshold-CpuUtilization {} -variable threshold-FreeStorageSpace {} -variable threshold-DiskQueueDepth {} -variable threshold-ReadLatency {} -variable threshold-WriteLatency {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.Redis/main.tf b/modules/ManagementGovernance/Monitoring.Redis/main.tf index 5549a5a..d48ef40 100644 --- a/modules/ManagementGovernance/Monitoring.Redis/main.tf +++ b/modules/ManagementGovernance/Monitoring.Redis/main.tf @@ -1,17 +1,18 @@ -resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" { - alarm_name = "${var.cw-alarm-prefix}:Redis:EngineCPUUtilization:${var.redis-cluster-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = "EngineCPUUtilization" - period = "3600" - statistic = "Average" - threshold = var.threshold-EngineCPUUtilization - alarm_description = "Redis:EngineCPUUtilization" - namespace = "AWS/ElastiCache" +resource "aws_cloudwatch_metric_alarm" "redis-alarms" { + for_each = var.settings + alarm_name = "${var.cw-alarm-prefix}:Redis:${each.value["metric"]}:${var.redis-cluster-id}" + comparison_operator = each.value["comparison_operator"] + evaluation_periods = each.value["evaluation_periods"] + metric_name = each.value["metric"] + period = each.value["period"] + statistic = each.value["statistic"] + threshold = each.value["threshold"] + alarm_description = "NGW:${each.value["metric"]}" + namespace = "AWS/NATGateway" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] + alarm_actions = [each.value["action"]] + ok_actions = [each.value["action"]] dimensions = { CacheClusterId = var.redis-cluster-id } @@ -19,75 +20,4 @@ resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" { lifecycle { ignore_changes = [tags] } -} - -resource "aws_cloudwatch_metric_alarm" "redis-DatabaseMemoryUsagePercentage" { - alarm_name = "${var.cw-alarm-prefix}:Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = "DatabaseMemoryUsagePercentage" - period = "3600" - statistic = "Average" - threshold = var.threshold-DatabaseMemoryUsagePercentage - alarm_description = "Redis:DatabaseMemoryUsagePercentage" - namespace = "AWS/ElastiCache" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-urgent] - ok_actions = [var.sns-targets.alarm-actions-urgent] - dimensions = { - CacheClusterId = var.redis-cluster-id - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" { - alarm_name = "${var.cw-alarm-prefix}:Redis:CacheHitRate:${var.redis-cluster-id}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "4" - metric_name = "CacheHitRate" - period = "900" - statistic = "Average" - threshold = var.threshold-CacheHitRate - alarm_description = "Redis:CacheHitRate" - namespace = "AWS/ElastiCache" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-standard] - ok_actions = [var.sns-targets.alarm-actions-standard] - dimensions = { - CacheClusterId = var.redis-cluster-id - # CacheNodeId = each.value - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } -} - -resource "aws_cloudwatch_metric_alarm" "redis-StringBasedCmdsLatency" { - alarm_name = "${var.cw-alarm-prefix}:Redis:StringBasedCmdsLatency:${var.redis-cluster-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "3" - metric_name = "StringBasedCmdsLatency" - period = "60" - statistic = "Average" - threshold = var.threshold-StringBasedCmdsLatency - alarm_description = "The average latency, in microseconds, of the string-based commands run during a selected time range" - namespace = "AWS/ElastiCache" - insufficient_data_actions = [] - actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-standard] - ok_actions = [var.sns-targets.alarm-actions-standard] - dimensions = { - CacheClusterId = var.redis-cluster-id - # CacheNodeId = each.value - } - tags = var.default-tags - lifecycle { - ignore_changes = [tags] - } } \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.Redis/variables.tf b/modules/ManagementGovernance/Monitoring.Redis/variables.tf index fb1264c..7d03e8d 100644 --- a/modules/ManagementGovernance/Monitoring.Redis/variables.tf +++ b/modules/ManagementGovernance/Monitoring.Redis/variables.tf @@ -1,10 +1,5 @@ variable "cw-alarm-prefix" {} variable "actions-enabled" {} variable "redis-cluster-id" {} -variable "sns-targets" {} +variable "settings" {} variable "default-tags" {} - -variable "threshold-EngineCPUUtilization" {} -variable "threshold-DatabaseMemoryUsagePercentage" {} -variable "threshold-CacheHitRate" {} -variable "threshold-StringBasedCmdsLatency" {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.TGW/main.tf b/modules/ManagementGovernance/Monitoring.TGW/main.tf index 7d259ab..5c21323 100644 --- a/modules/ManagementGovernance/Monitoring.TGW/main.tf +++ b/modules/ManagementGovernance/Monitoring.TGW/main.tf @@ -1,17 +1,17 @@ resource "aws_cloudwatch_metric_alarm" "tgw-PacketDropCountNoRoute" { alarm_name = "${var.cw-alarm-prefix}:TGW:PacketDropCountNoRoute:${var.tgw-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" + comparison_operator = var.settings.PacketDropCountNoRoute.comparison_operator + evaluation_periods = var.settings.PacketDropCountNoRoute.evaluation_periods metric_name = "PacketDropCountNoRoute" - period = "300" - statistic = "Average" - threshold = var.threshold-PacketDropCountNoRoute + period = var.settings.PacketDropCountNoRoute.period + statistic = var.settings.PacketDropCountNoRoute.statistic + threshold = var.settings.PacketDropCountNoRoute.threshold alarm_description = "TGW:PacketDropCountNoRoute" namespace = "AWS/TransitGateway" insufficient_data_actions = [] actions_enabled = var.actions-enabled - alarm_actions = [var.sns-targets.alarm-actions-standard] - ok_actions = [var.sns-targets.alarm-actions-standard] + alarm_actions = [var.settings.PacketDropCountNoRoute.action] + ok_actions = [var.settings.PacketDropCountNoRoute.action] dimensions = { TransitGateway = var.tgw-id } diff --git a/modules/ManagementGovernance/Monitoring.TGW/variables.tf b/modules/ManagementGovernance/Monitoring.TGW/variables.tf index 0161b66..d73c42a 100644 --- a/modules/ManagementGovernance/Monitoring.TGW/variables.tf +++ b/modules/ManagementGovernance/Monitoring.TGW/variables.tf @@ -1,7 +1,5 @@ variable cw-alarm-prefix {} variable actions-enabled {} variable tgw-id {} -variable threshold-PacketDropCountNoRoute {} - -variable sns-targets {} +variable settings {} variable default-tags {}