UPD: Added more monitoring modules and various enhancements

2022-10-26 11:13:56 +08:00 · 2022-10-26 11:13:56 +08:00 · b3ba6f2441
commit b3ba6f2441
parent 2af0ff1b1a
28 changed files with 443 additions and 167 deletions
--- a/modules/ManagementGovernance/Monitoring.ALB/main.tf
+++ b/modules/ManagementGovernance/Monitoring.ALB/main.tf
@ -1,28 +1,28 @@
-data external alb-targetgroups {
+data "external" "alb-targetgroups" {
  program = ["bash", "../../modules/ManagementGovernance/Monitoring.ALB/list-alb-targetgroups.sh"]
  query = {
    lb = var.load-balancer
  }
 }

-resource aws_cloudwatch_metric_alarm alb-HealthyHostCount {
-  for_each = toset(split(" ", data.external.alb-targetgroups.result.result))
-  alarm_name = "ALBTG:HealthyHostCount:${split("/", each.value)[1]}/${split("/", each.value)[2]}"
-  comparison_operator = "LessThanThreshold"
-  evaluation_periods = "1"
-  metric_name = "HealthyHostCount"
-  period = "300"
-  statistic = "Minimum"
-  threshold = var.threshold-HealthHostCountMin
-  alarm_description = "ALBTG:HealthyHostCount"
-  namespace = "AWS/ApplicationELB"
+resource "aws_cloudwatch_metric_alarm" "alb-HealthyHostCount" {
+  for_each                  = toset(split(" ", data.external.alb-targetgroups.result.result))
+  alarm_name                = "${var.cw-alarm-prefix}:ALBTG:HealthyHostCount:${split("/", each.value)[1]}/${split("/", each.value)[2]}"
+  comparison_operator       = "LessThanThreshold"
+  evaluation_periods        = "1"
+  metric_name               = "HealthyHostCount"
+  period                    = "300"
+  statistic                 = "Minimum"
+  threshold                 = var.threshold-HealthHostCountMin
+  alarm_description         = "ALBTG:HealthyHostCount"
+  namespace                 = "AWS/ApplicationELB"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-emergency]
-  ok_actions = [var.alarm-actions-emergency]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-emergency]
+  ok_actions                = [var.alarm-actions-emergency]
  dimensions = {
-    TargetGroup = "targetgroup/${split("/", each.value)[1]}/${split("/", each.value)[2]}"
-    LoadBalancer = "app/${split("/",var.load-balancer)[2]}/${split("/",var.load-balancer)[3]}"
+    TargetGroup  = "targetgroup/${split("/", each.value)[1]}/${split("/", each.value)[2]}"
+    LoadBalancer = "app/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"
  }
  tags = var.default-tags
  lifecycle {
--- a/modules/ManagementGovernance/Monitoring.ALB/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.ALB/variables.tf
@ -1,4 +1,5 @@
-# variable target-group {}
+variable cw-alarm-prefix {}
+variable actions-enabled {}
 variable load-balancer {}
 variable threshold-HealthHostCountMin {}
 variable alarm-actions-urgent {
--- a/modules/ManagementGovernance/Monitoring.ASG/README.md
+++ b/modules/ManagementGovernance/Monitoring.ASG/README.md
@ -0,0 +1,5 @@
+# Monitoring module for BEA
+This module deploys the default cloudwatch metric monitoring
+
+## Notes
+Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
--- a/modules/ManagementGovernance/Monitoring.ASG/main.tf
+++ b/modules/ManagementGovernance/Monitoring.ASG/main.tf
@ -0,0 +1,22 @@
+resource "aws_cloudwatch_metric_alarm" "asg-CPUUtilization" {
+  alarm_name                = "${var.cw-alarm-prefix}:ASG:CPUUtilization:${var.asg-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "3"
+  metric_name               = "CPUUtilization"
+  period                    = "1800"
+  statistic                 = "Average"
+  threshold                 = var.threshold-CPUUtilization
+  alarm_description         = "ASG:CPUUtilization"
+  namespace                 = "AWS/EC2"
+  insufficient_data_actions = []
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
+  dimensions = {
+    AutoScalingGroupName = var.asg-name
+  }
+  tags = var.default-tags
+  lifecycle {
+    ignore_changes = [tags]
+  }
+}
--- a/modules/ManagementGovernance/Monitoring.ASG/provider.tf
+++ b/modules/ManagementGovernance/Monitoring.ASG/provider.tf
@ -0,0 +1,9 @@
+terraform {
+  required_version = "~> 1.3.0"
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 4.36.1"
+    }
+  }
+}
--- a/modules/ManagementGovernance/Monitoring.ASG/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.ASG/variables.tf
@ -0,0 +1,22 @@
+variable cw-alarm-prefix {}
+variable actions-enabled {}
+variable asg-name {}
+variable alarm-actions-urgent {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent"
+}
+variable alarm-actions-emergency {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency"
+}
+variable alarm-actions-standard {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard"
+}
+variable alarm-actions-general {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support"
+}
+variable default-tags {}
+
+variable threshold-CPUUtilization {}
--- a/modules/ManagementGovernance/Monitoring.EC2/main.tf
+++ b/modules/ManagementGovernance/Monitoring.EC2/main.tf
@ -1,5 +1,5 @@
 resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
-  alarm_name                = "EC2:StatusCheckFailed_System:${var.ec2-instance-id}"
+  alarm_name                = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_System:${var.ec2-instance-id}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "1"
  metric_name               = "StatusCheckFailed_System"
@ -9,7 +9,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
  alarm_description         = "EC2:StatusCheckFailed_System"
  namespace                 = "AWS/EC2"
  insufficient_data_actions = []
-  actions_enabled           = "true"
+  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.alarm-actions-emergency]
  ok_actions                = [var.alarm-actions-emergency]
  dimensions = {
@ -22,7 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
 }

 resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
-  alarm_name                = "EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}"
+  alarm_name                = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "1"
  metric_name               = "StatusCheckFailed_Instance"
@ -32,7 +32,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
  alarm_description         = "EC2:StatusCheckFailed_Instance"
  namespace                 = "AWS/EC2"
  insufficient_data_actions = []
-  actions_enabled           = "true"
+  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.alarm-actions-emergency]
  ok_actions                = [var.alarm-actions-emergency]
  dimensions = {
@ -45,7 +45,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
 }

 resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
-  alarm_name                = "EC2:CPUUtilization:${var.ec2-instance-id}"
+  alarm_name                = "${var.cw-alarm-prefix}:EC2:CPUUtilization:${var.ec2-instance-id}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "6"
  metric_name               = "CPUUtilization"
@ -55,7 +55,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
  alarm_description         = "EC2:CPUUtilization"
  namespace                 = "AWS/EC2"
  insufficient_data_actions = []
-  actions_enabled           = "true"
+  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.alarm-actions-urgent]
  ok_actions                = [var.alarm-actions-urgent]
  treat_missing_data        = "notBreaching"
--- a/modules/ManagementGovernance/Monitoring.EC2/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.EC2/variables.tf
@ -1,3 +1,5 @@
+variable cw-alarm-prefix {}
+variable actions-enabled {}
 variable ec2-instance-id {}
 variable alarm-actions-urgent {
  type = string
--- a/modules/ManagementGovernance/Monitoring.EMR/main.tf
+++ b/modules/ManagementGovernance/Monitoring.EMR/main.tf
@ -1,5 +1,5 @@
 resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
-  alarm_name                = "EMR:AppsPending:${var.job-flow-id}"
+  alarm_name                = "${var.cw-alarm-prefix}:EMR:AppsPending:${var.job-flow-id}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "1"
  metric_name               = "AppsPending"
@ -9,7 +9,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
  alarm_description         = "EMR:AppsPending"
  namespace                 = "AWS/ElasticMapReduce"
  insufficient_data_actions = []
-  actions_enabled           = "true"
+  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.alarm-actions-standard]
  ok_actions                = [var.alarm-actions-standard]
  dimensions = {
@ -22,7 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
 }

 resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
-  alarm_name                = "EMR:CapacityRemainingGB:${var.job-flow-id}"
+  alarm_name                = "${var.cw-alarm-prefix}:EMR:CapacityRemainingGB:${var.job-flow-id}"
  comparison_operator       = "LessThanThreshold"
  evaluation_periods        = "1"
  metric_name               = "CapacityRemainingGB"
@ -32,7 +32,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
  alarm_description         = "EMR:CapacityRemainingGB"
  namespace                 = "AWS/ElasticMapReduce"
  insufficient_data_actions = []
-  actions_enabled           = "true"
+  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.alarm-actions-urgent]
  ok_actions                = [var.alarm-actions-urgent]
  dimensions = {
--- a/modules/ManagementGovernance/Monitoring.EMR/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.EMR/variables.tf
@ -1,3 +1,5 @@
+variable cw-alarm-prefix {}
+variable actions-enabled {}
 variable job-flow-id {}
 variable threshold-AppsPending {}
 variable threshold-CapacityRemainingGB {}
--- a/modules/ManagementGovernance/Monitoring.Kafka/README.md
+++ b/modules/ManagementGovernance/Monitoring.Kafka/README.md
@ -0,0 +1,5 @@
+# Monitoring module for BEA
+This module deploys the default cloudwatch metric monitoring
+
+## Notes
+Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
--- a/modules/ManagementGovernance/Monitoring.Kafka/main.tf
+++ b/modules/ManagementGovernance/Monitoring.Kafka/main.tf
@ -0,0 +1,22 @@
+resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
+  alarm_name                = "${var.cw-alarm-prefix}:Kafka:ZooKeeperRequestLatencyMsMean:${var.cluster-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "3"
+  metric_name               = "ZooKeeperRequestLatencyMsMean"
+  period                    = "1800"
+  statistic                 = "Average"
+  threshold                 = var.threshold-ZooKeeperRequestLatencyMsMean
+  alarm_description         = "Kafka:ZooKeeperRequestLatencyMsMean"
+  namespace                 = "AWS/Kafka"
+  insufficient_data_actions = []
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
+  dimensions = {
+    "Cluster Name" = var.cluster-name
+  }
+  tags = var.default-tags
+  lifecycle {
+    ignore_changes = [tags]
+  }
+}
--- a/modules/ManagementGovernance/Monitoring.Kafka/provider.tf
+++ b/modules/ManagementGovernance/Monitoring.Kafka/provider.tf
@ -0,0 +1,9 @@
+terraform {
+  required_version = "~> 1.3.0"
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 4.36.1"
+    }
+  }
+}
--- a/modules/ManagementGovernance/Monitoring.Kafka/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.Kafka/variables.tf
@ -0,0 +1,22 @@
+variable cw-alarm-prefix {}
+variable actions-enabled {}
+variable cluster-name {}
+variable alarm-actions-urgent {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent"
+}
+variable alarm-actions-emergency {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency"
+}
+variable alarm-actions-standard {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard"
+}
+variable alarm-actions-general {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support"
+}
+variable default-tags {}
+
+variable threshold-ZooKeeperRequestLatencyMsMean {}
--- a/modules/ManagementGovernance/Monitoring.NLB/main.tf
+++ b/modules/ManagementGovernance/Monitoring.NLB/main.tf
@ -16,7 +16,7 @@ module "nlb-targetgroups" {

 resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" {
  for_each                  = module.nlb-targetgroups.result-set
-  alarm_name                = "NLBTG:HealthyHostCount:${split(":", each.value)[5]}"
+  alarm_name                = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}"
  comparison_operator       = "LessThanThreshold"
  evaluation_periods        = "1"
  metric_name               = "HealthyHostCount"
@ -26,7 +26,7 @@ resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" {
  alarm_description         = "NLBTG:HealthyHostCount"
  namespace                 = "AWS/NetworkELB"
  insufficient_data_actions = []
-  actions_enabled           = "true"
+  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.alarm-actions-emergency]
  ok_actions                = [var.alarm-actions-emergency]
  dimensions = {
--- a/modules/ManagementGovernance/Monitoring.NLB/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.NLB/variables.tf
@ -1,4 +1,5 @@
-# variable target-group {}
+variable cw-alarm-prefix {}
+variable actions-enabled {}
 variable load-balancer {}
 variable threshold-HealthHostCountMin {}
 variable alarm-actions-urgent {
--- a/modules/ManagementGovernance/Monitoring.OpenSearch/README.md
+++ b/modules/ManagementGovernance/Monitoring.OpenSearch/README.md
@ -0,0 +1,5 @@
+# Monitoring module for BEA
+This module deploys the default cloudwatch metric monitoring
+
+## Notes
+Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
--- a/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf
+++ b/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf
@ -0,0 +1,98 @@
+data "aws_caller_identity" "this" {}
+
+resource "aws_cloudwatch_metric_alarm" "ES-CPUUtilization" {
+  alarm_name                = "${var.cw-alarm-prefix}:ES:CPUUtilization:${var.domain-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "3"
+  metric_name               = "CPUUtilization"
+  period                    = "1800"
+  statistic                 = "Average"
+  threshold                 = var.threshold-CPUUtilization
+  alarm_description         = "ES:CPUUtilization"
+  namespace                 = "AWS/ES"
+  insufficient_data_actions = []
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
+  dimensions = {
+    DomainName = var.domain-name
+    ClientId   = data.aws_caller_identity.this.id
+  }
+  tags = var.default-tags
+  lifecycle {
+    ignore_changes = [tags]
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "ES-SearchLatency" {
+  alarm_name                = "${var.cw-alarm-prefix}:ES:SearchLatency:${var.domain-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "3"
+  metric_name               = "SearchLatency"
+  period                    = "1800"
+  statistic                 = "Average"
+  threshold                 = var.threshold-SearchLatency
+  alarm_description         = "ES:SearchLatency"
+  namespace                 = "AWS/ES"
+  insufficient_data_actions = []
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
+  dimensions = {
+    DomainName = var.domain-name
+    ClientId   = data.aws_caller_identity.this.id
+  }
+  tags = var.default-tags
+  lifecycle {
+    ignore_changes = [tags]
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "ES-IndexingLatency" {
+  alarm_name                = "${var.cw-alarm-prefix}:ES:IndexingLatency:${var.domain-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "3"
+  metric_name               = "IndexingLatency"
+  period                    = "1800"
+  statistic                 = "Average"
+  threshold                 = var.threshold-IndexingLatency
+  alarm_description         = "ES:IndexingLatency"
+  namespace                 = "AWS/ES"
+  insufficient_data_actions = []
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
+  dimensions = {
+    DomainName = var.domain-name
+    ClientId   = data.aws_caller_identity.this.id
+  }
+  tags = var.default-tags
+  lifecycle {
+    ignore_changes = [tags]
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" {
+  alarm_name                = "${var.cw-alarm-prefix}:ES:ClusterStatusRed:${var.domain-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "2"
+  metric_name               = "ClusterStatus.red"
+  period                    = "900"
+  statistic                 = "Maximum"
+  threshold                 = 0
+  alarm_description         = "At least one primary shard and its replicas aren't allocated to a node."
+  namespace                 = "AWS/ES"
+  insufficient_data_actions = []
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
+  dimensions = {
+    DomainName = var.domain-name
+    ClientId   = data.aws_caller_identity.this.id
+  }
+  tags = var.default-tags
+  lifecycle {
+    ignore_changes = [tags]
+  }
+}
+
--- a/modules/ManagementGovernance/Monitoring.OpenSearch/provider.tf
+++ b/modules/ManagementGovernance/Monitoring.OpenSearch/provider.tf
@ -0,0 +1,9 @@
+terraform {
+  required_version = "~> 1.3.0"
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 4.36.1"
+    }
+  }
+}
--- a/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf
@ -0,0 +1,25 @@
+variable cw-alarm-prefix {}
+variable actions-enabled {}
+variable domain-name {}
+variable alarm-actions-urgent {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent"
+}
+variable alarm-actions-emergency {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency"
+}
+variable alarm-actions-standard {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard"
+}
+variable alarm-actions-general {
+  type = string
+  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support"
+}
+variable default-tags {}
+
+variable threshold-CPUUtilization {}
+variable threshold-SearchLatency {}
+variable threshold-IndexingLatency {}
+# variable threshold-KibanaHealthyNodes {}
--- a/modules/ManagementGovernance/Monitoring.RDS/main.tf
+++ b/modules/ManagementGovernance/Monitoring.RDS/main.tf
@ -1,17 +1,17 @@
-resource aws_cloudwatch_metric_alarm rds-cpu {
-  alarm_name = "RDS:CpuUtilization:${var.rds-instance-name}"
-  comparison_operator = "GreaterThanThreshold"
-  evaluation_periods = "1"
-  metric_name = "CPUUtilization"
-  period = "3600"
-  statistic = "Average"
-  threshold = var.threshold-CpuUtilization
-  alarm_description = "RDS:CpuUtilization"
-  namespace = "AWS/RDS"
+resource "aws_cloudwatch_metric_alarm" "rds-cpu" {
+  alarm_name                = "${var.cw-alarm-prefix}:RDS:CpuUtilization:${var.rds-instance-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "1"
+  metric_name               = "CPUUtilization"
+  period                    = "3600"
+  statistic                 = "Average"
+  threshold                 = var.threshold-CpuUtilization
+  alarm_description         = "RDS:CpuUtilization"
+  namespace                 = "AWS/RDS"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-urgent]
-  ok_actions = [var.alarm-actions-urgent]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
  dimensions = {
    DBInstanceIdentifier = var.rds-instance-name
  }
@ -21,20 +21,20 @@ resource aws_cloudwatch_metric_alarm rds-cpu {
  }
 }

-resource aws_cloudwatch_metric_alarm rds-storage {
-  alarm_name = "RDS:FreeStorageSpace:${var.rds-instance-name}"
-  comparison_operator = "LessThanThreshold"
-  evaluation_periods = "1"
-  metric_name = "FreeStorageSpace"
-  period = "3600"
-  statistic = "Average"
-  threshold = var.threshold-FreeStorageSpace
-  alarm_description = "RDS:FreeStorageSpace"
-  namespace = "AWS/RDS"
+resource "aws_cloudwatch_metric_alarm" "rds-storage" {
+  alarm_name                = "${var.cw-alarm-prefix}:RDS:FreeStorageSpace:${var.rds-instance-name}"
+  comparison_operator       = "LessThanThreshold"
+  evaluation_periods        = "1"
+  metric_name               = "FreeStorageSpace"
+  period                    = "3600"
+  statistic                 = "Average"
+  threshold                 = var.threshold-FreeStorageSpace
+  alarm_description         = "RDS:FreeStorageSpace"
+  namespace                 = "AWS/RDS"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-urgent]
-  ok_actions = [var.alarm-actions-urgent]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
  dimensions = {
    DBInstanceIdentifier = var.rds-instance-name
  }
@ -44,20 +44,20 @@ resource aws_cloudwatch_metric_alarm rds-storage {
  }
 }

-resource aws_cloudwatch_metric_alarm rds-memory {
-  alarm_name = "RDS:FreeableMemory:${var.rds-instance-name}"
-  comparison_operator = "LessThanThreshold"
-  evaluation_periods = "1"
-  metric_name = "FreeableMemory"
-  period = "3600"
-  statistic = "Average"
-  threshold = var.threshold-FreeableMemory
-  alarm_description = "RDS:FreeableMemory"
-  namespace = "AWS/RDS"
+resource "aws_cloudwatch_metric_alarm" "rds-memory" {
+  alarm_name                = "${var.cw-alarm-prefix}:RDS:FreeableMemory:${var.rds-instance-name}"
+  comparison_operator       = "LessThanThreshold"
+  evaluation_periods        = "1"
+  metric_name               = "FreeableMemory"
+  period                    = "3600"
+  statistic                 = "Average"
+  threshold                 = var.threshold-FreeableMemory
+  alarm_description         = "RDS:FreeableMemory"
+  namespace                 = "AWS/RDS"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-urgent]
-  ok_actions = [var.alarm-actions-urgent]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
  dimensions = {
    DBInstanceIdentifier = var.rds-instance-name
  }
@ -67,20 +67,20 @@ resource aws_cloudwatch_metric_alarm rds-memory {
  }
 }

-resource aws_cloudwatch_metric_alarm rds-DiskQueueDepth {
-  alarm_name = "RDS:DiskQueueDepth:${var.rds-instance-name}"
-  comparison_operator = "GreaterThanThreshold"
-  evaluation_periods = "1"
-  metric_name = "DiskQueueDepth"
-  period = "300"
-  statistic = "Average"
-  threshold = var.threshold-DiskQueueDepth
-  alarm_description = "RDS:DiskQueueDepth"
-  namespace = "AWS/RDS"
+resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" {
+  alarm_name                = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "1"
+  metric_name               = "DiskQueueDepth"
+  period                    = "300"
+  statistic                 = "Average"
+  threshold                 = var.threshold-DiskQueueDepth
+  alarm_description         = "RDS:DiskQueueDepth"
+  namespace                 = "AWS/RDS"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-urgent]
-  ok_actions = [var.alarm-actions-urgent]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
  dimensions = {
    DBInstanceIdentifier = var.rds-instance-name
  }
@ -90,20 +90,20 @@ resource aws_cloudwatch_metric_alarm rds-DiskQueueDepth {
  }
 }

-resource aws_cloudwatch_metric_alarm rds-ReadLatency {
-  alarm_name = "RDS:ReadLatency:${var.rds-instance-name}"
-  comparison_operator = "GreaterThanThreshold"
-  evaluation_periods = "2"
-  metric_name = "ReadLatency"
-  period = "900"
-  statistic = "Average"
-  threshold = var.threshold-ReadLatency
-  alarm_description = "RDS:ReadLatency"
-  namespace = "AWS/RDS"
+resource "aws_cloudwatch_metric_alarm" "rds-ReadLatency" {
+  alarm_name                = "${var.cw-alarm-prefix}:RDS:ReadLatency:${var.rds-instance-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "2"
+  metric_name               = "ReadLatency"
+  period                    = "900"
+  statistic                 = "Average"
+  threshold                 = var.threshold-ReadLatency
+  alarm_description         = "RDS:ReadLatency"
+  namespace                 = "AWS/RDS"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-urgent]
-  ok_actions = [var.alarm-actions-urgent]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
  dimensions = {
    DBInstanceIdentifier = var.rds-instance-name
  }
@ -113,20 +113,20 @@ resource aws_cloudwatch_metric_alarm rds-ReadLatency {
  }
 }

-resource aws_cloudwatch_metric_alarm rds-WriteLatency {
-  alarm_name = "RDS:WriteLatency:${var.rds-instance-name}"
-  comparison_operator = "GreaterThanThreshold"
-  evaluation_periods = "2"
-  metric_name = "WriteLatency"
-  period = "900"
-  statistic = "Average"
-  threshold = var.threshold-WriteLatency
-  alarm_description = "RDS:WriteLatency"
-  namespace = "AWS/RDS"
+resource "aws_cloudwatch_metric_alarm" "rds-WriteLatency" {
+  alarm_name                = "${var.cw-alarm-prefix}:RDS:WriteLatency:${var.rds-instance-name}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "2"
+  metric_name               = "WriteLatency"
+  period                    = "900"
+  statistic                 = "Average"
+  threshold                 = var.threshold-WriteLatency
+  alarm_description         = "RDS:WriteLatency"
+  namespace                 = "AWS/RDS"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-urgent]
-  ok_actions = [var.alarm-actions-urgent]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
  dimensions = {
    DBInstanceIdentifier = var.rds-instance-name
  }
--- a/modules/ManagementGovernance/Monitoring.RDS/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.RDS/variables.tf
@ -1,3 +1,5 @@
+variable cw-alarm-prefix {}
+variable actions-enabled {}
 variable rds-instance-name {}
 variable alarm-actions-urgent {
  type = string
--- a/modules/ManagementGovernance/Monitoring.Redis/main.tf
+++ b/modules/ManagementGovernance/Monitoring.Redis/main.tf
@ -1,17 +1,17 @@
-resource aws_cloudwatch_metric_alarm redis-EngineCPUUtilization {
-  alarm_name = "Redis:EngineCPUUtilization:${var.redis-cluster-id}"
-  comparison_operator = "GreaterThanThreshold"
-  evaluation_periods = "1"
-  metric_name = "EngineCPUUtilization"
-  period = "3600"
-  statistic = "Average"
-  threshold = var.threshold-EngineCPUUtilization
-  alarm_description = "Redis:EngineCPUUtilization"
-  namespace = "AWS/ElastiCache"
+resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" {
+  alarm_name                = "${var.cw-alarm-prefix}:Redis:EngineCPUUtilization:${var.redis-cluster-id}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "1"
+  metric_name               = "EngineCPUUtilization"
+  period                    = "3600"
+  statistic                 = "Average"
+  threshold                 = var.threshold-EngineCPUUtilization
+  alarm_description         = "Redis:EngineCPUUtilization"
+  namespace                 = "AWS/ElastiCache"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-urgent]
-  ok_actions = [var.alarm-actions-urgent]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
  dimensions = {
    CacheClusterId = var.redis-cluster-id
  }
@ -21,20 +21,20 @@ resource aws_cloudwatch_metric_alarm redis-EngineCPUUtilization {
  }
 }

-resource aws_cloudwatch_metric_alarm redis-DatabaseMemoryUsagePercentage {
-  alarm_name = "Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}"
-  comparison_operator = "GreaterThanThreshold"
-  evaluation_periods = "1"
-  metric_name = "DatabaseMemoryUsagePercentage"
-  period = "3600"
-  statistic = "Average"
-  threshold = var.threshold-DatabaseMemoryUsagePercentage
-  alarm_description = "Redis:DatabaseMemoryUsagePercentage"
-  namespace = "AWS/ElastiCache"
+resource "aws_cloudwatch_metric_alarm" "redis-DatabaseMemoryUsagePercentage" {
+  alarm_name                = "${var.cw-alarm-prefix}:Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}"
+  comparison_operator       = "GreaterThanThreshold"
+  evaluation_periods        = "1"
+  metric_name               = "DatabaseMemoryUsagePercentage"
+  period                    = "3600"
+  statistic                 = "Average"
+  threshold                 = var.threshold-DatabaseMemoryUsagePercentage
+  alarm_description         = "Redis:DatabaseMemoryUsagePercentage"
+  namespace                 = "AWS/ElastiCache"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-urgent]
-  ok_actions = [var.alarm-actions-urgent]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-urgent]
+  ok_actions                = [var.alarm-actions-urgent]
  dimensions = {
    CacheClusterId = var.redis-cluster-id
  }
@ -44,26 +44,20 @@ resource aws_cloudwatch_metric_alarm redis-DatabaseMemoryUsagePercentage {
  }
 }

-/*
-data aws_elasticache_cluster redis-cluster {
-  cluster_id = var.redis-cluster-id
-}
-*/
-resource aws_cloudwatch_metric_alarm redis-CacheHitRate {
-  # for_each = toset(data.aws_elasticache_cluster.redis-cluster.cache_nodes.*.id)
-  alarm_name = "Redis:CacheHitRate:${var.redis-cluster-id}"
-  comparison_operator = "LessThanThreshold"
-  evaluation_periods = "4"
-  metric_name = "CacheHitRate"
-  period = "900"
-  statistic = "Average"
-  threshold = var.threshold-CacheHitRate
-  alarm_description = "Redis:CacheHitRate"
-  namespace = "AWS/ElastiCache"
+resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" {
+  alarm_name                = "${var.cw-alarm-prefix}:Redis:CacheHitRate:${var.redis-cluster-id}"
+  comparison_operator       = "LessThanThreshold"
+  evaluation_periods        = "4"
+  metric_name               = "CacheHitRate"
+  period                    = "900"
+  statistic                 = "Average"
+  threshold                 = var.threshold-CacheHitRate
+  alarm_description         = "Redis:CacheHitRate"
+  namespace                 = "AWS/ElastiCache"
  insufficient_data_actions = []
-  actions_enabled = "true"
-  alarm_actions = [var.alarm-actions-standard]
-  ok_actions = [var.alarm-actions-standard]
+  actions_enabled           = var.actions-enabled
+  alarm_actions             = [var.alarm-actions-standard]
+  ok_actions                = [var.alarm-actions-standard]
  dimensions = {
    CacheClusterId = var.redis-cluster-id
    # CacheNodeId = each.value
--- a/modules/ManagementGovernance/Monitoring.Redis/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.Redis/variables.tf
@ -1,22 +1,24 @@
-variable redis-cluster-id {}
-variable alarm-actions-urgent {
-  type = string
+variable cw-alarm-prefix {}
+variable "actions-enabled" {}
+variable "redis-cluster-id" {}
+variable "alarm-actions-urgent" {
+  type    = string
  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent"
 }
-variable alarm-actions-emergency {
-  type = string
+variable "alarm-actions-emergency" {
+  type    = string
  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency"
 }
-variable alarm-actions-standard {
-  type = string
+variable "alarm-actions-standard" {
+  type    = string
  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard"
 }
-variable alarm-actions-general {
-  type = string
+variable "alarm-actions-general" {
+  type    = string
  default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support"
 }
-variable default-tags {}
+variable "default-tags" {}

-variable threshold-EngineCPUUtilization {}
-variable threshold-DatabaseMemoryUsagePercentage {}
-variable threshold-CacheHitRate {}
+variable "threshold-EngineCPUUtilization" {}
+variable "threshold-DatabaseMemoryUsagePercentage" {}
+variable "threshold-CacheHitRate" {}
--- a/modules/util/resource-list/list-alb.sh
+++ b/modules/util/resource-list/list-alb.sh
@ -1,3 +1,4 @@
 #!/bin/bash
 RESULTS=$(aws elbv2 describe-load-balancers --query 'LoadBalancers[?Type==`application`].LoadBalancerArn' --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs)
 jq -n --arg result "$RESULTS" '{"result":$result}'
+
--- a/modules/util/resource-list/list-asg.sh
+++ b/modules/util/resource-list/list-asg.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+# exclude ASG instances
+RESULTS=$(aws autoscaling describe-auto-scaling-groups --query 'AutoScalingGroups[*].AutoScalingGroupName' --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs)
+jq -n --arg result "$RESULTS" '{"result":$result}'
+
+
--- a/modules/util/resource-list/list-kafka.sh
+++ b/modules/util/resource-list/list-kafka.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+# exclude ASG instances
+RESULTS=$(aws kafka list-clusters --query ClusterInfoList[*].ClusterName --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs)
+jq -n --arg result "$RESULTS" '{"result":$result}'
+
+
--- a/modules/util/resource-list/list-opensearch.sh
+++ b/modules/util/resource-list/list-opensearch.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+# exclude ASG instances
+RESULTS=$(aws opensearch list-domain-names --query DomainNames[*].DomainName --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs)
+jq -n --arg result "$RESULTS" '{"result":$result}'
+
+