From b3ba6f2441f0effbc797862c24cf7d83d4f45b6c Mon Sep 17 00:00:00 2001 From: KF Date: Wed, 26 Oct 2022 11:13:56 +0800 Subject: [PATCH] UPD: Added more monitoring modules and various enhancements --- .../Monitoring.ALB/main.tf | 34 ++-- .../Monitoring.ALB/variables.tf | 3 +- .../Monitoring.ASG/README.md | 5 + .../Monitoring.ASG/main.tf | 22 +++ .../Monitoring.ASG/provider.tf | 9 + .../Monitoring.ASG/variables.tf | 22 +++ .../Monitoring.EC2/main.tf | 12 +- .../Monitoring.EC2/variables.tf | 2 + .../Monitoring.EMR/main.tf | 8 +- .../Monitoring.EMR/variables.tf | 2 + .../Monitoring.Kafka/README.md | 5 + .../Monitoring.Kafka/main.tf | 22 +++ .../Monitoring.Kafka/provider.tf | 9 + .../Monitoring.Kafka/variables.tf | 22 +++ .../Monitoring.NLB/main.tf | 4 +- .../Monitoring.NLB/variables.tf | 3 +- .../Monitoring.OpenSearch/README.md | 5 + .../Monitoring.OpenSearch/main.tf | 98 +++++++++++ .../Monitoring.OpenSearch/provider.tf | 9 + .../Monitoring.OpenSearch/variables.tf | 25 +++ .../Monitoring.RDS/main.tf | 156 +++++++++--------- .../Monitoring.RDS/variables.tf | 2 + .../Monitoring.Redis/main.tf | 84 +++++----- .../Monitoring.Redis/variables.tf | 28 ++-- modules/util/resource-list/list-alb.sh | 1 + modules/util/resource-list/list-asg.sh | 6 + modules/util/resource-list/list-kafka.sh | 6 + modules/util/resource-list/list-opensearch.sh | 6 + 28 files changed, 443 insertions(+), 167 deletions(-) create mode 100644 modules/ManagementGovernance/Monitoring.ASG/README.md create mode 100644 modules/ManagementGovernance/Monitoring.ASG/main.tf create mode 100644 modules/ManagementGovernance/Monitoring.ASG/provider.tf create mode 100644 modules/ManagementGovernance/Monitoring.ASG/variables.tf create mode 100644 modules/ManagementGovernance/Monitoring.Kafka/README.md create mode 100644 modules/ManagementGovernance/Monitoring.Kafka/main.tf create mode 100644 modules/ManagementGovernance/Monitoring.Kafka/provider.tf create mode 100644 modules/ManagementGovernance/Monitoring.Kafka/variables.tf create mode 100644 modules/ManagementGovernance/Monitoring.OpenSearch/README.md create mode 100644 modules/ManagementGovernance/Monitoring.OpenSearch/main.tf create mode 100644 modules/ManagementGovernance/Monitoring.OpenSearch/provider.tf create mode 100644 modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf create mode 100755 modules/util/resource-list/list-asg.sh create mode 100755 modules/util/resource-list/list-kafka.sh create mode 100755 modules/util/resource-list/list-opensearch.sh diff --git a/modules/ManagementGovernance/Monitoring.ALB/main.tf b/modules/ManagementGovernance/Monitoring.ALB/main.tf index 6f91401..7e3f243 100644 --- a/modules/ManagementGovernance/Monitoring.ALB/main.tf +++ b/modules/ManagementGovernance/Monitoring.ALB/main.tf @@ -1,28 +1,28 @@ -data external alb-targetgroups { +data "external" "alb-targetgroups" { program = ["bash", "../../modules/ManagementGovernance/Monitoring.ALB/list-alb-targetgroups.sh"] query = { lb = var.load-balancer } } -resource aws_cloudwatch_metric_alarm alb-HealthyHostCount { - for_each = toset(split(" ", data.external.alb-targetgroups.result.result)) - alarm_name = "ALBTG:HealthyHostCount:${split("/", each.value)[1]}/${split("/", each.value)[2]}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "1" - metric_name = "HealthyHostCount" - period = "300" - statistic = "Minimum" - threshold = var.threshold-HealthHostCountMin - alarm_description = "ALBTG:HealthyHostCount" - namespace = "AWS/ApplicationELB" +resource "aws_cloudwatch_metric_alarm" "alb-HealthyHostCount" { + for_each = toset(split(" ", data.external.alb-targetgroups.result.result)) + alarm_name = "${var.cw-alarm-prefix}:ALBTG:HealthyHostCount:${split("/", each.value)[1]}/${split("/", each.value)[2]}" + comparison_operator = "LessThanThreshold" + evaluation_periods = "1" + metric_name = "HealthyHostCount" + period = "300" + statistic = "Minimum" + threshold = var.threshold-HealthHostCountMin + alarm_description = "ALBTG:HealthyHostCount" + namespace = "AWS/ApplicationELB" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-emergency] - ok_actions = [var.alarm-actions-emergency] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-emergency] + ok_actions = [var.alarm-actions-emergency] dimensions = { - TargetGroup = "targetgroup/${split("/", each.value)[1]}/${split("/", each.value)[2]}" - LoadBalancer = "app/${split("/",var.load-balancer)[2]}/${split("/",var.load-balancer)[3]}" + TargetGroup = "targetgroup/${split("/", each.value)[1]}/${split("/", each.value)[2]}" + LoadBalancer = "app/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}" } tags = var.default-tags lifecycle { diff --git a/modules/ManagementGovernance/Monitoring.ALB/variables.tf b/modules/ManagementGovernance/Monitoring.ALB/variables.tf index 70e74cc..54fa842 100644 --- a/modules/ManagementGovernance/Monitoring.ALB/variables.tf +++ b/modules/ManagementGovernance/Monitoring.ALB/variables.tf @@ -1,4 +1,5 @@ -# variable target-group {} +variable cw-alarm-prefix {} +variable actions-enabled {} variable load-balancer {} variable threshold-HealthHostCountMin {} variable alarm-actions-urgent { diff --git a/modules/ManagementGovernance/Monitoring.ASG/README.md b/modules/ManagementGovernance/Monitoring.ASG/README.md new file mode 100644 index 0000000..744b521 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.ASG/README.md @@ -0,0 +1,5 @@ +# Monitoring module for BEA +This module deploys the default cloudwatch metric monitoring + +## Notes +Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway. \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.ASG/main.tf b/modules/ManagementGovernance/Monitoring.ASG/main.tf new file mode 100644 index 0000000..16e2cb2 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.ASG/main.tf @@ -0,0 +1,22 @@ +resource "aws_cloudwatch_metric_alarm" "asg-CPUUtilization" { + alarm_name = "${var.cw-alarm-prefix}:ASG:CPUUtilization:${var.asg-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "3" + metric_name = "CPUUtilization" + period = "1800" + statistic = "Average" + threshold = var.threshold-CPUUtilization + alarm_description = "ASG:CPUUtilization" + namespace = "AWS/EC2" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] + dimensions = { + AutoScalingGroupName = var.asg-name + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} diff --git a/modules/ManagementGovernance/Monitoring.ASG/provider.tf b/modules/ManagementGovernance/Monitoring.ASG/provider.tf new file mode 100644 index 0000000..7b64cf5 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.ASG/provider.tf @@ -0,0 +1,9 @@ +terraform { + required_version = "~> 1.3.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.36.1" + } + } +} diff --git a/modules/ManagementGovernance/Monitoring.ASG/variables.tf b/modules/ManagementGovernance/Monitoring.ASG/variables.tf new file mode 100644 index 0000000..68e5516 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.ASG/variables.tf @@ -0,0 +1,22 @@ +variable cw-alarm-prefix {} +variable actions-enabled {} +variable asg-name {} +variable alarm-actions-urgent { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent" +} +variable alarm-actions-emergency { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency" +} +variable alarm-actions-standard { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard" +} +variable alarm-actions-general { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support" +} +variable default-tags {} + +variable threshold-CPUUtilization {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.EC2/main.tf b/modules/ManagementGovernance/Monitoring.EC2/main.tf index 6035276..ca5bd57 100644 --- a/modules/ManagementGovernance/Monitoring.EC2/main.tf +++ b/modules/ManagementGovernance/Monitoring.EC2/main.tf @@ -1,5 +1,5 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" { - alarm_name = "EC2:StatusCheckFailed_System:${var.ec2-instance-id}" + alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_System:${var.ec2-instance-id}" comparison_operator = "GreaterThanThreshold" evaluation_periods = "1" metric_name = "StatusCheckFailed_System" @@ -9,7 +9,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" { alarm_description = "EC2:StatusCheckFailed_System" namespace = "AWS/EC2" insufficient_data_actions = [] - actions_enabled = "true" + actions_enabled = var.actions-enabled alarm_actions = [var.alarm-actions-emergency] ok_actions = [var.alarm-actions-emergency] dimensions = { @@ -22,7 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" { } resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" { - alarm_name = "EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}" + alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}" comparison_operator = "GreaterThanThreshold" evaluation_periods = "1" metric_name = "StatusCheckFailed_Instance" @@ -32,7 +32,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" { alarm_description = "EC2:StatusCheckFailed_Instance" namespace = "AWS/EC2" insufficient_data_actions = [] - actions_enabled = "true" + actions_enabled = var.actions-enabled alarm_actions = [var.alarm-actions-emergency] ok_actions = [var.alarm-actions-emergency] dimensions = { @@ -45,7 +45,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" { } resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" { - alarm_name = "EC2:CPUUtilization:${var.ec2-instance-id}" + alarm_name = "${var.cw-alarm-prefix}:EC2:CPUUtilization:${var.ec2-instance-id}" comparison_operator = "GreaterThanThreshold" evaluation_periods = "6" metric_name = "CPUUtilization" @@ -55,7 +55,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" { alarm_description = "EC2:CPUUtilization" namespace = "AWS/EC2" insufficient_data_actions = [] - actions_enabled = "true" + actions_enabled = var.actions-enabled alarm_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent] treat_missing_data = "notBreaching" diff --git a/modules/ManagementGovernance/Monitoring.EC2/variables.tf b/modules/ManagementGovernance/Monitoring.EC2/variables.tf index c83fc2b..f789798 100644 --- a/modules/ManagementGovernance/Monitoring.EC2/variables.tf +++ b/modules/ManagementGovernance/Monitoring.EC2/variables.tf @@ -1,3 +1,5 @@ +variable cw-alarm-prefix {} +variable actions-enabled {} variable ec2-instance-id {} variable alarm-actions-urgent { type = string diff --git a/modules/ManagementGovernance/Monitoring.EMR/main.tf b/modules/ManagementGovernance/Monitoring.EMR/main.tf index c6cdc59..f973ca3 100644 --- a/modules/ManagementGovernance/Monitoring.EMR/main.tf +++ b/modules/ManagementGovernance/Monitoring.EMR/main.tf @@ -1,5 +1,5 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" { - alarm_name = "EMR:AppsPending:${var.job-flow-id}" + alarm_name = "${var.cw-alarm-prefix}:EMR:AppsPending:${var.job-flow-id}" comparison_operator = "GreaterThanThreshold" evaluation_periods = "1" metric_name = "AppsPending" @@ -9,7 +9,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" { alarm_description = "EMR:AppsPending" namespace = "AWS/ElasticMapReduce" insufficient_data_actions = [] - actions_enabled = "true" + actions_enabled = var.actions-enabled alarm_actions = [var.alarm-actions-standard] ok_actions = [var.alarm-actions-standard] dimensions = { @@ -22,7 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" { } resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" { - alarm_name = "EMR:CapacityRemainingGB:${var.job-flow-id}" + alarm_name = "${var.cw-alarm-prefix}:EMR:CapacityRemainingGB:${var.job-flow-id}" comparison_operator = "LessThanThreshold" evaluation_periods = "1" metric_name = "CapacityRemainingGB" @@ -32,7 +32,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" { alarm_description = "EMR:CapacityRemainingGB" namespace = "AWS/ElasticMapReduce" insufficient_data_actions = [] - actions_enabled = "true" + actions_enabled = var.actions-enabled alarm_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent] dimensions = { diff --git a/modules/ManagementGovernance/Monitoring.EMR/variables.tf b/modules/ManagementGovernance/Monitoring.EMR/variables.tf index 84e9efa..a7941bc 100644 --- a/modules/ManagementGovernance/Monitoring.EMR/variables.tf +++ b/modules/ManagementGovernance/Monitoring.EMR/variables.tf @@ -1,3 +1,5 @@ +variable cw-alarm-prefix {} +variable actions-enabled {} variable job-flow-id {} variable threshold-AppsPending {} variable threshold-CapacityRemainingGB {} diff --git a/modules/ManagementGovernance/Monitoring.Kafka/README.md b/modules/ManagementGovernance/Monitoring.Kafka/README.md new file mode 100644 index 0000000..744b521 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.Kafka/README.md @@ -0,0 +1,5 @@ +# Monitoring module for BEA +This module deploys the default cloudwatch metric monitoring + +## Notes +Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway. \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.Kafka/main.tf b/modules/ManagementGovernance/Monitoring.Kafka/main.tf new file mode 100644 index 0000000..ac8303b --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.Kafka/main.tf @@ -0,0 +1,22 @@ +resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" { + alarm_name = "${var.cw-alarm-prefix}:Kafka:ZooKeeperRequestLatencyMsMean:${var.cluster-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "3" + metric_name = "ZooKeeperRequestLatencyMsMean" + period = "1800" + statistic = "Average" + threshold = var.threshold-ZooKeeperRequestLatencyMsMean + alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean" + namespace = "AWS/Kafka" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] + dimensions = { + "Cluster Name" = var.cluster-name + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} diff --git a/modules/ManagementGovernance/Monitoring.Kafka/provider.tf b/modules/ManagementGovernance/Monitoring.Kafka/provider.tf new file mode 100644 index 0000000..7b64cf5 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.Kafka/provider.tf @@ -0,0 +1,9 @@ +terraform { + required_version = "~> 1.3.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.36.1" + } + } +} diff --git a/modules/ManagementGovernance/Monitoring.Kafka/variables.tf b/modules/ManagementGovernance/Monitoring.Kafka/variables.tf new file mode 100644 index 0000000..705b0c2 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.Kafka/variables.tf @@ -0,0 +1,22 @@ +variable cw-alarm-prefix {} +variable actions-enabled {} +variable cluster-name {} +variable alarm-actions-urgent { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent" +} +variable alarm-actions-emergency { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency" +} +variable alarm-actions-standard { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard" +} +variable alarm-actions-general { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support" +} +variable default-tags {} + +variable threshold-ZooKeeperRequestLatencyMsMean {} diff --git a/modules/ManagementGovernance/Monitoring.NLB/main.tf b/modules/ManagementGovernance/Monitoring.NLB/main.tf index e8786c5..be3aa8c 100644 --- a/modules/ManagementGovernance/Monitoring.NLB/main.tf +++ b/modules/ManagementGovernance/Monitoring.NLB/main.tf @@ -16,7 +16,7 @@ module "nlb-targetgroups" { resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" { for_each = module.nlb-targetgroups.result-set - alarm_name = "NLBTG:HealthyHostCount:${split(":", each.value)[5]}" + alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}" comparison_operator = "LessThanThreshold" evaluation_periods = "1" metric_name = "HealthyHostCount" @@ -26,7 +26,7 @@ resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" { alarm_description = "NLBTG:HealthyHostCount" namespace = "AWS/NetworkELB" insufficient_data_actions = [] - actions_enabled = "true" + actions_enabled = var.actions-enabled alarm_actions = [var.alarm-actions-emergency] ok_actions = [var.alarm-actions-emergency] dimensions = { diff --git a/modules/ManagementGovernance/Monitoring.NLB/variables.tf b/modules/ManagementGovernance/Monitoring.NLB/variables.tf index 70e74cc..54fa842 100644 --- a/modules/ManagementGovernance/Monitoring.NLB/variables.tf +++ b/modules/ManagementGovernance/Monitoring.NLB/variables.tf @@ -1,4 +1,5 @@ -# variable target-group {} +variable cw-alarm-prefix {} +variable actions-enabled {} variable load-balancer {} variable threshold-HealthHostCountMin {} variable alarm-actions-urgent { diff --git a/modules/ManagementGovernance/Monitoring.OpenSearch/README.md b/modules/ManagementGovernance/Monitoring.OpenSearch/README.md new file mode 100644 index 0000000..744b521 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.OpenSearch/README.md @@ -0,0 +1,5 @@ +# Monitoring module for BEA +This module deploys the default cloudwatch metric monitoring + +## Notes +Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway. \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf b/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf new file mode 100644 index 0000000..57dff7b --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf @@ -0,0 +1,98 @@ +data "aws_caller_identity" "this" {} + +resource "aws_cloudwatch_metric_alarm" "ES-CPUUtilization" { + alarm_name = "${var.cw-alarm-prefix}:ES:CPUUtilization:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "3" + metric_name = "CPUUtilization" + period = "1800" + statistic = "Average" + threshold = var.threshold-CPUUtilization + alarm_description = "ES:CPUUtilization" + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-SearchLatency" { + alarm_name = "${var.cw-alarm-prefix}:ES:SearchLatency:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "3" + metric_name = "SearchLatency" + period = "1800" + statistic = "Average" + threshold = var.threshold-SearchLatency + alarm_description = "ES:SearchLatency" + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-IndexingLatency" { + alarm_name = "${var.cw-alarm-prefix}:ES:IndexingLatency:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "3" + metric_name = "IndexingLatency" + period = "1800" + statistic = "Average" + threshold = var.threshold-IndexingLatency + alarm_description = "ES:IndexingLatency" + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" { + alarm_name = "${var.cw-alarm-prefix}:ES:ClusterStatusRed:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "ClusterStatus.red" + period = "900" + statistic = "Maximum" + threshold = 0 + alarm_description = "At least one primary shard and its replicas aren't allocated to a node." + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + diff --git a/modules/ManagementGovernance/Monitoring.OpenSearch/provider.tf b/modules/ManagementGovernance/Monitoring.OpenSearch/provider.tf new file mode 100644 index 0000000..7b64cf5 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.OpenSearch/provider.tf @@ -0,0 +1,9 @@ +terraform { + required_version = "~> 1.3.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.36.1" + } + } +} diff --git a/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf b/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf new file mode 100644 index 0000000..28100c4 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf @@ -0,0 +1,25 @@ +variable cw-alarm-prefix {} +variable actions-enabled {} +variable domain-name {} +variable alarm-actions-urgent { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent" +} +variable alarm-actions-emergency { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency" +} +variable alarm-actions-standard { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard" +} +variable alarm-actions-general { + type = string + default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support" +} +variable default-tags {} + +variable threshold-CPUUtilization {} +variable threshold-SearchLatency {} +variable threshold-IndexingLatency {} +# variable threshold-KibanaHealthyNodes {} \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.RDS/main.tf b/modules/ManagementGovernance/Monitoring.RDS/main.tf index 3af6201..63d4e5c 100644 --- a/modules/ManagementGovernance/Monitoring.RDS/main.tf +++ b/modules/ManagementGovernance/Monitoring.RDS/main.tf @@ -1,17 +1,17 @@ -resource aws_cloudwatch_metric_alarm rds-cpu { - alarm_name = "RDS:CpuUtilization:${var.rds-instance-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = "CPUUtilization" - period = "3600" - statistic = "Average" - threshold = var.threshold-CpuUtilization - alarm_description = "RDS:CpuUtilization" - namespace = "AWS/RDS" +resource "aws_cloudwatch_metric_alarm" "rds-cpu" { + alarm_name = "${var.cw-alarm-prefix}:RDS:CpuUtilization:${var.rds-instance-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + metric_name = "CPUUtilization" + period = "3600" + statistic = "Average" + threshold = var.threshold-CpuUtilization + alarm_description = "RDS:CpuUtilization" + namespace = "AWS/RDS" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-urgent] - ok_actions = [var.alarm-actions-urgent] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] dimensions = { DBInstanceIdentifier = var.rds-instance-name } @@ -21,20 +21,20 @@ resource aws_cloudwatch_metric_alarm rds-cpu { } } -resource aws_cloudwatch_metric_alarm rds-storage { - alarm_name = "RDS:FreeStorageSpace:${var.rds-instance-name}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "1" - metric_name = "FreeStorageSpace" - period = "3600" - statistic = "Average" - threshold = var.threshold-FreeStorageSpace - alarm_description = "RDS:FreeStorageSpace" - namespace = "AWS/RDS" +resource "aws_cloudwatch_metric_alarm" "rds-storage" { + alarm_name = "${var.cw-alarm-prefix}:RDS:FreeStorageSpace:${var.rds-instance-name}" + comparison_operator = "LessThanThreshold" + evaluation_periods = "1" + metric_name = "FreeStorageSpace" + period = "3600" + statistic = "Average" + threshold = var.threshold-FreeStorageSpace + alarm_description = "RDS:FreeStorageSpace" + namespace = "AWS/RDS" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-urgent] - ok_actions = [var.alarm-actions-urgent] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] dimensions = { DBInstanceIdentifier = var.rds-instance-name } @@ -44,20 +44,20 @@ resource aws_cloudwatch_metric_alarm rds-storage { } } -resource aws_cloudwatch_metric_alarm rds-memory { - alarm_name = "RDS:FreeableMemory:${var.rds-instance-name}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "1" - metric_name = "FreeableMemory" - period = "3600" - statistic = "Average" - threshold = var.threshold-FreeableMemory - alarm_description = "RDS:FreeableMemory" - namespace = "AWS/RDS" +resource "aws_cloudwatch_metric_alarm" "rds-memory" { + alarm_name = "${var.cw-alarm-prefix}:RDS:FreeableMemory:${var.rds-instance-name}" + comparison_operator = "LessThanThreshold" + evaluation_periods = "1" + metric_name = "FreeableMemory" + period = "3600" + statistic = "Average" + threshold = var.threshold-FreeableMemory + alarm_description = "RDS:FreeableMemory" + namespace = "AWS/RDS" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-urgent] - ok_actions = [var.alarm-actions-urgent] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] dimensions = { DBInstanceIdentifier = var.rds-instance-name } @@ -67,20 +67,20 @@ resource aws_cloudwatch_metric_alarm rds-memory { } } -resource aws_cloudwatch_metric_alarm rds-DiskQueueDepth { - alarm_name = "RDS:DiskQueueDepth:${var.rds-instance-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = "DiskQueueDepth" - period = "300" - statistic = "Average" - threshold = var.threshold-DiskQueueDepth - alarm_description = "RDS:DiskQueueDepth" - namespace = "AWS/RDS" +resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" { + alarm_name = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + metric_name = "DiskQueueDepth" + period = "300" + statistic = "Average" + threshold = var.threshold-DiskQueueDepth + alarm_description = "RDS:DiskQueueDepth" + namespace = "AWS/RDS" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-urgent] - ok_actions = [var.alarm-actions-urgent] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] dimensions = { DBInstanceIdentifier = var.rds-instance-name } @@ -90,20 +90,20 @@ resource aws_cloudwatch_metric_alarm rds-DiskQueueDepth { } } -resource aws_cloudwatch_metric_alarm rds-ReadLatency { - alarm_name = "RDS:ReadLatency:${var.rds-instance-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "ReadLatency" - period = "900" - statistic = "Average" - threshold = var.threshold-ReadLatency - alarm_description = "RDS:ReadLatency" - namespace = "AWS/RDS" +resource "aws_cloudwatch_metric_alarm" "rds-ReadLatency" { + alarm_name = "${var.cw-alarm-prefix}:RDS:ReadLatency:${var.rds-instance-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "ReadLatency" + period = "900" + statistic = "Average" + threshold = var.threshold-ReadLatency + alarm_description = "RDS:ReadLatency" + namespace = "AWS/RDS" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-urgent] - ok_actions = [var.alarm-actions-urgent] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] dimensions = { DBInstanceIdentifier = var.rds-instance-name } @@ -113,20 +113,20 @@ resource aws_cloudwatch_metric_alarm rds-ReadLatency { } } -resource aws_cloudwatch_metric_alarm rds-WriteLatency { - alarm_name = "RDS:WriteLatency:${var.rds-instance-name}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "2" - metric_name = "WriteLatency" - period = "900" - statistic = "Average" - threshold = var.threshold-WriteLatency - alarm_description = "RDS:WriteLatency" - namespace = "AWS/RDS" +resource "aws_cloudwatch_metric_alarm" "rds-WriteLatency" { + alarm_name = "${var.cw-alarm-prefix}:RDS:WriteLatency:${var.rds-instance-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "WriteLatency" + period = "900" + statistic = "Average" + threshold = var.threshold-WriteLatency + alarm_description = "RDS:WriteLatency" + namespace = "AWS/RDS" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-urgent] - ok_actions = [var.alarm-actions-urgent] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] dimensions = { DBInstanceIdentifier = var.rds-instance-name } diff --git a/modules/ManagementGovernance/Monitoring.RDS/variables.tf b/modules/ManagementGovernance/Monitoring.RDS/variables.tf index fb690cd..a919c48 100644 --- a/modules/ManagementGovernance/Monitoring.RDS/variables.tf +++ b/modules/ManagementGovernance/Monitoring.RDS/variables.tf @@ -1,3 +1,5 @@ +variable cw-alarm-prefix {} +variable actions-enabled {} variable rds-instance-name {} variable alarm-actions-urgent { type = string diff --git a/modules/ManagementGovernance/Monitoring.Redis/main.tf b/modules/ManagementGovernance/Monitoring.Redis/main.tf index 80747e7..ab0bbcf 100644 --- a/modules/ManagementGovernance/Monitoring.Redis/main.tf +++ b/modules/ManagementGovernance/Monitoring.Redis/main.tf @@ -1,17 +1,17 @@ -resource aws_cloudwatch_metric_alarm redis-EngineCPUUtilization { - alarm_name = "Redis:EngineCPUUtilization:${var.redis-cluster-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = "EngineCPUUtilization" - period = "3600" - statistic = "Average" - threshold = var.threshold-EngineCPUUtilization - alarm_description = "Redis:EngineCPUUtilization" - namespace = "AWS/ElastiCache" +resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" { + alarm_name = "${var.cw-alarm-prefix}:Redis:EngineCPUUtilization:${var.redis-cluster-id}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + metric_name = "EngineCPUUtilization" + period = "3600" + statistic = "Average" + threshold = var.threshold-EngineCPUUtilization + alarm_description = "Redis:EngineCPUUtilization" + namespace = "AWS/ElastiCache" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-urgent] - ok_actions = [var.alarm-actions-urgent] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] dimensions = { CacheClusterId = var.redis-cluster-id } @@ -21,20 +21,20 @@ resource aws_cloudwatch_metric_alarm redis-EngineCPUUtilization { } } -resource aws_cloudwatch_metric_alarm redis-DatabaseMemoryUsagePercentage { - alarm_name = "Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = "DatabaseMemoryUsagePercentage" - period = "3600" - statistic = "Average" - threshold = var.threshold-DatabaseMemoryUsagePercentage - alarm_description = "Redis:DatabaseMemoryUsagePercentage" - namespace = "AWS/ElastiCache" +resource "aws_cloudwatch_metric_alarm" "redis-DatabaseMemoryUsagePercentage" { + alarm_name = "${var.cw-alarm-prefix}:Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + metric_name = "DatabaseMemoryUsagePercentage" + period = "3600" + statistic = "Average" + threshold = var.threshold-DatabaseMemoryUsagePercentage + alarm_description = "Redis:DatabaseMemoryUsagePercentage" + namespace = "AWS/ElastiCache" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-urgent] - ok_actions = [var.alarm-actions-urgent] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-urgent] + ok_actions = [var.alarm-actions-urgent] dimensions = { CacheClusterId = var.redis-cluster-id } @@ -44,26 +44,20 @@ resource aws_cloudwatch_metric_alarm redis-DatabaseMemoryUsagePercentage { } } -/* -data aws_elasticache_cluster redis-cluster { - cluster_id = var.redis-cluster-id -} -*/ -resource aws_cloudwatch_metric_alarm redis-CacheHitRate { - # for_each = toset(data.aws_elasticache_cluster.redis-cluster.cache_nodes.*.id) - alarm_name = "Redis:CacheHitRate:${var.redis-cluster-id}" - comparison_operator = "LessThanThreshold" - evaluation_periods = "4" - metric_name = "CacheHitRate" - period = "900" - statistic = "Average" - threshold = var.threshold-CacheHitRate - alarm_description = "Redis:CacheHitRate" - namespace = "AWS/ElastiCache" +resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" { + alarm_name = "${var.cw-alarm-prefix}:Redis:CacheHitRate:${var.redis-cluster-id}" + comparison_operator = "LessThanThreshold" + evaluation_periods = "4" + metric_name = "CacheHitRate" + period = "900" + statistic = "Average" + threshold = var.threshold-CacheHitRate + alarm_description = "Redis:CacheHitRate" + namespace = "AWS/ElastiCache" insufficient_data_actions = [] - actions_enabled = "true" - alarm_actions = [var.alarm-actions-standard] - ok_actions = [var.alarm-actions-standard] + actions_enabled = var.actions-enabled + alarm_actions = [var.alarm-actions-standard] + ok_actions = [var.alarm-actions-standard] dimensions = { CacheClusterId = var.redis-cluster-id # CacheNodeId = each.value diff --git a/modules/ManagementGovernance/Monitoring.Redis/variables.tf b/modules/ManagementGovernance/Monitoring.Redis/variables.tf index ad2d9b5..13e69d2 100644 --- a/modules/ManagementGovernance/Monitoring.Redis/variables.tf +++ b/modules/ManagementGovernance/Monitoring.Redis/variables.tf @@ -1,22 +1,24 @@ -variable redis-cluster-id {} -variable alarm-actions-urgent { - type = string +variable cw-alarm-prefix {} +variable "actions-enabled" {} +variable "redis-cluster-id" {} +variable "alarm-actions-urgent" { + type = string default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent" } -variable alarm-actions-emergency { - type = string +variable "alarm-actions-emergency" { + type = string default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency" } -variable alarm-actions-standard { - type = string +variable "alarm-actions-standard" { + type = string default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard" } -variable alarm-actions-general { - type = string +variable "alarm-actions-general" { + type = string default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support" } -variable default-tags {} +variable "default-tags" {} -variable threshold-EngineCPUUtilization {} -variable threshold-DatabaseMemoryUsagePercentage {} -variable threshold-CacheHitRate {} \ No newline at end of file +variable "threshold-EngineCPUUtilization" {} +variable "threshold-DatabaseMemoryUsagePercentage" {} +variable "threshold-CacheHitRate" {} \ No newline at end of file diff --git a/modules/util/resource-list/list-alb.sh b/modules/util/resource-list/list-alb.sh index a6f5920..73dd90b 100755 --- a/modules/util/resource-list/list-alb.sh +++ b/modules/util/resource-list/list-alb.sh @@ -1,3 +1,4 @@ #!/bin/bash RESULTS=$(aws elbv2 describe-load-balancers --query 'LoadBalancers[?Type==`application`].LoadBalancerArn' --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs) jq -n --arg result "$RESULTS" '{"result":$result}' + diff --git a/modules/util/resource-list/list-asg.sh b/modules/util/resource-list/list-asg.sh new file mode 100755 index 0000000..9026862 --- /dev/null +++ b/modules/util/resource-list/list-asg.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# exclude ASG instances +RESULTS=$(aws autoscaling describe-auto-scaling-groups --query 'AutoScalingGroups[*].AutoScalingGroupName' --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs) +jq -n --arg result "$RESULTS" '{"result":$result}' + + diff --git a/modules/util/resource-list/list-kafka.sh b/modules/util/resource-list/list-kafka.sh new file mode 100755 index 0000000..5e92761 --- /dev/null +++ b/modules/util/resource-list/list-kafka.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# exclude ASG instances +RESULTS=$(aws kafka list-clusters --query ClusterInfoList[*].ClusterName --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs) +jq -n --arg result "$RESULTS" '{"result":$result}' + + diff --git a/modules/util/resource-list/list-opensearch.sh b/modules/util/resource-list/list-opensearch.sh new file mode 100755 index 0000000..65882c1 --- /dev/null +++ b/modules/util/resource-list/list-opensearch.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# exclude ASG instances +RESULTS=$(aws opensearch list-domain-names --query DomainNames[*].DomainName --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs) +jq -n --arg result "$RESULTS" '{"result":$result}' + +