UPD: Added more monitoring modules and various enhancements

This commit is contained in:
KF 2022-10-26 11:13:56 +08:00
parent 2af0ff1b1a
commit b3ba6f2441
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
28 changed files with 443 additions and 167 deletions

View File

@ -1,13 +1,13 @@
data external alb-targetgroups { data "external" "alb-targetgroups" {
program = ["bash", "../../modules/ManagementGovernance/Monitoring.ALB/list-alb-targetgroups.sh"] program = ["bash", "../../modules/ManagementGovernance/Monitoring.ALB/list-alb-targetgroups.sh"]
query = { query = {
lb = var.load-balancer lb = var.load-balancer
} }
} }
resource aws_cloudwatch_metric_alarm alb-HealthyHostCount { resource "aws_cloudwatch_metric_alarm" "alb-HealthyHostCount" {
for_each = toset(split(" ", data.external.alb-targetgroups.result.result)) for_each = toset(split(" ", data.external.alb-targetgroups.result.result))
alarm_name = "ALBTG:HealthyHostCount:${split("/", each.value)[1]}/${split("/", each.value)[2]}" alarm_name = "${var.cw-alarm-prefix}:ALBTG:HealthyHostCount:${split("/", each.value)[1]}/${split("/", each.value)[2]}"
comparison_operator = "LessThanThreshold" comparison_operator = "LessThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "HealthyHostCount" metric_name = "HealthyHostCount"
@ -17,12 +17,12 @@ resource aws_cloudwatch_metric_alarm alb-HealthyHostCount {
alarm_description = "ALBTG:HealthyHostCount" alarm_description = "ALBTG:HealthyHostCount"
namespace = "AWS/ApplicationELB" namespace = "AWS/ApplicationELB"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-emergency] alarm_actions = [var.alarm-actions-emergency]
ok_actions = [var.alarm-actions-emergency] ok_actions = [var.alarm-actions-emergency]
dimensions = { dimensions = {
TargetGroup = "targetgroup/${split("/", each.value)[1]}/${split("/", each.value)[2]}" TargetGroup = "targetgroup/${split("/", each.value)[1]}/${split("/", each.value)[2]}"
LoadBalancer = "app/${split("/",var.load-balancer)[2]}/${split("/",var.load-balancer)[3]}" LoadBalancer = "app/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"
} }
tags = var.default-tags tags = var.default-tags
lifecycle { lifecycle {

View File

@ -1,4 +1,5 @@
# variable target-group {} variable cw-alarm-prefix {}
variable actions-enabled {}
variable load-balancer {} variable load-balancer {}
variable threshold-HealthHostCountMin {} variable threshold-HealthHostCountMin {}
variable alarm-actions-urgent { variable alarm-actions-urgent {

View File

@ -0,0 +1,5 @@
# Monitoring module for BEA
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.

View File

@ -0,0 +1,22 @@
resource "aws_cloudwatch_metric_alarm" "asg-CPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ASG:CPUUtilization:${var.asg-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "CPUUtilization"
period = "1800"
statistic = "Average"
threshold = var.threshold-CPUUtilization
alarm_description = "ASG:CPUUtilization"
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent]
dimensions = {
AutoScalingGroupName = var.asg-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.36.1"
}
}
}

View File

@ -0,0 +1,22 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable asg-name {}
variable alarm-actions-urgent {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent"
}
variable alarm-actions-emergency {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency"
}
variable alarm-actions-standard {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard"
}
variable alarm-actions-general {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support"
}
variable default-tags {}
variable threshold-CPUUtilization {}

View File

@ -1,5 +1,5 @@
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" { resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
alarm_name = "EC2:StatusCheckFailed_System:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_System:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "StatusCheckFailed_System" metric_name = "StatusCheckFailed_System"
@ -9,7 +9,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
alarm_description = "EC2:StatusCheckFailed_System" alarm_description = "EC2:StatusCheckFailed_System"
namespace = "AWS/EC2" namespace = "AWS/EC2"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-emergency] alarm_actions = [var.alarm-actions-emergency]
ok_actions = [var.alarm-actions-emergency] ok_actions = [var.alarm-actions-emergency]
dimensions = { dimensions = {
@ -22,7 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
} }
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" { resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
alarm_name = "EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:StatusCheckFailed_Instance:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "StatusCheckFailed_Instance" metric_name = "StatusCheckFailed_Instance"
@ -32,7 +32,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
alarm_description = "EC2:StatusCheckFailed_Instance" alarm_description = "EC2:StatusCheckFailed_Instance"
namespace = "AWS/EC2" namespace = "AWS/EC2"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-emergency] alarm_actions = [var.alarm-actions-emergency]
ok_actions = [var.alarm-actions-emergency] ok_actions = [var.alarm-actions-emergency]
dimensions = { dimensions = {
@ -45,7 +45,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
} }
resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" { resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
alarm_name = "EC2:CPUUtilization:${var.ec2-instance-id}" alarm_name = "${var.cw-alarm-prefix}:EC2:CPUUtilization:${var.ec2-instance-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "6" evaluation_periods = "6"
metric_name = "CPUUtilization" metric_name = "CPUUtilization"
@ -55,7 +55,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
alarm_description = "EC2:CPUUtilization" alarm_description = "EC2:CPUUtilization"
namespace = "AWS/EC2" namespace = "AWS/EC2"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
treat_missing_data = "notBreaching" treat_missing_data = "notBreaching"

View File

@ -1,3 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable ec2-instance-id {} variable ec2-instance-id {}
variable alarm-actions-urgent { variable alarm-actions-urgent {
type = string type = string

View File

@ -1,5 +1,5 @@
resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" { resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
alarm_name = "EMR:AppsPending:${var.job-flow-id}" alarm_name = "${var.cw-alarm-prefix}:EMR:AppsPending:${var.job-flow-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "AppsPending" metric_name = "AppsPending"
@ -9,7 +9,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
alarm_description = "EMR:AppsPending" alarm_description = "EMR:AppsPending"
namespace = "AWS/ElasticMapReduce" namespace = "AWS/ElasticMapReduce"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-standard] alarm_actions = [var.alarm-actions-standard]
ok_actions = [var.alarm-actions-standard] ok_actions = [var.alarm-actions-standard]
dimensions = { dimensions = {
@ -22,7 +22,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-AppsPending" {
} }
resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" { resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
alarm_name = "EMR:CapacityRemainingGB:${var.job-flow-id}" alarm_name = "${var.cw-alarm-prefix}:EMR:CapacityRemainingGB:${var.job-flow-id}"
comparison_operator = "LessThanThreshold" comparison_operator = "LessThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "CapacityRemainingGB" metric_name = "CapacityRemainingGB"
@ -32,7 +32,7 @@ resource "aws_cloudwatch_metric_alarm" "emr-CapacityRemainingGB" {
alarm_description = "EMR:CapacityRemainingGB" alarm_description = "EMR:CapacityRemainingGB"
namespace = "AWS/ElasticMapReduce" namespace = "AWS/ElasticMapReduce"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
dimensions = { dimensions = {

View File

@ -1,3 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable job-flow-id {} variable job-flow-id {}
variable threshold-AppsPending {} variable threshold-AppsPending {}
variable threshold-CapacityRemainingGB {} variable threshold-CapacityRemainingGB {}

View File

@ -0,0 +1,5 @@
# Monitoring module for BEA
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.

View File

@ -0,0 +1,22 @@
resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
alarm_name = "${var.cw-alarm-prefix}:Kafka:ZooKeeperRequestLatencyMsMean:${var.cluster-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "ZooKeeperRequestLatencyMsMean"
period = "1800"
statistic = "Average"
threshold = var.threshold-ZooKeeperRequestLatencyMsMean
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent]
dimensions = {
"Cluster Name" = var.cluster-name
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.36.1"
}
}
}

View File

@ -0,0 +1,22 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable cluster-name {}
variable alarm-actions-urgent {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent"
}
variable alarm-actions-emergency {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency"
}
variable alarm-actions-standard {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard"
}
variable alarm-actions-general {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support"
}
variable default-tags {}
variable threshold-ZooKeeperRequestLatencyMsMean {}

View File

@ -16,7 +16,7 @@ module "nlb-targetgroups" {
resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" { resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" {
for_each = module.nlb-targetgroups.result-set for_each = module.nlb-targetgroups.result-set
alarm_name = "NLBTG:HealthyHostCount:${split(":", each.value)[5]}" alarm_name = "${var.cw-alarm-prefix}:NLBTG:HealthyHostCount:${split(":", each.value)[5]}"
comparison_operator = "LessThanThreshold" comparison_operator = "LessThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "HealthyHostCount" metric_name = "HealthyHostCount"
@ -26,7 +26,7 @@ resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" {
alarm_description = "NLBTG:HealthyHostCount" alarm_description = "NLBTG:HealthyHostCount"
namespace = "AWS/NetworkELB" namespace = "AWS/NetworkELB"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-emergency] alarm_actions = [var.alarm-actions-emergency]
ok_actions = [var.alarm-actions-emergency] ok_actions = [var.alarm-actions-emergency]
dimensions = { dimensions = {

View File

@ -1,4 +1,5 @@
# variable target-group {} variable cw-alarm-prefix {}
variable actions-enabled {}
variable load-balancer {} variable load-balancer {}
variable threshold-HealthHostCountMin {} variable threshold-HealthHostCountMin {}
variable alarm-actions-urgent { variable alarm-actions-urgent {

View File

@ -0,0 +1,5 @@
# Monitoring module for BEA
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.

View File

@ -0,0 +1,98 @@
data "aws_caller_identity" "this" {}
resource "aws_cloudwatch_metric_alarm" "ES-CPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ES:CPUUtilization:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "CPUUtilization"
period = "1800"
statistic = "Average"
threshold = var.threshold-CPUUtilization
alarm_description = "ES:CPUUtilization"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-SearchLatency" {
alarm_name = "${var.cw-alarm-prefix}:ES:SearchLatency:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "SearchLatency"
period = "1800"
statistic = "Average"
threshold = var.threshold-SearchLatency
alarm_description = "ES:SearchLatency"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-IndexingLatency" {
alarm_name = "${var.cw-alarm-prefix}:ES:IndexingLatency:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "IndexingLatency"
period = "1800"
statistic = "Average"
threshold = var.threshold-IndexingLatency
alarm_description = "ES:IndexingLatency"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" {
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterStatusRed:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ClusterStatus.red"
period = "900"
statistic = "Maximum"
threshold = 0
alarm_description = "At least one primary shard and its replicas aren't allocated to a node."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.36.1"
}
}
}

View File

@ -0,0 +1,25 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable domain-name {}
variable alarm-actions-urgent {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent"
}
variable alarm-actions-emergency {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency"
}
variable alarm-actions-standard {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard"
}
variable alarm-actions-general {
type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support"
}
variable default-tags {}
variable threshold-CPUUtilization {}
variable threshold-SearchLatency {}
variable threshold-IndexingLatency {}
# variable threshold-KibanaHealthyNodes {}

View File

@ -1,5 +1,5 @@
resource aws_cloudwatch_metric_alarm rds-cpu { resource "aws_cloudwatch_metric_alarm" "rds-cpu" {
alarm_name = "RDS:CpuUtilization:${var.rds-instance-name}" alarm_name = "${var.cw-alarm-prefix}:RDS:CpuUtilization:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "CPUUtilization" metric_name = "CPUUtilization"
@ -9,7 +9,7 @@ resource aws_cloudwatch_metric_alarm rds-cpu {
alarm_description = "RDS:CpuUtilization" alarm_description = "RDS:CpuUtilization"
namespace = "AWS/RDS" namespace = "AWS/RDS"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
dimensions = { dimensions = {
@ -21,8 +21,8 @@ resource aws_cloudwatch_metric_alarm rds-cpu {
} }
} }
resource aws_cloudwatch_metric_alarm rds-storage { resource "aws_cloudwatch_metric_alarm" "rds-storage" {
alarm_name = "RDS:FreeStorageSpace:${var.rds-instance-name}" alarm_name = "${var.cw-alarm-prefix}:RDS:FreeStorageSpace:${var.rds-instance-name}"
comparison_operator = "LessThanThreshold" comparison_operator = "LessThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "FreeStorageSpace" metric_name = "FreeStorageSpace"
@ -32,7 +32,7 @@ resource aws_cloudwatch_metric_alarm rds-storage {
alarm_description = "RDS:FreeStorageSpace" alarm_description = "RDS:FreeStorageSpace"
namespace = "AWS/RDS" namespace = "AWS/RDS"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
dimensions = { dimensions = {
@ -44,8 +44,8 @@ resource aws_cloudwatch_metric_alarm rds-storage {
} }
} }
resource aws_cloudwatch_metric_alarm rds-memory { resource "aws_cloudwatch_metric_alarm" "rds-memory" {
alarm_name = "RDS:FreeableMemory:${var.rds-instance-name}" alarm_name = "${var.cw-alarm-prefix}:RDS:FreeableMemory:${var.rds-instance-name}"
comparison_operator = "LessThanThreshold" comparison_operator = "LessThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "FreeableMemory" metric_name = "FreeableMemory"
@ -55,7 +55,7 @@ resource aws_cloudwatch_metric_alarm rds-memory {
alarm_description = "RDS:FreeableMemory" alarm_description = "RDS:FreeableMemory"
namespace = "AWS/RDS" namespace = "AWS/RDS"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
dimensions = { dimensions = {
@ -67,8 +67,8 @@ resource aws_cloudwatch_metric_alarm rds-memory {
} }
} }
resource aws_cloudwatch_metric_alarm rds-DiskQueueDepth { resource "aws_cloudwatch_metric_alarm" "rds-DiskQueueDepth" {
alarm_name = "RDS:DiskQueueDepth:${var.rds-instance-name}" alarm_name = "${var.cw-alarm-prefix}:RDS:DiskQueueDepth:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "DiskQueueDepth" metric_name = "DiskQueueDepth"
@ -78,7 +78,7 @@ resource aws_cloudwatch_metric_alarm rds-DiskQueueDepth {
alarm_description = "RDS:DiskQueueDepth" alarm_description = "RDS:DiskQueueDepth"
namespace = "AWS/RDS" namespace = "AWS/RDS"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
dimensions = { dimensions = {
@ -90,8 +90,8 @@ resource aws_cloudwatch_metric_alarm rds-DiskQueueDepth {
} }
} }
resource aws_cloudwatch_metric_alarm rds-ReadLatency { resource "aws_cloudwatch_metric_alarm" "rds-ReadLatency" {
alarm_name = "RDS:ReadLatency:${var.rds-instance-name}" alarm_name = "${var.cw-alarm-prefix}:RDS:ReadLatency:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2" evaluation_periods = "2"
metric_name = "ReadLatency" metric_name = "ReadLatency"
@ -101,7 +101,7 @@ resource aws_cloudwatch_metric_alarm rds-ReadLatency {
alarm_description = "RDS:ReadLatency" alarm_description = "RDS:ReadLatency"
namespace = "AWS/RDS" namespace = "AWS/RDS"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
dimensions = { dimensions = {
@ -113,8 +113,8 @@ resource aws_cloudwatch_metric_alarm rds-ReadLatency {
} }
} }
resource aws_cloudwatch_metric_alarm rds-WriteLatency { resource "aws_cloudwatch_metric_alarm" "rds-WriteLatency" {
alarm_name = "RDS:WriteLatency:${var.rds-instance-name}" alarm_name = "${var.cw-alarm-prefix}:RDS:WriteLatency:${var.rds-instance-name}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2" evaluation_periods = "2"
metric_name = "WriteLatency" metric_name = "WriteLatency"
@ -124,7 +124,7 @@ resource aws_cloudwatch_metric_alarm rds-WriteLatency {
alarm_description = "RDS:WriteLatency" alarm_description = "RDS:WriteLatency"
namespace = "AWS/RDS" namespace = "AWS/RDS"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
dimensions = { dimensions = {

View File

@ -1,3 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable rds-instance-name {} variable rds-instance-name {}
variable alarm-actions-urgent { variable alarm-actions-urgent {
type = string type = string

View File

@ -1,5 +1,5 @@
resource aws_cloudwatch_metric_alarm redis-EngineCPUUtilization { resource "aws_cloudwatch_metric_alarm" "redis-EngineCPUUtilization" {
alarm_name = "Redis:EngineCPUUtilization:${var.redis-cluster-id}" alarm_name = "${var.cw-alarm-prefix}:Redis:EngineCPUUtilization:${var.redis-cluster-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "EngineCPUUtilization" metric_name = "EngineCPUUtilization"
@ -9,7 +9,7 @@ resource aws_cloudwatch_metric_alarm redis-EngineCPUUtilization {
alarm_description = "Redis:EngineCPUUtilization" alarm_description = "Redis:EngineCPUUtilization"
namespace = "AWS/ElastiCache" namespace = "AWS/ElastiCache"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
dimensions = { dimensions = {
@ -21,8 +21,8 @@ resource aws_cloudwatch_metric_alarm redis-EngineCPUUtilization {
} }
} }
resource aws_cloudwatch_metric_alarm redis-DatabaseMemoryUsagePercentage { resource "aws_cloudwatch_metric_alarm" "redis-DatabaseMemoryUsagePercentage" {
alarm_name = "Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}" alarm_name = "${var.cw-alarm-prefix}:Redis:DatabaseMemoryUsagePercentage:${var.redis-cluster-id}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
metric_name = "DatabaseMemoryUsagePercentage" metric_name = "DatabaseMemoryUsagePercentage"
@ -32,7 +32,7 @@ resource aws_cloudwatch_metric_alarm redis-DatabaseMemoryUsagePercentage {
alarm_description = "Redis:DatabaseMemoryUsagePercentage" alarm_description = "Redis:DatabaseMemoryUsagePercentage"
namespace = "AWS/ElastiCache" namespace = "AWS/ElastiCache"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-urgent] alarm_actions = [var.alarm-actions-urgent]
ok_actions = [var.alarm-actions-urgent] ok_actions = [var.alarm-actions-urgent]
dimensions = { dimensions = {
@ -44,14 +44,8 @@ resource aws_cloudwatch_metric_alarm redis-DatabaseMemoryUsagePercentage {
} }
} }
/* resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" {
data aws_elasticache_cluster redis-cluster { alarm_name = "${var.cw-alarm-prefix}:Redis:CacheHitRate:${var.redis-cluster-id}"
cluster_id = var.redis-cluster-id
}
*/
resource aws_cloudwatch_metric_alarm redis-CacheHitRate {
# for_each = toset(data.aws_elasticache_cluster.redis-cluster.cache_nodes.*.id)
alarm_name = "Redis:CacheHitRate:${var.redis-cluster-id}"
comparison_operator = "LessThanThreshold" comparison_operator = "LessThanThreshold"
evaluation_periods = "4" evaluation_periods = "4"
metric_name = "CacheHitRate" metric_name = "CacheHitRate"
@ -61,7 +55,7 @@ resource aws_cloudwatch_metric_alarm redis-CacheHitRate {
alarm_description = "Redis:CacheHitRate" alarm_description = "Redis:CacheHitRate"
namespace = "AWS/ElastiCache" namespace = "AWS/ElastiCache"
insufficient_data_actions = [] insufficient_data_actions = []
actions_enabled = "true" actions_enabled = var.actions-enabled
alarm_actions = [var.alarm-actions-standard] alarm_actions = [var.alarm-actions-standard]
ok_actions = [var.alarm-actions-standard] ok_actions = [var.alarm-actions-standard]
dimensions = { dimensions = {

View File

@ -1,22 +1,24 @@
variable redis-cluster-id {} variable cw-alarm-prefix {}
variable alarm-actions-urgent { variable "actions-enabled" {}
variable "redis-cluster-id" {}
variable "alarm-actions-urgent" {
type = string type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent" default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-urgent"
} }
variable alarm-actions-emergency { variable "alarm-actions-emergency" {
type = string type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency" default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-emergency"
} }
variable alarm-actions-standard { variable "alarm-actions-standard" {
type = string type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard" default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support-standard"
} }
variable alarm-actions-general { variable "alarm-actions-general" {
type = string type = string
default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support" default = "arn:aws:sns:ap-east-1:843733946244:rackspace-support"
} }
variable default-tags {} variable "default-tags" {}
variable threshold-EngineCPUUtilization {} variable "threshold-EngineCPUUtilization" {}
variable threshold-DatabaseMemoryUsagePercentage {} variable "threshold-DatabaseMemoryUsagePercentage" {}
variable threshold-CacheHitRate {} variable "threshold-CacheHitRate" {}

View File

@ -1,3 +1,4 @@
#!/bin/bash #!/bin/bash
RESULTS=$(aws elbv2 describe-load-balancers --query 'LoadBalancers[?Type==`application`].LoadBalancerArn' --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs) RESULTS=$(aws elbv2 describe-load-balancers --query 'LoadBalancers[?Type==`application`].LoadBalancerArn' --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs)
jq -n --arg result "$RESULTS" '{"result":$result}' jq -n --arg result "$RESULTS" '{"result":$result}'

View File

@ -0,0 +1,6 @@
#!/bin/bash
# exclude ASG instances
RESULTS=$(aws autoscaling describe-auto-scaling-groups --query 'AutoScalingGroups[*].AutoScalingGroupName' --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs)
jq -n --arg result "$RESULTS" '{"result":$result}'

View File

@ -0,0 +1,6 @@
#!/bin/bash
# exclude ASG instances
RESULTS=$(aws kafka list-clusters --query ClusterInfoList[*].ClusterName --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs)
jq -n --arg result "$RESULTS" '{"result":$result}'

View File

@ -0,0 +1,6 @@
#!/bin/bash
# exclude ASG instances
RESULTS=$(aws opensearch list-domain-names --query DomainNames[*].DomainName --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs)
jq -n --arg result "$RESULTS" '{"result":$result}'