terraform.aws-baseline-infra/examples/bea-mbk-monitoring-moved-to-bea-working-dir/main.tf
2023-02-28 16:38:16 +08:00

258 lines
9.5 KiB
HCL

// detect sns topic arn
locals {
sns-targets = {
alarm-actions-urgent = data.aws_sns_topic.rackspace-support-urgent.arn
alarm-actions-emergency = data.aws_sns_topic.rackspace-support-emergency.arn
alarm-actions-standard = data.aws_sns_topic.rackspace-support-standard.arn
alarm-actions-general = data.aws_sns_topic.rackspace-support.arn
}
}
// Health event monitoring
module "health-events" {
source = "../../modules/ManagementGovernance/Monitoring.EventBridge"
cw-alarm-prefix = local.cw-alarm-prefix
actions-enabled = true
default-tags = local.default-tags
sns-targets = local.sns-targets
}
// RDS monitoring
module "rds-instances" {
source = "../../modules/util/resource-list"
resource-type = "rds"
}
module "rds-monitoring" {
# for_each = toset(var.rds-instance-ids)
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.rds-instances.result-set
source = "../../modules/ManagementGovernance/Monitoring.RDS"
default-tags = local.default-tags
rds-instance-name = each.value
threshold-CpuUtilization = 90
threshold-FreeableMemory = 512 * 1024 * 1024
threshold-FreeStorageSpace = 5 * 1024 * 1024 * 1024
threshold-DiskQueueDepth = 30
threshold-ReadLatency = 0.03
threshold-WriteLatency = 0.03
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
// Redis monitoring
module "redis-instances" {
source = "../../modules/util/resource-list"
resource-type = "redis"
}
module "redis-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.redis-instances.result-set
source = "../../modules/ManagementGovernance/Monitoring.Redis"
default-tags = local.default-tags
redis-cluster-id = each.value
threshold-EngineCPUUtilization = 90
threshold-DatabaseMemoryUsagePercentage = 90
threshold-CacheHitRate = 3
threshold-StringBasedCmdsLatency = 100000 # 1 second
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
// EC2 monitoring
module "ec2-instances" {
source = "../../modules/util/resource-list"
resource-type = "ec2"
}
module "ec2-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.ec2-instances.result-set
source = "../../modules/ManagementGovernance/Monitoring.EC2"
default-tags = local.default-tags
ec2-instance-id = each.value
threshold-CPUUtilization = 90
#threshold-mem_free = 100000
threshold-mem_used_percent = 95
#threshold-swap_free = 100000
threshold-swap_used_percent = 70
#threshold-disk_free = 1 * 1000 * 1000 * 1000
threshold-disk_used_percentage = 90
threshold-disk_inodes_free = 10000
threshold-processes_total = 500
threshold-LogicalDiskFreePct = 10
threshold-MemoryCommittedPct = 90
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
// NLB monitoring
module "nlb-arns" {
source = "../../modules/util/resource-list"
resource-type = "nlb"
}
module "nlb-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.nlb-arns.result-set
source = "../../modules/ManagementGovernance/Monitoring.NLB"
default-tags = local.default-tags
load-balancer = each.value
threshold-HealthHostCountMin = 1
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
/* disabled for now. all of their ALB returns static http response and with no target group attached
data "external" "alb-arns" {
program = ["bash", "./list-alb.sh"]
}
module "alb-monitoring" {
for_each = toset(split(" ", data.external.alb-arns.result.result))
source = "../../modules/ManagementGovernance/Monitoring.ALB"
default-tags = local.default-tags
load-balancer = each.value
threshold-HealthHostCountMin = 1
}
*/
// EMR monitoring
module "emr-clusters" {
source = "../../modules/util/resource-list"
resource-type = "emr"
}
module "emr-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.emr-clusters.result-set
source = "../../modules/ManagementGovernance/Monitoring.EMR"
default-tags = local.default-tags
job-flow-id = split("/", each.value)[1]
threshold-AppsPending = 2
threshold-CapacityRemainingGB = 100
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
// ASG monitoring
module "asg" {
source = "../../modules/util/resource-list"
resource-type = "asg"
}
module "asg-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.asg.result-set
source = "../../modules/ManagementGovernance/Monitoring.ASG"
default-tags = local.default-tags
asg-name = each.value
threshold-CPUUtilization = 90
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
// OpenSearch monitoring
module "es-domains" {
source = "../../modules/util/resource-list"
resource-type = "opensearch"
}
module "es-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.es-domains.result-set
source = "../../modules/ManagementGovernance/Monitoring.OpenSearch"
default-tags = local.default-tags
domain-name = each.value
threshold-CPUUtilization = 90
threshold-IndexingLatency = 3
threshold-SearchLatency = 3
threshold-ClusterIndexWritesBlocked = 1
threshold-FreeStorageSpace = 5120
threshold-JVMMemoryPressure = 95
threshold-MasterCPUUtilization = 50
threshold-MasterJVMMemoryPressure = 95
threshold-ThreadpoolSearchQueue = 500
threshold-ThreadpoolSearchRejected = 1
threshold-ThreadpoolWriteQueue = 100
threshold-ThreadpoolWriteRejected = 1
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
// MSK monitoring
module "kafka-clusters" {
source = "../../modules/util/resource-list"
resource-type = "kafka"
}
module "kafka-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.kafka-clusters.result-set
source = "../../modules/ManagementGovernance/Monitoring.Kafka"
default-tags = local.default-tags
cluster-name = each.value
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
threshold-ZooKeeperRequestLatencyMsMean = 30
threshold-CpuUserSystem = 60
threshold-HeapMemoryAfterGC = 60
threshold-KafkaDataLogsDiskUsed = 85
}
// transit gateway monitoring
module "tgw" {
source = "../../modules/util/resource-list"
resource-type = "tgw"
}
module "tgw-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.tgw.result-set
source = "../../modules/ManagementGovernance/Monitoring.TGW"
default-tags = local.default-tags
tgw-id = each.value
threshold-PacketDropCountNoRoute = 100
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
// NAT Gateway monitoring
module "ngw" {
source = "../../modules/util/resource-list"
resource-type = "ngw"
}
module "ngw-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.ngw.result-set
source = "../../modules/ManagementGovernance/Monitoring.NGW"
default-tags = local.default-tags
res-id = each.value
threshold-ErrorPortAllocation = 0
threshold-ConnectionEstablishedCount = 1000
threshold-PacketsDropCount = 10
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
// EKS monitoring
data "aws_eks_clusters" "eks-clusters" {}
module "eks-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = data.aws_eks_clusters.eks-clusters.names
source = "../../modules/ManagementGovernance/Monitoring.EKS"
default-tags = local.default-tags
cluster-name = each.value
eks-namespace = "default"
pod-names = ["depl-nginx", "depl-alpine"]
threshold-pod_cpu_utilization = 85
threshold-pod_memory_utilization = 85
threshold-pod_number_of_container_restarts = 5
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}