// detect sns topic arn locals { sns-targets = { alarm-actions-urgent = data.aws_sns_topic.rackspace-support-urgent.arn alarm-actions-emergency = data.aws_sns_topic.rackspace-support-emergency.arn alarm-actions-standard = data.aws_sns_topic.rackspace-support-standard.arn alarm-actions-general = data.aws_sns_topic.rackspace-support.arn } } // Health event monitoring module "health-events" { source = "../../modules/ManagementGovernance/Monitoring.EventBridge" cw-alarm-prefix = local.cw-alarm-prefix actions-enabled = true default-tags = local.default-tags sns-targets = local.sns-targets } // RDS monitoring module "rds-instances" { source = "../../modules/util/resource-list" resource-type = "rds" } module "rds-monitoring" { # for_each = toset(var.rds-instance-ids) cw-alarm-prefix = local.cw-alarm-prefix for_each = module.rds-instances.result-set source = "../../modules/ManagementGovernance/Monitoring.RDS" default-tags = local.default-tags rds-instance-name = each.value threshold-CpuUtilization = 90 threshold-FreeableMemory = 512 * 1024 * 1024 threshold-FreeStorageSpace = 5 * 1024 * 1024 * 1024 threshold-DiskQueueDepth = 30 threshold-ReadLatency = 0.03 threshold-WriteLatency = 0.03 actions-enabled = var.actions-enabled sns-targets = local.sns-targets } // Redis monitoring module "redis-instances" { source = "../../modules/util/resource-list" resource-type = "redis" } module "redis-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = module.redis-instances.result-set source = "../../modules/ManagementGovernance/Monitoring.Redis" default-tags = local.default-tags redis-cluster-id = each.value threshold-EngineCPUUtilization = 90 threshold-DatabaseMemoryUsagePercentage = 90 threshold-CacheHitRate = 3 threshold-StringBasedCmdsLatency = 100000 # 1 second actions-enabled = var.actions-enabled sns-targets = local.sns-targets } // EC2 monitoring module "ec2-instances" { source = "../../modules/util/resource-list" resource-type = "ec2" } module "ec2-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = module.ec2-instances.result-set source = "../../modules/ManagementGovernance/Monitoring.EC2" default-tags = local.default-tags ec2-instance-id = each.value threshold-CPUUtilization = 90 #threshold-mem_free = 100000 threshold-mem_used_percent = 95 #threshold-swap_free = 100000 threshold-swap_used_percent = 70 #threshold-disk_free = 1 * 1000 * 1000 * 1000 threshold-disk_used_percentage = 90 threshold-disk_inodes_free = 10000 threshold-processes_total = 500 threshold-LogicalDiskFreePct = 10 threshold-MemoryCommittedPct = 90 actions-enabled = var.actions-enabled sns-targets = local.sns-targets } // NLB monitoring module "nlb-arns" { source = "../../modules/util/resource-list" resource-type = "nlb" } module "nlb-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = module.nlb-arns.result-set source = "../../modules/ManagementGovernance/Monitoring.NLB" default-tags = local.default-tags load-balancer = each.value threshold-HealthHostCountMin = 1 actions-enabled = var.actions-enabled sns-targets = local.sns-targets } /* disabled for now. all of their ALB returns static http response and with no target group attached data "external" "alb-arns" { program = ["bash", "./list-alb.sh"] } module "alb-monitoring" { for_each = toset(split(" ", data.external.alb-arns.result.result)) source = "../../modules/ManagementGovernance/Monitoring.ALB" default-tags = local.default-tags load-balancer = each.value threshold-HealthHostCountMin = 1 } */ // EMR monitoring module "emr-clusters" { source = "../../modules/util/resource-list" resource-type = "emr" } module "emr-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = module.emr-clusters.result-set source = "../../modules/ManagementGovernance/Monitoring.EMR" default-tags = local.default-tags job-flow-id = split("/", each.value)[1] threshold-AppsPending = 2 threshold-CapacityRemainingGB = 100 actions-enabled = var.actions-enabled sns-targets = local.sns-targets } // ASG monitoring module "asg" { source = "../../modules/util/resource-list" resource-type = "asg" } module "asg-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = module.asg.result-set source = "../../modules/ManagementGovernance/Monitoring.ASG" default-tags = local.default-tags asg-name = each.value threshold-CPUUtilization = 90 actions-enabled = var.actions-enabled sns-targets = local.sns-targets } // OpenSearch monitoring module "es-domains" { source = "../../modules/util/resource-list" resource-type = "opensearch" } module "es-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = module.es-domains.result-set source = "../../modules/ManagementGovernance/Monitoring.OpenSearch" default-tags = local.default-tags domain-name = each.value threshold-CPUUtilization = 90 threshold-IndexingLatency = 3 threshold-SearchLatency = 3 threshold-ClusterIndexWritesBlocked = 1 threshold-FreeStorageSpace = 5120 threshold-JVMMemoryPressure = 95 threshold-MasterCPUUtilization = 50 threshold-MasterJVMMemoryPressure = 95 threshold-ThreadpoolSearchQueue = 500 threshold-ThreadpoolSearchRejected = 1 threshold-ThreadpoolWriteQueue = 100 threshold-ThreadpoolWriteRejected = 1 actions-enabled = var.actions-enabled sns-targets = local.sns-targets } // MSK monitoring module "kafka-clusters" { source = "../../modules/util/resource-list" resource-type = "kafka" } module "kafka-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = module.kafka-clusters.result-set source = "../../modules/ManagementGovernance/Monitoring.Kafka" default-tags = local.default-tags cluster-name = each.value actions-enabled = var.actions-enabled sns-targets = local.sns-targets threshold-ZooKeeperRequestLatencyMsMean = 30 threshold-CpuUserSystem = 60 threshold-HeapMemoryAfterGC = 60 threshold-KafkaDataLogsDiskUsed = 85 } // transit gateway monitoring module "tgw" { source = "../../modules/util/resource-list" resource-type = "tgw" } module "tgw-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = module.tgw.result-set source = "../../modules/ManagementGovernance/Monitoring.TGW" default-tags = local.default-tags tgw-id = each.value threshold-PacketDropCountNoRoute = 100 actions-enabled = var.actions-enabled sns-targets = local.sns-targets } // NAT Gateway monitoring module "ngw" { source = "../../modules/util/resource-list" resource-type = "ngw" } module "ngw-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = module.ngw.result-set source = "../../modules/ManagementGovernance/Monitoring.NGW" default-tags = local.default-tags res-id = each.value threshold-ErrorPortAllocation = 0 threshold-ConnectionEstablishedCount = 1000 threshold-PacketsDropCount = 10 actions-enabled = var.actions-enabled sns-targets = local.sns-targets } // EKS monitoring data "aws_eks_clusters" "eks-clusters" {} module "eks-monitoring" { cw-alarm-prefix = local.cw-alarm-prefix for_each = data.aws_eks_clusters.eks-clusters.names source = "../../modules/ManagementGovernance/Monitoring.EKS" default-tags = local.default-tags cluster-name = each.value eks-namespace = "default" pod-names = ["depl-nginx", "depl-alpine"] threshold-pod_cpu_utilization = 85 threshold-pod_memory_utilization = 85 threshold-pod_number_of_container_restarts = 5 actions-enabled = var.actions-enabled sns-targets = local.sns-targets }