258 lines
9.5 KiB
Terraform
258 lines
9.5 KiB
Terraform
|
// detect sns topic arn
|
||
|
locals {
|
||
|
sns-targets = {
|
||
|
alarm-actions-urgent = data.aws_sns_topic.rackspace-support-urgent.arn
|
||
|
alarm-actions-emergency = data.aws_sns_topic.rackspace-support-emergency.arn
|
||
|
alarm-actions-standard = data.aws_sns_topic.rackspace-support-standard.arn
|
||
|
alarm-actions-general = data.aws_sns_topic.rackspace-support.arn
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Health event monitoring
|
||
|
module "health-events" {
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.EventBridge"
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
actions-enabled = true
|
||
|
default-tags = local.default-tags
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|
||
|
|
||
|
// RDS monitoring
|
||
|
module "rds-instances" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "rds"
|
||
|
}
|
||
|
|
||
|
module "rds-monitoring" {
|
||
|
# for_each = toset(var.rds-instance-ids)
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.rds-instances.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.RDS"
|
||
|
default-tags = local.default-tags
|
||
|
rds-instance-name = each.value
|
||
|
threshold-CpuUtilization = 90
|
||
|
threshold-FreeableMemory = 512 * 1024 * 1024
|
||
|
threshold-FreeStorageSpace = 5 * 1024 * 1024 * 1024
|
||
|
threshold-DiskQueueDepth = 30
|
||
|
threshold-ReadLatency = 0.03
|
||
|
threshold-WriteLatency = 0.03
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|
||
|
|
||
|
// Redis monitoring
|
||
|
module "redis-instances" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "redis"
|
||
|
}
|
||
|
|
||
|
module "redis-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.redis-instances.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.Redis"
|
||
|
default-tags = local.default-tags
|
||
|
redis-cluster-id = each.value
|
||
|
threshold-EngineCPUUtilization = 90
|
||
|
threshold-DatabaseMemoryUsagePercentage = 90
|
||
|
threshold-CacheHitRate = 3
|
||
|
threshold-StringBasedCmdsLatency = 100000 # 1 second
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|
||
|
|
||
|
// EC2 monitoring
|
||
|
module "ec2-instances" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "ec2"
|
||
|
}
|
||
|
|
||
|
module "ec2-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.ec2-instances.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.EC2"
|
||
|
default-tags = local.default-tags
|
||
|
ec2-instance-id = each.value
|
||
|
threshold-CPUUtilization = 90
|
||
|
#threshold-mem_free = 100000
|
||
|
threshold-mem_used_percent = 95
|
||
|
#threshold-swap_free = 100000
|
||
|
threshold-swap_used_percent = 70
|
||
|
#threshold-disk_free = 1 * 1000 * 1000 * 1000
|
||
|
threshold-disk_used_percentage = 90
|
||
|
threshold-disk_inodes_free = 10000
|
||
|
threshold-processes_total = 500
|
||
|
threshold-LogicalDiskFreePct = 10
|
||
|
threshold-MemoryCommittedPct = 90
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|
||
|
|
||
|
// NLB monitoring
|
||
|
module "nlb-arns" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "nlb"
|
||
|
}
|
||
|
|
||
|
module "nlb-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.nlb-arns.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.NLB"
|
||
|
default-tags = local.default-tags
|
||
|
load-balancer = each.value
|
||
|
threshold-HealthHostCountMin = 1
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|
||
|
|
||
|
/* disabled for now. all of their ALB returns static http response and with no target group attached
|
||
|
data "external" "alb-arns" {
|
||
|
program = ["bash", "./list-alb.sh"]
|
||
|
}
|
||
|
|
||
|
module "alb-monitoring" {
|
||
|
for_each = toset(split(" ", data.external.alb-arns.result.result))
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.ALB"
|
||
|
default-tags = local.default-tags
|
||
|
load-balancer = each.value
|
||
|
threshold-HealthHostCountMin = 1
|
||
|
}
|
||
|
|
||
|
*/
|
||
|
|
||
|
// EMR monitoring
|
||
|
module "emr-clusters" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "emr"
|
||
|
}
|
||
|
|
||
|
module "emr-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.emr-clusters.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.EMR"
|
||
|
default-tags = local.default-tags
|
||
|
job-flow-id = split("/", each.value)[1]
|
||
|
threshold-AppsPending = 2
|
||
|
threshold-CapacityRemainingGB = 100
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|
||
|
|
||
|
// ASG monitoring
|
||
|
module "asg" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "asg"
|
||
|
}
|
||
|
|
||
|
module "asg-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.asg.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.ASG"
|
||
|
default-tags = local.default-tags
|
||
|
asg-name = each.value
|
||
|
threshold-CPUUtilization = 90
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|
||
|
|
||
|
// OpenSearch monitoring
|
||
|
module "es-domains" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "opensearch"
|
||
|
}
|
||
|
|
||
|
module "es-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.es-domains.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.OpenSearch"
|
||
|
default-tags = local.default-tags
|
||
|
domain-name = each.value
|
||
|
threshold-CPUUtilization = 90
|
||
|
threshold-IndexingLatency = 3
|
||
|
threshold-SearchLatency = 3
|
||
|
threshold-ClusterIndexWritesBlocked = 1
|
||
|
threshold-FreeStorageSpace = 5120
|
||
|
threshold-JVMMemoryPressure = 95
|
||
|
threshold-MasterCPUUtilization = 50
|
||
|
threshold-MasterJVMMemoryPressure = 95
|
||
|
threshold-ThreadpoolSearchQueue = 500
|
||
|
threshold-ThreadpoolSearchRejected = 1
|
||
|
threshold-ThreadpoolWriteQueue = 100
|
||
|
threshold-ThreadpoolWriteRejected = 1
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
|
||
|
}
|
||
|
|
||
|
// MSK monitoring
|
||
|
module "kafka-clusters" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "kafka"
|
||
|
}
|
||
|
|
||
|
module "kafka-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.kafka-clusters.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.Kafka"
|
||
|
default-tags = local.default-tags
|
||
|
cluster-name = each.value
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
threshold-ZooKeeperRequestLatencyMsMean = 30
|
||
|
threshold-CpuUserSystem = 60
|
||
|
threshold-HeapMemoryAfterGC = 60
|
||
|
threshold-KafkaDataLogsDiskUsed = 85
|
||
|
}
|
||
|
|
||
|
// transit gateway monitoring
|
||
|
module "tgw" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "tgw"
|
||
|
}
|
||
|
|
||
|
module "tgw-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.tgw.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.TGW"
|
||
|
default-tags = local.default-tags
|
||
|
tgw-id = each.value
|
||
|
threshold-PacketDropCountNoRoute = 100
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|
||
|
|
||
|
// NAT Gateway monitoring
|
||
|
module "ngw" {
|
||
|
source = "../../modules/util/resource-list"
|
||
|
resource-type = "ngw"
|
||
|
}
|
||
|
|
||
|
module "ngw-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = module.ngw.result-set
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.NGW"
|
||
|
default-tags = local.default-tags
|
||
|
res-id = each.value
|
||
|
threshold-ErrorPortAllocation = 0
|
||
|
threshold-ConnectionEstablishedCount = 1000
|
||
|
threshold-PacketsDropCount = 10
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|
||
|
|
||
|
// EKS monitoring
|
||
|
data "aws_eks_clusters" "eks-clusters" {}
|
||
|
|
||
|
module "eks-monitoring" {
|
||
|
cw-alarm-prefix = local.cw-alarm-prefix
|
||
|
for_each = data.aws_eks_clusters.eks-clusters.names
|
||
|
source = "../../modules/ManagementGovernance/Monitoring.EKS"
|
||
|
default-tags = local.default-tags
|
||
|
cluster-name = each.value
|
||
|
eks-namespace = "default"
|
||
|
pod-names = ["depl-nginx", "depl-alpine"]
|
||
|
threshold-pod_cpu_utilization = 85
|
||
|
threshold-pod_memory_utilization = 85
|
||
|
threshold-pod_number_of_container_restarts = 5
|
||
|
actions-enabled = var.actions-enabled
|
||
|
sns-targets = local.sns-targets
|
||
|
}
|