UPD: added monitoring for MSK/Redis

This commit is contained in:
xpk 2022-12-19 16:13:45 +08:00
parent 7023e71fb5
commit 9a3ec387a9
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
6 changed files with 165 additions and 13 deletions

View File

@ -20,3 +20,127 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
ignore_changes = [tags] ignore_changes = [tags]
} }
} }
data "aws_msk_cluster" "msk-cluster" {
cluster_name = var.cluster-name
}
data "aws_msk_broker_nodes" "msk-broker" {
cluster_arn = data.aws_msk_cluster.msk-cluster.arn
}
/*
output debug {
value = data.aws_msk_broker_nodes.msk-broker.node_info_list
}
*/
/*
module "msk-brokers" {
source = "../../util/resource-list"
resource-type = "kafka-brokers"
query-input = data.aws_msk_cluster.msk-cluster.arn
}
*/
resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:CpuUsage:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
threshold = var.threshold-CpuUserSystem
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
metric_query {
id = "m1"
metric {
metric_name = "CpuUser"
namespace = "AWS/Kafka"
period = 300
stat = "Average"
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
}
}
metric_query {
id = "m2"
metric {
metric_name = "CpuSystem"
namespace = "AWS/Kafka"
period = 300
stat = "Average"
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
}
}
metric_query {
id = "e1"
expression = "m1 + m2"
label = "CpuUserSystem"
return_data = "true"
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:KafkaDataLogsDiskUsed:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "KafkaDataLogsDiskUsed"
period = "300"
statistic = "Average"
threshold = var.threshold-KafkaDataLogsDiskUsed
alarm_description = "Kafka:KafkaDataLogsDiskUsed"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:HeapMemoryAfterGC:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "HeapMemoryAfterGC"
period = "300"
statistic = "Average"
threshold = var.threshold-HeapMemoryAfterGC
alarm_description = "Kafka:HeapMemoryAfterGC"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -5,3 +5,6 @@ variable sns-targets {}
variable default-tags {} variable default-tags {}
variable threshold-ZooKeeperRequestLatencyMsMean {} variable threshold-ZooKeeperRequestLatencyMsMean {}
variable threshold-CpuUserSystem {}
variable threshold-KafkaDataLogsDiskUsed {}
variable threshold-HeapMemoryAfterGC {}

View File

@ -66,4 +66,28 @@ resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" {
lifecycle { lifecycle {
ignore_changes = [tags] ignore_changes = [tags]
} }
}
resource "aws_cloudwatch_metric_alarm" "redis-StringBasedCmdsLatency" {
alarm_name = "${var.cw-alarm-prefix}:Redis:StringBasedCmdsLatency:${var.redis-cluster-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "StringBasedCmdsLatency"
period = "60"
statistic = "Average"
threshold = var.threshold-StringBasedCmdsLatency
alarm_description = "The average latency, in microseconds, of the string-based commands run during a selected time range"
namespace = "AWS/ElastiCache"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
CacheClusterId = var.redis-cluster-id
# CacheNodeId = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
} }

View File

@ -1,9 +1,10 @@
variable cw-alarm-prefix {} variable "cw-alarm-prefix" {}
variable "actions-enabled" {} variable "actions-enabled" {}
variable "redis-cluster-id" {} variable "redis-cluster-id" {}
variable sns-targets {} variable "sns-targets" {}
variable "default-tags" {} variable "default-tags" {}
variable "threshold-EngineCPUUtilization" {} variable "threshold-EngineCPUUtilization" {}
variable "threshold-DatabaseMemoryUsagePercentage" {} variable "threshold-DatabaseMemoryUsagePercentage" {}
variable "threshold-CacheHitRate" {} variable "threshold-CacheHitRate" {}
variable "threshold-StringBasedCmdsLatency" {}

View File

@ -10,19 +10,18 @@ This module performs the following tasks:
Subnet cidrs are calculated automatically. Due to the design of terraform's cidrsubnets, this module has limitations: Subnet cidrs are calculated automatically. Due to the design of terraform's cidrsubnets, this module has limitations:
* supports 2, 4, 6, or 8 subnets in total. * supports 2, 4, 6, 8, or 12 subnets in total.
* hard-coded to work with 2 AZs, regardless of number of AZs available in the region. * hard-coded to work with 2 AZs, regardless of number of AZs available in the region.
Based on the input variables, it will create subnet cidrs using the following function Based on the input variables, it will create subnet cidrs using the following function
| Private Subnets per az | Public Subnets per az | Function | Example if a /24 is used on VPC | | Private Subnets per az | Public Subnets per az | Function | Example if a /24 is used on VPC |
| ---------------------- | --------------------- | -------------------------------------------- | ------------------------------- | |------------------------|-----------------------|------------------------------------------------------|---------------------------------|
| 1 | 0 | cidrsubnets(local.vpc-cidr, 1,1) | 2 * /25 | | 1 | 0 | cidrsubnets(local.vpc-cidr, 1,1) | 2 * /25 |
| 1 | 1 | cidrsubnets(local.vpc-cidr, 2,2,2,2) | 4 * /26 | | 1 | 1 | cidrsubnets(local.vpc-cidr, 2,2,2,2) | 4 * /26 |
| 2 | 1 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) | 6 * /27 | | 2 | 1 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) | 6 * /27 |
| 2 | 2 | cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) | 8 * /28 | | 2 | 2 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3,3,3) | 8 * /27 |
| 6 | 6 | cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4,4,4,4,4) | 12 * /28 |
simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
## Inputs: ## Inputs:

View File

@ -9,7 +9,8 @@ locals {
vpc-cidr = var.vpc-cidr vpc-cidr = var.vpc-cidr
total-no-subnets = local.no-az * (var.number-of-private-subnets-per-az + var.number-of-public-subnets-per-az) total-no-subnets = local.no-az * (var.number-of-private-subnets-per-az + var.number-of-public-subnets-per-az)
simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null # simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
simple-divide = local.total-no-subnets >= 12 ? cidrsubnets(local.vpc-cidr, 4,4,4,4, 4,4,4,4, 4,4,4,4) : local.total-no-subnets >= 8 ? cidrsubnets(local.vpc-cidr, 3,3,3,3, 3,3,3,3) : local.total-no-subnets >= 6 ? cidrsubnets(local.vpc-cidr, 3,3,3, 3,3,3) : local.total-no-subnets >= 4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >= 2 ? cidrsubnets(local.vpc-cidr, 1, 1) : null
public-subnets = slice(local.simple-divide, 0, var.number-of-public-subnets-per-az * local.no-az) public-subnets = slice(local.simple-divide, 0, var.number-of-public-subnets-per-az * local.no-az)
private-subnets = slice(local.simple-divide, var.number-of-public-subnets-per-az * local.no-az , local.total-no-subnets) private-subnets = slice(local.simple-divide, var.number-of-public-subnets-per-az * local.no-az , local.total-no-subnets)
} }