UPD: added monitoring for MSK/Redis

This commit is contained in:
xpk 2022-12-19 16:13:45 +08:00
parent 7023e71fb5
commit 9a3ec387a9
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
6 changed files with 165 additions and 13 deletions

View File

@ -20,3 +20,127 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
ignore_changes = [tags]
}
}
data "aws_msk_cluster" "msk-cluster" {
cluster_name = var.cluster-name
}
data "aws_msk_broker_nodes" "msk-broker" {
cluster_arn = data.aws_msk_cluster.msk-cluster.arn
}
/*
output debug {
value = data.aws_msk_broker_nodes.msk-broker.node_info_list
}
*/
/*
module "msk-brokers" {
source = "../../util/resource-list"
resource-type = "kafka-brokers"
query-input = data.aws_msk_cluster.msk-cluster.arn
}
*/
resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:CpuUsage:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
threshold = var.threshold-CpuUserSystem
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
metric_query {
id = "m1"
metric {
metric_name = "CpuUser"
namespace = "AWS/Kafka"
period = 300
stat = "Average"
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
}
}
metric_query {
id = "m2"
metric {
metric_name = "CpuSystem"
namespace = "AWS/Kafka"
period = 300
stat = "Average"
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
}
}
metric_query {
id = "e1"
expression = "m1 + m2"
label = "CpuUserSystem"
return_data = "true"
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:KafkaDataLogsDiskUsed:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "KafkaDataLogsDiskUsed"
period = "300"
statistic = "Average"
threshold = var.threshold-KafkaDataLogsDiskUsed
alarm_description = "Kafka:KafkaDataLogsDiskUsed"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.cw-alarm-prefix}:Kafka:HeapMemoryAfterGC:${var.cluster-name}-${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "HeapMemoryAfterGC"
period = "300"
statistic = "Average"
threshold = var.threshold-HeapMemoryAfterGC
alarm_description = "Kafka:HeapMemoryAfterGC"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -5,3 +5,6 @@ variable sns-targets {}
variable default-tags {}
variable threshold-ZooKeeperRequestLatencyMsMean {}
variable threshold-CpuUserSystem {}
variable threshold-KafkaDataLogsDiskUsed {}
variable threshold-HeapMemoryAfterGC {}

View File

@ -66,4 +66,28 @@ resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" {
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "redis-StringBasedCmdsLatency" {
alarm_name = "${var.cw-alarm-prefix}:Redis:StringBasedCmdsLatency:${var.redis-cluster-id}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "StringBasedCmdsLatency"
period = "60"
statistic = "Average"
threshold = var.threshold-StringBasedCmdsLatency
alarm_description = "The average latency, in microseconds, of the string-based commands run during a selected time range"
namespace = "AWS/ElastiCache"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
CacheClusterId = var.redis-cluster-id
# CacheNodeId = each.value
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -1,9 +1,10 @@
variable cw-alarm-prefix {}
variable "cw-alarm-prefix" {}
variable "actions-enabled" {}
variable "redis-cluster-id" {}
variable sns-targets {}
variable "sns-targets" {}
variable "default-tags" {}
variable "threshold-EngineCPUUtilization" {}
variable "threshold-DatabaseMemoryUsagePercentage" {}
variable "threshold-CacheHitRate" {}
variable "threshold-CacheHitRate" {}
variable "threshold-StringBasedCmdsLatency" {}

View File

@ -10,19 +10,18 @@ This module performs the following tasks:
Subnet cidrs are calculated automatically. Due to the design of terraform's cidrsubnets, this module has limitations:
* supports 2, 4, 6, or 8 subnets in total.
* supports 2, 4, 6, 8, or 12 subnets in total.
* hard-coded to work with 2 AZs, regardless of number of AZs available in the region.
Based on the input variables, it will create subnet cidrs using the following function
| Private Subnets per az | Public Subnets per az | Function | Example if a /24 is used on VPC |
| ---------------------- | --------------------- | -------------------------------------------- | ------------------------------- |
| 1 | 0 | cidrsubnets(local.vpc-cidr, 1,1) | 2 * /25 |
| 1 | 1 | cidrsubnets(local.vpc-cidr, 2,2,2,2) | 4 * /26 |
| 2 | 1 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) | 6 * /27 |
| 2 | 2 | cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) | 8 * /28 |
simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
| Private Subnets per az | Public Subnets per az | Function | Example if a /24 is used on VPC |
|------------------------|-----------------------|------------------------------------------------------|---------------------------------|
| 1 | 0 | cidrsubnets(local.vpc-cidr, 1,1) | 2 * /25 |
| 1 | 1 | cidrsubnets(local.vpc-cidr, 2,2,2,2) | 4 * /26 |
| 2 | 1 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) | 6 * /27 |
| 2 | 2 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3,3,3) | 8 * /27 |
| 6 | 6 | cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4,4,4,4,4) | 12 * /28 |
## Inputs:

View File

@ -9,7 +9,8 @@ locals {
vpc-cidr = var.vpc-cidr
total-no-subnets = local.no-az * (var.number-of-private-subnets-per-az + var.number-of-public-subnets-per-az)
simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
# simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
simple-divide = local.total-no-subnets >= 12 ? cidrsubnets(local.vpc-cidr, 4,4,4,4, 4,4,4,4, 4,4,4,4) : local.total-no-subnets >= 8 ? cidrsubnets(local.vpc-cidr, 3,3,3,3, 3,3,3,3) : local.total-no-subnets >= 6 ? cidrsubnets(local.vpc-cidr, 3,3,3, 3,3,3) : local.total-no-subnets >= 4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >= 2 ? cidrsubnets(local.vpc-cidr, 1, 1) : null
public-subnets = slice(local.simple-divide, 0, var.number-of-public-subnets-per-az * local.no-az)
private-subnets = slice(local.simple-divide, var.number-of-public-subnets-per-az * local.no-az , local.total-no-subnets)
}