UPD: added monitoring for MSK/Redis
This commit is contained in:
parent
7023e71fb5
commit
9a3ec387a9
@ -20,3 +20,127 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
data "aws_msk_cluster" "msk-cluster" {
|
||||
cluster_name = var.cluster-name
|
||||
}
|
||||
|
||||
data "aws_msk_broker_nodes" "msk-broker" {
|
||||
cluster_arn = data.aws_msk_cluster.msk-cluster.arn
|
||||
}
|
||||
|
||||
/*
|
||||
output debug {
|
||||
value = data.aws_msk_broker_nodes.msk-broker.node_info_list
|
||||
}
|
||||
*/
|
||||
/*
|
||||
module "msk-brokers" {
|
||||
source = "../../util/resource-list"
|
||||
resource-type = "kafka-brokers"
|
||||
query-input = data.aws_msk_cluster.msk-cluster.arn
|
||||
}
|
||||
*/
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
|
||||
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
|
||||
alarm_name = "${var.cw-alarm-prefix}:Kafka:CpuUsage:${var.cluster-name}-${each.value}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
threshold = var.threshold-CpuUserSystem
|
||||
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
metric_query {
|
||||
id = "m1"
|
||||
metric {
|
||||
metric_name = "CpuUser"
|
||||
namespace = "AWS/Kafka"
|
||||
period = 300
|
||||
stat = "Average"
|
||||
dimensions = {
|
||||
"Cluster Name" = var.cluster-name
|
||||
"Broker ID" = each.value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metric_query {
|
||||
id = "m2"
|
||||
metric {
|
||||
metric_name = "CpuSystem"
|
||||
namespace = "AWS/Kafka"
|
||||
period = 300
|
||||
stat = "Average"
|
||||
dimensions = {
|
||||
"Cluster Name" = var.cluster-name
|
||||
"Broker ID" = each.value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metric_query {
|
||||
id = "e1"
|
||||
expression = "m1 + m2"
|
||||
label = "CpuUserSystem"
|
||||
return_data = "true"
|
||||
}
|
||||
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
|
||||
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
|
||||
alarm_name = "${var.cw-alarm-prefix}:Kafka:KafkaDataLogsDiskUsed:${var.cluster-name}-${each.value}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "KafkaDataLogsDiskUsed"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-KafkaDataLogsDiskUsed
|
||||
alarm_description = "Kafka:KafkaDataLogsDiskUsed"
|
||||
namespace = "AWS/Kafka"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
"Cluster Name" = var.cluster-name
|
||||
"Broker ID" = each.value
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" {
|
||||
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
|
||||
alarm_name = "${var.cw-alarm-prefix}:Kafka:HeapMemoryAfterGC:${var.cluster-name}-${each.value}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "HeapMemoryAfterGC"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-HeapMemoryAfterGC
|
||||
alarm_description = "Kafka:HeapMemoryAfterGC"
|
||||
namespace = "AWS/Kafka"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
"Cluster Name" = var.cluster-name
|
||||
"Broker ID" = each.value
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5,3 +5,6 @@ variable sns-targets {}
|
||||
variable default-tags {}
|
||||
|
||||
variable threshold-ZooKeeperRequestLatencyMsMean {}
|
||||
variable threshold-CpuUserSystem {}
|
||||
variable threshold-KafkaDataLogsDiskUsed {}
|
||||
variable threshold-HeapMemoryAfterGC {}
|
@ -66,4 +66,28 @@ resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" {
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "redis-StringBasedCmdsLatency" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:Redis:StringBasedCmdsLatency:${var.redis-cluster-id}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "3"
|
||||
metric_name = "StringBasedCmdsLatency"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-StringBasedCmdsLatency
|
||||
alarm_description = "The average latency, in microseconds, of the string-based commands run during a selected time range"
|
||||
namespace = "AWS/ElastiCache"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
dimensions = {
|
||||
CacheClusterId = var.redis-cluster-id
|
||||
# CacheNodeId = each.value
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
@ -1,9 +1,10 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable "cw-alarm-prefix" {}
|
||||
variable "actions-enabled" {}
|
||||
variable "redis-cluster-id" {}
|
||||
variable sns-targets {}
|
||||
variable "sns-targets" {}
|
||||
variable "default-tags" {}
|
||||
|
||||
variable "threshold-EngineCPUUtilization" {}
|
||||
variable "threshold-DatabaseMemoryUsagePercentage" {}
|
||||
variable "threshold-CacheHitRate" {}
|
||||
variable "threshold-CacheHitRate" {}
|
||||
variable "threshold-StringBasedCmdsLatency" {}
|
@ -10,19 +10,18 @@ This module performs the following tasks:
|
||||
|
||||
Subnet cidrs are calculated automatically. Due to the design of terraform's cidrsubnets, this module has limitations:
|
||||
|
||||
* supports 2, 4, 6, or 8 subnets in total.
|
||||
* supports 2, 4, 6, 8, or 12 subnets in total.
|
||||
* hard-coded to work with 2 AZs, regardless of number of AZs available in the region.
|
||||
|
||||
Based on the input variables, it will create subnet cidrs using the following function
|
||||
|
||||
| Private Subnets per az | Public Subnets per az | Function | Example if a /24 is used on VPC |
|
||||
| ---------------------- | --------------------- | -------------------------------------------- | ------------------------------- |
|
||||
| 1 | 0 | cidrsubnets(local.vpc-cidr, 1,1) | 2 * /25 |
|
||||
| 1 | 1 | cidrsubnets(local.vpc-cidr, 2,2,2,2) | 4 * /26 |
|
||||
| 2 | 1 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) | 6 * /27 |
|
||||
| 2 | 2 | cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) | 8 * /28 |
|
||||
|
||||
simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
|
||||
| Private Subnets per az | Public Subnets per az | Function | Example if a /24 is used on VPC |
|
||||
|------------------------|-----------------------|------------------------------------------------------|---------------------------------|
|
||||
| 1 | 0 | cidrsubnets(local.vpc-cidr, 1,1) | 2 * /25 |
|
||||
| 1 | 1 | cidrsubnets(local.vpc-cidr, 2,2,2,2) | 4 * /26 |
|
||||
| 2 | 1 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) | 6 * /27 |
|
||||
| 2 | 2 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3,3,3) | 8 * /27 |
|
||||
| 6 | 6 | cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4,4,4,4,4) | 12 * /28 |
|
||||
|
||||
## Inputs:
|
||||
|
||||
|
@ -9,7 +9,8 @@ locals {
|
||||
vpc-cidr = var.vpc-cidr
|
||||
total-no-subnets = local.no-az * (var.number-of-private-subnets-per-az + var.number-of-public-subnets-per-az)
|
||||
|
||||
simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
|
||||
# simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
|
||||
simple-divide = local.total-no-subnets >= 12 ? cidrsubnets(local.vpc-cidr, 4,4,4,4, 4,4,4,4, 4,4,4,4) : local.total-no-subnets >= 8 ? cidrsubnets(local.vpc-cidr, 3,3,3,3, 3,3,3,3) : local.total-no-subnets >= 6 ? cidrsubnets(local.vpc-cidr, 3,3,3, 3,3,3) : local.total-no-subnets >= 4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >= 2 ? cidrsubnets(local.vpc-cidr, 1, 1) : null
|
||||
public-subnets = slice(local.simple-divide, 0, var.number-of-public-subnets-per-az * local.no-az)
|
||||
private-subnets = slice(local.simple-divide, var.number-of-public-subnets-per-az * local.no-az , local.total-no-subnets)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user