UPD: added monitoring for MSK/Redis
This commit is contained in:
parent
7023e71fb5
commit
9a3ec387a9
@ -20,3 +20,127 @@ resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
|
|||||||
ignore_changes = [tags]
|
ignore_changes = [tags]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
data "aws_msk_cluster" "msk-cluster" {
|
||||||
|
cluster_name = var.cluster-name
|
||||||
|
}
|
||||||
|
|
||||||
|
data "aws_msk_broker_nodes" "msk-broker" {
|
||||||
|
cluster_arn = data.aws_msk_cluster.msk-cluster.arn
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
output debug {
|
||||||
|
value = data.aws_msk_broker_nodes.msk-broker.node_info_list
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
module "msk-brokers" {
|
||||||
|
source = "../../util/resource-list"
|
||||||
|
resource-type = "kafka-brokers"
|
||||||
|
query-input = data.aws_msk_cluster.msk-cluster.arn
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
|
||||||
|
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
|
||||||
|
alarm_name = "${var.cw-alarm-prefix}:Kafka:CpuUsage:${var.cluster-name}-${each.value}"
|
||||||
|
comparison_operator = "GreaterThanThreshold"
|
||||||
|
evaluation_periods = "3"
|
||||||
|
threshold = var.threshold-CpuUserSystem
|
||||||
|
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
|
||||||
|
insufficient_data_actions = []
|
||||||
|
actions_enabled = var.actions-enabled
|
||||||
|
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||||
|
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||||
|
metric_query {
|
||||||
|
id = "m1"
|
||||||
|
metric {
|
||||||
|
metric_name = "CpuUser"
|
||||||
|
namespace = "AWS/Kafka"
|
||||||
|
period = 300
|
||||||
|
stat = "Average"
|
||||||
|
dimensions = {
|
||||||
|
"Cluster Name" = var.cluster-name
|
||||||
|
"Broker ID" = each.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
metric_query {
|
||||||
|
id = "m2"
|
||||||
|
metric {
|
||||||
|
metric_name = "CpuSystem"
|
||||||
|
namespace = "AWS/Kafka"
|
||||||
|
period = 300
|
||||||
|
stat = "Average"
|
||||||
|
dimensions = {
|
||||||
|
"Cluster Name" = var.cluster-name
|
||||||
|
"Broker ID" = each.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
metric_query {
|
||||||
|
id = "e1"
|
||||||
|
expression = "m1 + m2"
|
||||||
|
label = "CpuUserSystem"
|
||||||
|
return_data = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
tags = var.default-tags
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = [tags]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
|
||||||
|
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
|
||||||
|
alarm_name = "${var.cw-alarm-prefix}:Kafka:KafkaDataLogsDiskUsed:${var.cluster-name}-${each.value}"
|
||||||
|
comparison_operator = "GreaterThanThreshold"
|
||||||
|
evaluation_periods = "3"
|
||||||
|
metric_name = "KafkaDataLogsDiskUsed"
|
||||||
|
period = "300"
|
||||||
|
statistic = "Average"
|
||||||
|
threshold = var.threshold-KafkaDataLogsDiskUsed
|
||||||
|
alarm_description = "Kafka:KafkaDataLogsDiskUsed"
|
||||||
|
namespace = "AWS/Kafka"
|
||||||
|
insufficient_data_actions = []
|
||||||
|
actions_enabled = var.actions-enabled
|
||||||
|
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||||
|
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||||
|
dimensions = {
|
||||||
|
"Cluster Name" = var.cluster-name
|
||||||
|
"Broker ID" = each.value
|
||||||
|
}
|
||||||
|
tags = var.default-tags
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = [tags]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" {
|
||||||
|
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
|
||||||
|
alarm_name = "${var.cw-alarm-prefix}:Kafka:HeapMemoryAfterGC:${var.cluster-name}-${each.value}"
|
||||||
|
comparison_operator = "GreaterThanThreshold"
|
||||||
|
evaluation_periods = "3"
|
||||||
|
metric_name = "HeapMemoryAfterGC"
|
||||||
|
period = "300"
|
||||||
|
statistic = "Average"
|
||||||
|
threshold = var.threshold-HeapMemoryAfterGC
|
||||||
|
alarm_description = "Kafka:HeapMemoryAfterGC"
|
||||||
|
namespace = "AWS/Kafka"
|
||||||
|
insufficient_data_actions = []
|
||||||
|
actions_enabled = var.actions-enabled
|
||||||
|
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||||
|
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||||
|
dimensions = {
|
||||||
|
"Cluster Name" = var.cluster-name
|
||||||
|
"Broker ID" = each.value
|
||||||
|
}
|
||||||
|
tags = var.default-tags
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = [tags]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -5,3 +5,6 @@ variable sns-targets {}
|
|||||||
variable default-tags {}
|
variable default-tags {}
|
||||||
|
|
||||||
variable threshold-ZooKeeperRequestLatencyMsMean {}
|
variable threshold-ZooKeeperRequestLatencyMsMean {}
|
||||||
|
variable threshold-CpuUserSystem {}
|
||||||
|
variable threshold-KafkaDataLogsDiskUsed {}
|
||||||
|
variable threshold-HeapMemoryAfterGC {}
|
@ -67,3 +67,27 @@ resource "aws_cloudwatch_metric_alarm" "redis-CacheHitRate" {
|
|||||||
ignore_changes = [tags]
|
ignore_changes = [tags]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "redis-StringBasedCmdsLatency" {
|
||||||
|
alarm_name = "${var.cw-alarm-prefix}:Redis:StringBasedCmdsLatency:${var.redis-cluster-id}"
|
||||||
|
comparison_operator = "GreaterThanThreshold"
|
||||||
|
evaluation_periods = "3"
|
||||||
|
metric_name = "StringBasedCmdsLatency"
|
||||||
|
period = "60"
|
||||||
|
statistic = "Average"
|
||||||
|
threshold = var.threshold-StringBasedCmdsLatency
|
||||||
|
alarm_description = "The average latency, in microseconds, of the string-based commands run during a selected time range"
|
||||||
|
namespace = "AWS/ElastiCache"
|
||||||
|
insufficient_data_actions = []
|
||||||
|
actions_enabled = var.actions-enabled
|
||||||
|
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||||
|
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||||
|
dimensions = {
|
||||||
|
CacheClusterId = var.redis-cluster-id
|
||||||
|
# CacheNodeId = each.value
|
||||||
|
}
|
||||||
|
tags = var.default-tags
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = [tags]
|
||||||
|
}
|
||||||
|
}
|
@ -1,9 +1,10 @@
|
|||||||
variable cw-alarm-prefix {}
|
variable "cw-alarm-prefix" {}
|
||||||
variable "actions-enabled" {}
|
variable "actions-enabled" {}
|
||||||
variable "redis-cluster-id" {}
|
variable "redis-cluster-id" {}
|
||||||
variable sns-targets {}
|
variable "sns-targets" {}
|
||||||
variable "default-tags" {}
|
variable "default-tags" {}
|
||||||
|
|
||||||
variable "threshold-EngineCPUUtilization" {}
|
variable "threshold-EngineCPUUtilization" {}
|
||||||
variable "threshold-DatabaseMemoryUsagePercentage" {}
|
variable "threshold-DatabaseMemoryUsagePercentage" {}
|
||||||
variable "threshold-CacheHitRate" {}
|
variable "threshold-CacheHitRate" {}
|
||||||
|
variable "threshold-StringBasedCmdsLatency" {}
|
@ -10,19 +10,18 @@ This module performs the following tasks:
|
|||||||
|
|
||||||
Subnet cidrs are calculated automatically. Due to the design of terraform's cidrsubnets, this module has limitations:
|
Subnet cidrs are calculated automatically. Due to the design of terraform's cidrsubnets, this module has limitations:
|
||||||
|
|
||||||
* supports 2, 4, 6, or 8 subnets in total.
|
* supports 2, 4, 6, 8, or 12 subnets in total.
|
||||||
* hard-coded to work with 2 AZs, regardless of number of AZs available in the region.
|
* hard-coded to work with 2 AZs, regardless of number of AZs available in the region.
|
||||||
|
|
||||||
Based on the input variables, it will create subnet cidrs using the following function
|
Based on the input variables, it will create subnet cidrs using the following function
|
||||||
|
|
||||||
| Private Subnets per az | Public Subnets per az | Function | Example if a /24 is used on VPC |
|
| Private Subnets per az | Public Subnets per az | Function | Example if a /24 is used on VPC |
|
||||||
| ---------------------- | --------------------- | -------------------------------------------- | ------------------------------- |
|
|------------------------|-----------------------|------------------------------------------------------|---------------------------------|
|
||||||
| 1 | 0 | cidrsubnets(local.vpc-cidr, 1,1) | 2 * /25 |
|
| 1 | 0 | cidrsubnets(local.vpc-cidr, 1,1) | 2 * /25 |
|
||||||
| 1 | 1 | cidrsubnets(local.vpc-cidr, 2,2,2,2) | 4 * /26 |
|
| 1 | 1 | cidrsubnets(local.vpc-cidr, 2,2,2,2) | 4 * /26 |
|
||||||
| 2 | 1 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) | 6 * /27 |
|
| 2 | 1 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) | 6 * /27 |
|
||||||
| 2 | 2 | cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) | 8 * /28 |
|
| 2 | 2 | cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3,3,3) | 8 * /27 |
|
||||||
|
| 6 | 6 | cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4,4,4,4,4) | 12 * /28 |
|
||||||
simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
|
|
||||||
|
|
||||||
## Inputs:
|
## Inputs:
|
||||||
|
|
||||||
|
@ -9,7 +9,8 @@ locals {
|
|||||||
vpc-cidr = var.vpc-cidr
|
vpc-cidr = var.vpc-cidr
|
||||||
total-no-subnets = local.no-az * (var.number-of-private-subnets-per-az + var.number-of-public-subnets-per-az)
|
total-no-subnets = local.no-az * (var.number-of-private-subnets-per-az + var.number-of-public-subnets-per-az)
|
||||||
|
|
||||||
simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
|
# simple-divide = local.total-no-subnets >=8 ? cidrsubnets(local.vpc-cidr, 4,4,4,4,4,4,4,4) : local.total-no-subnets >=6 ? cidrsubnets(local.vpc-cidr, 3,3,3,3,3,3) : local.total-no-subnets >=4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >=2 ? cidrsubnets(local.vpc-cidr, 1,1) : null
|
||||||
|
simple-divide = local.total-no-subnets >= 12 ? cidrsubnets(local.vpc-cidr, 4,4,4,4, 4,4,4,4, 4,4,4,4) : local.total-no-subnets >= 8 ? cidrsubnets(local.vpc-cidr, 3,3,3,3, 3,3,3,3) : local.total-no-subnets >= 6 ? cidrsubnets(local.vpc-cidr, 3,3,3, 3,3,3) : local.total-no-subnets >= 4 ? cidrsubnets(local.vpc-cidr, 2,2,2,2) : local.total-no-subnets >= 2 ? cidrsubnets(local.vpc-cidr, 1, 1) : null
|
||||||
public-subnets = slice(local.simple-divide, 0, var.number-of-public-subnets-per-az * local.no-az)
|
public-subnets = slice(local.simple-divide, 0, var.number-of-public-subnets-per-az * local.no-az)
|
||||||
private-subnets = slice(local.simple-divide, var.number-of-public-subnets-per-az * local.no-az , local.total-no-subnets)
|
private-subnets = slice(local.simple-divide, var.number-of-public-subnets-per-az * local.no-az , local.total-no-subnets)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user