UPD: added more OpenSearch alarms

This commit is contained in:
xpk 2022-12-19 14:45:23 +08:00
parent bfbca075aa
commit 7023e71fb5
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
2 changed files with 232 additions and 8 deletions

View File

@ -96,3 +96,218 @@ resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" {
} }
} }
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteQueue" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteQueue:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolWriteQueue"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolWriteQueue
alarm_description = "The cluster is experiencing high indexing concurrency. Review and control indexing requests, or increase cluster resources."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchQueue" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchQueue:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolSearchQueue"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolSearchQueue
alarm_description = "The cluster is experiencing high search concurrency. Consider scaling your cluster. You can also increase the search queue size, but increasing it excessively can cause out of memory errors."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchRejected" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchRejected:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolSearchRejected"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolSearchRejected
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteRejected" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteRejected:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolWriteRejected"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolWriteRejected
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-MasterCPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ES:MasterCPUUtilization:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MasterCPUUtilization"
period = "300"
statistic = "Average"
threshold = var.threshold-MasterCPUUtilization
alarm_description = "MasterCPUUtilization"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-MasterJVMMemoryPressure" {
alarm_name = "${var.cw-alarm-prefix}:ES:MasterJVMMemoryPressure:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MasterJVMMemoryPressure"
period = "60"
statistic = "Average"
threshold = var.threshold-MasterJVMMemoryPressure
alarm_description = "MasterJVMMemoryPressure"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-JVMMemoryPressure" {
alarm_name = "${var.cw-alarm-prefix}:ES:JVMMemoryPressure:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "JVMMemoryPressure"
period = "60"
statistic = "Average"
threshold = var.threshold-JVMMemoryPressure
alarm_description = "The cluster could encounter out of memory errors if usage increases. Consider scaling vertically. OpenSearch Service uses half of an instance's RAM for the Java heap, up to a heap size of 32 GiB. You can scale instances vertically up to 64 GiB of RAM, at which point you can scale horizontally by adding instances."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ClusterIndexWritesBlocked" {
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterIndexWritesBlocked:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ClusterIndexWritesBlocked"
period = "60"
statistic = "Average"
threshold = var.threshold-ClusterIndexWritesBlocked
alarm_description = "Your cluster is blocking write requests. See ClusterBlockException."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-FreeStorageSpace" {
alarm_name = "${var.cw-alarm-prefix}:ES:FreeStorageSpace:${var.domain-name}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "FreeStorageSpace"
period = "300"
statistic = "Average"
threshold = var.threshold-FreeStorageSpace
alarm_description = "A node in your cluster is low on free storage space."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -1,10 +1,19 @@
variable cw-alarm-prefix {} variable "cw-alarm-prefix" {}
variable actions-enabled {} variable "actions-enabled" {}
variable domain-name {} variable "domain-name" {}
variable sns-targets {} variable "sns-targets" {}
variable default-tags {} variable "default-tags" {}
variable threshold-CPUUtilization {} variable "threshold-CPUUtilization" {}
variable threshold-SearchLatency {} variable "threshold-SearchLatency" {}
variable threshold-IndexingLatency {} variable "threshold-IndexingLatency" {}
variable "threshold-ThreadpoolWriteQueue" {}
variable "threshold-ThreadpoolSearchQueue" {}
variable "threshold-ThreadpoolSearchRejected" {}
variable "threshold-ThreadpoolWriteRejected" {}
variable "threshold-MasterCPUUtilization" {}
variable "threshold-MasterJVMMemoryPressure" {}
variable "threshold-JVMMemoryPressure" {}
variable "threshold-ClusterIndexWritesBlocked" {}
variable "threshold-FreeStorageSpace" {}
# variable threshold-KibanaHealthyNodes {} # variable threshold-KibanaHealthyNodes {}