UPD: added more OpenSearch alarms
This commit is contained in:
parent
bfbca075aa
commit
7023e71fb5
@ -96,3 +96,218 @@ resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" {
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteQueue" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteQueue:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ThreadpoolWriteQueue"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ThreadpoolWriteQueue
|
||||
alarm_description = "The cluster is experiencing high indexing concurrency. Review and control indexing requests, or increase cluster resources."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchQueue" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchQueue:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ThreadpoolSearchQueue"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ThreadpoolSearchQueue
|
||||
alarm_description = "The cluster is experiencing high search concurrency. Consider scaling your cluster. You can also increase the search queue size, but increasing it excessively can cause out of memory errors."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-standard]
|
||||
ok_actions = [var.sns-targets.alarm-actions-standard]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchRejected" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchRejected:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ThreadpoolSearchRejected"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ThreadpoolSearchRejected
|
||||
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteRejected" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteRejected:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ThreadpoolWriteRejected"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ThreadpoolWriteRejected
|
||||
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-MasterCPUUtilization" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:MasterCPUUtilization:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "MasterCPUUtilization"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-MasterCPUUtilization
|
||||
alarm_description = "MasterCPUUtilization"
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-MasterJVMMemoryPressure" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:MasterJVMMemoryPressure:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "MasterJVMMemoryPressure"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-MasterJVMMemoryPressure
|
||||
alarm_description = "MasterJVMMemoryPressure"
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-JVMMemoryPressure" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:JVMMemoryPressure:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "JVMMemoryPressure"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-JVMMemoryPressure
|
||||
alarm_description = "The cluster could encounter out of memory errors if usage increases. Consider scaling vertically. OpenSearch Service uses half of an instance's RAM for the Java heap, up to a heap size of 32 GiB. You can scale instances vertically up to 64 GiB of RAM, at which point you can scale horizontally by adding instances."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-ClusterIndexWritesBlocked" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterIndexWritesBlocked:${var.domain-name}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "ClusterIndexWritesBlocked"
|
||||
period = "60"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-ClusterIndexWritesBlocked
|
||||
alarm_description = "Your cluster is blocking write requests. See ClusterBlockException."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "ES-FreeStorageSpace" {
|
||||
alarm_name = "${var.cw-alarm-prefix}:ES:FreeStorageSpace:${var.domain-name}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "FreeStorageSpace"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = var.threshold-FreeStorageSpace
|
||||
alarm_description = "A node in your cluster is low on free storage space."
|
||||
namespace = "AWS/ES"
|
||||
insufficient_data_actions = []
|
||||
actions_enabled = var.actions-enabled
|
||||
alarm_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
ok_actions = [var.sns-targets.alarm-actions-urgent]
|
||||
dimensions = {
|
||||
DomainName = var.domain-name
|
||||
ClientId = data.aws_caller_identity.this.id
|
||||
}
|
||||
tags = var.default-tags
|
||||
lifecycle {
|
||||
ignore_changes = [tags]
|
||||
}
|
||||
}
|
||||
|
@ -1,10 +1,19 @@
|
||||
variable cw-alarm-prefix {}
|
||||
variable actions-enabled {}
|
||||
variable domain-name {}
|
||||
variable sns-targets {}
|
||||
variable default-tags {}
|
||||
variable "cw-alarm-prefix" {}
|
||||
variable "actions-enabled" {}
|
||||
variable "domain-name" {}
|
||||
variable "sns-targets" {}
|
||||
variable "default-tags" {}
|
||||
|
||||
variable threshold-CPUUtilization {}
|
||||
variable threshold-SearchLatency {}
|
||||
variable threshold-IndexingLatency {}
|
||||
variable "threshold-CPUUtilization" {}
|
||||
variable "threshold-SearchLatency" {}
|
||||
variable "threshold-IndexingLatency" {}
|
||||
variable "threshold-ThreadpoolWriteQueue" {}
|
||||
variable "threshold-ThreadpoolSearchQueue" {}
|
||||
variable "threshold-ThreadpoolSearchRejected" {}
|
||||
variable "threshold-ThreadpoolWriteRejected" {}
|
||||
variable "threshold-MasterCPUUtilization" {}
|
||||
variable "threshold-MasterJVMMemoryPressure" {}
|
||||
variable "threshold-JVMMemoryPressure" {}
|
||||
variable "threshold-ClusterIndexWritesBlocked" {}
|
||||
variable "threshold-FreeStorageSpace" {}
|
||||
# variable threshold-KibanaHealthyNodes {}
|
Loading…
Reference in New Issue
Block a user