UPD: added more OpenSearch alarms

2022-12-19 14:45:23 +08:00 · 2022-12-19 14:45:23 +08:00 · 7023e71fb5
commit 7023e71fb5
parent bfbca075aa
2 changed files with 232 additions and 8 deletions
--- a/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf
+++ b/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf
@ -96,3 +96,218 @@ resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" {
  }
 }
 resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteQueue" {
  alarm_name                = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteQueue:${var.domain-name}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "2"
  metric_name               = "ThreadpoolWriteQueue"
  period                    = "60"
  statistic                 = "Average"
  threshold                 = var.threshold-ThreadpoolWriteQueue
  alarm_description         = "The cluster is experiencing high indexing concurrency. Review and control indexing requests, or increase cluster resources."
  namespace                 = "AWS/ES"
  insufficient_data_actions = []
  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.sns-targets.alarm-actions-standard]
  ok_actions                = [var.sns-targets.alarm-actions-standard]
  dimensions = {
    DomainName = var.domain-name
    ClientId   = data.aws_caller_identity.this.id
  }
  tags = var.default-tags
  lifecycle {
    ignore_changes = [tags]
  }
 }
 resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchQueue" {
  alarm_name                = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchQueue:${var.domain-name}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "2"
  metric_name               = "ThreadpoolSearchQueue"
  period                    = "60"
  statistic                 = "Average"
  threshold                 = var.threshold-ThreadpoolSearchQueue
  alarm_description         = "The cluster is experiencing high search concurrency. Consider scaling your cluster. You can also increase the search queue size, but increasing it excessively can cause out of memory errors."
  namespace                 = "AWS/ES"
  insufficient_data_actions = []
  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.sns-targets.alarm-actions-standard]
  ok_actions                = [var.sns-targets.alarm-actions-standard]
  dimensions = {
    DomainName = var.domain-name
    ClientId   = data.aws_caller_identity.this.id
  }
  tags = var.default-tags
  lifecycle {
    ignore_changes = [tags]
  }
 }
 resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchRejected" {
  alarm_name                = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchRejected:${var.domain-name}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "2"
  metric_name               = "ThreadpoolSearchRejected"
  period                    = "60"
  statistic                 = "Average"
  threshold                 = var.threshold-ThreadpoolSearchRejected
  alarm_description         = "These alarms notify you of domain issues that might impact performance and stability."
  namespace                 = "AWS/ES"
  insufficient_data_actions = []
  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.sns-targets.alarm-actions-urgent]
  ok_actions                = [var.sns-targets.alarm-actions-urgent]
  dimensions = {
    DomainName = var.domain-name
    ClientId   = data.aws_caller_identity.this.id
  }
  tags = var.default-tags
  lifecycle {
    ignore_changes = [tags]
  }
 }
 resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteRejected" {
  alarm_name                = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteRejected:${var.domain-name}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "2"
  metric_name               = "ThreadpoolWriteRejected"
  period                    = "60"
  statistic                 = "Average"
  threshold                 = var.threshold-ThreadpoolWriteRejected
  alarm_description         = "These alarms notify you of domain issues that might impact performance and stability."
  namespace                 = "AWS/ES"
  insufficient_data_actions = []
  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.sns-targets.alarm-actions-urgent]
  ok_actions                = [var.sns-targets.alarm-actions-urgent]
  dimensions = {
    DomainName = var.domain-name
    ClientId   = data.aws_caller_identity.this.id
  }
  tags = var.default-tags
  lifecycle {
    ignore_changes = [tags]
  }
 }
 resource "aws_cloudwatch_metric_alarm" "ES-MasterCPUUtilization" {
  alarm_name                = "${var.cw-alarm-prefix}:ES:MasterCPUUtilization:${var.domain-name}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "2"
  metric_name               = "MasterCPUUtilization"
  period                    = "300"
  statistic                 = "Average"
  threshold                 = var.threshold-MasterCPUUtilization
  alarm_description         = "MasterCPUUtilization"
  namespace                 = "AWS/ES"
  insufficient_data_actions = []
  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.sns-targets.alarm-actions-urgent]
  ok_actions                = [var.sns-targets.alarm-actions-urgent]
  dimensions = {
    DomainName = var.domain-name
    ClientId   = data.aws_caller_identity.this.id
  }
  tags = var.default-tags
  lifecycle {
    ignore_changes = [tags]
  }
 }
 resource "aws_cloudwatch_metric_alarm" "ES-MasterJVMMemoryPressure" {
  alarm_name                = "${var.cw-alarm-prefix}:ES:MasterJVMMemoryPressure:${var.domain-name}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "2"
  metric_name               = "MasterJVMMemoryPressure"
  period                    = "60"
  statistic                 = "Average"
  threshold                 = var.threshold-MasterJVMMemoryPressure
  alarm_description         = "MasterJVMMemoryPressure"
  namespace                 = "AWS/ES"
  insufficient_data_actions = []
  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.sns-targets.alarm-actions-urgent]
  ok_actions                = [var.sns-targets.alarm-actions-urgent]
  dimensions = {
    DomainName = var.domain-name
    ClientId   = data.aws_caller_identity.this.id
  }
  tags = var.default-tags
  lifecycle {
    ignore_changes = [tags]
  }
 }
 resource "aws_cloudwatch_metric_alarm" "ES-JVMMemoryPressure" {
  alarm_name                = "${var.cw-alarm-prefix}:ES:JVMMemoryPressure:${var.domain-name}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "2"
  metric_name               = "JVMMemoryPressure"
  period                    = "60"
  statistic                 = "Average"
  threshold                 = var.threshold-JVMMemoryPressure
  alarm_description         = "The cluster could encounter out of memory errors if usage increases. Consider scaling vertically. OpenSearch Service uses half of an instance's RAM for the Java heap, up to a heap size of 32 GiB. You can scale instances vertically up to 64 GiB of RAM, at which point you can scale horizontally by adding instances."
  namespace                 = "AWS/ES"
  insufficient_data_actions = []
  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.sns-targets.alarm-actions-urgent]
  ok_actions                = [var.sns-targets.alarm-actions-urgent]
  dimensions = {
    DomainName = var.domain-name
    ClientId   = data.aws_caller_identity.this.id
  }
  tags = var.default-tags
  lifecycle {
    ignore_changes = [tags]
  }
 }
 resource "aws_cloudwatch_metric_alarm" "ES-ClusterIndexWritesBlocked" {
  alarm_name                = "${var.cw-alarm-prefix}:ES:ClusterIndexWritesBlocked:${var.domain-name}"
  comparison_operator       = "GreaterThanThreshold"
  evaluation_periods        = "2"
  metric_name               = "ClusterIndexWritesBlocked"
  period                    = "60"
  statistic                 = "Average"
  threshold                 = var.threshold-ClusterIndexWritesBlocked
  alarm_description         = "Your cluster is blocking write requests. See ClusterBlockException."
  namespace                 = "AWS/ES"
  insufficient_data_actions = []
  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.sns-targets.alarm-actions-urgent]
  ok_actions                = [var.sns-targets.alarm-actions-urgent]
  dimensions = {
    DomainName = var.domain-name
    ClientId   = data.aws_caller_identity.this.id
  }
  tags = var.default-tags
  lifecycle {
    ignore_changes = [tags]
  }
 }
 resource "aws_cloudwatch_metric_alarm" "ES-FreeStorageSpace" {
  alarm_name                = "${var.cw-alarm-prefix}:ES:FreeStorageSpace:${var.domain-name}"
  comparison_operator       = "LessThanThreshold"
  evaluation_periods        = "2"
  metric_name               = "FreeStorageSpace"
  period                    = "300"
  statistic                 = "Average"
  threshold                 = var.threshold-FreeStorageSpace
  alarm_description         = "A node in your cluster is low on free storage space."
  namespace                 = "AWS/ES"
  insufficient_data_actions = []
  actions_enabled           = var.actions-enabled
  alarm_actions             = [var.sns-targets.alarm-actions-urgent]
  ok_actions                = [var.sns-targets.alarm-actions-urgent]
  dimensions = {
    DomainName = var.domain-name
    ClientId   = data.aws_caller_identity.this.id
  }
  tags = var.default-tags
  lifecycle {
    ignore_changes = [tags]
  }
 }
--- a/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf
+++ b/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf
@ -1,10 +1,19 @@
-variable cw-alarm-prefix {}
+variable "cw-alarm-prefix" {}
-variable actions-enabled {}
+variable "actions-enabled" {}
-variable domain-name {}
+variable "domain-name" {}
-variable sns-targets {}
+variable "sns-targets" {}
-variable default-tags {}
+variable "default-tags" {}
-variable threshold-CPUUtilization {}
+variable "threshold-CPUUtilization" {}
-variable threshold-SearchLatency {}
+variable "threshold-SearchLatency" {}
-variable threshold-IndexingLatency {}
+variable "threshold-IndexingLatency" {}
 variable "threshold-ThreadpoolWriteQueue" {}
 variable "threshold-ThreadpoolSearchQueue" {}
 variable "threshold-ThreadpoolSearchRejected" {}
 variable "threshold-ThreadpoolWriteRejected" {}
 variable "threshold-MasterCPUUtilization" {}
 variable "threshold-MasterJVMMemoryPressure" {}
 variable "threshold-JVMMemoryPressure" {}
 variable "threshold-ClusterIndexWritesBlocked" {}
 variable "threshold-FreeStorageSpace" {}
 # variable threshold-KibanaHealthyNodes {}