diff --git a/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf b/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf index 19e97d0..6e70eb6 100644 --- a/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf +++ b/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf @@ -96,3 +96,218 @@ resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" { } } +resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteQueue" { + alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteQueue:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "ThreadpoolWriteQueue" + period = "60" + statistic = "Average" + threshold = var.threshold-ThreadpoolWriteQueue + alarm_description = "The cluster is experiencing high indexing concurrency. Review and control indexing requests, or increase cluster resources." + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-standard] + ok_actions = [var.sns-targets.alarm-actions-standard] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchQueue" { + alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchQueue:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "ThreadpoolSearchQueue" + period = "60" + statistic = "Average" + threshold = var.threshold-ThreadpoolSearchQueue + alarm_description = "The cluster is experiencing high search concurrency. Consider scaling your cluster. You can also increase the search queue size, but increasing it excessively can cause out of memory errors." + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-standard] + ok_actions = [var.sns-targets.alarm-actions-standard] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchRejected" { + alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchRejected:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "ThreadpoolSearchRejected" + period = "60" + statistic = "Average" + threshold = var.threshold-ThreadpoolSearchRejected + alarm_description = "These alarms notify you of domain issues that might impact performance and stability." + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteRejected" { + alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteRejected:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "ThreadpoolWriteRejected" + period = "60" + statistic = "Average" + threshold = var.threshold-ThreadpoolWriteRejected + alarm_description = "These alarms notify you of domain issues that might impact performance and stability." + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-MasterCPUUtilization" { + alarm_name = "${var.cw-alarm-prefix}:ES:MasterCPUUtilization:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "MasterCPUUtilization" + period = "300" + statistic = "Average" + threshold = var.threshold-MasterCPUUtilization + alarm_description = "MasterCPUUtilization" + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-MasterJVMMemoryPressure" { + alarm_name = "${var.cw-alarm-prefix}:ES:MasterJVMMemoryPressure:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "MasterJVMMemoryPressure" + period = "60" + statistic = "Average" + threshold = var.threshold-MasterJVMMemoryPressure + alarm_description = "MasterJVMMemoryPressure" + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-JVMMemoryPressure" { + alarm_name = "${var.cw-alarm-prefix}:ES:JVMMemoryPressure:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "JVMMemoryPressure" + period = "60" + statistic = "Average" + threshold = var.threshold-JVMMemoryPressure + alarm_description = "The cluster could encounter out of memory errors if usage increases. Consider scaling vertically. OpenSearch Service uses half of an instance's RAM for the Java heap, up to a heap size of 32 GiB. You can scale instances vertically up to 64 GiB of RAM, at which point you can scale horizontally by adding instances." + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-ClusterIndexWritesBlocked" { + alarm_name = "${var.cw-alarm-prefix}:ES:ClusterIndexWritesBlocked:${var.domain-name}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "ClusterIndexWritesBlocked" + period = "60" + statistic = "Average" + threshold = var.threshold-ClusterIndexWritesBlocked + alarm_description = "Your cluster is blocking write requests. See ClusterBlockException." + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "ES-FreeStorageSpace" { + alarm_name = "${var.cw-alarm-prefix}:ES:FreeStorageSpace:${var.domain-name}" + comparison_operator = "LessThanThreshold" + evaluation_periods = "2" + metric_name = "FreeStorageSpace" + period = "300" + statistic = "Average" + threshold = var.threshold-FreeStorageSpace + alarm_description = "A node in your cluster is low on free storage space." + namespace = "AWS/ES" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + DomainName = var.domain-name + ClientId = data.aws_caller_identity.this.id + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} diff --git a/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf b/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf index 28d9fdb..923cd2e 100644 --- a/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf +++ b/modules/ManagementGovernance/Monitoring.OpenSearch/variables.tf @@ -1,10 +1,19 @@ -variable cw-alarm-prefix {} -variable actions-enabled {} -variable domain-name {} -variable sns-targets {} -variable default-tags {} +variable "cw-alarm-prefix" {} +variable "actions-enabled" {} +variable "domain-name" {} +variable "sns-targets" {} +variable "default-tags" {} -variable threshold-CPUUtilization {} -variable threshold-SearchLatency {} -variable threshold-IndexingLatency {} +variable "threshold-CPUUtilization" {} +variable "threshold-SearchLatency" {} +variable "threshold-IndexingLatency" {} +variable "threshold-ThreadpoolWriteQueue" {} +variable "threshold-ThreadpoolSearchQueue" {} +variable "threshold-ThreadpoolSearchRejected" {} +variable "threshold-ThreadpoolWriteRejected" {} +variable "threshold-MasterCPUUtilization" {} +variable "threshold-MasterJVMMemoryPressure" {} +variable "threshold-JVMMemoryPressure" {} +variable "threshold-ClusterIndexWritesBlocked" {} +variable "threshold-FreeStorageSpace" {} # variable threshold-KibanaHealthyNodes {} \ No newline at end of file