terraform.aws-baseline-infra/modules/ManagementGovernance/Monitoring.OpenSearch/main.tf

314 lines
13 KiB
Terraform
Raw Normal View History

data "aws_caller_identity" "this" {}
resource "aws_cloudwatch_metric_alarm" "ES-CPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ES:CPUUtilization:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "CPUUtilization"
period = "1800"
statistic = "Average"
threshold = var.threshold-CPUUtilization
alarm_description = "ES:CPUUtilization"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-SearchLatency" {
alarm_name = "${var.cw-alarm-prefix}:ES:SearchLatency:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "SearchLatency"
period = "1800"
statistic = "Average"
threshold = var.threshold-SearchLatency
alarm_description = "ES:SearchLatency"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-IndexingLatency" {
alarm_name = "${var.cw-alarm-prefix}:ES:IndexingLatency:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "IndexingLatency"
period = "1800"
statistic = "Average"
threshold = var.threshold-IndexingLatency
alarm_description = "ES:IndexingLatency"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ClusterStatusRed" {
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterStatusRed:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ClusterStatus.red"
period = "900"
statistic = "Maximum"
threshold = 0
alarm_description = "At least one primary shard and its replicas aren't allocated to a node."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
2022-12-19 14:45:23 +08:00
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteQueue" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteQueue:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolWriteQueue"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolWriteQueue
alarm_description = "The cluster is experiencing high indexing concurrency. Review and control indexing requests, or increase cluster resources."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchQueue" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchQueue:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolSearchQueue"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolSearchQueue
alarm_description = "The cluster is experiencing high search concurrency. Consider scaling your cluster. You can also increase the search queue size, but increasing it excessively can cause out of memory errors."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-standard]
ok_actions = [var.sns-targets.alarm-actions-standard]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolSearchRejected" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolSearchRejected:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolSearchRejected"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolSearchRejected
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ThreadpoolWriteRejected" {
alarm_name = "${var.cw-alarm-prefix}:ES:ThreadpoolWriteRejected:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ThreadpoolWriteRejected"
period = "60"
statistic = "Average"
threshold = var.threshold-ThreadpoolWriteRejected
alarm_description = "These alarms notify you of domain issues that might impact performance and stability."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-MasterCPUUtilization" {
alarm_name = "${var.cw-alarm-prefix}:ES:MasterCPUUtilization:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MasterCPUUtilization"
period = "300"
statistic = "Average"
threshold = var.threshold-MasterCPUUtilization
alarm_description = "MasterCPUUtilization"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-MasterJVMMemoryPressure" {
alarm_name = "${var.cw-alarm-prefix}:ES:MasterJVMMemoryPressure:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MasterJVMMemoryPressure"
period = "60"
statistic = "Average"
threshold = var.threshold-MasterJVMMemoryPressure
alarm_description = "MasterJVMMemoryPressure"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-JVMMemoryPressure" {
alarm_name = "${var.cw-alarm-prefix}:ES:JVMMemoryPressure:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "JVMMemoryPressure"
period = "60"
statistic = "Average"
threshold = var.threshold-JVMMemoryPressure
alarm_description = "The cluster could encounter out of memory errors if usage increases. Consider scaling vertically. OpenSearch Service uses half of an instance's RAM for the Java heap, up to a heap size of 32 GiB. You can scale instances vertically up to 64 GiB of RAM, at which point you can scale horizontally by adding instances."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-ClusterIndexWritesBlocked" {
alarm_name = "${var.cw-alarm-prefix}:ES:ClusterIndexWritesBlocked:${var.domain-name}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ClusterIndexWritesBlocked"
period = "60"
statistic = "Average"
threshold = var.threshold-ClusterIndexWritesBlocked
alarm_description = "Your cluster is blocking write requests. See ClusterBlockException."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "ES-FreeStorageSpace" {
alarm_name = "${var.cw-alarm-prefix}:ES:FreeStorageSpace:${var.domain-name}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "FreeStorageSpace"
period = "300"
statistic = "Average"
threshold = var.threshold-FreeStorageSpace
alarm_description = "A node in your cluster is low on free storage space."
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}