NEW: EKS monitoring

This commit is contained in:
xpk 2022-12-30 00:01:39 +08:00
parent 9a3ec387a9
commit bcdbb23221
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
4 changed files with 130 additions and 0 deletions

View File

@ -0,0 +1,27 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
Unlike other monitoring modules which discovers resources details automatically, EKS pod name need to be supplied to this module.
AWS cli does not provide pod information.
## Example
```terraform
data "aws_eks_clusters" "eks-clusters" {}
module "eks-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = data.aws_eks_clusters.eks-clusters.names
source = "../../modules/ManagementGovernance/Monitoring.EKS"
default-tags = local.default-tags
cluster-name = each.value
eks-namespace = "default"
pod-names = ["depl-nginx", "depl-alpine"]
threshold-pod_cpu_utilization = 85
threshold-pod_memory_utilization = 85
threshold-pod_number_of_container_restarts = 5
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
```

View File

@ -0,0 +1,81 @@
// The following checks requires container insights
resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" {
for_each = toset(var.pod-names)
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_cpu_utilization:${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "pod_cpu_utilization"
period = "300"
statistic = "Average"
threshold = var.threshold-pod_cpu_utilization
alarm_description = "EKS:pod_cpu_utilization"
namespace = "ContainerInsights"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
"PodName" = each.value
"ClusterName" = var.cluster-name
"Namespace" = var.eks-namespace
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" {
for_each = toset(var.pod-names)
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_memory_utilization:${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "pod_memory_utilization"
period = "300"
statistic = "Average"
threshold = var.threshold-pod_memory_utilization
alarm_description = "EKS:pod_memory_utilization"
namespace = "ContainerInsights"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
"PodName" = each.value
"ClusterName" = var.cluster-name
"Namespace" = var.eks-namespace
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}
resource "aws_cloudwatch_metric_alarm" "eks-pod_number_of_container_restarts" {
for_each = toset(var.pod-names)
alarm_name = "${var.cw-alarm-prefix}:EKS:pod_number_of_container_restarts:${each.value}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "pod_number_of_container_restarts"
period = "300"
statistic = "Average"
threshold = var.threshold-pod_number_of_container_restarts
alarm_description = "EKS:pod_number_of_container_restarts"
namespace = "ContainerInsights"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.sns-targets.alarm-actions-urgent]
ok_actions = [var.sns-targets.alarm-actions-urgent]
dimensions = {
"PodName" = each.value
"ClusterName" = var.cluster-name
"Namespace" = var.eks-namespace
}
tags = var.default-tags
lifecycle {
ignore_changes = [tags]
}
}

View File

@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.36.1"
}
}
}

View File

@ -0,0 +1,13 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable sns-targets {}
variable default-tags {}
variable cluster-name {}
variable eks-namespace {}
variable pod-names {
type = list
}
variable threshold-pod_cpu_utilization {}
variable threshold-pod_memory_utilization {}
variable threshold-pod_number_of_container_restarts {}