From bcdbb232212047b451ff5e06fb53d8bc6e741de8 Mon Sep 17 00:00:00 2001 From: xpk Date: Fri, 30 Dec 2022 00:01:39 +0800 Subject: [PATCH] NEW: EKS monitoring --- .../Monitoring.EKS/README.md | 27 +++++++ .../Monitoring.EKS/main.tf | 81 +++++++++++++++++++ .../Monitoring.EKS/provider.tf | 9 +++ .../Monitoring.EKS/variables.tf | 13 +++ 4 files changed, 130 insertions(+) create mode 100644 modules/ManagementGovernance/Monitoring.EKS/README.md create mode 100644 modules/ManagementGovernance/Monitoring.EKS/main.tf create mode 100644 modules/ManagementGovernance/Monitoring.EKS/provider.tf create mode 100644 modules/ManagementGovernance/Monitoring.EKS/variables.tf diff --git a/modules/ManagementGovernance/Monitoring.EKS/README.md b/modules/ManagementGovernance/Monitoring.EKS/README.md new file mode 100644 index 0000000..59cf483 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.EKS/README.md @@ -0,0 +1,27 @@ +# Monitoring module +This module deploys the default cloudwatch metric monitoring + +## Notes +Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway. +Unlike other monitoring modules which discovers resources details automatically, EKS pod name need to be supplied to this module. +AWS cli does not provide pod information. + +## Example +```terraform +data "aws_eks_clusters" "eks-clusters" {} + +module "eks-monitoring" { + cw-alarm-prefix = local.cw-alarm-prefix + for_each = data.aws_eks_clusters.eks-clusters.names + source = "../../modules/ManagementGovernance/Monitoring.EKS" + default-tags = local.default-tags + cluster-name = each.value + eks-namespace = "default" + pod-names = ["depl-nginx", "depl-alpine"] + threshold-pod_cpu_utilization = 85 + threshold-pod_memory_utilization = 85 + threshold-pod_number_of_container_restarts = 5 + actions-enabled = var.actions-enabled + sns-targets = local.sns-targets +} +``` \ No newline at end of file diff --git a/modules/ManagementGovernance/Monitoring.EKS/main.tf b/modules/ManagementGovernance/Monitoring.EKS/main.tf new file mode 100644 index 0000000..a1de6fc --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.EKS/main.tf @@ -0,0 +1,81 @@ +// The following checks requires container insights + +resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" { + for_each = toset(var.pod-names) + alarm_name = "${var.cw-alarm-prefix}:EKS:pod_cpu_utilization:${each.value}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "3" + metric_name = "pod_cpu_utilization" + period = "300" + statistic = "Average" + threshold = var.threshold-pod_cpu_utilization + alarm_description = "EKS:pod_cpu_utilization" + namespace = "ContainerInsights" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + "PodName" = each.value + "ClusterName" = var.cluster-name + "Namespace" = var.eks-namespace + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" { + for_each = toset(var.pod-names) + + alarm_name = "${var.cw-alarm-prefix}:EKS:pod_memory_utilization:${each.value}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "3" + metric_name = "pod_memory_utilization" + period = "300" + statistic = "Average" + threshold = var.threshold-pod_memory_utilization + alarm_description = "EKS:pod_memory_utilization" + namespace = "ContainerInsights" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + "PodName" = each.value + "ClusterName" = var.cluster-name + "Namespace" = var.eks-namespace + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} + +resource "aws_cloudwatch_metric_alarm" "eks-pod_number_of_container_restarts" { + for_each = toset(var.pod-names) + + alarm_name = "${var.cw-alarm-prefix}:EKS:pod_number_of_container_restarts:${each.value}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "3" + metric_name = "pod_number_of_container_restarts" + period = "300" + statistic = "Average" + threshold = var.threshold-pod_number_of_container_restarts + alarm_description = "EKS:pod_number_of_container_restarts" + namespace = "ContainerInsights" + insufficient_data_actions = [] + actions_enabled = var.actions-enabled + alarm_actions = [var.sns-targets.alarm-actions-urgent] + ok_actions = [var.sns-targets.alarm-actions-urgent] + dimensions = { + "PodName" = each.value + "ClusterName" = var.cluster-name + "Namespace" = var.eks-namespace + } + tags = var.default-tags + lifecycle { + ignore_changes = [tags] + } +} diff --git a/modules/ManagementGovernance/Monitoring.EKS/provider.tf b/modules/ManagementGovernance/Monitoring.EKS/provider.tf new file mode 100644 index 0000000..7b64cf5 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.EKS/provider.tf @@ -0,0 +1,9 @@ +terraform { + required_version = "~> 1.3.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.36.1" + } + } +} diff --git a/modules/ManagementGovernance/Monitoring.EKS/variables.tf b/modules/ManagementGovernance/Monitoring.EKS/variables.tf new file mode 100644 index 0000000..f5befc6 --- /dev/null +++ b/modules/ManagementGovernance/Monitoring.EKS/variables.tf @@ -0,0 +1,13 @@ +variable cw-alarm-prefix {} +variable actions-enabled {} +variable sns-targets {} +variable default-tags {} + +variable cluster-name {} +variable eks-namespace {} +variable pod-names { + type = list +} +variable threshold-pod_cpu_utilization {} +variable threshold-pod_memory_utilization {} +variable threshold-pod_number_of_container_restarts {} \ No newline at end of file