terraform.aws-baseline-infra/modules/ManagementGovernance/Monitoring.EC2/main.tf

395 lines
19 KiB
HCL

locals {
# alarm-message limited to 1024 characters
alarm-message = <<EOF
Cloudwatch alarm for the following resource
- Instance ID: ${var.ec2-instance-id}
- Instance Name: ${data.aws_instance.ec2-instance.tags["Name"]}
- Instance IP: ${data.aws_instance.ec2-instance.private_ip}
- Instance Type: ${data.aws_instance.ec2-instance.instance_type}
EOF
}
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
alarm_name = "${var.settings.StatusCheckFailed_System.ecccode}-EC2_${var.ec2-instance-id}-StatusCheckFailed_System"
comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator
evaluation_periods = var.settings.StatusCheckFailed_System.evaluation_periods
metric_name = "StatusCheckFailed_System"
period = var.settings.StatusCheckFailed_System.period
statistic = var.settings.StatusCheckFailed_System.statistic
threshold = var.settings.StatusCheckFailed_System.threshold
# alarm_description = "EC2:StatusCheckFailed_System"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.StatusCheckFailed_System.action]
ok_actions = [var.settings.StatusCheckFailed_System.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
alarm_name = "${var.settings.StatusCheckFailed_Instance.ecccode}-EC2_${var.ec2-instance-id}-StatusCheckFailed_Instance"
comparison_operator = var.settings.StatusCheckFailed_Instance.comparison_operator
evaluation_periods = var.settings.StatusCheckFailed_Instance.evaluation_periods
metric_name = "StatusCheckFailed_Instance"
period = var.settings.StatusCheckFailed_Instance.period
statistic = var.settings.StatusCheckFailed_Instance.statistic
threshold = var.settings.StatusCheckFailed_Instance.threshold
# alarm_description = "EC2:StatusCheckFailed_Instance"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.StatusCheckFailed_Instance.action]
ok_actions = [var.settings.StatusCheckFailed_Instance.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
alarm_name = "${var.settings.CPUUtilization.ecccode}-EC2_${var.ec2-instance-id}-CPUUtilization"
comparison_operator = var.settings.CPUUtilization.comparison_operator
evaluation_periods = var.settings.CPUUtilization.evaluation_periods
metric_name = "CPUUtilization"
period = var.settings.CPUUtilization.period
statistic = var.settings.CPUUtilization.statistic
threshold = var.settings.CPUUtilization.threshold
# alarm_description = "EC2:CPUUtilization"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.CPUUtilization.action]
ok_actions = [var.settings.CPUUtilization.action]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = var.ec2-instance-id
}
}
# cwagent metrics
data "aws_instance" "ec2-instance" {
instance_id = var.ec2-instance-id
}
# put instance name or ip in alarm name
locals {
instance-ip = data.aws_instance.ec2-instance.private_ip
instance-name = data.aws_instance.ec2-instance.tags["Name"]
}
module "ec2_os" {
source = "../../util/awscli"
access_key = var.target-account-ak
aws_cli_commands = "ec2 describe-instances --instance-ids ${var.ec2-instance-id} --query Reservations[].Instances[].PlatformDetails"
secret_key = var.target-account-sk
session_token = var.target-account-token
}
# Linux specific checks
# default cw agent uses mem_used_percent metric
# detect presense of cloudwatch agent
module "detect_cloudwatch_agent" {
source = "../../util/awscli"
access_key = var.target-account-ak
secret_key = var.target-account-sk
session_token = var.target-account-token
aws_cli_commands = "cloudwatch list-metrics --namespace CWAgent --dimensions Name=InstanceId,Value=${var.ec2-instance-id} --query Metrics[].MetricName --max-items 1"
}
resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" {
count = module.ec2_os.awscliout[0] != "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.mem_used_percent.ecccode}-EC2_${var.ec2-instance-id}-mem_used_percent"
comparison_operator = var.settings.mem_used_percent.comparison_operator
evaluation_periods = var.settings.mem_used_percent.evaluation_periods
metric_name = "mem_used_percent"
period = var.settings.mem_used_percent.period
statistic = var.settings.mem_used_percent.statistic
threshold = var.settings.mem_used_percent.threshold
# alarm_description = "EC2:mem_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.mem_used_percent.action]
ok_actions = [var.settings.mem_used_percent.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
data "external" "cw-dimensions" {
program = ["bash", "${path.module}/get-cwagent-dimensions.sh"]
query = {
iid = var.ec2-instance-id
access_key = var.target-account-ak
secret_key = var.target-account-sk
session_token = var.target-account-token
}
}
/* module returns blank
module "cw-dimensions" {
source = "../../util/awscli"
access_key = var.target-account-ak
aws_cli_commands = "cloudwatch list-metrics --namespace CWAgent --metric-name disk_inodes_free --dimensions Name=InstanceId,Value=${var.ec2-instance-id} Name=path,Value=/ --query Metrics[].Dimensions[] | jq '.[] | {(.Name):(.Value)}' | jq -s 'add'"
secret_key = var.target-account-sk
session_token = var.target-account-token
}
*/
resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" {
count = module.ec2_os.awscliout[0] != "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.swap_used_percent.ecccode}-EC2_${var.ec2-instance-id}-swap_used_percent"
comparison_operator = var.settings.swap_used_percent.comparison_operator
evaluation_periods = var.settings.swap_used_percent.evaluation_periods
metric_name = "swap_used_percent"
period = var.settings.swap_used_percent.period
statistic = var.settings.swap_used_percent.statistic
threshold = var.settings.swap_used_percent.threshold
# alarm_description = "EC2:swap_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.swap_used_percent.action]
ok_actions = [var.settings.swap_used_percent.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent_warn" {
count = module.ec2_os.awscliout[0] != "Windows" && data.external.cw-dimensions.result != null ? 1 : 0
alarm_name = "${var.settings.disk_used_percent_warn.ecccode}-EC2_${var.ec2-instance-id}-disk_used_percent"
comparison_operator = var.settings.disk_used_percent_warn.comparison_operator
evaluation_periods = var.settings.disk_used_percent_warn.evaluation_periods
metric_name = "disk_used_percent"
period = var.settings.disk_used_percent_warn.period
statistic = var.settings.disk_used_percent_warn.statistic
threshold = var.settings.disk_used_percent_warn.threshold
# alarm_description = "EC2:disk_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.disk_used_percent_warn.action]
ok_actions = [var.settings.disk_used_percent_warn.action]
dimensions = data.external.cw-dimensions.result
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent_crit" {
count = module.ec2_os.awscliout[0] != "Windows" && data.external.cw-dimensions.result != null ? 1 : 0
alarm_name = "${var.settings.disk_used_percent_crit.ecccode}-EC2_${var.ec2-instance-id}-disk_used_percent"
comparison_operator = var.settings.disk_used_percent_crit.comparison_operator
evaluation_periods = var.settings.disk_used_percent_crit.evaluation_periods
metric_name = "disk_used_percent"
period = var.settings.disk_used_percent_crit.period
statistic = var.settings.disk_used_percent_crit.statistic
threshold = var.settings.disk_used_percent_crit.threshold
# alarm_description = "EC2:disk_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.disk_used_percent_crit.action]
ok_actions = [var.settings.disk_used_percent_crit.action]
dimensions = data.external.cw-dimensions.result
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
count = module.ec2_os.awscliout[0] != "Windows" && data.external.cw-dimensions.result != null ? 1 : 0
alarm_name = "${var.settings.disk_inodes_free.ecccode}-EC2_${var.ec2-instance-id}-disk_inodes_free"
comparison_operator = var.settings.disk_inodes_free.comparison_operator
evaluation_periods = var.settings.disk_inodes_free.evaluation_periods
metric_name = "disk_inodes_free"
period = var.settings.disk_inodes_free.period
statistic = var.settings.disk_inodes_free.statistic
threshold = var.settings.disk_inodes_free.threshold
# alarm_description = "EC2:disk_inodes_free"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.disk_inodes_free.action]
ok_actions = [var.settings.disk_inodes_free.action]
dimensions = data.external.cw-dimensions.result
}
# process metric not published by default cw agent config
resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
count = module.ec2_os.awscliout[0] != "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.processes_total.ecccode}-EC2_${var.ec2-instance-id}-processes_total"
comparison_operator = var.settings.processes_total.comparison_operator
evaluation_periods = var.settings.processes_total.evaluation_periods
metric_name = "processes_total"
period = var.settings.processes_total.period
statistic = var.settings.processes_total.statistic
threshold = var.settings.processes_total.threshold
# alarm_description = "EC2:processes_total"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.processes_total.action]
ok_actions = [var.settings.processes_total.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-net_err" {
count = module.ec2_os.awscliout[0] != "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.net_err_in.ecccode}-EC2_${var.ec2-instance-id}-net_err"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = var.settings.net_err_in.evaluation_periods
threshold = 0
# alarm_description = "EC2:net_err_in or EC2:net_err_out exceeds threshold"
alarm_description = local.alarm-message
insufficient_data_actions = []
actions_enabled = false
alarm_actions = [var.settings.net_err_in.action]
ok_actions = [var.settings.net_err_in.action]
treat_missing_data = "notBreaching"
metric_query {
id = "e1"
expression = "IF(m1 > ${var.settings.net_err_in.threshold} OR m2 > ${var.settings.net_err_out.threshold}, 1, 0)"
label = "net_err_exceeds_threshold"
return_data = "true"
}
metric_query {
id = "m1"
metric {
metric_name = "net_err_in"
namespace = "CWAgent"
period = var.settings.net_err_in.period
stat = var.settings.net_err_in.statistic
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
interface = "eth0"
}
}
}
metric_query {
id = "m2"
metric {
metric_name = "net_err_out"
namespace = "CWAgent"
period = var.settings.net_err_out.period
stat = var.settings.net_err_out.statistic
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
interface = "eth0"
}
}
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-NetworkIn" {
count = try(var.settings.NetworkIn.monitor, false) ? 1 : 0
alarm_name = "${var.settings.NetworkIn.ecccode}-EC2_${var.ec2-instance-id}-NetworkIn"
comparison_operator = var.settings.NetworkIn.comparison_operator
evaluation_periods = var.settings.NetworkIn.evaluation_periods
metric_name = "NetworkIn"
period = var.settings.NetworkIn.period
statistic = var.settings.NetworkIn.statistic
threshold = var.settings.NetworkIn.threshold
# alarm_description = "EC2:NetworkIn"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.NetworkIn.action]
ok_actions = [var.settings.NetworkIn.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-NetworkOut" {
count = try(var.settings.NetworkIn.monitor, false) ? 1 : 0
alarm_name = "${var.settings.NetworkOut.ecccode}-EC2_${var.ec2-instance-id}-NetworkOut"
comparison_operator = var.settings.NetworkOut.comparison_operator
evaluation_periods = var.settings.NetworkOut.evaluation_periods
metric_name = "NetworkOut"
period = var.settings.NetworkOut.period
statistic = var.settings.NetworkOut.statistic
threshold = var.settings.NetworkOut.threshold
# alarm_description = "EC2:NetworkOut"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.NetworkOut.action]
ok_actions = [var.settings.NetworkOut.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
}
# Windows specific checks
resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
count = module.ec2_os.awscliout[0] == "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.MemoryCommittedPct.ecccode}-EC2_${var.ec2-instance-id}-MemoryCommittedPct"
comparison_operator = var.settings.MemoryCommittedPct.comparison_operator
evaluation_periods = var.settings.MemoryCommittedPct.evaluation_periods
metric_name = "Memory % Committed Bytes In Use"
period = var.settings.MemoryCommittedPct.period
statistic = var.settings.MemoryCommittedPct.statistic
threshold = var.settings.MemoryCommittedPct.threshold
# alarm_description = "EC2:MemoryCommittedBytes"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.MemoryCommittedPct.action]
ok_actions = [var.settings.MemoryCommittedPct.action]
dimensions = {
objectname = "Memory"
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-LogicalDiskFreePct" {
count = module.ec2_os.awscliout[0] == "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.LogicalDiskFreePct.ecccode}-EC2_${var.ec2-instance-id}-LogicalDiskFreePct"
comparison_operator = var.settings.LogicalDiskFreePct.comparison_operator
evaluation_periods = var.settings.LogicalDiskFreePct.evaluation_periods
metric_name = "LogicalDisk % Free Space"
period = var.settings.LogicalDiskFreePct.period
statistic = var.settings.LogicalDiskFreePct.statistic
threshold = var.settings.LogicalDiskFreePct.threshold
# alarm_description = "EC2:OsDiskFreePct"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.LogicalDiskFreePct.action]
ok_actions = [var.settings.LogicalDiskFreePct.action]
dimensions = {
instance = "C:"
objectname = "LogicalDisk"
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}