UPD: various updates on cloudwatch monitoring from upstream

This commit is contained in:
xpk 2024-01-24 11:18:35 +08:00
parent 436b799ff1
commit 1fe92a3f78
Signed by: xpk
GPG Key ID: CD4FF6793F09AB86
4 changed files with 207 additions and 122 deletions

View File

@ -43,14 +43,18 @@ No modules.
| cwl-region | AWS region where Cloudwatch LogGroup resides. Needed for setting up cwlog-stream-role | `string` | n/a | yes |
| dest-bucket-arn | Destination S3 bucket ARN | `string` | n/a | yes |
| dest-bucket-kmskey-arn | KMS key ARN for destination bucket | `string` | n/a | yes |
| dest-bucket-prefix | S3 object prefix for this stream | `string` | n/a | yes |
| dest-bucket-prefix | S3 object prefix for this stream. Please do not start with / end with a /. For example, r53-log/acme.local/ | `string` | n/a | yes |
| enable-firehose-errorlog | Enable firehose errorlog | `bool` | `false` | no |
| firehose-kmskey-arn | KMS Key arn for Firehose | `string` | n/a | yes |
| source-cwlgroup-name | Name of source CloudwatchLog group | `string` | n/a | yes |
| stream-name | Name of Kinesis Data Firehose delivery stream | `string` | n/a | yes |
## Outputs
No outputs.
| Name | Description |
|------|-------------|
| cloudwatchstream-iam-role-arn | n/a |
| firehose-iam-role-arn | n/a |
---
## Authorship

View File

@ -0,0 +1,7 @@
output firehose-iam-role-arn {
value = aws_iam_role.firehose-stream-iam-role.arn
}
output cloudwatchstream-iam-role-arn {
value = aws_iam_role.cwlog-stream-role.arn
}

View File

@ -29,3 +29,46 @@ module "ec2-monitoring" {
sns-targets = var.sns-targets
}
```
## Sample cloudwatch alarm email notification
```
Subject: ALARM: "TestAlarmPleaseIgnore" in Asia Pacific (Hong Kong)
You are receiving this email because your Amazon CloudWatch Alarm "TestAlarmPleaseIgnore" in the
Asia Pacific (Hong Kong) region has entered the ALARM state, because "Threshold Crossed: 1 out of
the last 1 datapoints [864.0 (24/01/24 00:56:00)] was less than or equal to the threshold (900.0)
(minimum 1 datapoint for OK -> ALARM transition)." at "Wednesday 24 January, 2024 01:01:34 UTC".
View this alarm in the AWS Management Console:
https://ap-east-1.console.aws.amazon.com%2Fcloudwatch...
Alarm Details:
- Name: TestAlarmPleaseIgnore
- Description: Cloudwatch alarm for the following resource
- Instance ID: xxx
- Instance Name: yyy
- Instance IP: zz.zz.zz.zz
- State Change: OK -> ALARM
- Reason for State Change: Threshold Crossed: 1 out of the last 1 datapoints [864.0 (24/01/24 00:56:00)] was less than or equal to the threshold (900.0) (minimum 1 datapoint for OK -> ALARM transition).
- Timestamp: Wednesday 24 January, 2024 01:01:34 UTC
- AWS Account: 111122223333
- Alarm Arn: arn:aws:cloudwatch:ap-east-1:111122223333:alarm:TestAlarmPleaseIgnore
Threshold:
- The alarm is in the ALARM state when the metric is LessThanOrEqualToThreshold 900.0 for at least 1 of the last 1 period(s) of 300 seconds.
Monitored Metric:
- MetricNamespace: AWS/EC2
- MetricName: CPUCreditBalance
- Dimensions: [InstanceId = i-050d4adeafaa53cd0]
- Period: 300 seconds
- Statistic: Average
- Unit: not specified
- TreatMissingData: missing
State Change Actions:
- OK:
- ALARM: [arn:aws:sns:ap-east-1:111122223333:CWA-SNS-Email-KenFong]
- INSUFFICIENT_DATA:
```

View File

@ -1,3 +1,14 @@
locals {
# alarm-message limited to 1024 characters
alarm-message = <<EOF
Cloudwatch alarm for the following resource
- Instance ID: ${var.ec2-instance-id}
- Instance Name: ${data.aws_instance.ec2-instance.tags["Name"]}
- Instance IP: ${data.aws_instance.ec2-instance.private_ip}
- Instance Type: ${data.aws_instance.ec2-instance.instance_type}
EOF
}
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
alarm_name = "${var.settings.StatusCheckFailed_System.ecccode}-EC2_${var.ec2-instance-id}-StatusCheckFailed_System"
comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator
@ -6,7 +17,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
period = var.settings.StatusCheckFailed_System.period
statistic = var.settings.StatusCheckFailed_System.statistic
threshold = var.settings.StatusCheckFailed_System.threshold
alarm_description = "EC2:StatusCheckFailed_System"
# alarm_description = "EC2:StatusCheckFailed_System"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -25,7 +37,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
period = var.settings.StatusCheckFailed_Instance.period
statistic = var.settings.StatusCheckFailed_Instance.statistic
threshold = var.settings.StatusCheckFailed_Instance.threshold
alarm_description = "EC2:StatusCheckFailed_Instance"
# alarm_description = "EC2:StatusCheckFailed_Instance"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -44,7 +57,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
period = var.settings.CPUUtilization.period
statistic = var.settings.CPUUtilization.statistic
threshold = var.settings.CPUUtilization.threshold
alarm_description = "EC2:CPUUtilization"
# alarm_description = "EC2:CPUUtilization"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -61,6 +75,12 @@ data "aws_instance" "ec2-instance" {
instance_id = var.ec2-instance-id
}
# put instance name or ip in alarm name
locals {
instance-ip = data.aws_instance.ec2-instance.private_ip
instance-name = data.aws_instance.ec2-instance.tags["Name"]
}
module "ec2_os" {
source = "../../util/awscli"
access_key = var.target-account-ak
@ -90,7 +110,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" {
period = var.settings.mem_used_percent.period
statistic = var.settings.mem_used_percent.statistic
threshold = var.settings.mem_used_percent.threshold
alarm_description = "EC2:mem_used_percent"
# alarm_description = "EC2:mem_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -132,7 +153,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" {
period = var.settings.swap_used_percent.period
statistic = var.settings.swap_used_percent.statistic
threshold = var.settings.swap_used_percent.threshold
alarm_description = "EC2:swap_used_percent"
# alarm_description = "EC2:swap_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -154,7 +176,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent_warn" {
period = var.settings.disk_used_percent_warn.period
statistic = var.settings.disk_used_percent_warn.statistic
threshold = var.settings.disk_used_percent_warn.threshold
alarm_description = "EC2:disk_used_percent"
# alarm_description = "EC2:disk_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -172,7 +195,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent_crit" {
period = var.settings.disk_used_percent_crit.period
statistic = var.settings.disk_used_percent_crit.statistic
threshold = var.settings.disk_used_percent_crit.threshold
alarm_description = "EC2:disk_used_percent"
# alarm_description = "EC2:disk_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -190,7 +214,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
period = var.settings.disk_inodes_free.period
statistic = var.settings.disk_inodes_free.statistic
threshold = var.settings.disk_inodes_free.threshold
alarm_description = "EC2:disk_inodes_free"
# alarm_description = "EC2:disk_inodes_free"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -209,7 +234,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
period = var.settings.processes_total.period
statistic = var.settings.processes_total.statistic
threshold = var.settings.processes_total.threshold
alarm_description = "EC2:processes_total"
# alarm_description = "EC2:processes_total"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -228,7 +254,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-net_err" {
comparison_operator = "GreaterThanThreshold"
evaluation_periods = var.settings.net_err_in.evaluation_periods
threshold = 0
alarm_description = "EC2:net_err_in or EC2:net_err_out exceeds threshold"
# alarm_description = "EC2:net_err_in or EC2:net_err_out exceeds threshold"
alarm_description = local.alarm-message
insufficient_data_actions = []
actions_enabled = false
alarm_actions = [var.settings.net_err_in.action]
@ -276,7 +303,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-net_err" {
}
resource "aws_cloudwatch_metric_alarm" "ec2-NetworkIn" {
count = try(var.settings.NetworkIn.monitor,false) ? 1 : 0
count = try(var.settings.NetworkIn.monitor, false) ? 1 : 0
alarm_name = "${var.settings.NetworkIn.ecccode}-EC2_${var.ec2-instance-id}-NetworkIn"
comparison_operator = var.settings.NetworkIn.comparison_operator
evaluation_periods = var.settings.NetworkIn.evaluation_periods
@ -284,7 +311,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-NetworkIn" {
period = var.settings.NetworkIn.period
statistic = var.settings.NetworkIn.statistic
threshold = var.settings.NetworkIn.threshold
alarm_description = "EC2:NetworkIn"
# alarm_description = "EC2:NetworkIn"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -296,7 +324,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2-NetworkIn" {
}
resource "aws_cloudwatch_metric_alarm" "ec2-NetworkOut" {
count = try(var.settings.NetworkIn.monitor,false) ? 1 : 0
count = try(var.settings.NetworkIn.monitor, false) ? 1 : 0
alarm_name = "${var.settings.NetworkOut.ecccode}-EC2_${var.ec2-instance-id}-NetworkOut"
comparison_operator = var.settings.NetworkOut.comparison_operator
evaluation_periods = var.settings.NetworkOut.evaluation_periods
@ -304,7 +332,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-NetworkOut" {
period = var.settings.NetworkOut.period
statistic = var.settings.NetworkOut.statistic
threshold = var.settings.NetworkOut.threshold
alarm_description = "EC2:NetworkOut"
# alarm_description = "EC2:NetworkOut"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -325,7 +354,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
period = var.settings.MemoryCommittedPct.period
statistic = var.settings.MemoryCommittedPct.statistic
threshold = var.settings.MemoryCommittedPct.threshold
alarm_description = "EC2:MemoryCommittedBytes"
# alarm_description = "EC2:MemoryCommittedBytes"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
@ -348,7 +378,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-LogicalDiskFreePct" {
period = var.settings.LogicalDiskFreePct.period
statistic = var.settings.LogicalDiskFreePct.statistic
threshold = var.settings.LogicalDiskFreePct.threshold
alarm_description = "EC2:OsDiskFreePct"
# alarm_description = "EC2:OsDiskFreePct"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled