UPD: various updates on cloudwatch monitoring from upstream
This commit is contained in:
parent
436b799ff1
commit
1fe92a3f78
@ -43,14 +43,18 @@ No modules.
|
|||||||
| cwl-region | AWS region where Cloudwatch LogGroup resides. Needed for setting up cwlog-stream-role | `string` | n/a | yes |
|
| cwl-region | AWS region where Cloudwatch LogGroup resides. Needed for setting up cwlog-stream-role | `string` | n/a | yes |
|
||||||
| dest-bucket-arn | Destination S3 bucket ARN | `string` | n/a | yes |
|
| dest-bucket-arn | Destination S3 bucket ARN | `string` | n/a | yes |
|
||||||
| dest-bucket-kmskey-arn | KMS key ARN for destination bucket | `string` | n/a | yes |
|
| dest-bucket-kmskey-arn | KMS key ARN for destination bucket | `string` | n/a | yes |
|
||||||
| dest-bucket-prefix | S3 object prefix for this stream | `string` | n/a | yes |
|
| dest-bucket-prefix | S3 object prefix for this stream. Please do not start with / end with a /. For example, r53-log/acme.local/ | `string` | n/a | yes |
|
||||||
|
| enable-firehose-errorlog | Enable firehose errorlog | `bool` | `false` | no |
|
||||||
| firehose-kmskey-arn | KMS Key arn for Firehose | `string` | n/a | yes |
|
| firehose-kmskey-arn | KMS Key arn for Firehose | `string` | n/a | yes |
|
||||||
| source-cwlgroup-name | Name of source CloudwatchLog group | `string` | n/a | yes |
|
| source-cwlgroup-name | Name of source CloudwatchLog group | `string` | n/a | yes |
|
||||||
| stream-name | Name of Kinesis Data Firehose delivery stream | `string` | n/a | yes |
|
| stream-name | Name of Kinesis Data Firehose delivery stream | `string` | n/a | yes |
|
||||||
|
|
||||||
## Outputs
|
## Outputs
|
||||||
|
|
||||||
No outputs.
|
| Name | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| cloudwatchstream-iam-role-arn | n/a |
|
||||||
|
| firehose-iam-role-arn | n/a |
|
||||||
|
|
||||||
---
|
---
|
||||||
## Authorship
|
## Authorship
|
||||||
|
7
modules/ManagementGovernance/Cwl-firehose-s3/outputs.tf
Normal file
7
modules/ManagementGovernance/Cwl-firehose-s3/outputs.tf
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
output firehose-iam-role-arn {
|
||||||
|
value = aws_iam_role.firehose-stream-iam-role.arn
|
||||||
|
}
|
||||||
|
|
||||||
|
output cloudwatchstream-iam-role-arn {
|
||||||
|
value = aws_iam_role.cwlog-stream-role.arn
|
||||||
|
}
|
@ -29,3 +29,46 @@ module "ec2-monitoring" {
|
|||||||
sns-targets = var.sns-targets
|
sns-targets = var.sns-targets
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Sample cloudwatch alarm email notification
|
||||||
|
```
|
||||||
|
Subject: ALARM: "TestAlarmPleaseIgnore" in Asia Pacific (Hong Kong)
|
||||||
|
|
||||||
|
You are receiving this email because your Amazon CloudWatch Alarm "TestAlarmPleaseIgnore" in the
|
||||||
|
Asia Pacific (Hong Kong) region has entered the ALARM state, because "Threshold Crossed: 1 out of
|
||||||
|
the last 1 datapoints [864.0 (24/01/24 00:56:00)] was less than or equal to the threshold (900.0)
|
||||||
|
(minimum 1 datapoint for OK -> ALARM transition)." at "Wednesday 24 January, 2024 01:01:34 UTC".
|
||||||
|
|
||||||
|
View this alarm in the AWS Management Console:
|
||||||
|
https://ap-east-1.console.aws.amazon.com%2Fcloudwatch...
|
||||||
|
|
||||||
|
Alarm Details:
|
||||||
|
- Name: TestAlarmPleaseIgnore
|
||||||
|
- Description: Cloudwatch alarm for the following resource
|
||||||
|
- Instance ID: xxx
|
||||||
|
- Instance Name: yyy
|
||||||
|
- Instance IP: zz.zz.zz.zz
|
||||||
|
- State Change: OK -> ALARM
|
||||||
|
- Reason for State Change: Threshold Crossed: 1 out of the last 1 datapoints [864.0 (24/01/24 00:56:00)] was less than or equal to the threshold (900.0) (minimum 1 datapoint for OK -> ALARM transition).
|
||||||
|
- Timestamp: Wednesday 24 January, 2024 01:01:34 UTC
|
||||||
|
- AWS Account: 111122223333
|
||||||
|
- Alarm Arn: arn:aws:cloudwatch:ap-east-1:111122223333:alarm:TestAlarmPleaseIgnore
|
||||||
|
|
||||||
|
Threshold:
|
||||||
|
- The alarm is in the ALARM state when the metric is LessThanOrEqualToThreshold 900.0 for at least 1 of the last 1 period(s) of 300 seconds.
|
||||||
|
|
||||||
|
Monitored Metric:
|
||||||
|
- MetricNamespace: AWS/EC2
|
||||||
|
- MetricName: CPUCreditBalance
|
||||||
|
- Dimensions: [InstanceId = i-050d4adeafaa53cd0]
|
||||||
|
- Period: 300 seconds
|
||||||
|
- Statistic: Average
|
||||||
|
- Unit: not specified
|
||||||
|
- TreatMissingData: missing
|
||||||
|
|
||||||
|
|
||||||
|
State Change Actions:
|
||||||
|
- OK:
|
||||||
|
- ALARM: [arn:aws:sns:ap-east-1:111122223333:CWA-SNS-Email-KenFong]
|
||||||
|
- INSUFFICIENT_DATA:
|
||||||
|
```
|
@ -1,3 +1,14 @@
|
|||||||
|
locals {
|
||||||
|
# alarm-message limited to 1024 characters
|
||||||
|
alarm-message = <<EOF
|
||||||
|
Cloudwatch alarm for the following resource
|
||||||
|
- Instance ID: ${var.ec2-instance-id}
|
||||||
|
- Instance Name: ${data.aws_instance.ec2-instance.tags["Name"]}
|
||||||
|
- Instance IP: ${data.aws_instance.ec2-instance.private_ip}
|
||||||
|
- Instance Type: ${data.aws_instance.ec2-instance.instance_type}
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
|
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
|
||||||
alarm_name = "${var.settings.StatusCheckFailed_System.ecccode}-EC2_${var.ec2-instance-id}-StatusCheckFailed_System"
|
alarm_name = "${var.settings.StatusCheckFailed_System.ecccode}-EC2_${var.ec2-instance-id}-StatusCheckFailed_System"
|
||||||
comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator
|
comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator
|
||||||
@ -6,7 +17,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
|
|||||||
period = var.settings.StatusCheckFailed_System.period
|
period = var.settings.StatusCheckFailed_System.period
|
||||||
statistic = var.settings.StatusCheckFailed_System.statistic
|
statistic = var.settings.StatusCheckFailed_System.statistic
|
||||||
threshold = var.settings.StatusCheckFailed_System.threshold
|
threshold = var.settings.StatusCheckFailed_System.threshold
|
||||||
alarm_description = "EC2:StatusCheckFailed_System"
|
# alarm_description = "EC2:StatusCheckFailed_System"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "AWS/EC2"
|
namespace = "AWS/EC2"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -25,7 +37,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
|
|||||||
period = var.settings.StatusCheckFailed_Instance.period
|
period = var.settings.StatusCheckFailed_Instance.period
|
||||||
statistic = var.settings.StatusCheckFailed_Instance.statistic
|
statistic = var.settings.StatusCheckFailed_Instance.statistic
|
||||||
threshold = var.settings.StatusCheckFailed_Instance.threshold
|
threshold = var.settings.StatusCheckFailed_Instance.threshold
|
||||||
alarm_description = "EC2:StatusCheckFailed_Instance"
|
# alarm_description = "EC2:StatusCheckFailed_Instance"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "AWS/EC2"
|
namespace = "AWS/EC2"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -44,7 +57,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
|
|||||||
period = var.settings.CPUUtilization.period
|
period = var.settings.CPUUtilization.period
|
||||||
statistic = var.settings.CPUUtilization.statistic
|
statistic = var.settings.CPUUtilization.statistic
|
||||||
threshold = var.settings.CPUUtilization.threshold
|
threshold = var.settings.CPUUtilization.threshold
|
||||||
alarm_description = "EC2:CPUUtilization"
|
# alarm_description = "EC2:CPUUtilization"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "AWS/EC2"
|
namespace = "AWS/EC2"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -61,6 +75,12 @@ data "aws_instance" "ec2-instance" {
|
|||||||
instance_id = var.ec2-instance-id
|
instance_id = var.ec2-instance-id
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# put instance name or ip in alarm name
|
||||||
|
locals {
|
||||||
|
instance-ip = data.aws_instance.ec2-instance.private_ip
|
||||||
|
instance-name = data.aws_instance.ec2-instance.tags["Name"]
|
||||||
|
}
|
||||||
|
|
||||||
module "ec2_os" {
|
module "ec2_os" {
|
||||||
source = "../../util/awscli"
|
source = "../../util/awscli"
|
||||||
access_key = var.target-account-ak
|
access_key = var.target-account-ak
|
||||||
@ -90,7 +110,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" {
|
|||||||
period = var.settings.mem_used_percent.period
|
period = var.settings.mem_used_percent.period
|
||||||
statistic = var.settings.mem_used_percent.statistic
|
statistic = var.settings.mem_used_percent.statistic
|
||||||
threshold = var.settings.mem_used_percent.threshold
|
threshold = var.settings.mem_used_percent.threshold
|
||||||
alarm_description = "EC2:mem_used_percent"
|
# alarm_description = "EC2:mem_used_percent"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "CWAgent"
|
namespace = "CWAgent"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -132,7 +153,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" {
|
|||||||
period = var.settings.swap_used_percent.period
|
period = var.settings.swap_used_percent.period
|
||||||
statistic = var.settings.swap_used_percent.statistic
|
statistic = var.settings.swap_used_percent.statistic
|
||||||
threshold = var.settings.swap_used_percent.threshold
|
threshold = var.settings.swap_used_percent.threshold
|
||||||
alarm_description = "EC2:swap_used_percent"
|
# alarm_description = "EC2:swap_used_percent"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "CWAgent"
|
namespace = "CWAgent"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -154,7 +176,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent_warn" {
|
|||||||
period = var.settings.disk_used_percent_warn.period
|
period = var.settings.disk_used_percent_warn.period
|
||||||
statistic = var.settings.disk_used_percent_warn.statistic
|
statistic = var.settings.disk_used_percent_warn.statistic
|
||||||
threshold = var.settings.disk_used_percent_warn.threshold
|
threshold = var.settings.disk_used_percent_warn.threshold
|
||||||
alarm_description = "EC2:disk_used_percent"
|
# alarm_description = "EC2:disk_used_percent"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "CWAgent"
|
namespace = "CWAgent"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -172,7 +195,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent_crit" {
|
|||||||
period = var.settings.disk_used_percent_crit.period
|
period = var.settings.disk_used_percent_crit.period
|
||||||
statistic = var.settings.disk_used_percent_crit.statistic
|
statistic = var.settings.disk_used_percent_crit.statistic
|
||||||
threshold = var.settings.disk_used_percent_crit.threshold
|
threshold = var.settings.disk_used_percent_crit.threshold
|
||||||
alarm_description = "EC2:disk_used_percent"
|
# alarm_description = "EC2:disk_used_percent"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "CWAgent"
|
namespace = "CWAgent"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -190,7 +214,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
|
|||||||
period = var.settings.disk_inodes_free.period
|
period = var.settings.disk_inodes_free.period
|
||||||
statistic = var.settings.disk_inodes_free.statistic
|
statistic = var.settings.disk_inodes_free.statistic
|
||||||
threshold = var.settings.disk_inodes_free.threshold
|
threshold = var.settings.disk_inodes_free.threshold
|
||||||
alarm_description = "EC2:disk_inodes_free"
|
# alarm_description = "EC2:disk_inodes_free"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "CWAgent"
|
namespace = "CWAgent"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -209,7 +234,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
|
|||||||
period = var.settings.processes_total.period
|
period = var.settings.processes_total.period
|
||||||
statistic = var.settings.processes_total.statistic
|
statistic = var.settings.processes_total.statistic
|
||||||
threshold = var.settings.processes_total.threshold
|
threshold = var.settings.processes_total.threshold
|
||||||
alarm_description = "EC2:processes_total"
|
# alarm_description = "EC2:processes_total"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "CWAgent"
|
namespace = "CWAgent"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -228,7 +254,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-net_err" {
|
|||||||
comparison_operator = "GreaterThanThreshold"
|
comparison_operator = "GreaterThanThreshold"
|
||||||
evaluation_periods = var.settings.net_err_in.evaluation_periods
|
evaluation_periods = var.settings.net_err_in.evaluation_periods
|
||||||
threshold = 0
|
threshold = 0
|
||||||
alarm_description = "EC2:net_err_in or EC2:net_err_out exceeds threshold"
|
# alarm_description = "EC2:net_err_in or EC2:net_err_out exceeds threshold"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = false
|
actions_enabled = false
|
||||||
alarm_actions = [var.settings.net_err_in.action]
|
alarm_actions = [var.settings.net_err_in.action]
|
||||||
@ -284,7 +311,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-NetworkIn" {
|
|||||||
period = var.settings.NetworkIn.period
|
period = var.settings.NetworkIn.period
|
||||||
statistic = var.settings.NetworkIn.statistic
|
statistic = var.settings.NetworkIn.statistic
|
||||||
threshold = var.settings.NetworkIn.threshold
|
threshold = var.settings.NetworkIn.threshold
|
||||||
alarm_description = "EC2:NetworkIn"
|
# alarm_description = "EC2:NetworkIn"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "AWS/EC2"
|
namespace = "AWS/EC2"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -304,7 +332,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-NetworkOut" {
|
|||||||
period = var.settings.NetworkOut.period
|
period = var.settings.NetworkOut.period
|
||||||
statistic = var.settings.NetworkOut.statistic
|
statistic = var.settings.NetworkOut.statistic
|
||||||
threshold = var.settings.NetworkOut.threshold
|
threshold = var.settings.NetworkOut.threshold
|
||||||
alarm_description = "EC2:NetworkOut"
|
# alarm_description = "EC2:NetworkOut"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "AWS/EC2"
|
namespace = "AWS/EC2"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -325,7 +354,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
|
|||||||
period = var.settings.MemoryCommittedPct.period
|
period = var.settings.MemoryCommittedPct.period
|
||||||
statistic = var.settings.MemoryCommittedPct.statistic
|
statistic = var.settings.MemoryCommittedPct.statistic
|
||||||
threshold = var.settings.MemoryCommittedPct.threshold
|
threshold = var.settings.MemoryCommittedPct.threshold
|
||||||
alarm_description = "EC2:MemoryCommittedBytes"
|
# alarm_description = "EC2:MemoryCommittedBytes"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "CWAgent"
|
namespace = "CWAgent"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
@ -348,7 +378,8 @@ resource "aws_cloudwatch_metric_alarm" "ec2-LogicalDiskFreePct" {
|
|||||||
period = var.settings.LogicalDiskFreePct.period
|
period = var.settings.LogicalDiskFreePct.period
|
||||||
statistic = var.settings.LogicalDiskFreePct.statistic
|
statistic = var.settings.LogicalDiskFreePct.statistic
|
||||||
threshold = var.settings.LogicalDiskFreePct.threshold
|
threshold = var.settings.LogicalDiskFreePct.threshold
|
||||||
alarm_description = "EC2:OsDiskFreePct"
|
# alarm_description = "EC2:OsDiskFreePct"
|
||||||
|
alarm_description = local.alarm-message
|
||||||
namespace = "CWAgent"
|
namespace = "CWAgent"
|
||||||
insufficient_data_actions = []
|
insufficient_data_actions = []
|
||||||
actions_enabled = var.actions-enabled
|
actions_enabled = var.actions-enabled
|
||||||
|
Loading…
Reference in New Issue
Block a user