terraform.aws-baseline-infra/examples/emr/main.tf

locals {
  name = "${var.environment}-${var.customer-name}"
}

module "emr" {
  source  = "terraform-aws-modules/emr/aws"
  version = "1.2.0"

  name                        = "${local.name}-emr"
  release_label               = "emr-7.0.0"
  security_configuration_name = aws_emr_security_configuration.security_config.name
  applications                = ["hbase", "phoenix"]
  auto_termination_policy = {
    idle_timeout = 3600
  }

  bootstrap_action = {
  }

  configurations_json = jsonencode([
    {
      Classification : "hbase-env",
      Configurations : [
        {
          "Classification" : "export",
          "Properties" : {
            "HBASE_MASTER_OPTS" : "-Xmx4g",
            "HBASE_REGIONSERVER_OPTS" : "-Xmx8g"
          }
        }
      ],
      Properties : {}
    },
    {
      Classification : "hbase-site",
      Properties : {
        "hbase.regionserver.handler.count" : "300"
      }
    }
  ])

  master_instance_fleet = {
    name                      = "master-fleet"
    target_on_demand_capacity = 1
    instance_type_configs = [
      {
        instance_type = "c6g.xlarge"
        ebs_config = {
          size                 = 20
          type                 = "gp3"
          volumes_per_instance = 1
        }
      }
    ]
  }

  core_instance_fleet = {
    name                      = "core-fleet"
    target_on_demand_capacity = 0
    target_spot_capacity      = 1
    instance_type_configs = [
      {
        bid_price_as_percentage_of_on_demand_price = 70
        instance_type                              = "c6g.xlarge"
        weighted_capacity                          = 1
        ebs_config = {
          size                 = 20
          type                 = "gp3"
          volumes_per_instance = 1
        }
      },
      {
        bid_price_as_percentage_of_on_demand_price = 70
        instance_type                              = "m6g.xlarge"
        weighted_capacity                          = 1
        ebs_config = {
          size                 = 20
          type                 = "gp3"
          volumes_per_instance = 1
        }
      }
    ]
    launch_specifications = {
      spot_specification = {
        allocation_strategy      = "capacity-optimized"
        block_duration_minutes   = 0
        timeout_action           = "SWITCH_TO_ON_DEMAND"
        timeout_duration_minutes = 5
      }
    }
  }

  ebs_root_volume_size = 20
  # Subnets should be tagged with
  # { "for-use-with-amazon-emr-managed-policies" = true }
  ec2_attributes = {
    subnet_ids = ["subnet-08dec6787782ee087", "subnet-0551e96ffd016192a"]
    key_name   = "kf-key"
  }
  vpc_id = "vpc-01a10b033169f89a8"

  # Required for creating public cluster
  is_private_cluster = false

  keep_job_flow_alive_when_no_steps = true
  list_steps_states                 = ["PENDING", "RUNNING", "CANCEL_PENDING", "CANCELLED", "FAILED", "INTERRUPTED", "COMPLETED"]
  log_uri                           = "s3n://${module.s3_bucket.s3_bucket_id}/"

  scale_down_behavior    = "TERMINATE_AT_TASK_COMPLETION"
  step_concurrency_level = 3
  termination_protection = false
  visible_to_all_users   = true
  service_iam_role_policies = {
    AmazonEMRServicePolicy_v2 = "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2"
    PowerUser                 = "arn:aws:iam::aws:policy/PowerUserAccess"
  }
  iam_instance_profile_policies = {
    AmazonElasticMapReduceforEC2Role = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"
    PowerUser                        = "arn:aws:iam::aws:policy/PowerUserAccess"
  }
  # Use managed scaling policy to refill spot instances
  managed_scaling_policy = {
    unit_type                       = "InstanceFleetUnits"
    minimum_capacity_units          = 1
    maximum_capacity_units          = 4
    maximum_ondemand_capacity_units = 0
    maximum_core_capacity_units     = 4
  }
}

resource "random_id" "this" {
  byte_length = 2
}

module "s3_bucket" {
  source  = "terraform-aws-modules/s3-bucket/aws"
  version = "~> 3.0"

  bucket = "${local.name}-emrlogs-${random_id.this.dec}"

  # Allow deletion of non-empty bucket
  # Example usage only - not recommended for production
  force_destroy = true

  attach_deny_insecure_transport_policy = true
  attach_require_latest_tls_policy      = true

  block_public_acls       = true
  block_public_policy     = true
  ignore_public_acls      = true
  restrict_public_buckets = true

  server_side_encryption_configuration = {
    rule = {
      apply_server_side_encryption_by_default = {
        sse_algorithm = "AES256"
      }
    }
  }
}

resource "aws_kms_key" "ebs" {
  description             = "KMS key for EBS volumes"
  deletion_window_in_days = 7
}

resource "aws_emr_security_configuration" "security_config" {
  name = "${local.name}-emr-security-config"

  configuration = jsonencode(
    {
      EncryptionConfiguration = {
        AtRestEncryptionConfiguration = {
          LocalDiskEncryptionConfiguration = {
            AwsKmsKey                 = aws_kms_key.ebs.arn
            EnableEbsEncryption       = true
            EncryptionKeyProviderType = "AwsKms"
          }
          S3EncryptionConfiguration = {
            EncryptionMode = "SSE-S3"
          }
        }
        EnableAtRestEncryption    = true
        EnableInTransitEncryption = false
      }
      InstanceMetadataServiceConfiguration = {
        HttpPutResponseHopLimit               = 1
        MinimumInstanceMetadataServiceVersion = 2
      }
    }
  )
}

# Tag EMR master and core instances
# Need to run this layer twice to set instance tags
# Adding depends_on will results in dependency loop
data "aws_instances" "master_instances" {
  # depends_on = [module.emr]
  instance_tags = {
    "aws:elasticmapreduce:instance-group-role" = "MASTER"
  }
  instance_state_names = ["running"]
}

data "aws_instances" "core_instances" {
  # depends_on = [module.emr]
  instance_tags = {
    "aws:elasticmapreduce:instance-group-role" = "CORE"
  }
  instance_state_names = ["running"]
}

resource "aws_ec2_tag" "tag-emr-core-instances" {
  #  depends_on  = [data.aws_instances.core_instances]
  count       = length(data.aws_instances.core_instances.ids)
  resource_id = sort(data.aws_instances.core_instances.ids)[count.index]
  key         = "Name"
  value       = "${local.name}-emr-core-${count.index + 1}"
}

resource "aws_ec2_tag" "tag-emr-master-instances" {
  #  depends_on  = [data.aws_instances.master_instances]
  count       = length(data.aws_instances.master_instances.ids)
  resource_id = sort(data.aws_instances.master_instances.ids)[count.index]
  key         = "Name"
  value       = "${local.name}-emr-master-${count.index + 1}"
}
NEW: working example of EMR with spot instances 2024-01-06 00:36:00 +08:00			`locals {`
			`name = "${var.environment}-${var.customer-name}"`
			`}`

			`module "emr" {`
			`source = "terraform-aws-modules/emr/aws"`
			`version = "1.2.0"`

			`name = "${local.name}-emr"`
			`release_label = "emr-7.0.0"`
			`security_configuration_name = aws_emr_security_configuration.security_config.name`
			`applications = ["hbase", "phoenix"]`
			`auto_termination_policy = {`
			`idle_timeout = 3600`
			`}`

			`bootstrap_action = {`
			`}`

			`configurations_json = jsonencode([`
			`{`
			`Classification : "hbase-env",`
			`Configurations : [`
			`{`
			`"Classification" : "export",`
			`"Properties" : {`
			`"HBASE_MASTER_OPTS" : "-Xmx4g",`
			`"HBASE_REGIONSERVER_OPTS" : "-Xmx8g"`
			`}`
			`}`
			`],`
			`Properties : {}`
			`},`
			`{`
			`Classification : "hbase-site",`
			`Properties : {`
			`"hbase.regionserver.handler.count" : "300"`
			`}`
			`}`
			`])`

			`master_instance_fleet = {`
			`name = "master-fleet"`
			`target_on_demand_capacity = 1`
			`instance_type_configs = [`
			`{`
			`instance_type = "c6g.xlarge"`
UPD: Enabled EBS encryption on EMR. Added managed scaling policy 2024-01-06 10:25:56 +08:00			`ebs_config = {`
			`size = 20`
			`type = "gp3"`
			`volumes_per_instance = 1`
			`}`
NEW: working example of EMR with spot instances 2024-01-06 00:36:00 +08:00			`}`
			`]`
			`}`

			`core_instance_fleet = {`
			`name = "core-fleet"`
			`target_on_demand_capacity = 0`
UPD: changed initial core fleet size from 2 to 1 to prevent match with managed scaling policy 2024-01-06 10:28:48 +08:00			`target_spot_capacity = 1`
NEW: working example of EMR with spot instances 2024-01-06 00:36:00 +08:00			`instance_type_configs = [`
			`{`
			`bid_price_as_percentage_of_on_demand_price = 70`
			`instance_type = "c6g.xlarge"`
			`weighted_capacity = 1`
			`ebs_config = {`
			`size = 20`
			`type = "gp3"`
			`volumes_per_instance = 1`
			`}`
			`},`
			`{`
			`bid_price_as_percentage_of_on_demand_price = 70`
			`instance_type = "m6g.xlarge"`
			`weighted_capacity = 1`
			`ebs_config = {`
			`size = 20`
			`type = "gp3"`
			`volumes_per_instance = 1`
			`}`
			`}`
			`]`
			`launch_specifications = {`
			`spot_specification = {`
			`allocation_strategy = "capacity-optimized"`
			`block_duration_minutes = 0`
			`timeout_action = "SWITCH_TO_ON_DEMAND"`
			`timeout_duration_minutes = 5`
			`}`
			`}`
			`}`

			`ebs_root_volume_size = 20`
			`# Subnets should be tagged with`
			`# { "for-use-with-amazon-emr-managed-policies" = true }`
			`ec2_attributes = {`
			`subnet_ids = ["subnet-08dec6787782ee087", "subnet-0551e96ffd016192a"]`
			`key_name = "kf-key"`
			`}`
			`vpc_id = "vpc-01a10b033169f89a8"`

			`# Required for creating public cluster`
			`is_private_cluster = false`

			`keep_job_flow_alive_when_no_steps = true`
			`list_steps_states = ["PENDING", "RUNNING", "CANCEL_PENDING", "CANCELLED", "FAILED", "INTERRUPTED", "COMPLETED"]`
			`log_uri = "s3n://${module.s3_bucket.s3_bucket_id}/"`

			`scale_down_behavior = "TERMINATE_AT_TASK_COMPLETION"`
			`step_concurrency_level = 3`
			`termination_protection = false`
			`visible_to_all_users = true`
			`service_iam_role_policies = {`
			`AmazonEMRServicePolicy_v2 = "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2"`
			`PowerUser = "arn:aws:iam::aws:policy/PowerUserAccess"`
			`}`
UPD: Enabled EBS encryption on EMR. Added managed scaling policy 2024-01-06 10:25:56 +08:00			`iam_instance_profile_policies = {`
			`AmazonElasticMapReduceforEC2Role = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"`
			`PowerUser = "arn:aws:iam::aws:policy/PowerUserAccess"`
			`}`
			`# Use managed scaling policy to refill spot instances`
			`managed_scaling_policy = {`
			`unit_type = "InstanceFleetUnits"`
			`minimum_capacity_units = 1`
			`maximum_capacity_units = 4`
			`maximum_ondemand_capacity_units = 0`
			`maximum_core_capacity_units = 4`
			`}`
NEW: working example of EMR with spot instances 2024-01-06 00:36:00 +08:00			`}`

			`resource "random_id" "this" {`
			`byte_length = 2`
			`}`

			`module "s3_bucket" {`
			`source = "terraform-aws-modules/s3-bucket/aws"`
			`version = "~> 3.0"`

			`bucket = "${local.name}-emrlogs-${random_id.this.dec}"`

			`# Allow deletion of non-empty bucket`
			`# Example usage only - not recommended for production`
			`force_destroy = true`

			`attach_deny_insecure_transport_policy = true`
			`attach_require_latest_tls_policy = true`

			`block_public_acls = true`
			`block_public_policy = true`
			`ignore_public_acls = true`
			`restrict_public_buckets = true`

			`server_side_encryption_configuration = {`
			`rule = {`
			`apply_server_side_encryption_by_default = {`
			`sse_algorithm = "AES256"`
			`}`
			`}`
			`}`
			`}`

UPD: Enabled EBS encryption on EMR. Added managed scaling policy 2024-01-06 10:25:56 +08:00			`resource "aws_kms_key" "ebs" {`
			`description = "KMS key for EBS volumes"`
			`deletion_window_in_days = 7`
			`}`
NEW: working example of EMR with spot instances 2024-01-06 00:36:00 +08:00
			`resource "aws_emr_security_configuration" "security_config" {`
			`name = "${local.name}-emr-security-config"`

			`configuration = jsonencode(`
			`{`
UPD: Enabled EBS encryption on EMR. Added managed scaling policy 2024-01-06 10:25:56 +08:00			`EncryptionConfiguration = {`
			`AtRestEncryptionConfiguration = {`
			`LocalDiskEncryptionConfiguration = {`
			`AwsKmsKey = aws_kms_key.ebs.arn`
			`EnableEbsEncryption = true`
			`EncryptionKeyProviderType = "AwsKms"`
			`}`
			`S3EncryptionConfiguration = {`
			`EncryptionMode = "SSE-S3"`
			`}`
			`}`
			`EnableAtRestEncryption = true`
			`EnableInTransitEncryption = false`
			`}`
			`InstanceMetadataServiceConfiguration = {`
			`HttpPutResponseHopLimit = 1`
			`MinimumInstanceMetadataServiceVersion = 2`
NEW: working example of EMR with spot instances 2024-01-06 00:36:00 +08:00			`}`
			`}`
			`)`
			`}`

			`# Tag EMR master and core instances`
			`# Need to run this layer twice to set instance tags`
			`# Adding depends_on will results in dependency loop`
			`data "aws_instances" "master_instances" {`
			`# depends_on = [module.emr]`
			`instance_tags = {`
			`"aws:elasticmapreduce:instance-group-role" = "MASTER"`
			`}`
			`instance_state_names = ["running"]`
			`}`

			`data "aws_instances" "core_instances" {`
			`# depends_on = [module.emr]`
			`instance_tags = {`
			`"aws:elasticmapreduce:instance-group-role" = "CORE"`
			`}`
			`instance_state_names = ["running"]`
			`}`

			`resource "aws_ec2_tag" "tag-emr-core-instances" {`
			`# depends_on = [data.aws_instances.core_instances]`
			`count = length(data.aws_instances.core_instances.ids)`
			`resource_id = sort(data.aws_instances.core_instances.ids)[count.index]`
			`key = "Name"`
			`value = "${local.name}-emr-core-${count.index + 1}"`
			`}`

			`resource "aws_ec2_tag" "tag-emr-master-instances" {`
			`# depends_on = [data.aws_instances.master_instances]`
			`count = length(data.aws_instances.master_instances.ids)`
			`resource_id = sort(data.aws_instances.master_instances.ids)[count.index]`
			`key = "Name"`
			`value = "${local.name}-emr-master-${count.index + 1}"`
			`}`