locals { name = "${var.environment}-${var.customer-name}" } module "emr" { source = "terraform-aws-modules/emr/aws" version = "1.2.0" name = "${local.name}-emr" release_label = "emr-7.0.0" security_configuration_name = aws_emr_security_configuration.security_config.name applications = ["hbase", "phoenix"] auto_termination_policy = { idle_timeout = 3600 } bootstrap_action = { } configurations_json = jsonencode([ { Classification : "hbase-env", Configurations : [ { "Classification" : "export", "Properties" : { "HBASE_MASTER_OPTS" : "-Xmx4g", "HBASE_REGIONSERVER_OPTS" : "-Xmx8g" } } ], Properties : {} }, { Classification : "hbase-site", Properties : { "hbase.regionserver.handler.count" : "300" } } ]) master_instance_fleet = { name = "master-fleet" target_on_demand_capacity = 1 instance_type_configs = [ { instance_type = "c6g.xlarge" ebs_config = { size = 20 type = "gp3" volumes_per_instance = 1 } } ] } core_instance_fleet = { name = "core-fleet" target_on_demand_capacity = 0 target_spot_capacity = 1 instance_type_configs = [ { bid_price_as_percentage_of_on_demand_price = 70 instance_type = "c6g.xlarge" weighted_capacity = 1 ebs_config = { size = 20 type = "gp3" volumes_per_instance = 1 } }, { bid_price_as_percentage_of_on_demand_price = 70 instance_type = "m6g.xlarge" weighted_capacity = 1 ebs_config = { size = 20 type = "gp3" volumes_per_instance = 1 } } ] launch_specifications = { spot_specification = { allocation_strategy = "capacity-optimized" block_duration_minutes = 0 timeout_action = "SWITCH_TO_ON_DEMAND" timeout_duration_minutes = 5 } } } ebs_root_volume_size = 20 # Subnets should be tagged with # { "for-use-with-amazon-emr-managed-policies" = true } ec2_attributes = { subnet_ids = ["subnet-08dec6787782ee087", "subnet-0551e96ffd016192a"] key_name = "kf-key" } vpc_id = "vpc-01a10b033169f89a8" # Required for creating public cluster is_private_cluster = false keep_job_flow_alive_when_no_steps = true list_steps_states = ["PENDING", "RUNNING", "CANCEL_PENDING", "CANCELLED", "FAILED", "INTERRUPTED", "COMPLETED"] log_uri = "s3n://${module.s3_bucket.s3_bucket_id}/" scale_down_behavior = "TERMINATE_AT_TASK_COMPLETION" step_concurrency_level = 3 termination_protection = false visible_to_all_users = true service_iam_role_policies = { AmazonEMRServicePolicy_v2 = "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2" PowerUser = "arn:aws:iam::aws:policy/PowerUserAccess" } iam_instance_profile_policies = { AmazonElasticMapReduceforEC2Role = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role" PowerUser = "arn:aws:iam::aws:policy/PowerUserAccess" } # Use managed scaling policy to refill spot instances managed_scaling_policy = { unit_type = "InstanceFleetUnits" minimum_capacity_units = 1 maximum_capacity_units = 4 maximum_ondemand_capacity_units = 0 maximum_core_capacity_units = 4 } } resource "random_id" "this" { byte_length = 2 } module "s3_bucket" { source = "terraform-aws-modules/s3-bucket/aws" version = "~> 3.0" bucket = "${local.name}-emrlogs-${random_id.this.dec}" # Allow deletion of non-empty bucket # Example usage only - not recommended for production force_destroy = true attach_deny_insecure_transport_policy = true attach_require_latest_tls_policy = true block_public_acls = true block_public_policy = true ignore_public_acls = true restrict_public_buckets = true server_side_encryption_configuration = { rule = { apply_server_side_encryption_by_default = { sse_algorithm = "AES256" } } } } resource "aws_kms_key" "ebs" { description = "KMS key for EBS volumes" deletion_window_in_days = 7 } resource "aws_emr_security_configuration" "security_config" { name = "${local.name}-emr-security-config" configuration = jsonencode( { EncryptionConfiguration = { AtRestEncryptionConfiguration = { LocalDiskEncryptionConfiguration = { AwsKmsKey = aws_kms_key.ebs.arn EnableEbsEncryption = true EncryptionKeyProviderType = "AwsKms" } S3EncryptionConfiguration = { EncryptionMode = "SSE-S3" } } EnableAtRestEncryption = true EnableInTransitEncryption = false } InstanceMetadataServiceConfiguration = { HttpPutResponseHopLimit = 1 MinimumInstanceMetadataServiceVersion = 2 } } ) } # Tag EMR master and core instances # Need to run this layer twice to set instance tags # Adding depends_on will results in dependency loop data "aws_instances" "master_instances" { # depends_on = [module.emr] instance_tags = { "aws:elasticmapreduce:instance-group-role" = "MASTER" } instance_state_names = ["running"] } data "aws_instances" "core_instances" { # depends_on = [module.emr] instance_tags = { "aws:elasticmapreduce:instance-group-role" = "CORE" } instance_state_names = ["running"] } resource "aws_ec2_tag" "tag-emr-core-instances" { # depends_on = [data.aws_instances.core_instances] count = length(data.aws_instances.core_instances.ids) resource_id = sort(data.aws_instances.core_instances.ids)[count.index] key = "Name" value = "${local.name}-emr-core-${count.index + 1}" } resource "aws_ec2_tag" "tag-emr-master-instances" { # depends_on = [data.aws_instances.master_instances] count = length(data.aws_instances.master_instances.ids) resource_id = sort(data.aws_instances.master_instances.ids)[count.index] key = "Name" value = "${local.name}-emr-master-${count.index + 1}" }