terraform.aws-baseline-infra/examples/emr/main.tf

locals {
  name = "${var.environment}-${var.customer-name}"
}

module "emr" {
  source  = "terraform-aws-modules/emr/aws"
  version = "1.2.0"

  name                        = "${local.name}-emr"
  release_label               = "emr-7.0.0"
  security_configuration_name = aws_emr_security_configuration.security_config.name
  applications                = ["hbase", "phoenix"]
  auto_termination_policy = {
    idle_timeout = 3600
  }

  bootstrap_action = {
  }

  configurations_json = jsonencode([
    {
      Classification : "hbase-env",
      Configurations : [
        {
          "Classification" : "export",
          "Properties" : {
            "HBASE_MASTER_OPTS" : "-Xmx4g",
            "HBASE_REGIONSERVER_OPTS" : "-Xmx8g"
          }
        }
      ],
      Properties : {}
    },
    {
      Classification : "hbase-site",
      Properties : {
        "hbase.regionserver.handler.count" : "300"
      }
    }
  ])

  master_instance_fleet = {
    name                      = "master-fleet"
    target_on_demand_capacity = 1
    instance_type_configs = [
      {
        instance_type = "c6g.xlarge"
        ebs_config = {
          size                 = 20
          type                 = "gp3"
          volumes_per_instance = 1
        }
      }
    ]
  }

  core_instance_fleet = {
    name                      = "core-fleet"
    target_on_demand_capacity = 0
    target_spot_capacity      = 1
    instance_type_configs = [
      {
        bid_price_as_percentage_of_on_demand_price = 70
        instance_type                              = "c6g.xlarge"
        weighted_capacity                          = 1
        ebs_config = {
          size                 = 20
          type                 = "gp3"
          volumes_per_instance = 1
        }
      },
      {
        bid_price_as_percentage_of_on_demand_price = 70
        instance_type                              = "m6g.xlarge"
        weighted_capacity                          = 1
        ebs_config = {
          size                 = 20
          type                 = "gp3"
          volumes_per_instance = 1
        }
      }
    ]
    launch_specifications = {
      spot_specification = {
        allocation_strategy      = "capacity-optimized"
        block_duration_minutes   = 0
        timeout_action           = "SWITCH_TO_ON_DEMAND"
        timeout_duration_minutes = 5
      }
    }
  }

  ebs_root_volume_size = 20
  # Subnets should be tagged with
  # { "for-use-with-amazon-emr-managed-policies" = true }
  ec2_attributes = {
    subnet_ids = ["subnet-08dec6787782ee087", "subnet-0551e96ffd016192a"]
    key_name   = "kf-key"
  }
  vpc_id = "vpc-01a10b033169f89a8"

  # Required for creating public cluster
  is_private_cluster = false

  keep_job_flow_alive_when_no_steps = true
  list_steps_states                 = ["PENDING", "RUNNING", "CANCEL_PENDING", "CANCELLED", "FAILED", "INTERRUPTED", "COMPLETED"]
  log_uri                           = "s3n://${module.s3_bucket.s3_bucket_id}/"

  scale_down_behavior    = "TERMINATE_AT_TASK_COMPLETION"
  step_concurrency_level = 3
  termination_protection = false
  visible_to_all_users   = true
  service_iam_role_policies = {
    AmazonEMRServicePolicy_v2 = "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2"
    PowerUser                 = "arn:aws:iam::aws:policy/PowerUserAccess"
  }
  iam_instance_profile_policies = {
    AmazonElasticMapReduceforEC2Role = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"
    PowerUser                        = "arn:aws:iam::aws:policy/PowerUserAccess"
  }
  # Use managed scaling policy to refill spot instances
  managed_scaling_policy = {
    unit_type                       = "InstanceFleetUnits"
    minimum_capacity_units          = 1
    maximum_capacity_units          = 4
    maximum_ondemand_capacity_units = 0
    maximum_core_capacity_units     = 4
  }
}

resource "random_id" "this" {
  byte_length = 2
}

module "s3_bucket" {
  source  = "terraform-aws-modules/s3-bucket/aws"
  version = "~> 3.0"

  bucket = "${local.name}-emrlogs-${random_id.this.dec}"

  # Allow deletion of non-empty bucket
  # Example usage only - not recommended for production
  force_destroy = true

  attach_deny_insecure_transport_policy = true
  attach_require_latest_tls_policy      = true

  block_public_acls       = true
  block_public_policy     = true
  ignore_public_acls      = true
  restrict_public_buckets = true

  server_side_encryption_configuration = {
    rule = {
      apply_server_side_encryption_by_default = {
        sse_algorithm = "AES256"
      }
    }
  }
}

resource "aws_kms_key" "ebs" {
  description             = "KMS key for EBS volumes"
  deletion_window_in_days = 7
}

resource "aws_emr_security_configuration" "security_config" {
  name = "${local.name}-emr-security-config"

  configuration = jsonencode(
    {
      EncryptionConfiguration = {
        AtRestEncryptionConfiguration = {
          LocalDiskEncryptionConfiguration = {
            AwsKmsKey                 = aws_kms_key.ebs.arn
            EnableEbsEncryption       = true
            EncryptionKeyProviderType = "AwsKms"
          }
          S3EncryptionConfiguration = {
            EncryptionMode = "SSE-S3"
          }
        }
        EnableAtRestEncryption    = true
        EnableInTransitEncryption = false
      }
      InstanceMetadataServiceConfiguration = {
        HttpPutResponseHopLimit               = 1
        MinimumInstanceMetadataServiceVersion = 2
      }
    }
  )
}

# Tag EMR master and core instances
# Need to run this layer twice to set instance tags
# Adding depends_on will results in dependency loop
data "aws_instances" "master_instances" {
  # depends_on = [module.emr]
  instance_tags = {
    "aws:elasticmapreduce:instance-group-role" = "MASTER"
  }
  instance_state_names = ["running"]
}

data "aws_instances" "core_instances" {
  # depends_on = [module.emr]
  instance_tags = {
    "aws:elasticmapreduce:instance-group-role" = "CORE"
  }
  instance_state_names = ["running"]
}

resource "aws_ec2_tag" "tag-emr-core-instances" {
  #  depends_on  = [data.aws_instances.core_instances]
  count       = length(data.aws_instances.core_instances.ids)
  resource_id = sort(data.aws_instances.core_instances.ids)[count.index]
  key         = "Name"
  value       = "${local.name}-emr-core-${count.index + 1}"
}

resource "aws_ec2_tag" "tag-emr-master-instances" {
  #  depends_on  = [data.aws_instances.master_instances]
  count       = length(data.aws_instances.master_instances.ids)
  resource_id = sort(data.aws_instances.master_instances.ids)[count.index]
  key         = "Name"
  value       = "${local.name}-emr-master-${count.index + 1}"
}