Skip to content

Commit 4da0fee

Browse files
authored
Merge pull request #114 from awslabs/fluentbit-investigation
Spark Live UI added
2 parents c3b9ca7 + cca1e94 commit 4da0fee

File tree

12 files changed

+207
-24
lines changed

12 files changed

+207
-24
lines changed

analytics/terraform/emr-eks-karpenter/README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
5454
| Name | Description | Type | Default | Required |
5555
|------|-------------|------|---------|:--------:|
5656
| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.24"` | no |
57-
| <a name="input_enable_yunikorn"></a> [enable\_yunikorn](#input\_enable\_yunikorn) | Enable YuniKorn Scheduler | `bool` | `true` | no |
57+
| <a name="input_enable_yunikorn"></a> [enable\_yunikorn](#input\_enable\_yunikorn) | Enable YuniKorn Scheduler | `bool` | `false` | no |
5858
| <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"emr-eks-karpenter"` | no |
5959
| <a name="input_private_subnets"></a> [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 16382 IPs per Subnet | `list(string)` | <pre>[<br> "10.1.0.0/18",<br> "10.1.64.0/18",<br> "10.1.128.0/18"<br>]</pre> | no |
6060
| <a name="input_public_subnets"></a> [public\_subnets](#input\_public\_subnets) | Public Subnets CIDRs. 4094 IPs per Subnet | `list(string)` | <pre>[<br> "10.1.192.0/20",<br> "10.1.208.0/20",<br> "10.1.224.0/20"<br>]</pre> | no |
@@ -70,4 +70,5 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
7070
| <a name="output_emr_on_eks_role_arn"></a> [emr\_on\_eks\_role\_arn](#output\_emr\_on\_eks\_role\_arn) | IAM execution role arn for EMR on EKS |
7171
| <a name="output_emr_on_eks_role_id"></a> [emr\_on\_eks\_role\_id](#output\_emr\_on\_eks\_role\_id) | IAM execution role ID for EMR on EKS |
7272
| <a name="output_emrcontainers_virtual_cluster_id"></a> [emrcontainers\_virtual\_cluster\_id](#output\_emrcontainers\_virtual\_cluster\_id) | EMR Containers Virtual cluster ID |
73+
| <a name="output_emrcontainers_virtual_cluster_name"></a> [emrcontainers\_virtual\_cluster\_name](#output\_emrcontainers\_virtual\_cluster\_name) | EMR Containers Virtual cluster name |
7374
<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->

analytics/terraform/emr-eks-karpenter/addons.tf

+31-1
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,15 @@ module "eks_blueprints_kubernetes_addons" {
4949
}
5050

5151
#---------------------------------------
52-
# Cluster Autoscaler
52+
# Karpenter Autoscaler for EKS Cluster
5353
#---------------------------------------
5454
enable_karpenter = true
5555
karpenter_helm_config = {
56+
name = "karpenter"
57+
chart = "karpenter"
58+
repository = "oci://public.ecr.aws/karpenter"
59+
version = local.karpenter_helm_chart_version
60+
namespace = local.karpenter_namespace
5661
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
5762
repository_password = data.aws_ecrpublic_authorization_token.token.password
5863
}
@@ -234,3 +239,28 @@ resource "kubectl_manifest" "karpenter_provisioner" {
234239

235240
depends_on = [module.eks_blueprints_kubernetes_addons]
236241
}
242+
243+
#------------------------------------------------------------------------------------------------------------
244+
# Karpenter-CRD Helm Chart for upgrades - Custom Resource Definition (CRD) Upgrades
245+
# https://gallery.ecr.aws/karpenter/karpenter-crd
246+
# Checkout the user guide https://karpenter.sh/preview/upgrade-guide/
247+
# https://github.com/aws/karpenter/tree/main/charts/karpenter-crd
248+
#------------------------------------------------------------------------------------------------------------
249+
# README:
250+
# Karpenter ships with a few Custom Resource Definitions (CRDs). These CRDs are published:
251+
# As an independent helm chart karpenter-crd - source that can be used by Helm to manage the lifecycle of these CRDs.
252+
# To upgrade or install karpenter-crd run:
253+
# helm upgrade --install karpenter-crd oci://public.ecr.aws/karpenter/karpenter-crd --version vx.y.z --namespace karpenter --create-namespace
254+
#------------------------------------------------------------------------------------------------------------
255+
#resource "helm_release" "karpenter_crd" {
256+
# namespace = local.karpenter_namespace
257+
# create_namespace = true
258+
# name = "karpenter"
259+
# repository = "oci://public.ecr.aws/karpenter/karpenter-crd"
260+
# chart = "karpenter-crd"
261+
# version = "v0.24.0"
262+
# repository_username = data.aws_ecrpublic_authorization_token.token.user_name
263+
# repository_password = data.aws_ecrpublic_authorization_token.token.password
264+
#
265+
# depends_on = [module.eks_blueprints_kubernetes_addons.karpenter]
266+
#}

analytics/terraform/emr-eks-karpenter/examples/karpenter-memory-provisioner/execute_emr_eks_job.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
if [ $# -ne 4 ];
3+
if [ $# -ne 3 ];
44
then
55
echo "$0: Missing arguments EMR_VIRTUAL_CLUSTER_NAME, S3_BUCKET_NAME and EMR_JOB_EXECUTION_ROLE_ARN"
66
echo "USAGE: ./execute_emr_eks_job.sh '<EMR_VIRTUAL_CLUSTER_NAME>' '<s3://ENTER_BUCKET_NAME>' '<EMR_JOB_EXECUTION_ROLE_ARN>'"
@@ -80,7 +80,7 @@ if [[ $EMR_VIRTUAL_CLUSTER_ID != "" ]]; then
8080
"entryPointArguments": ["'"$INPUT_DATA_S3_PATH"'",
8181
"'"$OUTPUT_DATA_S3_PATH"'"
8282
],
83-
"sparkSubmitParameters": "--conf spark.executor.instances=10"
83+
"sparkSubmitParameters": "--conf spark.executor.instances=2"
8484
}
8585
}' \
8686
--configuration-overrides '{

analytics/terraform/emr-eks-karpenter/examples/karpenter-yunikorn-gangscheduling/execute_emr_eks_job.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
if [ $# -ne 4 ];
3+
if [ $# -ne 3 ];
44
then
55
echo "$0: Missing arguments EMR_VIRTUAL_CLUSTER_NAME, S3_BUCKET_NAME and EMR_JOB_EXECUTION_ROLE_ARN"
66
echo "USAGE: ./execute_emr_eks_job.sh '<EMR_VIRTUAL_CLUSTER_NAME>' '<s3://ENTER_BUCKET_NAME>' '<EMR_JOB_EXECUTION_ROLE_ARN>'"

analytics/terraform/emr-eks-karpenter/locals.tf

+6-5
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@ locals {
22
name = var.name
33
region = var.region
44

5-
vpc_cidr = var.vpc_cidr
6-
azs = slice(data.aws_availability_zones.available.names, 0, 3)
7-
core_node_group = "core-node-group"
8-
vpc_endpoints = ["autoscaling", "ecr.api", "ecr.dkr", "ec2", "ec2messages", "elasticloadbalancing", "sts", "kms", "logs", "ssm", "ssmmessages"]
9-
5+
vpc_cidr = var.vpc_cidr
6+
azs = slice(data.aws_availability_zones.available.names, 0, 3)
7+
core_node_group = "core-node-group"
8+
vpc_endpoints = ["autoscaling", "ecr.api", "ecr.dkr", "ec2", "ec2messages", "elasticloadbalancing", "sts", "kms", "logs", "ssm", "ssmmessages"]
9+
karpenter_helm_chart_version = "v0.25.0"
10+
karpenter_namespace = "karpenter"
1011
tags = merge(var.tags, {
1112
Blueprint = local.name
1213
GithubRepo = "github.com/awslabs/data-on-eks"

analytics/terraform/emr-eks-karpenter/outputs.tf

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ output "emrcontainers_virtual_cluster_id" {
99
}
1010

1111
output "emrcontainers_virtual_cluster_name" {
12-
description = "EMR Containers Virtual cluster NAME"
12+
description = "EMR Containers Virtual cluster name"
1313
value = aws_emrcontainers_virtual_cluster.this.name
1414
}
1515

analytics/terraform/emr-eks-karpenter/provisioners/spark-compute-optimized-provisioner.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ metadata:
4444
namespace: karpenter
4545
spec:
4646
subnetSelector:
47-
Name: "${eks_cluster_id}-private*" # required
47+
Name: "${eks_cluster_id}-private*" # or karpenter.sh/discovery/${eks_cluster_id}: '*'
4848
launchTemplate: "${launch_template_name}" # optional, see Launch Template documentation
4949
tags:
5050
InstanceType: "spark-compute-optimized" # optional, add tags for your own use

analytics/terraform/emr-eks-karpenter/variables.tf

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ variable "private_subnets" {
4141
}
4242

4343
variable "enable_yunikorn" {
44-
default = true
44+
default = false
4545
description = "Enable YuniKorn Scheduler"
4646
type = bool
4747
}

analytics/terraform/spark-k8s-operator/addons.tf

+25-3
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,27 @@ module "eks_blueprints_kubernetes_addons" {
1515
enable_amazon_eks_kube_proxy = true
1616
enable_amazon_eks_aws_ebs_csi_driver = true
1717

18+
enable_aws_load_balancer_controller = true
19+
aws_load_balancer_controller_helm_config = {
20+
name = "aws-load-balancer-controller"
21+
chart = "aws-load-balancer-controller"
22+
repository = "https://aws.github.io/eks-charts"
23+
version = "1.4.7"
24+
namespace = "kube-system"
25+
description = "aws-load-balancer-controller Helm Chart for ingress resources"
26+
}
27+
28+
enable_ingress_nginx = true
29+
ingress_nginx_helm_config = {
30+
name = "ingress-nginx"
31+
chart = "ingress-nginx"
32+
repository = "https://kubernetes.github.io/ingress-nginx"
33+
version = "4.5.2"
34+
description = "The NGINX HelmChart Ingress Controller deployment configuration"
35+
values = [templatefile("${path.module}/helm-values/nginx-values.yaml", {})]
36+
}
37+
38+
1839
#---------------------------------------------------------------
1940
# Metrics Server
2041
#---------------------------------------------------------------
@@ -138,7 +159,7 @@ module "eks_blueprints_kubernetes_addons" {
138159
name = "aws-for-fluent-bit"
139160
chart = "aws-for-fluent-bit"
140161
repository = "https://aws.github.io/eks-charts"
141-
version = "0.1.21"
162+
version = "0.1.22"
142163
namespace = "logging"
143164
timeout = "300"
144165
aws_for_fluent_bit_cw_log_group = "/${module.eks_blueprints.eks_cluster_id}/worker-fluentbit-logs" # Optional
@@ -147,6 +168,7 @@ module "eks_blueprints_kubernetes_addons" {
147168
region = data.aws_region.current.id
148169
aws_for_fluent_bit_cw_log = "/${module.eks_blueprints.eks_cluster_id}/worker-fluentbit-logs"
149170
s3_bucket_name = aws_s3_bucket.this.id
171+
cluster_name = module.eks_blueprints.eks_cluster_id
150172
})]
151173
set = [
152174
{
@@ -302,11 +324,11 @@ resource "aws_s3_bucket_public_access_block" "this" {
302324
ignore_public_acls = true
303325
}
304326

305-
# Creating an s3 bucket prefix. Ensure you copy analytics event logs under this path to visualize the dags
327+
# Creating an s3 bucket prefix. Ensure you copy Spark History event logs under this path to visualize the dags
306328
resource "aws_s3_object" "this" {
307329
bucket = aws_s3_bucket.this.id
308330
acl = "private"
309-
key = "logs/"
331+
key = "${module.eks_blueprints.eks_cluster_id}/event-history-logs/"
310332
content_type = "application/x-directory"
311333

312334
depends_on = [

analytics/terraform/spark-k8s-operator/helm-values/aws-for-fluentbit-values.yaml

+89-8
Original file line numberDiff line numberDiff line change
@@ -7,26 +7,77 @@ global:
77
hostNetwork: true
88
dnsPolicy: ClusterFirstWithHostNet
99

10-
# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters
10+
#----------------------------------------------------------#
11+
# PARSERS for k8s-custom-tag abd crio
12+
# NOTE: Read this link for more details about WHY CRIO parser used -> https://docs.fluentbit.io/manual/installation/kubernetes#container-runtime-interface-cri-parser
13+
# e.g., k8s log line for crio ->
14+
# 2023-02-19T21:28:48.495311051Z stdout F Unsetting extraneous env vars (UTC): 21:28:48
15+
# ^(?<time>[^ ]+) (?<stream>stdout|stderr) (?<logtag>P|F) (?<log>.*)$
16+
#----------------------------------------------------------#
17+
service:
18+
parsersFiles:
19+
- /fluent-bit/parsers/parsers.conf
20+
extraParsers: |
21+
[PARSER]
22+
Name k8s-custom-tag
23+
Format regex
24+
Regex ^(?<namespace_name>[^_]+)\.(?<container_name>.+)\.(?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?<docker_id>[a-z0-9]{64})-$
25+
26+
[PARSER]
27+
Name crio
28+
Format Regex
29+
Regex ^(?<time>[^ ]+) (?<stream>stdout|stderr) (?<logtag>P|F) (?<log>.*)$
30+
Time_Key time
31+
Time_Format %Y-%m-%dT%H:%M:%S.%L%z
32+
33+
#----------------------------------------------------------#
34+
# FILTER logs with k8s-custom-tag parser
35+
# Tag_regex -> Use this to verify the regex https://rubular.com/
36+
#----------------------------------------------------------#
37+
input:
38+
enabled: true
39+
tag: kube.<namespace_name>.<container_name>.<pod_name>.<docker_id>-
40+
path: "/var/log/containers/*.log"
41+
db: "/var/log/flb_kube.db"
42+
parser: crio
43+
memBufLimit: 5MB
44+
skipLongLines: "On"
45+
refreshInterval: 10
46+
extraInputs: |
47+
Tag_Regex (?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-(?<docker_id>[a-z0-9]{64})\.log$
48+
49+
#----------------------------------------------------------#
50+
# FILTER logs with k8s-custom-tag parser
51+
#----------------------------------------------------------#
52+
# NOTE: The Kubernetes filter will enrich the logs with Kubernetes metadata, specifically labels and annotations.
53+
# The filter only goes to the API Server when it cannot find the cached info, otherwise it uses the cache.
54+
#----------------------------------------------------------#
1155
filter:
56+
enabled: true
1257
name: "kubernetes"
1358
match: "kube.*"
1459
kubeURL: "https://kubernetes.default.svc.cluster.local:443"
1560
mergeLog: "On"
1661
mergeLogKey: "log_processed"
1762
keepLog: "On"
1863
k8sLoggingParser: "On"
19-
k8sLoggingExclude: "Off"
64+
k8sLoggingExclude: "On"
2065
bufferSize: "0"
2166
extraFilters: |
22-
Kube_Tag_Prefix application.var.log.containers.
23-
Labels Off
24-
Annotations Off
67+
Kube_Tag_Prefix kube.
68+
Regex_Parser k8s-custom-tag
2569
Use_Kubelet true
2670
Kubelet_Port 10250
71+
Annotations Off
2772
Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
2873
Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token
2974
75+
# extraFilters: |
76+
# Labels Off
77+
78+
#----------------------------------------------------------#
79+
# OUTPUT logs to CloudWatch
80+
#----------------------------------------------------------#
3081
cloudWatch:
3182
enabled: true
3283
match: "*"
@@ -40,6 +91,8 @@ cloudWatch:
4091
autoCreateGroup: false
4192
endpoint:
4293
credentialsEndpoint: {}
94+
# extraOutputs: |
95+
# ...
4396

4497
firehose:
4598
enabled: false
@@ -50,23 +103,36 @@ kinesis:
50103
elasticsearch:
51104
enabled: false
52105

106+
107+
#----------------------------------------------------------#
108+
# OUTPUT logs to S3
109+
#----------------------------------------------------------#
53110
# Use this config to write logs to an S3 bucket.
54111
# Pre-req
55112
# 1/ S3 bucket for logging
56113
# 2/ Additional IAM policy for FluentBit add-on IRSA config
57114
# 3/ Add this to Terraform to pass additional IAM policy "aws_for_fluentbit_irsa_policies = ["<ENTER_NEW_IAM_POLICY_FOR_S3>"]"
58-
115+
#----------------------------------------------------------#
59116
additionalOutputs: |
60117
[OUTPUT]
61118
Name s3
62119
Match *
63120
region ${region}
64121
bucket ${s3_bucket_name}
65122
total_file_size 100M
66-
s3_key_format /fluentbit-logs/$TAG[4]/year=%Y/month=%m/day=%d/hour=%H/
67-
s3_key_format_tag_delimiters ._
123+
s3_key_format /${cluster_name}/application-logs/year=%Y/month=%m/day=%d/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log
124+
s3_key_format_tag_delimiters ..
68125
store_dir /home/ec2-user/buffer
69126
upload_timeout 10m
127+
workers 2
128+
129+
#----------------------------------------------------------#
130+
# Use below when compression is enabled for S3 logs with gzip. Multipart upload cannot be used with gzip compression
131+
# use_put_object On
132+
# content_type application/json
133+
# compression gzip
134+
# preserve_data_ordering On
135+
#----------------------------------------------------------#
70136

71137
serviceAccount:
72138
create: true
@@ -88,3 +154,18 @@ updateStrategy:
88154

89155
nodeSelector:
90156
kubernetes.io/os: linux
157+
158+
volumes:
159+
- name: varlog
160+
hostPath:
161+
path: /var/log
162+
- name: varlibdockercontainers
163+
hostPath:
164+
path: /var/lib/docker/containers
165+
166+
volumeMounts:
167+
- name: varlog
168+
mountPath: /var/log
169+
- name: varlibdockercontainers
170+
mountPath: /var/lib/docker/containers
171+
readOnly: true
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
controller:
2+
service:
3+
# For more annotations https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.4/guide/service/annotations/
4+
annotations:
5+
service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
6+
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
7+
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
8+
service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: "*"
9+
service.beta.kubernetes.io/aws-load-balancer-backend-protocol: tcp
10+
service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
11+
service.beta.kubernetes.io/aws-load-balancer-type: nlb
12+
13+
#------------------------------------
14+
# FUTURE WORK TO ENABLE ROUTE53, ACM
15+
#------------------------------------
16+
# external-dns.alpha.kubernetes.io/hostname: kubernetes-example.com.
17+
# AWS route53-mapper
18+
#controller:
19+
# service:
20+
# labels:
21+
# dns: "route53"
22+
# annotations:
23+
# domainName: "kubernetes-example.com"
24+
25+
# AWS L7 ELB with SSL Termination
26+
#controller:
27+
# service:
28+
# targetPorts:
29+
# http: http
30+
# https: http
31+
# annotations:
32+
# service.beta.kubernetes.io/aws-load-balancer-ssl-cert: arn:aws:acm:XX-XXXX-X:XXXXXXXXX:certificate/XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXX
33+
# service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "http"
34+
# service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https"
35+
# service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: '3600'

analytics/terraform/spark-k8s-operator/helm-values/spark-k8s-operator-values.yaml

+13
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,16 @@ resources:
2626
batchScheduler:
2727
# -- Enable batch scheduler for spark jobs scheduling. If enabled, users can specify batch scheduler name in spark application
2828
enable: true
29+
30+
#------------------------------------
31+
# THIS WILL CREATE SERVICE AND INGRESS OBJECT FOR EACH SPARK APPLICATION
32+
#------------------------------------
33+
uiService:
34+
# -- Enable UI service creation for Spark application
35+
enable: true
36+
# -- Ingress URL format.
37+
# Requires the UI service to be enabled by setting `uiService.enable` to true.
38+
# 1/ Enable ingressUrlFormat to create an Ingress object for each Spark Job submitted using Spark Operator
39+
# 2/ This setup also requires ingres-nginx to be deployed with NLB as LB with IP based routing.
40+
# 3. Enter the NLB DNS name or enter Custom Domain name from route53 below which points to the NLB
41+
#ingressUrlFormat: '<ENTER_NLB_DNS_NAME/CUSTOM_DOMAIN_NAME>/{{$appName}}'

0 commit comments

Comments
 (0)