As Valohai Workers on Kubernetes are implemented basic Kubernetes primitives, you may choose the autoscaling solution of your liking.
In these instructions we show how this can be implemented with Karpenter for an AWS EKS cluster, which is recommended by AWS over Cluster Autoscaler.
The instructions below are based on the documentation provided by Karpenter. Note that you might need to modify them depending on your cluster setup.
Requirements
Setup
Define common variables for reuse:
# do I have OIDC configured?
aws iam list-open-id-connect-providers
# there should be something like:
# oidc.eks.eu-west-1.amazonaws.com/id/9CD39EC45022D8C399168B4427CA6209
export AWS_PROFILE=<aws-profile>
export AWS_REGION=<region>
export KUBECONFIG=~/.kube/<cluster-name>
CLUSTER=<cluster-name>
KARPENTER_NAMESPACE=kube-system
AWS_PARTITION="aws"
OIDC_ENDPOINT="$(aws eks describe-cluster --name ${CLUSTER} --query "cluster.identity.oidc.issuer" --output text)"
AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query 'Account' --output text)
Create two new IAM roles for nodes provisioned with Karpenter and the Karpenter controller:
echo '{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}' > node-trust-policy.json
aws iam create-role \
--role-name "KarpenterNodeRole-${CLUSTER}" \
--assume-role-policy-document file://node-trust-policy.json
aws iam attach-role-policy \
--role-name "KarpenterNodeRole-${CLUSTER}" \
--policy-arn arn:${AWS_PARTITION}:iam::aws:policy/AmazonEKSWorkerNodePolicy
aws iam attach-role-policy \
--role-name "KarpenterNodeRole-${CLUSTER}" \
--policy-arn arn:${AWS_PARTITION}:iam::aws:policy/AmazonEKS_CNI_Policy
aws iam attach-role-policy \
--role-name "KarpenterNodeRole-${CLUSTER}" \
--policy-arn arn:${AWS_PARTITION}:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
aws iam attach-role-policy \
--role-name "KarpenterNodeRole-${CLUSTER}" \
--policy-arn arn:${AWS_PARTITION}:iam::aws:policy/AmazonSSMManagedInstanceCore
cat << EOF > controller-trust-policy.json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Federated": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:oidc-provider/${OIDC_ENDPOINT#*//}"
},
"Action": "sts:AssumeRoleWithWebIdentity",
"Condition": {
"StringEquals": {
"${OIDC_ENDPOINT#*//}:aud": "sts.amazonaws.com",
"${OIDC_ENDPOINT#*//}:sub": "system:serviceaccount:${KARPENTER_NAMESPACE}:karpenter"
}
}
}
]
}
EOF
aws iam create-role \
--role-name KarpenterControllerRole-${CLUSTER} \
--assume-role-policy-document file://controller-trust-policy.json
cat << EOF > controller-policy.json
{
"Statement": [
{
"Action": [
"ssm:GetParameter",
"ec2:DescribeImages",
"ec2:RunInstances",
"ec2:DescribeSubnets",
"ec2:DescribeSecurityGroups",
"ec2:DescribeLaunchTemplates",
"ec2:DescribeInstances",
"ec2:DescribeInstanceTypes",
"ec2:DescribeInstanceTypeOfferings",
"ec2:DescribeAvailabilityZones",
"ec2:DeleteLaunchTemplate",
"ec2:CreateTags",
"ec2:CreateLaunchTemplate",
"ec2:CreateFleet",
"ec2:DescribeSpotPriceHistory",
"pricing:GetProducts"
],
"Effect": "Allow",
"Resource": "*",
"Sid": "Karpenter"
},
{
"Action": "ec2:TerminateInstances",
"Condition": {
"StringLike": {
"ec2:ResourceTag/karpenter.sh/nodepool": "*"
}
},
"Effect": "Allow",
"Resource": "*",
"Sid": "ConditionalEC2Termination"
},
{
"Effect": "Allow",
"Action": "iam:PassRole",
"Resource": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER}",
"Sid": "PassNodeIAMRole"
},
{
"Effect": "Allow",
"Action": "eks:DescribeCluster",
"Resource": "arn:${AWS_PARTITION}:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER}",
"Sid": "EKSClusterEndpointLookup"
},
{
"Sid": "AllowScopedInstanceProfileCreationActions",
"Effect": "Allow",
"Resource": "*",
"Action": [
"iam:CreateInstanceProfile"
],
"Condition": {
"StringEquals": {
"aws:RequestTag/kubernetes.io/cluster/${CLUSTER}": "owned",
"aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}"
},
"StringLike": {
"aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*"
}
}
},
{
"Sid": "AllowScopedInstanceProfileTagActions",
"Effect": "Allow",
"Resource": "*",
"Action": [
"iam:TagInstanceProfile"
],
"Condition": {
"StringEquals": {
"aws:ResourceTag/kubernetes.io/cluster/${CLUSTER}": "owned",
"aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}",
"aws:RequestTag/kubernetes.io/cluster/${CLUSTER}": "owned",
"aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}"
},
"StringLike": {
"aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*",
"aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*"
}
}
},
{
"Sid": "AllowScopedInstanceProfileActions",
"Effect": "Allow",
"Resource": "*",
"Action": [
"iam:AddRoleToInstanceProfile",
"iam:RemoveRoleFromInstanceProfile",
"iam:DeleteInstanceProfile"
],
"Condition": {
"StringEquals": {
"aws:ResourceTag/kubernetes.io/cluster/${CLUSTER}": "owned",
"aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}"
},
"StringLike": {
"aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*"
}
}
},
{
"Sid": "AllowInstanceProfileReadActions",
"Effect": "Allow",
"Resource": "*",
"Action": "iam:GetInstanceProfile"
}
],
"Version": "2012-10-17"
}
EOF
aws iam put-role-policy \
--role-name KarpenterControllerRole-${CLUSTER} \
--policy-name KarpenterControllerPolicy-${CLUSTER} \
--policy-document file://controller-policy.json
Add tags to node group subnets and security group so that Karpenter knows which resources to use:
for NODEGROUP in $(aws eks list-nodegroups --cluster-name ${CLUSTER} \
--query 'nodegroups' --output text); do aws ec2 create-tags \
--tags "Key=karpenter.sh/discovery,Value=${CLUSTER}" \
--resources $(aws eks describe-nodegroup --cluster-name ${CLUSTER} \
--nodegroup-name $NODEGROUP --query 'nodegroup.subnets' --output text )
done
aws eks list-nodegroups --cluster-name ${CLUSTER}
# {["ng-m7i-large", "ng-p3-2xlarge"]}
NODEGROUP=$(aws eks list-nodegroups --cluster-name ${CLUSTER} --query 'nodegroups[0]' --output text)
LAUNCH_TEMPLATE=$(aws eks describe-nodegroup --cluster-name ${CLUSTER} \
--nodegroup-name ${NODEGROUP} --query 'nodegroup.launchTemplate.{id:id,version:version}' \
--output text | tr -s "\t" ",")
# EKS > Cluster > Networking > Cluster Security Group
SECURITY_GROUPS=$(aws eks describe-cluster \
--name ${CLUSTER} \
--query "cluster.resourcesVpcConfig.clusterSecurityGroupId" \
--output text)
aws ec2 create-tags \
--tags "Key=karpenter.sh/discovery,Value=${CLUSTER}" \
--resources ${SECURITY_GROUPS}
Update the aws-auth
config map to allow the nodes with the previously created IAM roles to join the cluster:
cat << EOF
- groups:
- system:bootstrappers
- system:nodes
rolearn: arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER}
username: system:node:{{EC2PrivateDNSName}}
EOF
# Add the result from the above section to the mapRoles
kubectl edit configmap aws-auth -n kube-system
Now it’s time to finally deploy Karpenter:
# This is a suggestion, feel free to use other version if you prefer.
export KARPENTER_VERSION=v0.33.1
helm template karpenter oci://public.ecr.aws/karpenter/karpenter \
--version "${KARPENTER_VERSION}" \
--namespace "${KARPENTER_NAMESPACE}" \
--set "settings.clusterName=${CLUSTER}" \
--set "serviceAccount.annotations.eks\.amazonaws\.com/role-arn=arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${CLUSTER}" \
--set controller.resources.requests.cpu=1 \
--set controller.resources.requests.memory=1Gi \
--set controller.resources.limits.cpu=1 \
--set controller.resources.limits.memory=1Gi > karpenter.yaml
You will need to modify the affinity rules to tell Karpenter to run on one of the existing node group nodes.
You can add several node groups, one group per line:
cat << EOF
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: karpenter.sh/nodepool
operator: DoesNotExist
- matchExpressions:
- key: eks.amazonaws.com/nodegroup
operator: In
values:
- ${NODEGROUP}
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- topologyKey: kubernetes.io/hostname
EOF
# Replace the `affinity` section with the result from the command above
vim karpenter.yaml
Deploy the rest of the basic Karpenter resources:
kubectl create -f \
https://raw.githubusercontent.com/aws/karpenter-provider-aws/${KARPENTER_VERSION}/pkg/apis/crds/karpenter.sh_nodepools.yaml
kubectl create -f \
https://raw.githubusercontent.com/aws/karpenter-provider-aws/${KARPENTER_VERSION}/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml
kubectl create -f \
https://raw.githubusercontent.com/aws/karpenter-provider-aws/${KARPENTER_VERSION}/pkg/apis/crds/karpenter.sh_nodeclaims.yaml
kubectl apply -f karpenter.yaml
CPU Node Pool
One of the advantages of Karpenter is that it tries to pick the most cost-effective instance for your workload.
In order for it to do that, it needs to have information about available node pools.
We’ll first create a node pool with only CPU machines and look at adding instances with GPUs next.
Remember to modify the machine families under key: karpenter.k8s.aws/instance-category
depending on your needs. Moreover, the node pool examples below are just suggestions. You can customize them as you wish, including for example spot/on-demand, allowed generations, limits and when to scale down.
cat <<EOF | envsubst | kubectl apply -f -
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: default
spec:
template:
spec:
requirements:
- key: kubernetes.io/arch
operator: In
values: ["amd64"]
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["spot", "on-demand"]
- key: karpenter.k8s.aws/instance-category
operator: In
values: ["c", "m", "r"]
- key: karpenter.k8s.aws/instance-generation
operator: Gt
values: ["2"]
nodeClassRef:
name: default
limits:
cpu: 100
memory: 1000Gi
disruption:
consolidationPolicy: WhenUnderutilized
expireAfter: 720h
---
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
metadata:
name: default
spec:
amiFamily: AL2
role: "KarpenterNodeRole-${CLUSTER}"
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "${CLUSTER}"
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "${CLUSTER}"
EOF
GPU Node Pool
If you want to use GPU instances in your cluster, you will need to add another node pool. In addition, you will need to make sure you have the NVIDIA device plugin installed on your cluster.
You can choose how you want to install the device plugin; here are the instructions how to do it with Helm.
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
helm repo update
# Check version of the plugin and use it in the following command
helm search repo nvdp --devel
helm upgrade --install nvdp nvdp/nvidia-device-plugin \
--namespace nvidia-device-plugin \
--create-namespace \
--version <version>
Add the GPU node group:
cat <<EOF | envsubst | kubectl apply -f -
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: default-gpu
spec:
template:
spec:
requirements:
- key: kubernetes.io/arch
operator: In
values: ["amd64"]
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
- key: karpenter.k8s.aws/instance-category
operator: In
values: ["p"]
- key: karpenter.k8s.aws/instance-generation
operator: Gt
values: ["2"]
nodeClassRef:
name: default
taints:
- key: nvidia.com/gpu
value: true
effect: "NoSchedule"
limits:
cpu: 100
memory: 1000Gi
nvidia.com/gpu: 5
disruption:
consolidationPolicy: WhenUnderutilized
expireAfter: 720h
EOF
Diagnosis
You can follow the scaling logs with:
kubectl logs -f -n ${KARPENTER_NAMESPACE} -c controller -l app.kubernetes.io/name=karpenter