Curt-Park · August 14, 2024 04:55 · Curt-Park · Aug 6, 2024 · Curt-Park · Aug 6, 2024
diff --git a/gpu_sharing_job_template.yaml b/gpu_sharing_job_template.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: gpu-sharing-{UNIQUE_ID}-job
 spec:
  template:
    spec:
      restartPolicy: Never
      containers:
        - name: cuda-container-0
          image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
          command:
          - sh
          - -c
          - |
            export | grep NVIDIA_VISIBLE_DEVICES;  # just for logging.
            python3 -c "import torch; print('GPU Count: ', torch.cuda.device_count())";
          env:
          - name: NVIDIA_VISIBLE_DEVICES
            value: {GPU_UUID}
        - name: cuda-container-1
          image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
          command:
          - sh
          - -c
          - |
            export | grep NVIDIA_VISIBLE_DEVICES;  # just for logging.
            python3 -c "import torch; print('GPU Count: ', torch.cuda.device_count())";
          env:
          - name: NVIDIA_VISIBLE_DEVICES
            value: {GPU_UUID}
      nodeSelector:
        kubernetes.io/hostname: {NODE_NAME}
      tolerations:
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
diff --git a/trigger_gpu_sharing_job.yaml b/trigger_gpu_sharing_job.yaml
 # this job holds a GPU and triggers a gpu sharing job.
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: gpu-sharing-job-trigger-job
 spec:
  template:
    spec:
      serviceAccountName: gpu-sharing-sa
      restartPolicy: Never
      containers:
        - name: gpu-sharing-job-trigger
          image: nvidia/cuda:12.1.0-base-ubuntu18.04
          command:
          - sh
          - -c
          - |
            # Setup
            apt-get update && apt-get install -y curl wget openssl;
            curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl";
            chmod +x ./kubectl && mv ./kubectl /usr/local/bin/kubectl;

            # Get GPU UUID.
            GPU_UUID=$NVIDIA_VISIBLE_DEVICES;
            echo "Scheduled GPUs: ${GPU_UUID}";

            # Get the Job description.
            wget https://gist.githubusercontent.com/Curt-Park/bb20f76ba2b052b03b2e1ea9834517a6/raw/47c03e1d371c4ddc821d115be02cf3563b8b2c5b/gpu_sharing_job_template.yaml;
            UNIQUE_ID=$(openssl rand -hex 12);
            sed -e s/{NODE_NAME}/${NODE_NAME}/ \
                -e s/{GPU_UUID}/${GPU_UUID}/ \
                -e s/{UNIQUE_ID}/${UNIQUE_ID}/ \
                gpu_sharing_job_template.yaml > gpu_sharing_job.yaml;
            cat gpu_sharing_job.yaml;

            # Create the job.
            kubectl create -f gpu_sharing_job.yaml;

            # Wait for the job terminated.
            JOB_NAME=gpu-sharing-$UNIQUE_ID-job;
            while true; do
              status=$(kubectl get job $JOB_NAME -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}');
              if [ "$status" = "True" ]; then
                echo "Job $JOB_NAME completed";
                break;
              fi;
              echo "Waiting for the job $JOB_NAME to complete...";
              sleep 1;
            done;
          env:
          - name: NODE_NAME
            valueFrom:
              fieldRef:
                fieldPath: spec.nodeName
          resources:
            # Requesting 1 phsycal GPUs to share.
            requests:
              nvidia.com/gpu: 1
            limits:
              nvidia.com/gpu: 1
      nodeSelector:
        nodeType: gpu
      tolerations:
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: gpu-sharing-sa
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: gpu-sharing-role
 rules:
 - apiGroups: ["batch"]
  resources: ["jobs"]
  verbs: ["get",  "create"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: gpu-sharing-rolebinding
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: gpu-sharing-role
 subjects:
 - kind: ServiceAccount
  name: gpu-sharing-sa
	apiVersion: batch/v1
	kind: Job
	metadata:
	name: gpu-sharing-{UNIQUE_ID}-job
	spec:
	template:
	spec:
	restartPolicy: Never
	containers:
	- name: cuda-container-0
	image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
	command:
	- sh
	- -c
	- \|
	export \| grep NVIDIA_VISIBLE_DEVICES; # just for logging.
	python3 -c "import torch; print('GPU Count: ', torch.cuda.device_count())";
	env:
	- name: NVIDIA_VISIBLE_DEVICES
	value: {GPU_UUID}
	- name: cuda-container-1
	image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
	command:
	- sh
	- -c
	- \|
	export \| grep NVIDIA_VISIBLE_DEVICES; # just for logging.
	python3 -c "import torch; print('GPU Count: ', torch.cuda.device_count())";
	env:
	- name: NVIDIA_VISIBLE_DEVICES
	value: {GPU_UUID}
	nodeSelector:
	kubernetes.io/hostname: {NODE_NAME}
	tolerations:
	- key: nvidia.com/gpu
	operator: Exists
	effect: NoSchedule
	# this job holds a GPU and triggers a gpu sharing job.
	apiVersion: batch/v1
	kind: Job
	metadata:
	name: gpu-sharing-job-trigger-job
	spec:
	template:
	spec:
	serviceAccountName: gpu-sharing-sa
	restartPolicy: Never
	containers:
	- name: gpu-sharing-job-trigger
	image: nvidia/cuda:12.1.0-base-ubuntu18.04
	command:
	- sh
	- -c
	- \|
	# Setup
	apt-get update && apt-get install -y curl wget openssl;
	curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl";
	chmod +x ./kubectl && mv ./kubectl /usr/local/bin/kubectl;

	# Get GPU UUID.
	GPU_UUID=$NVIDIA_VISIBLE_DEVICES;
	echo "Scheduled GPUs: ${GPU_UUID}";

	# Get the Job description.
	wget https://gist.githubusercontent.com/Curt-Park/bb20f76ba2b052b03b2e1ea9834517a6/raw/47c03e1d371c4ddc821d115be02cf3563b8b2c5b/gpu_sharing_job_template.yaml;
	UNIQUE_ID=$(openssl rand -hex 12);
	sed -e s/{NODE_NAME}/${NODE_NAME}/ \
	-e s/{GPU_UUID}/${GPU_UUID}/ \
	-e s/{UNIQUE_ID}/${UNIQUE_ID}/ \
	gpu_sharing_job_template.yaml > gpu_sharing_job.yaml;
	cat gpu_sharing_job.yaml;

	# Create the job.
	kubectl create -f gpu_sharing_job.yaml;

	# Wait for the job terminated.
	JOB_NAME=gpu-sharing-$UNIQUE_ID-job;
	while true; do
	status=$(kubectl get job $JOB_NAME -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}');
	if [ "$status" = "True" ]; then
	echo "Job $JOB_NAME completed";
	break;
	fi;
	echo "Waiting for the job $JOB_NAME to complete...";
	sleep 1;
	done;
	env:
	- name: NODE_NAME
	valueFrom:
	fieldRef:
	fieldPath: spec.nodeName
	resources:
	# Requesting 1 phsycal GPUs to share.
	requests:
	nvidia.com/gpu: 1
	limits:
	nvidia.com/gpu: 1
	nodeSelector:
	nodeType: gpu
	tolerations:
	- key: nvidia.com/gpu
	operator: Exists
	effect: NoSchedule
	---
	apiVersion: v1
	kind: ServiceAccount
	metadata:
	name: gpu-sharing-sa
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: Role
	metadata:
	name: gpu-sharing-role
	rules:
	- apiGroups: ["batch"]
	resources: ["jobs"]
	verbs: ["get", "create"]
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: RoleBinding
	metadata:
	name: gpu-sharing-rolebinding
	roleRef:
	apiGroup: rbac.authorization.k8s.io
	kind: Role
	name: gpu-sharing-role
	subjects:
	- kind: ServiceAccount
	name: gpu-sharing-sa