steren · July 4, 2024 18:25
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
 steps:
 - name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', 'us-central1-docker.pkg.dev/$PROJECT_ID/containers/llamafile:latest', '.']
 - name: 'gcr.io/cloud-builders/docker'
  args: ['push', 'us-central1-docker.pkg.dev/$PROJECT_ID/containers/llamafile:latest']
 images:
 - us-central1-docker.pkg.dev/$PROJECT_ID/containers/llamafile:latest
 options:
 machineType: 'N1_HIGHCPU_32'
 diskSizeGb: '500'
diff --git a/Dockerfile b/Dockerfile
 FROM ubuntu:22.04

 RUN apt update -q && apt install -y ca-certificates wget
 RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
 RUN dpkg -i cuda-keyring_1.1-1_all.deb
 RUN apt-get update && apt-get -y install cuda

 # Update this to the URL pointing at the llamafile you want to run.
 # Find other models at https://github.com/Mozilla-Ocho/llamafile?tab=readme-ov-file#other-example-llamafiles
 ENV LLAMAFILE_DOWNLOAD_URL="https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q4_0.llamafile?download=true"

 # Download the llamafile and make it executable
 RUN wget -O ./model.llamafile $LLAMAFILE_DOWNLOAD_URL && chmod +x ./model.llamafile

 # Use the llamafile executable as container start command
 #ENTRYPOINT ["./model.llamafile"]
 # Use GPU maximize the number of layers sent to GPU, listen on 0.0.0.0, do not attempt to start a browser
 #CMD ["--gpu", "nvidia", "-ngl", "9999","--host", "0.0.0.0", "--nobrowser"]

 # TODO use proper ENTRYPOINT and CMD
 ENTRYPOINT ./model.llamafile --gpu nvidia -ngl 9999 --host 0.0.0.0 --nobrowser
	steps:
	- name: 'gcr.io/cloud-builders/docker'
	args: ['build', '-t', 'us-central1-docker.pkg.dev/$PROJECT_ID/containers/llamafile:latest', '.']
	- name: 'gcr.io/cloud-builders/docker'
	args: ['push', 'us-central1-docker.pkg.dev/$PROJECT_ID/containers/llamafile:latest']
	images:
	- us-central1-docker.pkg.dev/$PROJECT_ID/containers/llamafile:latest
	options:
	machineType: 'N1_HIGHCPU_32'
	diskSizeGb: '500'
	FROM ubuntu:22.04

	RUN apt update -q && apt install -y ca-certificates wget
	RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
	RUN dpkg -i cuda-keyring_1.1-1_all.deb
	RUN apt-get update && apt-get -y install cuda

	# Update this to the URL pointing at the llamafile you want to run.
	# Find other models at https://github.com/Mozilla-Ocho/llamafile?tab=readme-ov-file#other-example-llamafiles
	ENV LLAMAFILE_DOWNLOAD_URL="https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q4_0.llamafile?download=true"

	# Download the llamafile and make it executable
	RUN wget -O ./model.llamafile $LLAMAFILE_DOWNLOAD_URL && chmod +x ./model.llamafile

	# Use the llamafile executable as container start command
	#ENTRYPOINT ["./model.llamafile"]
	# Use GPU maximize the number of layers sent to GPU, listen on 0.0.0.0, do not attempt to start a browser
	#CMD ["--gpu", "nvidia", "-ngl", "9999","--host", "0.0.0.0", "--nobrowser"]

	# TODO use proper ENTRYPOINT and CMD
	ENTRYPOINT ./model.llamafile --gpu nvidia -ngl 9999 --host 0.0.0.0 --nobrowser