SteelPh0enix · September 19, 2024 15:58
diff --git a/llama-cpp-utils.sh b/llama-cpp-utils.sh
 #!/bin/zsh

 # Collection of variables, aliases and functions to work w/ llama.cpp
 # Source to activate.

 # HARDCODED VALUES - MAKE SURE TO TUNE THEM FOR YOUR SYSTEM!
 # These settings are for RX 7900 XT & latest Arch Linux
 export ROCM_VERSION="6.0.2"
 export USE_ROCM=1
 export HIP_PLATFORM="amd"
 export GPU_ARCHS="gfx1100"
 export HSA_OVERRIDE_GFX_VERSION="11.0.0"
 export ROCM_PATH="/opt/rocm"
 export TF_PYTHON_VERSION="3.12"
 export DEFAULT_ROCM_GPUS="0"
 export USE_SYMENGINE=1

 # llama.cpp-related variables (tweak if necessary)
 export LLAMA_CPP_PATH="${HOME}/.llama.cpp"
 export LLAMA_CPP_PYTHON_VENV_PATH="${HOME}/.llama.cpp.venv"
 export LLAMA_CPP_CMAKE_ARGS_FOR_ROCM=(
    "-DLLAMA_BUILD_TESTS=OFF"
    "-DLLAMA_BUILD_EXAMPLES=ON"
    "-DLLAMA_BUILD_SERVER=ON"
    "-DLLAMA_STANDALONE=ON"
    "-DLLAMA_CURL=OFF"
    "-DGGML_CCACHE=OFF"
    "-DGGML_NATIVE=ON"

    # CPU acceleration
    "-DGGML_AVX=ON"
    "-DGGML_AVX2=ON"
    "-DGGML_FMA=ON"
    "-DGGML_OPENMP=ON"

    # GPU acceleration
    "-DAMDGPU_TARGETS=${GPU_ARCHS}"
    "-DGGML_HIPBLAS=ON"
    "-DGGML_CUDA_DMMV_X=1024"
    "-DGGML_CUDA_MMV_Y=64"
    "-DGGML_CUDA_GRAPHS=ON"
 )

 # llama.cpp server default settings
 export LLAMA_CPP_SERVER_HOST="steelph0enix.pc"
 export LLAMA_CPP_SERVER_PORT="51536"
 export LLAMA_CPP_SERVER_URL="http://${LLAMA_CPP_SERVER_HOST}:${LLAMA_CPP_SERVER_PORT}/"
 export LLAMA_CPP_SERVER_CONTEXT_SIZE=20480
 export LLAMA_CPP_SERVER_BATCH_SIZE=4096
 export LLAMA_CPP_SERVER_UBATCH_SIZE=512
 export LLAMA_CPP_SERVER_GPU_LAYERS=999
 export LLAMA_CPP_SERVER_PRIORITY=1
 export LLAMA_CPP_SERVER_CUSTOM_FRONTEND_PATH="$HOME/LLMs/llama_server_frontend"

 # System-related variables
 export NUMBER_OF_CORES=$(/usr/bin/nproc)
 export PATH="${PATH}:${LLAMA_CPP_PATH}/build/bin"
 export PYTHONPATH="${PYTHONPATH}:${LLAMA_CPP_PATH}/gguf-py"

 # ROCm-related variables
 export GFX_ARCH=$GPU_ARCHS
 export PYTORCH_ROCM_ARCH=$GPU_ARCHS
 export TF_ROCM_AMDGPU_TARGETS=$GPU_ARCHS
 export ROCM_INSTALL_DIR=$ROCM_PATH
 export ROCM_TOOLKIT_PATH=$ROCM_PATH
 export HIP_PATH=$ROCM_PATH
 export HIPCXX="${ROCM_PATH}/llvm/bin/clang"
 export PATH="${PATH}:${HIP_PATH}"
 export ROCR_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS
 export GPU_DEVICE_ORDINAL=$DEFAULT_ROCM_GPUS
 export HIP_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS
 export CUDA_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS
 export OMP_DEFAULT_DEVICE=$DEFAULT_ROCM_GPUS

 # generic llm-related functions
 function llm-server() {
    local model_gguf_path=$1
    local model_name=${2:-${1:t:r}}

    llama-server \
        --threads ${NUMBER_OF_CORES} \
        --prio ${LLAMA_CPP_SERVER_PRIORITY} \
        --prio-batch ${LLAMA_CPP_SERVER_PRIORITY} \
        --ctx-size ${LLAMA_CPP_SERVER_CONTEXT_SIZE} \
        --batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} \
        --ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} \
        --flash-attn \
        --mlock \
        --mirostat 2 \
        --gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} \
        --model ${model_gguf_path} \
        --alias ${model_name} \
        --host ${LLAMA_CPP_SERVER_HOST} \
        --port ${LLAMA_CPP_SERVER_PORT} \
        --threads-http ${NUMBER_OF_CORES}
 }

 function llm-server-custom-front() {
    local model_gguf_path=$1
    local model_name=${2:-${1:t:r}}

    llama-server \
        --threads ${NUMBER_OF_CORES} \
        --prio ${LLAMA_CPP_SERVER_PRIORITY} \
        --prio-batch ${LLAMA_CPP_SERVER_PRIORITY} \
        --ctx-size ${LLAMA_CPP_SERVER_CONTEXT_SIZE} \
        --batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} \
        --ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} \
        --flash-attn \
        --mlock \
        --mirostat 2 \
        --gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} \
        --model ${model_gguf_path} \
        --alias ${model_name} \
        --host ${LLAMA_CPP_SERVER_HOST} \
        --port ${LLAMA_CPP_SERVER_PORT} \
        --path ${LLAMA_CPP_SERVER_CUSTOM_FRONTEND_PATH} \
        --threads-http ${NUMBER_OF_CORES}
 }

 function llm-quantize-model() {
  local base_model_dir=$1
  local output_quantization=${2:-auto}
  local output_gguf_dir=${3:-.}

  # base_model_dir should point to a repository, so dir's name should be model's name
  local model_name=$(basename $base_model_dir)

  if [ ! -d "$base_model_dir" ]; then
    echo "Error: Model directory '$base_model_dir' does not exist."
    return 1
  fi

  # Run the conversion command
  python $LLAMA_CPP_PATH/convert_hf_to_gguf.py --outtype $output_quantization --outfile $output_gguf_dir/$model_name.$output_quantization.gguf $base_model_dir

  # Check if the conversion was successful
  if [ $? -eq 0 ]; then
    echo "Model '$model_name' successfully quantized to $output_quantization format and saved as $output_gguf_dir/$model_name.$output_quantization.gguf"
  else
    echo "Error: Failed to quantize model '$base_model_dir'."
  fi
 }

 function llm-llama-venv-initialize() {
    echo "Initializing llama.cpp python virtualenv @ ${LLAMA_CPP_PYTHON_VENV_PATH}"
    python -m venv $LLAMA_CPP_PYTHON_VENV_PATH
    llm-llama-venv-update
 }

 function llm-llama-venv-activate() {
    source $LLAMA_CPP_PYTHON_VENV_PATH/bin/activate
    echo "llama.cpp python virtualenv activated!"
 }

 # llama.cpp python virtualenv management functions
 function llm-llama-venv-update() {
    echo "Updating llama.cpp python virtualenv..."
    llm-llama-venv-activate
    # core utils
    python -m pip install --upgrade pip setuptools wheel
    # pytorch
    python -m pip install --pre --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.2
    # llama.cpp requirements
    python -m pip install --upgrade sentencepiece transformers protobuf
    # fine-tuning utils
    python -m pip install --upgrade torchtune trl
 }

 # llama.cpp management functions
 function llm-llama-clone() {
    echo "Pulling llama.cpp repository to ${LLAMA_CPP_PATH}"
    git clone git@github.com:ggerganov/llama.cpp.git $LLAMA_CPP_PATH
    local og_pwd=$(pwd)
    cd $LLAMA_CPP_PATH
    git submodule update --init --recursive
    git lfs pull
    cd $og_pwd
 }

 function llm-llama-update() {
    echo "Pulling latest llama.cpp commit..."
    local og_pwd=$(pwd)
    cd $LLAMA_CPP_PATH
    git pull
    git lfs pull
    cd $og_pwd
 }

 function llm-llama-clean() {
    echo "Clearing llama.cpp repository from any build artifacts and junk..."
    local og_pwd=$(pwd)
    cd $LLAMA_CPP_PATH
    git clean -xddf
    cd $og_pwd
 }

 function llm-llama-build() {
    local og_pwd=$(pwd)

    cmake_arguments=("${LLAMA_CPP_CMAKE_ARGS_FOR_ROCM[@]}")

    cd $LLAMA_CPP_PATH
    
    echo "Generating build files (backend: $backend, CMake arguments: $cmake_arguments)"
    cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release $cmake_arguments
    echo "Building llama.cpp..."
    cmake --build build --config Release -j 24

    cd $og_pwd
 }

 function llm-llama-clean-update() {
    llm-llama-clean
    llm-llama-update
 }

 function llm-llama-clean-build() {
    llm-llama-clean
    llm-llama-build
 }

 function llm-llama-clean-update-build() {
    llm-llama-clean-update
    llm-llama-build
 }
	#!/bin/zsh

	# Collection of variables, aliases and functions to work w/ llama.cpp
	# Source to activate.

	# HARDCODED VALUES - MAKE SURE TO TUNE THEM FOR YOUR SYSTEM!
	# These settings are for RX 7900 XT & latest Arch Linux
	export ROCM_VERSION="6.0.2"
	export USE_ROCM=1
	export HIP_PLATFORM="amd"
	export GPU_ARCHS="gfx1100"
	export HSA_OVERRIDE_GFX_VERSION="11.0.0"
	export ROCM_PATH="/opt/rocm"
	export TF_PYTHON_VERSION="3.12"
	export DEFAULT_ROCM_GPUS="0"
	export USE_SYMENGINE=1

	# llama.cpp-related variables (tweak if necessary)
	export LLAMA_CPP_PATH="${HOME}/.llama.cpp"
	export LLAMA_CPP_PYTHON_VENV_PATH="${HOME}/.llama.cpp.venv"
	export LLAMA_CPP_CMAKE_ARGS_FOR_ROCM=(
	"-DLLAMA_BUILD_TESTS=OFF"
	"-DLLAMA_BUILD_EXAMPLES=ON"
	"-DLLAMA_BUILD_SERVER=ON"
	"-DLLAMA_STANDALONE=ON"
	"-DLLAMA_CURL=OFF"
	"-DGGML_CCACHE=OFF"
	"-DGGML_NATIVE=ON"

	# CPU acceleration
	"-DGGML_AVX=ON"
	"-DGGML_AVX2=ON"
	"-DGGML_FMA=ON"
	"-DGGML_OPENMP=ON"

	# GPU acceleration
	"-DAMDGPU_TARGETS=${GPU_ARCHS}"
	"-DGGML_HIPBLAS=ON"
	"-DGGML_CUDA_DMMV_X=1024"
	"-DGGML_CUDA_MMV_Y=64"
	"-DGGML_CUDA_GRAPHS=ON"
	)

	# llama.cpp server default settings
	export LLAMA_CPP_SERVER_HOST="steelph0enix.pc"
	export LLAMA_CPP_SERVER_PORT="51536"
	export LLAMA_CPP_SERVER_URL="http://${LLAMA_CPP_SERVER_HOST}:${LLAMA_CPP_SERVER_PORT}/"
	export LLAMA_CPP_SERVER_CONTEXT_SIZE=20480
	export LLAMA_CPP_SERVER_BATCH_SIZE=4096
	export LLAMA_CPP_SERVER_UBATCH_SIZE=512
	export LLAMA_CPP_SERVER_GPU_LAYERS=999
	export LLAMA_CPP_SERVER_PRIORITY=1
	export LLAMA_CPP_SERVER_CUSTOM_FRONTEND_PATH="$HOME/LLMs/llama_server_frontend"

	# System-related variables
	export NUMBER_OF_CORES=$(/usr/bin/nproc)
	export PATH="${PATH}:${LLAMA_CPP_PATH}/build/bin"
	export PYTHONPATH="${PYTHONPATH}:${LLAMA_CPP_PATH}/gguf-py"

	# ROCm-related variables
	export GFX_ARCH=$GPU_ARCHS
	export PYTORCH_ROCM_ARCH=$GPU_ARCHS
	export TF_ROCM_AMDGPU_TARGETS=$GPU_ARCHS
	export ROCM_INSTALL_DIR=$ROCM_PATH
	export ROCM_TOOLKIT_PATH=$ROCM_PATH
	export HIP_PATH=$ROCM_PATH
	export HIPCXX="${ROCM_PATH}/llvm/bin/clang"
	export PATH="${PATH}:${HIP_PATH}"
	export ROCR_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS
	export GPU_DEVICE_ORDINAL=$DEFAULT_ROCM_GPUS
	export HIP_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS
	export CUDA_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS
	export OMP_DEFAULT_DEVICE=$DEFAULT_ROCM_GPUS

	# generic llm-related functions
	function llm-server() {
	local model_gguf_path=$1
	local model_name=${2:-${1:t:r}}

	llama-server \
	--threads ${NUMBER_OF_CORES} \
	--prio ${LLAMA_CPP_SERVER_PRIORITY} \
	--prio-batch ${LLAMA_CPP_SERVER_PRIORITY} \
	--ctx-size ${LLAMA_CPP_SERVER_CONTEXT_SIZE} \
	--batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} \
	--ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} \
	--flash-attn \
	--mlock \
	--mirostat 2 \
	--gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} \
	--model ${model_gguf_path} \
	--alias ${model_name} \
	--host ${LLAMA_CPP_SERVER_HOST} \
	--port ${LLAMA_CPP_SERVER_PORT} \
	--threads-http ${NUMBER_OF_CORES}
	}

	function llm-server-custom-front() {
	local model_gguf_path=$1
	local model_name=${2:-${1:t:r}}

	llama-server \
	--threads ${NUMBER_OF_CORES} \
	--prio ${LLAMA_CPP_SERVER_PRIORITY} \
	--prio-batch ${LLAMA_CPP_SERVER_PRIORITY} \
	--ctx-size ${LLAMA_CPP_SERVER_CONTEXT_SIZE} \
	--batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} \
	--ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} \
	--flash-attn \
	--mlock \
	--mirostat 2 \
	--gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} \
	--model ${model_gguf_path} \
	--alias ${model_name} \
	--host ${LLAMA_CPP_SERVER_HOST} \
	--port ${LLAMA_CPP_SERVER_PORT} \
	--path ${LLAMA_CPP_SERVER_CUSTOM_FRONTEND_PATH} \
	--threads-http ${NUMBER_OF_CORES}
	}

	function llm-quantize-model() {
	local base_model_dir=$1
	local output_quantization=${2:-auto}
	local output_gguf_dir=${3:-.}

	# base_model_dir should point to a repository, so dir's name should be model's name
	local model_name=$(basename $base_model_dir)

	if [ ! -d "$base_model_dir" ]; then
	echo "Error: Model directory '$base_model_dir' does not exist."
	return 1
	fi

	# Run the conversion command
	python $LLAMA_CPP_PATH/convert_hf_to_gguf.py --outtype $output_quantization --outfile $output_gguf_dir/$model_name.$output_quantization.gguf $base_model_dir

	# Check if the conversion was successful
	if [ $? -eq 0 ]; then
	echo "Model '$model_name' successfully quantized to $output_quantization format and saved as $output_gguf_dir/$model_name.$output_quantization.gguf"
	else
	echo "Error: Failed to quantize model '$base_model_dir'."
	fi
	}

	function llm-llama-venv-initialize() {
	echo "Initializing llama.cpp python virtualenv @ ${LLAMA_CPP_PYTHON_VENV_PATH}"
	python -m venv $LLAMA_CPP_PYTHON_VENV_PATH
	llm-llama-venv-update
	}

	function llm-llama-venv-activate() {
	source $LLAMA_CPP_PYTHON_VENV_PATH/bin/activate
	echo "llama.cpp python virtualenv activated!"
	}

	# llama.cpp python virtualenv management functions
	function llm-llama-venv-update() {
	echo "Updating llama.cpp python virtualenv..."
	llm-llama-venv-activate
	# core utils
	python -m pip install --upgrade pip setuptools wheel
	# pytorch
	python -m pip install --pre --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.2
	# llama.cpp requirements
	python -m pip install --upgrade sentencepiece transformers protobuf
	# fine-tuning utils
	python -m pip install --upgrade torchtune trl
	}

	# llama.cpp management functions
	function llm-llama-clone() {
	echo "Pulling llama.cpp repository to ${LLAMA_CPP_PATH}"
	git clone git@github.com:ggerganov/llama.cpp.git $LLAMA_CPP_PATH
	local og_pwd=$(pwd)
	cd $LLAMA_CPP_PATH
	git submodule update --init --recursive
	git lfs pull
	cd $og_pwd
	}

	function llm-llama-update() {
	echo "Pulling latest llama.cpp commit..."
	local og_pwd=$(pwd)
	cd $LLAMA_CPP_PATH
	git pull
	git lfs pull
	cd $og_pwd
	}

	function llm-llama-clean() {
	echo "Clearing llama.cpp repository from any build artifacts and junk..."
	local og_pwd=$(pwd)
	cd $LLAMA_CPP_PATH
	git clean -xddf
	cd $og_pwd
	}

	function llm-llama-build() {
	local og_pwd=$(pwd)

	cmake_arguments=("${LLAMA_CPP_CMAKE_ARGS_FOR_ROCM[@]}")

	cd $LLAMA_CPP_PATH

	echo "Generating build files (backend: $backend, CMake arguments: $cmake_arguments)"
	cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release $cmake_arguments
	echo "Building llama.cpp..."
	cmake --build build --config Release -j 24

	cd $og_pwd
	}

	function llm-llama-clean-update() {
	llm-llama-clean
	llm-llama-update
	}

	function llm-llama-clean-build() {
	llm-llama-clean
	llm-llama-build
	}

	function llm-llama-clean-update-build() {
	llm-llama-clean-update
	llm-llama-build
	}