Skip to content

Instantly share code, notes, and snippets.

@SteelPh0enix
Last active September 19, 2024 15:58
Show Gist options
  • Save SteelPh0enix/760107a1749df8203fd7b0943fcb5976 to your computer and use it in GitHub Desktop.
Save SteelPh0enix/760107a1749df8203fd7b0943fcb5976 to your computer and use it in GitHub Desktop.
llama.cpp shell utils
#!/bin/zsh
# Collection of variables, aliases and functions to work w/ llama.cpp
# Source to activate.
# HARDCODED VALUES - MAKE SURE TO TUNE THEM FOR YOUR SYSTEM!
# These settings are for RX 7900 XT & latest Arch Linux
export ROCM_VERSION="6.0.2"
export USE_ROCM=1
export HIP_PLATFORM="amd"
export GPU_ARCHS="gfx1100"
export HSA_OVERRIDE_GFX_VERSION="11.0.0"
export ROCM_PATH="/opt/rocm"
export TF_PYTHON_VERSION="3.12"
export DEFAULT_ROCM_GPUS="0"
export USE_SYMENGINE=1
# llama.cpp-related variables (tweak if necessary)
export LLAMA_CPP_PATH="${HOME}/.llama.cpp"
export LLAMA_CPP_PYTHON_VENV_PATH="${HOME}/.llama.cpp.venv"
export LLAMA_CPP_CMAKE_ARGS_FOR_ROCM=(
"-DLLAMA_BUILD_TESTS=OFF"
"-DLLAMA_BUILD_EXAMPLES=ON"
"-DLLAMA_BUILD_SERVER=ON"
"-DLLAMA_STANDALONE=ON"
"-DLLAMA_CURL=OFF"
"-DGGML_CCACHE=OFF"
"-DGGML_NATIVE=ON"
# CPU acceleration
"-DGGML_AVX=ON"
"-DGGML_AVX2=ON"
"-DGGML_FMA=ON"
"-DGGML_OPENMP=ON"
# GPU acceleration
"-DAMDGPU_TARGETS=${GPU_ARCHS}"
"-DGGML_HIPBLAS=ON"
"-DGGML_CUDA_DMMV_X=1024"
"-DGGML_CUDA_MMV_Y=64"
"-DGGML_CUDA_GRAPHS=ON"
)
# llama.cpp server default settings
export LLAMA_CPP_SERVER_HOST="steelph0enix.pc"
export LLAMA_CPP_SERVER_PORT="51536"
export LLAMA_CPP_SERVER_URL="http://${LLAMA_CPP_SERVER_HOST}:${LLAMA_CPP_SERVER_PORT}/"
export LLAMA_CPP_SERVER_CONTEXT_SIZE=20480
export LLAMA_CPP_SERVER_BATCH_SIZE=4096
export LLAMA_CPP_SERVER_UBATCH_SIZE=512
export LLAMA_CPP_SERVER_GPU_LAYERS=999
export LLAMA_CPP_SERVER_PRIORITY=1
export LLAMA_CPP_SERVER_CUSTOM_FRONTEND_PATH="$HOME/LLMs/llama_server_frontend"
# System-related variables
export NUMBER_OF_CORES=$(/usr/bin/nproc)
export PATH="${PATH}:${LLAMA_CPP_PATH}/build/bin"
export PYTHONPATH="${PYTHONPATH}:${LLAMA_CPP_PATH}/gguf-py"
# ROCm-related variables
export GFX_ARCH=$GPU_ARCHS
export PYTORCH_ROCM_ARCH=$GPU_ARCHS
export TF_ROCM_AMDGPU_TARGETS=$GPU_ARCHS
export ROCM_INSTALL_DIR=$ROCM_PATH
export ROCM_TOOLKIT_PATH=$ROCM_PATH
export HIP_PATH=$ROCM_PATH
export HIPCXX="${ROCM_PATH}/llvm/bin/clang"
export PATH="${PATH}:${HIP_PATH}"
export ROCR_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS
export GPU_DEVICE_ORDINAL=$DEFAULT_ROCM_GPUS
export HIP_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS
export CUDA_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS
export OMP_DEFAULT_DEVICE=$DEFAULT_ROCM_GPUS
# generic llm-related functions
function llm-server() {
local model_gguf_path=$1
local model_name=${2:-${1:t:r}}
llama-server \
--threads ${NUMBER_OF_CORES} \
--prio ${LLAMA_CPP_SERVER_PRIORITY} \
--prio-batch ${LLAMA_CPP_SERVER_PRIORITY} \
--ctx-size ${LLAMA_CPP_SERVER_CONTEXT_SIZE} \
--batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} \
--ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} \
--flash-attn \
--mlock \
--mirostat 2 \
--gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} \
--model ${model_gguf_path} \
--alias ${model_name} \
--host ${LLAMA_CPP_SERVER_HOST} \
--port ${LLAMA_CPP_SERVER_PORT} \
--threads-http ${NUMBER_OF_CORES}
}
function llm-server-custom-front() {
local model_gguf_path=$1
local model_name=${2:-${1:t:r}}
llama-server \
--threads ${NUMBER_OF_CORES} \
--prio ${LLAMA_CPP_SERVER_PRIORITY} \
--prio-batch ${LLAMA_CPP_SERVER_PRIORITY} \
--ctx-size ${LLAMA_CPP_SERVER_CONTEXT_SIZE} \
--batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} \
--ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} \
--flash-attn \
--mlock \
--mirostat 2 \
--gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} \
--model ${model_gguf_path} \
--alias ${model_name} \
--host ${LLAMA_CPP_SERVER_HOST} \
--port ${LLAMA_CPP_SERVER_PORT} \
--path ${LLAMA_CPP_SERVER_CUSTOM_FRONTEND_PATH} \
--threads-http ${NUMBER_OF_CORES}
}
function llm-quantize-model() {
local base_model_dir=$1
local output_quantization=${2:-auto}
local output_gguf_dir=${3:-.}
# base_model_dir should point to a repository, so dir's name should be model's name
local model_name=$(basename $base_model_dir)
if [ ! -d "$base_model_dir" ]; then
echo "Error: Model directory '$base_model_dir' does not exist."
return 1
fi
# Run the conversion command
python $LLAMA_CPP_PATH/convert_hf_to_gguf.py --outtype $output_quantization --outfile $output_gguf_dir/$model_name.$output_quantization.gguf $base_model_dir
# Check if the conversion was successful
if [ $? -eq 0 ]; then
echo "Model '$model_name' successfully quantized to $output_quantization format and saved as $output_gguf_dir/$model_name.$output_quantization.gguf"
else
echo "Error: Failed to quantize model '$base_model_dir'."
fi
}
function llm-llama-venv-initialize() {
echo "Initializing llama.cpp python virtualenv @ ${LLAMA_CPP_PYTHON_VENV_PATH}"
python -m venv $LLAMA_CPP_PYTHON_VENV_PATH
llm-llama-venv-update
}
function llm-llama-venv-activate() {
source $LLAMA_CPP_PYTHON_VENV_PATH/bin/activate
echo "llama.cpp python virtualenv activated!"
}
# llama.cpp python virtualenv management functions
function llm-llama-venv-update() {
echo "Updating llama.cpp python virtualenv..."
llm-llama-venv-activate
# core utils
python -m pip install --upgrade pip setuptools wheel
# pytorch
python -m pip install --pre --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.2
# llama.cpp requirements
python -m pip install --upgrade sentencepiece transformers protobuf
# fine-tuning utils
python -m pip install --upgrade torchtune trl
}
# llama.cpp management functions
function llm-llama-clone() {
echo "Pulling llama.cpp repository to ${LLAMA_CPP_PATH}"
git clone git@github.com:ggerganov/llama.cpp.git $LLAMA_CPP_PATH
local og_pwd=$(pwd)
cd $LLAMA_CPP_PATH
git submodule update --init --recursive
git lfs pull
cd $og_pwd
}
function llm-llama-update() {
echo "Pulling latest llama.cpp commit..."
local og_pwd=$(pwd)
cd $LLAMA_CPP_PATH
git pull
git lfs pull
cd $og_pwd
}
function llm-llama-clean() {
echo "Clearing llama.cpp repository from any build artifacts and junk..."
local og_pwd=$(pwd)
cd $LLAMA_CPP_PATH
git clean -xddf
cd $og_pwd
}
function llm-llama-build() {
local og_pwd=$(pwd)
cmake_arguments=("${LLAMA_CPP_CMAKE_ARGS_FOR_ROCM[@]}")
cd $LLAMA_CPP_PATH
echo "Generating build files (backend: $backend, CMake arguments: $cmake_arguments)"
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release $cmake_arguments
echo "Building llama.cpp..."
cmake --build build --config Release -j 24
cd $og_pwd
}
function llm-llama-clean-update() {
llm-llama-clean
llm-llama-update
}
function llm-llama-clean-build() {
llm-llama-clean
llm-llama-build
}
function llm-llama-clean-update-build() {
llm-llama-clean-update
llm-llama-build
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment