Last active
September 19, 2024 15:58
-
-
Save SteelPh0enix/760107a1749df8203fd7b0943fcb5976 to your computer and use it in GitHub Desktop.
llama.cpp shell utils
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
# Collection of variables, aliases and functions to work w/ llama.cpp | |
# Source to activate. | |
# HARDCODED VALUES - MAKE SURE TO TUNE THEM FOR YOUR SYSTEM! | |
# These settings are for RX 7900 XT & latest Arch Linux | |
export ROCM_VERSION="6.0.2" | |
export USE_ROCM=1 | |
export HIP_PLATFORM="amd" | |
export GPU_ARCHS="gfx1100" | |
export HSA_OVERRIDE_GFX_VERSION="11.0.0" | |
export ROCM_PATH="/opt/rocm" | |
export TF_PYTHON_VERSION="3.12" | |
export DEFAULT_ROCM_GPUS="0" | |
export USE_SYMENGINE=1 | |
# llama.cpp-related variables (tweak if necessary) | |
export LLAMA_CPP_PATH="${HOME}/.llama.cpp" | |
export LLAMA_CPP_PYTHON_VENV_PATH="${HOME}/.llama.cpp.venv" | |
export LLAMA_CPP_CMAKE_ARGS_FOR_ROCM=( | |
"-DLLAMA_BUILD_TESTS=OFF" | |
"-DLLAMA_BUILD_EXAMPLES=ON" | |
"-DLLAMA_BUILD_SERVER=ON" | |
"-DLLAMA_STANDALONE=ON" | |
"-DLLAMA_CURL=OFF" | |
"-DGGML_CCACHE=OFF" | |
"-DGGML_NATIVE=ON" | |
# CPU acceleration | |
"-DGGML_AVX=ON" | |
"-DGGML_AVX2=ON" | |
"-DGGML_FMA=ON" | |
"-DGGML_OPENMP=ON" | |
# GPU acceleration | |
"-DAMDGPU_TARGETS=${GPU_ARCHS}" | |
"-DGGML_HIPBLAS=ON" | |
"-DGGML_CUDA_DMMV_X=1024" | |
"-DGGML_CUDA_MMV_Y=64" | |
"-DGGML_CUDA_GRAPHS=ON" | |
) | |
# llama.cpp server default settings | |
export LLAMA_CPP_SERVER_HOST="steelph0enix.pc" | |
export LLAMA_CPP_SERVER_PORT="51536" | |
export LLAMA_CPP_SERVER_URL="http://${LLAMA_CPP_SERVER_HOST}:${LLAMA_CPP_SERVER_PORT}/" | |
export LLAMA_CPP_SERVER_CONTEXT_SIZE=20480 | |
export LLAMA_CPP_SERVER_BATCH_SIZE=4096 | |
export LLAMA_CPP_SERVER_UBATCH_SIZE=512 | |
export LLAMA_CPP_SERVER_GPU_LAYERS=999 | |
export LLAMA_CPP_SERVER_PRIORITY=1 | |
export LLAMA_CPP_SERVER_CUSTOM_FRONTEND_PATH="$HOME/LLMs/llama_server_frontend" | |
# System-related variables | |
export NUMBER_OF_CORES=$(/usr/bin/nproc) | |
export PATH="${PATH}:${LLAMA_CPP_PATH}/build/bin" | |
export PYTHONPATH="${PYTHONPATH}:${LLAMA_CPP_PATH}/gguf-py" | |
# ROCm-related variables | |
export GFX_ARCH=$GPU_ARCHS | |
export PYTORCH_ROCM_ARCH=$GPU_ARCHS | |
export TF_ROCM_AMDGPU_TARGETS=$GPU_ARCHS | |
export ROCM_INSTALL_DIR=$ROCM_PATH | |
export ROCM_TOOLKIT_PATH=$ROCM_PATH | |
export HIP_PATH=$ROCM_PATH | |
export HIPCXX="${ROCM_PATH}/llvm/bin/clang" | |
export PATH="${PATH}:${HIP_PATH}" | |
export ROCR_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS | |
export GPU_DEVICE_ORDINAL=$DEFAULT_ROCM_GPUS | |
export HIP_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS | |
export CUDA_VISIBLE_DEVICES=$DEFAULT_ROCM_GPUS | |
export OMP_DEFAULT_DEVICE=$DEFAULT_ROCM_GPUS | |
# generic llm-related functions | |
function llm-server() { | |
local model_gguf_path=$1 | |
local model_name=${2:-${1:t:r}} | |
llama-server \ | |
--threads ${NUMBER_OF_CORES} \ | |
--prio ${LLAMA_CPP_SERVER_PRIORITY} \ | |
--prio-batch ${LLAMA_CPP_SERVER_PRIORITY} \ | |
--ctx-size ${LLAMA_CPP_SERVER_CONTEXT_SIZE} \ | |
--batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} \ | |
--ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} \ | |
--flash-attn \ | |
--mlock \ | |
--mirostat 2 \ | |
--gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} \ | |
--model ${model_gguf_path} \ | |
--alias ${model_name} \ | |
--host ${LLAMA_CPP_SERVER_HOST} \ | |
--port ${LLAMA_CPP_SERVER_PORT} \ | |
--threads-http ${NUMBER_OF_CORES} | |
} | |
function llm-server-custom-front() { | |
local model_gguf_path=$1 | |
local model_name=${2:-${1:t:r}} | |
llama-server \ | |
--threads ${NUMBER_OF_CORES} \ | |
--prio ${LLAMA_CPP_SERVER_PRIORITY} \ | |
--prio-batch ${LLAMA_CPP_SERVER_PRIORITY} \ | |
--ctx-size ${LLAMA_CPP_SERVER_CONTEXT_SIZE} \ | |
--batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} \ | |
--ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} \ | |
--flash-attn \ | |
--mlock \ | |
--mirostat 2 \ | |
--gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} \ | |
--model ${model_gguf_path} \ | |
--alias ${model_name} \ | |
--host ${LLAMA_CPP_SERVER_HOST} \ | |
--port ${LLAMA_CPP_SERVER_PORT} \ | |
--path ${LLAMA_CPP_SERVER_CUSTOM_FRONTEND_PATH} \ | |
--threads-http ${NUMBER_OF_CORES} | |
} | |
function llm-quantize-model() { | |
local base_model_dir=$1 | |
local output_quantization=${2:-auto} | |
local output_gguf_dir=${3:-.} | |
# base_model_dir should point to a repository, so dir's name should be model's name | |
local model_name=$(basename $base_model_dir) | |
if [ ! -d "$base_model_dir" ]; then | |
echo "Error: Model directory '$base_model_dir' does not exist." | |
return 1 | |
fi | |
# Run the conversion command | |
python $LLAMA_CPP_PATH/convert_hf_to_gguf.py --outtype $output_quantization --outfile $output_gguf_dir/$model_name.$output_quantization.gguf $base_model_dir | |
# Check if the conversion was successful | |
if [ $? -eq 0 ]; then | |
echo "Model '$model_name' successfully quantized to $output_quantization format and saved as $output_gguf_dir/$model_name.$output_quantization.gguf" | |
else | |
echo "Error: Failed to quantize model '$base_model_dir'." | |
fi | |
} | |
function llm-llama-venv-initialize() { | |
echo "Initializing llama.cpp python virtualenv @ ${LLAMA_CPP_PYTHON_VENV_PATH}" | |
python -m venv $LLAMA_CPP_PYTHON_VENV_PATH | |
llm-llama-venv-update | |
} | |
function llm-llama-venv-activate() { | |
source $LLAMA_CPP_PYTHON_VENV_PATH/bin/activate | |
echo "llama.cpp python virtualenv activated!" | |
} | |
# llama.cpp python virtualenv management functions | |
function llm-llama-venv-update() { | |
echo "Updating llama.cpp python virtualenv..." | |
llm-llama-venv-activate | |
# core utils | |
python -m pip install --upgrade pip setuptools wheel | |
# pytorch | |
python -m pip install --pre --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.2 | |
# llama.cpp requirements | |
python -m pip install --upgrade sentencepiece transformers protobuf | |
# fine-tuning utils | |
python -m pip install --upgrade torchtune trl | |
} | |
# llama.cpp management functions | |
function llm-llama-clone() { | |
echo "Pulling llama.cpp repository to ${LLAMA_CPP_PATH}" | |
git clone git@github.com:ggerganov/llama.cpp.git $LLAMA_CPP_PATH | |
local og_pwd=$(pwd) | |
cd $LLAMA_CPP_PATH | |
git submodule update --init --recursive | |
git lfs pull | |
cd $og_pwd | |
} | |
function llm-llama-update() { | |
echo "Pulling latest llama.cpp commit..." | |
local og_pwd=$(pwd) | |
cd $LLAMA_CPP_PATH | |
git pull | |
git lfs pull | |
cd $og_pwd | |
} | |
function llm-llama-clean() { | |
echo "Clearing llama.cpp repository from any build artifacts and junk..." | |
local og_pwd=$(pwd) | |
cd $LLAMA_CPP_PATH | |
git clean -xddf | |
cd $og_pwd | |
} | |
function llm-llama-build() { | |
local og_pwd=$(pwd) | |
cmake_arguments=("${LLAMA_CPP_CMAKE_ARGS_FOR_ROCM[@]}") | |
cd $LLAMA_CPP_PATH | |
echo "Generating build files (backend: $backend, CMake arguments: $cmake_arguments)" | |
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release $cmake_arguments | |
echo "Building llama.cpp..." | |
cmake --build build --config Release -j 24 | |
cd $og_pwd | |
} | |
function llm-llama-clean-update() { | |
llm-llama-clean | |
llm-llama-update | |
} | |
function llm-llama-clean-build() { | |
llm-llama-clean | |
llm-llama-build | |
} | |
function llm-llama-clean-update-build() { | |
llm-llama-clean-update | |
llm-llama-build | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment