Skip to content

Instantly share code, notes, and snippets.

@guruevi
Last active August 26, 2024 14:11
Show Gist options
  • Save guruevi/7d9673c6f44f49b1841eaf49bbd727f9 to your computer and use it in GitHub Desktop.
Save guruevi/7d9673c6f44f49b1841eaf49bbd727f9 to your computer and use it in GitHub Desktop.
Proxmox vGPU Hook Script
#!/usr/bin/env python3
import os
import re
import sys
def get_available_gpu(vgpu_type):
# In /sys/bus/pci/devices/ directory find the next NVIDIA vGPU device
# and return the path to it
for device in os.listdir('/sys/bus/pci/devices/'):
# Check if the device contains an nvidia directory
if not os.path.isdir(f'/sys/bus/pci/devices/{device}/nvidia'):
continue
# Check the current_vgpu_type file to see if it is in use
with open(f'/sys/bus/pci/devices/{device}/nvidia/current_vgpu_type') as file:
current_vgpu_type = file.read()
# If it is in use, continue to the next device
if current_vgpu_type != '0\n':
continue
with open(f'/sys/bus/pci/devices/{device}/nvidia/creatable_vgpu_types') as file:
available_vgpu_type = file.read()
for line in available_vgpu_type.splitlines():
if vgpu_type in line:
print(f'Found available: /sys/bus/pci/devices/{device}')
print(f'nVIDIA ID, type: {line}')
vgpu_id = line.split(" : ")[0].strip()
return device, vgpu_id
print("No available NVIDIA vGPU found, are virtual functions enabled? (systemctl start nvidia-sriov)")
exit(404)
def parse_vgpu_type_id(config):
# Define the regular expression pattern
pattern = r'(.*)nvidia-(\d+)'
# Search for the pattern in the config string
match = re.search(pattern, config['tags'])
# If a match is found, extract and return the vGPU type ID
if match:
return match.group(2)
return None
def parse_vgpu_bus_id(config) -> list:
# Define the regular expression pattern
pattern = r'-device vfio-pci,sysfsdev=(/sys/bus/pci/devices/[0-9a-fA-F:.]+)'
# Search for the pattern in the config string
matches = re.findall(pattern, config['args'])
# If a match is found, extract and return the vGPU bus ID
if not matches:
return []
return matches
def parse_vm_config(vmid, from_node):
config_file = f'/etc/pve/qemu-server/{vmid}.conf'
if from_node:
config_file = f'/etc/pve/nodes/{from_node}/qemu-server/{vmid}.conf'
with open(config_file) as file:
config = file.read()
# Split each string into a dict
config_dict = {}
for line in config.splitlines():
key, value = line.split(': ')
config_dict[key] = value
return config_dict
def parse_line_config(config_line, item):
line_dict = {}
for line in config_line.split(','):
key, value = line.split('=')
line_dict[key] = value
return line_dict.get(item, None)
def main():
if len(sys.argv) < 3:
print("Usage: script.py <vmid> <phase>")
print(" script.py <vmid> get_command <vgpu_name>")
sys.exit(1)
vmid = sys.argv[1]
phase = sys.argv[2]
if phase == "get_command":
if len(sys.argv) < 4:
print("Usage: script.py <vmid> get_command <vgpu_name>")
sys.exit(1)
vgpu_name = sys.argv[3]
# Read the VM config file
from_node = os.environ.get("PVE_MIGRATED_FROM", None)
config_dict = parse_vm_config(vmid, from_node)
if phase == 'get_command':
available_vgpu, gpu_id = get_available_gpu(vgpu_name)
uuid = parse_line_config(config_dict['smbios1'], 'uuid')
print(f"qm set {vmid} --hookscript local:snippets/nvidia_allocator.py")
print(
f"qm set {vmid} --args \"-device vfio-pci,sysfsdev=/sys/bus/pci/devices/{available_vgpu} -uuid {uuid}\"")
tags = set(filter(None, config_dict.get('tags', '').strip().split(';')))
tags.add(f"nvidia-{gpu_id}")
print(f"qm set {vmid} --tags \"{';'.join(tags)}\"")
sys.exit(0)
# Get the vGPU we want from config
vgpu_type_id = parse_vgpu_type_id(config_dict)
if not vgpu_type_id:
# VM doesn't seem to require a GPU
sys.exit(0)
vgpu_paths = parse_vgpu_bus_id(config_dict)
if not vgpu_paths:
# No vGPU location specified
sys.exit(0)
if phase == 'pre-start':
# Check if path already exists
for vgpu_path in vgpu_paths:
if not os.path.exists(vgpu_path):
print(f"Specified vGPU not found, rerun the nvidia_allocator get_command or check the drivers: {vgpu_path}")
sys.exit(1)
stop(vgpu_path)
# We break the loop here so that if we misconfigured #2, then we stop before configuring the first
for vgpu_path in vgpu_paths:
# Write the vgpu_type_id to current_vgpu_type
with open(f'{vgpu_path}/nvidia/current_vgpu_type', 'w') as file:
file.write(vgpu_type_id)
# Let Python handle any Exceptions here, so it crashes out with information
if phase == 'post-stop':
for vgpu_path in vgpu_paths:
stop(vgpu_path)
def stop(vgpu_path):
# Write 0 to current_vgpu_type to indicate that the vGPU is no longer in use
try:
with open(f'{vgpu_path}/nvidia/current_vgpu_type', 'w') as file:
file.write('0')
except (FileNotFoundError, PermissionError):
# The vGPU path does not exist, so we can ignore this error
print("vGPU already de-allocated")
if __name__ == "__main__":
main()
# Make sure we exit with a 0 status code
sys.exit(0)
@ShoGinn
Copy link

ShoGinn commented Aug 1, 2024

Thanks for this! I updated it a bit to incorporate python3 features as well as at least on my machine, the extra stop was causing a problem

@guruevi
Copy link
Author

guruevi commented Aug 1, 2024

Thanks for this! I updated it a bit to incorporate python3 features as well as at least on my machine, the extra stop was causing a problem

Found the same issue on my systems and updated it since, see the edit. The extra stop is necessary when the machine crashes (not through the Proxmox system) or the system is migrated and 'post-stop' is never called. So this tries to clean it up before restarting it.

I also made it print the correct sequence of commands, once I verify this always works I may just have an extra option to execute them from the script.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment