guruevi · August 26, 2024 14:11 · ShoGinn · Aug 1, 2024 · guruevi · Aug 1, 2024
diff --git a/nvidia_allocator.py b/nvidia_allocator.py
 #!/usr/bin/env python3
 import os
 import re
 import sys


 def get_available_gpu(vgpu_type):
    # In /sys/bus/pci/devices/ directory find the next NVIDIA vGPU device
    # and return the path to it
    for device in os.listdir('/sys/bus/pci/devices/'):
        # Check if the device contains an nvidia directory
        if not os.path.isdir(f'/sys/bus/pci/devices/{device}/nvidia'):
            continue

        # Check the current_vgpu_type file to see if it is in use
        with open(f'/sys/bus/pci/devices/{device}/nvidia/current_vgpu_type') as file:
            current_vgpu_type = file.read()
        # If it is in use, continue to the next device
        if current_vgpu_type != '0\n':
            continue

        with open(f'/sys/bus/pci/devices/{device}/nvidia/creatable_vgpu_types') as file:
            available_vgpu_type = file.read()

        for line in available_vgpu_type.splitlines():
            if vgpu_type in line:
                print(f'Found available: /sys/bus/pci/devices/{device}')
                print(f'nVIDIA ID, type: {line}')
                vgpu_id = line.split(" : ")[0].strip()
                return device, vgpu_id

    print("No available NVIDIA vGPU found, are virtual functions enabled? (systemctl start nvidia-sriov)")
    exit(404)


 def parse_vgpu_type_id(config):
    # Define the regular expression pattern
    pattern = r'(.*)nvidia-(\d+)'

    # Search for the pattern in the config string
    match = re.search(pattern, config['tags'])

    # If a match is found, extract and return the vGPU type ID
    if match:
        return match.group(2)

    return None


 def parse_vgpu_bus_id(config) -> list:
    # Define the regular expression pattern
    pattern = r'-device vfio-pci,sysfsdev=(/sys/bus/pci/devices/[0-9a-fA-F:.]+)'

    # Search for the pattern in the config string
    matches = re.findall(pattern, config['args'])

    # If a match is found, extract and return the vGPU bus ID
    if not matches:
        return []

    return matches


 def parse_vm_config(vmid, from_node):
    config_file = f'/etc/pve/qemu-server/{vmid}.conf'

    if from_node:
        config_file = f'/etc/pve/nodes/{from_node}/qemu-server/{vmid}.conf'

    with open(config_file) as file:
        config = file.read()

    # Split each string into a dict
    config_dict = {}
    for line in config.splitlines():
        key, value = line.split(': ')
        config_dict[key] = value

    return config_dict


 def parse_line_config(config_line, item):
    line_dict = {}
    for line in config_line.split(','):
        key, value = line.split('=')
        line_dict[key] = value

    return line_dict.get(item, None)


 def main():
    if len(sys.argv) < 3:
        print("Usage: script.py <vmid> <phase>")
        print("       script.py <vmid> get_command <vgpu_name>")
        sys.exit(1)

    vmid = sys.argv[1]
    phase = sys.argv[2]
    if phase == "get_command":
        if len(sys.argv) < 4:
            print("Usage: script.py <vmid> get_command <vgpu_name>")
            sys.exit(1)
        vgpu_name = sys.argv[3]

    # Read the VM config file
    from_node = os.environ.get("PVE_MIGRATED_FROM", None)
    config_dict = parse_vm_config(vmid, from_node)

    if phase == 'get_command':
        available_vgpu, gpu_id = get_available_gpu(vgpu_name)
        uuid = parse_line_config(config_dict['smbios1'], 'uuid')
        print(f"qm set {vmid} --hookscript local:snippets/nvidia_allocator.py")
        print(
            f"qm set {vmid} --args \"-device vfio-pci,sysfsdev=/sys/bus/pci/devices/{available_vgpu} -uuid {uuid}\"")
        tags = set(filter(None, config_dict.get('tags', '').strip().split(';')))
        tags.add(f"nvidia-{gpu_id}")
        print(f"qm set {vmid} --tags \"{';'.join(tags)}\"")
        sys.exit(0)

    # Get the vGPU we want from config
    vgpu_type_id = parse_vgpu_type_id(config_dict)
    if not vgpu_type_id:
        # VM doesn't seem to require a GPU
        sys.exit(0)

    vgpu_paths = parse_vgpu_bus_id(config_dict)
    if not vgpu_paths:
        # No vGPU location specified
        sys.exit(0)

    if phase == 'pre-start':
        # Check if path already exists
        for vgpu_path in vgpu_paths:
            if not os.path.exists(vgpu_path):
                print(f"Specified vGPU not found, rerun the nvidia_allocator get_command or check the drivers: {vgpu_path}")
                sys.exit(1)

            stop(vgpu_path)
        # We break the loop here so that if we misconfigured #2, then we stop before configuring the first

        for vgpu_path in vgpu_paths:
            # Write the vgpu_type_id to current_vgpu_type
            with open(f'{vgpu_path}/nvidia/current_vgpu_type', 'w') as file:
                file.write(vgpu_type_id)
                # Let Python handle any Exceptions here, so it crashes out with information

    if phase == 'post-stop':
        for vgpu_path in vgpu_paths:
            stop(vgpu_path)


 def stop(vgpu_path):
    # Write 0 to current_vgpu_type to indicate that the vGPU is no longer in use
    try:
        with open(f'{vgpu_path}/nvidia/current_vgpu_type', 'w') as file:
            file.write('0')
    except (FileNotFoundError, PermissionError):
        # The vGPU path does not exist, so we can ignore this error
        print("vGPU already de-allocated")


 if __name__ == "__main__":
    main()
    #  Make sure we exit with a 0 status code
    sys.exit(0)
	#!/usr/bin/env python3
	import os
	import re
	import sys


	def get_available_gpu(vgpu_type):
	# In /sys/bus/pci/devices/ directory find the next NVIDIA vGPU device
	# and return the path to it
	for device in os.listdir('/sys/bus/pci/devices/'):
	# Check if the device contains an nvidia directory
	if not os.path.isdir(f'/sys/bus/pci/devices/{device}/nvidia'):
	continue

	# Check the current_vgpu_type file to see if it is in use
	with open(f'/sys/bus/pci/devices/{device}/nvidia/current_vgpu_type') as file:
	current_vgpu_type = file.read()
	# If it is in use, continue to the next device
	if current_vgpu_type != '0\n':
	continue

	with open(f'/sys/bus/pci/devices/{device}/nvidia/creatable_vgpu_types') as file:
	available_vgpu_type = file.read()

	for line in available_vgpu_type.splitlines():
	if vgpu_type in line:
	print(f'Found available: /sys/bus/pci/devices/{device}')
	print(f'nVIDIA ID, type: {line}')
	vgpu_id = line.split(" : ")[0].strip()
	return device, vgpu_id

	print("No available NVIDIA vGPU found, are virtual functions enabled? (systemctl start nvidia-sriov)")
	exit(404)


	def parse_vgpu_type_id(config):
	# Define the regular expression pattern
	pattern = r'(.*)nvidia-(\d+)'

	# Search for the pattern in the config string
	match = re.search(pattern, config['tags'])

	# If a match is found, extract and return the vGPU type ID
	if match:
	return match.group(2)

	return None


	def parse_vgpu_bus_id(config) -> list:
	# Define the regular expression pattern
	pattern = r'-device vfio-pci,sysfsdev=(/sys/bus/pci/devices/[0-9a-fA-F:.]+)'

	# Search for the pattern in the config string
	matches = re.findall(pattern, config['args'])

	# If a match is found, extract and return the vGPU bus ID
	if not matches:
	return []

	return matches


	def parse_vm_config(vmid, from_node):
	config_file = f'/etc/pve/qemu-server/{vmid}.conf'

	if from_node:
	config_file = f'/etc/pve/nodes/{from_node}/qemu-server/{vmid}.conf'

	with open(config_file) as file:
	config = file.read()

	# Split each string into a dict
	config_dict = {}
	for line in config.splitlines():
	key, value = line.split(': ')
	config_dict[key] = value

	return config_dict


	def parse_line_config(config_line, item):
	line_dict = {}
	for line in config_line.split(','):
	key, value = line.split('=')
	line_dict[key] = value

	return line_dict.get(item, None)


	def main():
	if len(sys.argv) < 3:
	print("Usage: script.py <vmid> <phase>")
	print(" script.py <vmid> get_command <vgpu_name>")
	sys.exit(1)

	vmid = sys.argv[1]
	phase = sys.argv[2]
	if phase == "get_command":
	if len(sys.argv) < 4:
	print("Usage: script.py <vmid> get_command <vgpu_name>")
	sys.exit(1)
	vgpu_name = sys.argv[3]

	# Read the VM config file
	from_node = os.environ.get("PVE_MIGRATED_FROM", None)
	config_dict = parse_vm_config(vmid, from_node)

	if phase == 'get_command':
	available_vgpu, gpu_id = get_available_gpu(vgpu_name)
	uuid = parse_line_config(config_dict['smbios1'], 'uuid')
	print(f"qm set {vmid} --hookscript local:snippets/nvidia_allocator.py")
	print(
	f"qm set {vmid} --args \"-device vfio-pci,sysfsdev=/sys/bus/pci/devices/{available_vgpu} -uuid {uuid}\"")
	tags = set(filter(None, config_dict.get('tags', '').strip().split(';')))
	tags.add(f"nvidia-{gpu_id}")
	print(f"qm set {vmid} --tags \"{';'.join(tags)}\"")
	sys.exit(0)

	# Get the vGPU we want from config
	vgpu_type_id = parse_vgpu_type_id(config_dict)
	if not vgpu_type_id:
	# VM doesn't seem to require a GPU
	sys.exit(0)

	vgpu_paths = parse_vgpu_bus_id(config_dict)
	if not vgpu_paths:
	# No vGPU location specified
	sys.exit(0)

	if phase == 'pre-start':
	# Check if path already exists
	for vgpu_path in vgpu_paths:
	if not os.path.exists(vgpu_path):
	print(f"Specified vGPU not found, rerun the nvidia_allocator get_command or check the drivers: {vgpu_path}")
	sys.exit(1)

	stop(vgpu_path)
	# We break the loop here so that if we misconfigured #2, then we stop before configuring the first

	for vgpu_path in vgpu_paths:
	# Write the vgpu_type_id to current_vgpu_type
	with open(f'{vgpu_path}/nvidia/current_vgpu_type', 'w') as file:
	file.write(vgpu_type_id)
	# Let Python handle any Exceptions here, so it crashes out with information

	if phase == 'post-stop':
	for vgpu_path in vgpu_paths:
	stop(vgpu_path)


	def stop(vgpu_path):
	# Write 0 to current_vgpu_type to indicate that the vGPU is no longer in use
	try:
	with open(f'{vgpu_path}/nvidia/current_vgpu_type', 'w') as file:
	file.write('0')
	except (FileNotFoundError, PermissionError):
	# The vGPU path does not exist, so we can ignore this error
	print("vGPU already de-allocated")


	if __name__ == "__main__":
	main()
	# Make sure we exit with a 0 status code
	sys.exit(0)