Last active
February 27, 2024 14:05
-
-
Save aadityabhatia/50574836c727a1add565c7908e22cb98 to your computer and use it in GitHub Desktop.
monitor squeue and send notification when node is allocated
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import requests | |
import sys | |
from datetime import datetime | |
import time | |
import random | |
import signal | |
# arbitrarily chosen prime numbers | |
SLEEP_MIN = 127 | |
SLEEP_MAX = 157 | |
def get_slurm_status_status(username): | |
"""Get the status of the slurm nodes for a given user.""" | |
output = subprocess.check_output( | |
['squeue', '-u', username, '-h', '-o', '"%i %N %T"']).decode('utf-8') | |
# parse output and return a set of status lines | |
output = output.strip('" \n').split('\n') | |
outputSet = set() | |
for line in output: | |
line = line.strip('" ') | |
if line: | |
outputSet.add(line) | |
return outputSet | |
def send_notification(post_url, message): | |
"""Send notification via HTTP POST.""" | |
response = requests.post(post_url, data=message, headers={ | |
"Title": "Slurm Status Update", | |
"Priority": "max", | |
"Tags": "computer" | |
}) | |
if response.status_code != 200: | |
raise Exception( | |
f"Failed HTTP POST with {response.status_code}") | |
def monitor_slurm_status(username, post_url): | |
"""Monitor the slurm status and send notifications for changes.""" | |
previous_status = set() | |
while True: | |
current_status = get_slurm_status_status(username) | |
# Check for changes in the nodes | |
added_status = current_status - previous_status | |
removed_status = previous_status - current_status | |
# Send a notification if there are any changes | |
if added_status: | |
print(f"{datetime.now()} Added: {added_status}") | |
message = "; ".join(added_status) | |
# send a notification only if any of the added lines contain "RUNNING" | |
if any("RUNNING" in line for line in added_status): | |
send_notification(post_url, message) | |
print(f"{datetime.now()} Notification sent: {message}") | |
if removed_status: | |
print(f"{datetime.now()} Removed: {removed_status}") | |
# Update the previous nodes | |
previous_status = current_status | |
# sleep for a random interval betwen 127 and 157 seconds | |
time.sleep(random.randint(SLEEP_MIN, SLEEP_MAX)) | |
if __name__ == '__main__': | |
# first argument is the username to monitor | |
username = sys.argv[1] | |
# second argument is the URL to send the notification | |
post_url = sys.argv[2] | |
# trap SIGINT | |
def signal_handler(sig, frame): | |
print("Exiting...") | |
sys.exit(0) | |
signal.signal(signal.SIGINT, signal_handler) | |
monitor_slurm_status(username, post_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment