Browse Source

[#60401] watch.py: Added terminating nvidia-smi once readouts from the application is incorrect

Signed-off-by: Grzegorz Latosinski <glatosinski@antmicro.com>
main
Grzegorz Latosinski 2 years ago
parent
commit
a204242eb3
  1. 32
      watch.py

32
watch.py

@ -282,15 +282,15 @@ def watch(session, fsdev, iface, tmpfs_color, other_cache_color):
flags = fcntl.fcntl(sys.stdin, fcntl.F_GETFL) flags = fcntl.fcntl(sys.stdin, fcntl.F_GETFL)
fcntl.fcntl(sys.stdin, fcntl.F_SETFL, flags | os.O_NONBLOCK) fcntl.fcntl(sys.stdin, fcntl.F_SETFL, flags | os.O_NONBLOCK)
readlist = [psar.stdout, sys.stdin]
if pgpu:
readlist.append(pgpu.stdout)
# Gather data from sar output # Gather data from sar output
curr_gpu_util = 0 curr_gpu_util = 0
curr_gpu_mem = 0 curr_gpu_mem = 0
while 1: while 1:
# Await sar output or a command sent from command handler in sargraph.py # Await sar output or a command sent from command handler in sargraph.py
readlist = [psar.stdout, sys.stdin]
if pgpu:
readlist.append(pgpu.stdout)
rlist, _, _ = select.select(readlist, [], [], 0.25) rlist, _, _ = select.select(readlist, [], [], 0.25)
now = datetime.datetime.now() now = datetime.datetime.now()
if sys.stdin in rlist: if sys.stdin in rlist:
@ -394,12 +394,26 @@ def watch(session, fsdev, iface, tmpfs_color, other_cache_color):
if pgpu and pgpu.stdout in rlist: if pgpu and pgpu.stdout in rlist:
line = pgpu.stdout.readline().decode('utf-8') line = pgpu.stdout.readline().decode('utf-8')
curr_gpu_util, curr_gpu_mem = [ if pgpu.poll() is not None:
int(val.strip()) for val in line.split(', ') print("nvidia-smi stopped working, reason:")
] print(line)
if MAX_USED_GPU_RAM < curr_gpu_mem: print(f"Error code: {pgpu.returncode}")
MAX_USED_GPU_RAM = curr_gpu_mem print("Closing the GPU statistics collection")
TOTAL_GPU_LOAD += curr_gpu_util pgpu = None
else:
try:
curr_gpu_util, curr_gpu_mem = [
int(val.strip()) for val in line.split(', ')
]
if MAX_USED_GPU_RAM < curr_gpu_mem:
MAX_USED_GPU_RAM = curr_gpu_mem
TOTAL_GPU_LOAD += curr_gpu_util
except ValueError:
print(f"nvidia-smi error readout: {line}")
if "Unknown Error" in line:
# No valid readouts from now on, let's terminate current nvidia-smi session
pgpu.terminate()
pgpu = None
line = [ line = [
timestamp, timestamp,

Loading…
Cancel
Save