mirror of https://github.com/antmicro/sargraph.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
354 lines
11 KiB
354 lines
11 KiB
#!/usr/bin/env python3 |
|
|
|
# |
|
# (c) 2019-2023 Antmicro <www.antmicro.com> |
|
# License: Apache-2.0 |
|
# |
|
|
|
|
|
import datetime |
|
import fcntl |
|
import os |
|
import re |
|
import select |
|
import signal |
|
import subprocess |
|
import sys |
|
import time |
|
|
|
import graph |
|
|
|
from common import * |
|
|
|
die = 0 |
|
|
|
|
|
# Initialize summary variables |
|
SAMPLE_NUMBER = 0 |
|
TOTAL_RAM = 0 |
|
START_DATE = "" |
|
END_DATE = "" |
|
TOTAL_LOAD = 0.0 |
|
MAX_USED_RAM = 0 |
|
MAX_USED_FS = 0 |
|
MAX_TX = 0 |
|
MAX_RX = 0 |
|
TOTAL_FS = 0 |
|
|
|
TOTAL_GPU_LOAD = 0.0 |
|
TOTAL_GPU_RAM = 0 |
|
MAX_USED_GPU_RAM = 0 |
|
|
|
FS_NAME = None |
|
FS_SAR_INDEX = None |
|
|
|
IFACE_NAME = None |
|
IFACE_SAR_INDEX = None |
|
|
|
# Handle SIGTERM |
|
def kill_handler(a, b): |
|
global die |
|
die = 1 |
|
|
|
|
|
# Read a single table from sar output |
|
def read_table(f): |
|
# Find the header |
|
while True: |
|
header = f.readline().decode().split() |
|
if len(header) > 0: |
|
break |
|
|
|
# The first columns is always just time |
|
header[0] = 'time' |
|
|
|
table = {} |
|
for title in header: |
|
table[title] = [] |
|
|
|
# Read rows |
|
while True: |
|
row = f.readline().decode().split() |
|
if len(row) <= 0: |
|
break |
|
|
|
for i, value in enumerate(row): |
|
table[header[i]].append(value) |
|
|
|
return table |
|
|
|
|
|
# Initialize 'data.txt' where the data is dumped |
|
def initialize(session, machine): |
|
global TOTAL_RAM |
|
global TOTAL_GPU_RAM |
|
|
|
with open("/proc/meminfo") as f: |
|
TOTAL_RAM = int(scan("MemTotal:\s+(\d+)", float, f.read())) |
|
|
|
uname = machine.split(" ")[0:2] |
|
uname = f"{uname[0]} {uname[1]}" |
|
|
|
cpus = int(machine.split(" CPU)")[0].split("(")[-1]) |
|
|
|
cpu_name = "unknown" |
|
|
|
with open("/proc/cpuinfo") as f: |
|
for line in f: |
|
if "model name" in line: |
|
cpu_name = line.replace("\n", "").split(": ")[1] |
|
break |
|
header = [ |
|
f"# sargraph version: {SARGRAPH_VERSION}", |
|
f"pid: {os.getpid()}", |
|
f"machine: {uname}", |
|
f"cpu count: {cpus}", |
|
f"cpu: {cpu_name}" |
|
] |
|
try: |
|
pgpu = subprocess.run( |
|
'nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader,nounits'.split(' '), |
|
capture_output=True |
|
) |
|
if pgpu.returncode == 0: |
|
gpuname, gpudriver, memory_total = pgpu.stdout.decode('utf-8').rsplit(', ', 2) |
|
header.extend([ |
|
f"gpu: {gpuname}", |
|
f"gpu driver: {gpudriver}" |
|
]) |
|
TOTAL_GPU_RAM = int(memory_total) |
|
except Exception as e: |
|
print(e) |
|
pass |
|
with open(f"{session}.txt", "w") as f: |
|
print(*header, sep=", ", file=f) |
|
|
|
|
|
# Add a summary comment to 'data.txt' |
|
def summarize(session): |
|
# Is there anything to be summarized? |
|
if SAMPLE_NUMBER == 0: |
|
return |
|
|
|
average_load = TOTAL_LOAD / float(SAMPLE_NUMBER) |
|
max_used_ram = MAX_USED_RAM * 1024.0 |
|
total_ram = TOTAL_RAM * 1024.0 |
|
max_used_fs = MAX_USED_FS * 1024.0 * 1024.0 |
|
total_fs = TOTAL_FS * 1024 * 1024 |
|
max_tx = MAX_TX / 128 # kB/s to Mb/s |
|
max_rx = MAX_RX / 128 # kB/s to Mb/s |
|
|
|
sdt = datetime.datetime.strptime(START_DATE, '%Y-%m-%d %H:%M:%S') |
|
edt = datetime.datetime.strptime(END_DATE, '%Y-%m-%d %H:%M:%S') |
|
delta_t = (edt - sdt).total_seconds() |
|
|
|
summary = [ |
|
f"# total ram: {total_ram:.2f} B", |
|
f"total disk space: {total_fs:.2f} B", |
|
f"max ram used: {max_used_ram:.2f} B", |
|
f"max disk used: {max_used_fs:.2f} B", |
|
f"average load: {average_load:.2f} %", |
|
f"observed disk: {FS_NAME}", |
|
f"max data received: {max_rx:.2f} Mb/s", |
|
f"max data sent: {max_tx:.2f} Mb/s", |
|
f"observed network: {IFACE_NAME}", |
|
f"duration: {delta_t} seconds" |
|
] |
|
|
|
if TOTAL_GPU_RAM != 0: |
|
summary.extend([ |
|
f"total gpu ram: {TOTAL_GPU_RAM * 1024 * 1024:.2f} B", # default units are MiB |
|
f"max gpu ram used: {MAX_USED_GPU_RAM * 1024 * 1024:.2f} B", # default units are MiB |
|
f"average gpu load: {TOTAL_GPU_LOAD / SAMPLE_NUMBER:.2f} %" |
|
]) |
|
|
|
with open(f"{session}.txt", "a") as f: |
|
print(*summary, sep=", ", file=f) |
|
|
|
|
|
# Run sar and gather data from it |
|
def watch(session, fsdev, iface): |
|
global SAMPLE_NUMBER |
|
global START_DATE |
|
global END_DATE |
|
global TOTAL_LOAD |
|
global MAX_USED_RAM |
|
global MAX_USED_FS |
|
global MAX_TX |
|
global MAX_RX |
|
global TOTAL_FS |
|
global TOTAL_RX |
|
global TOTAL_TX |
|
global FS_SAR_INDEX |
|
global TOTAL_RAM |
|
global FS_NAME |
|
global IFACE_NAME |
|
global IFACE_SAR_INDEX |
|
global TOTAL_GPU_LOAD |
|
global TOTAL_GPU_RAM |
|
global MAX_USED_GPU_RAM |
|
|
|
global die |
|
|
|
# Was a graph alreay produced by save command from sargraph? |
|
dont_plot = False |
|
|
|
my_env = os.environ |
|
my_env["S_TIME_FORMAT"] = "ISO" |
|
p = run_or_fail("sar", "-F", "-u", "-r", "-n", "DEV", "1", stdout=subprocess.PIPE, env=my_env) |
|
# subprocess for GPU data fetching in the background |
|
try: |
|
pgpu = subprocess.Popen( |
|
'nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv,noheader,nounits -l 1'.split(' '), |
|
stdout=subprocess.PIPE, |
|
env=my_env |
|
) |
|
except: |
|
pgpu = None |
|
|
|
machine = p.stdout.readline().decode() |
|
initialize(session, machine) |
|
p.stdout.readline() |
|
|
|
signal.signal(signal.SIGTERM, kill_handler) |
|
|
|
# Make stdin nonblocking to continue working when no command is sent |
|
flags = fcntl.fcntl(sys.stdin, fcntl.F_GETFL) |
|
fcntl.fcntl(sys.stdin, fcntl.F_SETFL, flags | os.O_NONBLOCK) |
|
|
|
readlist = [p.stdout, sys.stdin] |
|
if pgpu: |
|
readlist.append(pgpu.stdout) |
|
# Gather data from sar output |
|
curr_gpu_util = 0 |
|
curr_gpu_mem = 0 |
|
|
|
while 1: |
|
# Await sar output or a command sent from command handler in sargraph.py |
|
rlist, _, _ = select.select(readlist, [], [], 0.25) |
|
now = datetime.datetime.now() |
|
if sys.stdin in rlist: |
|
label_line = sys.stdin.readline().replace("\n", "") |
|
if label_line.startswith("command:"): |
|
label_line = label_line[len("command:"):] |
|
if label_line.startswith("q:"): |
|
label_line = label_line[len("q:"):] |
|
|
|
summarize(session) |
|
if label_line == "none": |
|
pass |
|
elif label_line: |
|
graph.graph(session, label_line) |
|
elif not dont_plot: |
|
graph.graph(session) |
|
dont_plot = True |
|
die = 1 |
|
break |
|
elif label_line.startswith("s:"): |
|
label_line = label_line[len("s:"):] |
|
|
|
dont_plot = True |
|
|
|
if label_line != "none": |
|
summarize(session) |
|
if not label_line: |
|
graph.graph(session) |
|
else: |
|
graph.graph(session, label_line) |
|
elif label_line.startswith('label:'): |
|
label_line = label_line[len('label:'):] |
|
with open(f"{session}.txt", "a") as f: |
|
timestamp = now.strftime("%Y-%m-%d-%H:%M:%S") |
|
print(f"# {timestamp} label: {label_line}", file=f) |
|
if p.stdout not in rlist: |
|
continue |
|
|
|
date = now.strftime("%Y-%m-%d") |
|
daytime = now.strftime("%H:%M:%S") |
|
|
|
# Read and process CPU data |
|
cpu_data = read_table(p.stdout) |
|
if START_DATE == "": |
|
START_DATE = date + " " + daytime |
|
TOTAL_LOAD += stof(cpu_data["%user"][0]) |
|
SAMPLE_NUMBER += 1 |
|
|
|
# Read and process RAM data |
|
ram_data = read_table(p.stdout) |
|
if TOTAL_RAM == 0: |
|
TOTAL_RAM = (int(ram_data['kbmemused'][0]) + int(ram_data['kbmemfree'][0])) |
|
if MAX_USED_RAM < int(ram_data['kbmemused'][0]): |
|
MAX_USED_RAM = int(ram_data['kbmemused'][0]) |
|
|
|
# Read and process network data |
|
net_data = read_table(p.stdout) |
|
if IFACE_SAR_INDEX is None: |
|
if iface: |
|
IFACE_SAR_INDEX = net_data['IFACE'].index(iface) |
|
else: |
|
maxj, maxv = 0, 0 |
|
for j, used in enumerate(net_data['IFACE']): |
|
v = stof(net_data['rxkB/s'][j]) |
|
if maxv < v: |
|
maxj, maxv = j, v |
|
IFACE_SAR_INDEX = maxj |
|
if IFACE_NAME is None: |
|
IFACE_NAME = net_data['IFACE'][IFACE_SAR_INDEX] |
|
if MAX_RX < stof(net_data['rxkB/s'][IFACE_SAR_INDEX]): |
|
MAX_RX = stof(net_data['rxkB/s'][IFACE_SAR_INDEX]) |
|
if MAX_TX < stof(net_data['txkB/s'][IFACE_SAR_INDEX]): |
|
MAX_TX = stof(net_data['txkB/s'][IFACE_SAR_INDEX]) |
|
|
|
# Read and process FS data |
|
fs_data = read_table(p.stdout) |
|
if FS_SAR_INDEX is None: |
|
if fsdev: |
|
FS_SAR_INDEX = fs_data['FILESYSTEM'].index(fsdev) |
|
else: |
|
maxj, maxv = 0, 0 |
|
for j, free in enumerate(fs_data['MBfsfree']): |
|
v = stof(fs_data['MBfsfree'][j]) + stof(fs_data['MBfsused'][j]) |
|
if maxv < v: |
|
maxj, maxv = j, v |
|
FS_SAR_INDEX = maxj |
|
if FS_NAME is None: |
|
FS_NAME = fs_data["FILESYSTEM"][FS_SAR_INDEX] |
|
if TOTAL_FS == 0: |
|
TOTAL_FS = (stof(fs_data['MBfsused'][FS_SAR_INDEX]) + stof(fs_data['MBfsfree'][FS_SAR_INDEX])) |
|
if MAX_USED_FS < int(fs_data['MBfsused'][FS_SAR_INDEX]): |
|
MAX_USED_FS = int(fs_data['MBfsused'][FS_SAR_INDEX]) |
|
|
|
END_DATE = date + " " + daytime |
|
timestamp = date + "-" + daytime |
|
|
|
if pgpu and pgpu.stdout in rlist: |
|
line = pgpu.stdout.readline().decode('utf-8') |
|
curr_gpu_util, curr_gpu_mem = [ |
|
int(val.strip()) for val in line.split(', ') |
|
] |
|
if MAX_USED_GPU_RAM < curr_gpu_mem: |
|
MAX_USED_GPU_RAM = curr_gpu_mem |
|
TOTAL_GPU_LOAD += curr_gpu_util |
|
with open(f"{session}.txt", "a") as f: |
|
line = [ |
|
timestamp, |
|
cpu_data['%user'][0], |
|
ram_data['%memused'][0], |
|
fs_data['%fsused'][FS_SAR_INDEX], |
|
stof(net_data['rxkB/s'][IFACE_SAR_INDEX])/128, # kB/s to Mb/s |
|
stof(net_data['txkB/s'][IFACE_SAR_INDEX])/128 # kB/s to Mb/s |
|
] |
|
if pgpu and TOTAL_GPU_RAM != 0: |
|
line.extend([ |
|
f'{curr_gpu_util:.2f}', |
|
f'{curr_gpu_mem / TOTAL_GPU_RAM * 100.0:.2f}' |
|
]) |
|
print(*line, file=f) |
|
|
|
if die: |
|
break |
|
|
|
# This runs if we were stopped by SIGTERM and no plot was made so far |
|
if not dont_plot: |
|
summarize(session) |
|
plot.plot(session)
|
|
|