-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmonitor_experiments.sh
More file actions
executable file
·145 lines (126 loc) · 4.95 KB
/
Copy pathmonitor_experiments.sh
File metadata and controls
executable file
·145 lines (126 loc) · 4.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env bash
#
# Monitor running SCL experiments.
#
# Usage:
# ./monitor_experiments.sh # One-shot status
# ./monitor_experiments.sh --watch # Auto-refresh every 10s
# ./monitor_experiments.sh --watch 5 # Auto-refresh every 5s
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_DIR="$SCRIPT_DIR/logs"
WATCH=false
INTERVAL=10
while [[ $# -gt 0 ]]; do
case "$1" in
--watch|-w)
WATCH=true; shift
if [[ $# -gt 0 && "$1" =~ ^[0-9]+$ ]]; then
INTERVAL="$1"; shift
fi
;;
*) shift ;;
esac
done
print_status() {
clear 2>/dev/null || true
echo "════════════════════════════════════════════════════════════════════"
echo " SCL Experiment Monitor — $(date '+%Y-%m-%d %H:%M:%S')"
echo "════════════════════════════════════════════════════════════════════"
echo ""
local running=0 finished=0 failed=0 pending=0
printf " %-14s %-10s %-8s %-10s %s\n" "EXPERIMENT" "STATUS" "EPOCH" "DURATION" "LAST LOG LINE"
printf " %-14s %-10s %-8s %-10s %s\n" "──────────" "──────" "─────" "────────" "─────────────"
for log_file in "$LOG_DIR"/scl_exp_*.log; do
[[ -f "$log_file" ]] || continue
local exp_name
exp_name=$(basename "$log_file" .log)
local pid_file="$LOG_DIR/${exp_name}.pid"
# Determine status
local status="UNKNOWN"
if [[ -f "$pid_file" ]]; then
local pid
pid=$(<"$pid_file")
if kill -0 "$pid" 2>/dev/null; then
status="RUNNING"
running=$((running + 1))
else
# PID file exists but process dead — check exit
rm -f "$pid_file"
if grep -q "Error\|Traceback\|FAILED\|Exception" "$log_file" 2>/dev/null; then
status="FAILED"
failed=$((failed + 1))
else
status="FINISHED"
finished=$((finished + 1))
fi
fi
else
# No PID file — already completed
if grep -q "Error\|Traceback\|FAILED\|Exception" "$log_file" 2>/dev/null; then
status="FAILED"
failed=$((failed + 1))
else
status="FINISHED"
finished=$((finished + 1))
fi
fi
# Extract epoch info from log (look for Lightning epoch progress)
local epoch="—"
local epoch_line
epoch_line=$(grep -oP "Epoch \K[0-9]+" "$log_file" 2>/dev/null | tail -1 || true)
if [[ -n "$epoch_line" ]]; then
epoch="${epoch_line}/100"
fi
# Get file age (duration since creation)
local duration="—"
if [[ -f "$log_file" ]]; then
local created modified now elapsed
created=$(stat -c %Y "$log_file" 2>/dev/null || echo 0)
modified=$(stat -c %Y "$log_file" 2>/dev/null || echo 0)
# Use modification time - creation time isn't reliable, use first/last modify
if [[ "$created" -gt 0 ]]; then
now=$(date +%s)
elapsed=$((now - created))
local h=$((elapsed / 3600))
local m=$(( (elapsed % 3600) / 60 ))
duration="${h}h${m}m"
fi
fi
# Last meaningful log line
local last_line
last_line=$(tail -1 "$log_file" 2>/dev/null | head -c 50 || echo "—")
# Color based on status
local color="\033[0m"
case "$status" in
RUNNING) color="\033[1;33m" ;; # Yellow
FINISHED) color="\033[1;32m" ;; # Green
FAILED) color="\033[1;31m" ;; # Red
esac
printf " %-14s ${color}%-10s\033[0m %-8s %-10s %s\n" \
"$exp_name" "$status" "$epoch" "$duration" "$last_line"
done
echo ""
echo " Summary: $running running, $finished finished, $failed failed"
# Show scheduler log tail
if [[ -f "$LOG_DIR/scheduler.log" ]]; then
echo ""
echo " Recent scheduler events:"
tail -5 "$LOG_DIR/scheduler.log" 2>/dev/null | sed 's/^/ /'
fi
# Show GPU utilization
if command -v nvidia-smi &>/dev/null; then
echo ""
local gpu_info
gpu_info=$(nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null || echo "—,—,—")
echo " GPU: ${gpu_info}% util, ${gpu_info##*,} MB total"
fi
}
if $WATCH; then
while true; do
print_status
sleep "$INTERVAL"
done
else
print_status
fi