fix: decouple VRAM usage from saturation status
VRAM percentage no longer marks GPU as saturated. Saturation is about slot availability (handled by is_gpu_busy()), not memory usage. Added vram_warning boolean flag (≥95% threshold) for informational monitoring without affecting routing decisions. 27B Dense now correctly shows healthy at 91% VRAM.
This commit is contained in:
+3
-2
@@ -102,7 +102,8 @@ def check_gpu_health(model, sidecar_timeout=5, gpu_timeout=3):
|
|||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
d = resp.json()
|
d = resp.json()
|
||||||
pct = (d.get("vram_used_mb",0) / max(d.get("vram_total_mb",1), 1)) * 100
|
pct = (d.get("vram_used_mb",0) / max(d.get("vram_total_mb",1), 1)) * 100
|
||||||
status = "healthy" if pct < 90 else "saturated"
|
status = "healthy" # VRAM usage != saturation; busy slots handled by is_gpu_busy()
|
||||||
|
vram_warning = pct >= 95
|
||||||
# Also check if llama.cpp endpoint is actually responding
|
# Also check if llama.cpp endpoint is actually responding
|
||||||
gpu_url = GPU_URLS.get(model, "")
|
gpu_url = GPU_URLS.get(model, "")
|
||||||
try:
|
try:
|
||||||
@@ -111,7 +112,7 @@ def check_gpu_health(model, sidecar_timeout=5, gpu_timeout=3):
|
|||||||
status = "down"
|
status = "down"
|
||||||
except Exception:
|
except Exception:
|
||||||
status = "down"
|
status = "down"
|
||||||
return {"status": status, "vram_used_mb": d.get("vram_used_mb"), "vram_total_mb": d.get("vram_total_mb"), "vram_pct": round(pct,1), "temp_c": d.get("temp_c"), "gpu_util_pct": d.get("gpu_util_pct"), "gpu_name": d.get("gpu_name"), "power_w": d.get("power_w"), "power_limit_w": d.get("power_limit_w")}
|
return {"status": status, "vram_warning": vram_warning, "vram_used_mb": d.get("vram_used_mb"), "vram_total_mb": d.get("vram_total_mb"), "vram_pct": round(pct,1), "temp_c": d.get("temp_c"), "gpu_util_pct": d.get("gpu_util_pct"), "gpu_name": d.get("gpu_name"), "power_w": d.get("power_w"), "power_limit_w": d.get("power_limit_w")}
|
||||||
except Exception: pass
|
except Exception: pass
|
||||||
return {"status": "down"}
|
return {"status": "down"}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user