diff --git a/router/router.py b/router/router.py index ffdceb8..fba7836 100644 --- a/router/router.py +++ b/router/router.py @@ -59,7 +59,10 @@ def gpu_incr(model): if r: r.incr("active:" + model) def gpu_decr(model): - if r: r.decr("active:" + model) + if r: + v = r.decr("active:" + model) + if v and int(v) < 0: + r.set("active:" + model, 0) # never go negative def check_gpu_health(model): url = GPU_SIDECARS.get(model) @@ -69,7 +72,16 @@ def check_gpu_health(model): if resp.status_code == 200: d = resp.json() pct = (d.get("vram_used_mb",0) / max(d.get("vram_total_mb",1), 1)) * 100 - return {"status": "healthy" if pct < 90 else "saturated", "vram_used_mb": d.get("vram_used_mb"), "vram_total_mb": d.get("vram_total_mb"), "vram_pct": round(pct,1), "temp_c": d.get("temp_c"), "gpu_util_pct": d.get("gpu_util_pct"), "gpu_name": d.get("gpu_name"), "power_w": d.get("power_w"), "power_limit_w": d.get("power_limit_w")} + status = "healthy" if pct < 90 else "saturated" + # Also check if llama.cpp endpoint is actually responding + gpu_url = GPU_URLS.get(model, "") + try: + hr = requests.get(gpu_url.replace("/v1","") + "/health", headers={"Authorization": "Bearer not-needed"}, timeout=3) + if hr.status_code != 200: + status = "down" + except Exception: + status = "down" + return {"status": status, "vram_used_mb": d.get("vram_used_mb"), "vram_total_mb": d.get("vram_total_mb"), "vram_pct": round(pct,1), "temp_c": d.get("temp_c"), "gpu_util_pct": d.get("gpu_util_pct"), "gpu_name": d.get("gpu_name"), "power_w": d.get("power_w"), "power_limit_w": d.get("power_limit_w")} except Exception: pass return {"status": "down"} @@ -230,10 +242,7 @@ def chat(): bcast() return jsonify(data) except requests.Timeout: - try: gpu_decr(model) - except: pass return jsonify({"error":"timeout"}), 504 - except Exception as e: log.error("Error: %s\n%s", e, traceback.format_exc()) return jsonify({"error":str(e)}), 500