Router: health check verifies actual llama.cpp endpoint, gpu_decr negative guard, AMD sidecar fixed (sysfs fallback)
This commit is contained in:
+14
-5
@@ -59,7 +59,10 @@ def gpu_incr(model):
|
|||||||
if r: r.incr("active:" + model)
|
if r: r.incr("active:" + model)
|
||||||
|
|
||||||
def gpu_decr(model):
|
def gpu_decr(model):
|
||||||
if r: r.decr("active:" + model)
|
if r:
|
||||||
|
v = r.decr("active:" + model)
|
||||||
|
if v and int(v) < 0:
|
||||||
|
r.set("active:" + model, 0) # never go negative
|
||||||
|
|
||||||
def check_gpu_health(model):
|
def check_gpu_health(model):
|
||||||
url = GPU_SIDECARS.get(model)
|
url = GPU_SIDECARS.get(model)
|
||||||
@@ -69,7 +72,16 @@ def check_gpu_health(model):
|
|||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
d = resp.json()
|
d = resp.json()
|
||||||
pct = (d.get("vram_used_mb",0) / max(d.get("vram_total_mb",1), 1)) * 100
|
pct = (d.get("vram_used_mb",0) / max(d.get("vram_total_mb",1), 1)) * 100
|
||||||
return {"status": "healthy" if pct < 90 else "saturated", "vram_used_mb": d.get("vram_used_mb"), "vram_total_mb": d.get("vram_total_mb"), "vram_pct": round(pct,1), "temp_c": d.get("temp_c"), "gpu_util_pct": d.get("gpu_util_pct"), "gpu_name": d.get("gpu_name"), "power_w": d.get("power_w"), "power_limit_w": d.get("power_limit_w")}
|
status = "healthy" if pct < 90 else "saturated"
|
||||||
|
# Also check if llama.cpp endpoint is actually responding
|
||||||
|
gpu_url = GPU_URLS.get(model, "")
|
||||||
|
try:
|
||||||
|
hr = requests.get(gpu_url.replace("/v1","") + "/health", headers={"Authorization": "Bearer not-needed"}, timeout=3)
|
||||||
|
if hr.status_code != 200:
|
||||||
|
status = "down"
|
||||||
|
except Exception:
|
||||||
|
status = "down"
|
||||||
|
return {"status": status, "vram_used_mb": d.get("vram_used_mb"), "vram_total_mb": d.get("vram_total_mb"), "vram_pct": round(pct,1), "temp_c": d.get("temp_c"), "gpu_util_pct": d.get("gpu_util_pct"), "gpu_name": d.get("gpu_name"), "power_w": d.get("power_w"), "power_limit_w": d.get("power_limit_w")}
|
||||||
except Exception: pass
|
except Exception: pass
|
||||||
return {"status": "down"}
|
return {"status": "down"}
|
||||||
|
|
||||||
@@ -230,10 +242,7 @@ def chat():
|
|||||||
bcast()
|
bcast()
|
||||||
return jsonify(data)
|
return jsonify(data)
|
||||||
except requests.Timeout:
|
except requests.Timeout:
|
||||||
try: gpu_decr(model)
|
|
||||||
except: pass
|
|
||||||
return jsonify({"error":"timeout"}), 504
|
return jsonify({"error":"timeout"}), 504
|
||||||
except Exception as e:
|
|
||||||
log.error("Error: %s\n%s", e, traceback.format_exc())
|
log.error("Error: %s\n%s", e, traceback.format_exc())
|
||||||
return jsonify({"error":str(e)}), 500
|
return jsonify({"error":str(e)}), 500
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user