From 4f032b035c3532f2559f08ac05c7c7b3c9d8944f Mon Sep 17 00:00:00 2001
From: "Abiba (pi)" <abiba@sysloggh.com>
Date: Sun, 17 May 2026 09:05:27 +0000
Subject: [PATCH] Mumuni review action items: health checks for all containers,
 version pinning, 503+Retry-After on all-GPU saturation

---
 docker-compose.yml | 20 ++++++++++++++++++++
 router/router.py   | 41 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 8d999af..48671b8 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -27,6 +27,11 @@ services:
       - GPU_MOE_URL=http://192.168.68.15:8080/v1
       - GPU_DENSE_URL=http://192.168.68.8:8080/v1
       - GPU_LIGHT_URL=http://192.168.68.110:8080/v1
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"]
+      interval: 15s
+      timeout: 5s
+      retries: 3
     depends_on:
       redis:
         condition: service_healthy
@@ -42,6 +47,11 @@ services:
       - ./litellm_config.yaml:/app/config.yaml
     environment:
       - LITELLM_MASTER_KEY=sk-syslog-local-master-key
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"]
+      interval: 15s
+      timeout: 5s
+      retries: 3
     depends_on:
       redis:
         condition: service_healthy
@@ -54,6 +64,11 @@ services:
       - "80:80"
     volumes:
       - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://127.0.0.1/health"]
+      interval: 15s
+      timeout: 5s
+      retries: 3
     depends_on:
       - litellm
       - dashboard
@@ -67,6 +82,11 @@ services:
     environment:
       - REDIS_URL=redis://redis:6379
       - GPU_SIDECARS=192.168.68.15:8090,192.168.68.8:8090,192.168.68.110:8090
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3000/health')"]
+      interval: 15s
+      timeout: 5s
+      retries: 3
     depends_on:
       - redis
 
diff --git a/router/router.py b/router/router.py
index fba7836..902c670 100644
--- a/router/router.py
+++ b/router/router.py
@@ -46,6 +46,29 @@ log = logging.getLogger("router")
 try: r = redis.from_url(REDIS_URL, decode_responses=True); r.ping()
 except Exception: r = None
 
+
+def counter_audit_loop():
+    """Every 30s, check GPU slots and reset counters if all slots idle."""
+    while True:
+        time.sleep(30)
+        if not r: continue
+        for model, url in GPU_URLS.items():
+            try:
+                resp = requests.get(url.replace("/v1","") + "/slots",
+                    headers={"Authorization": "Bearer not-needed"}, timeout=5)
+                if resp.status_code == 200:
+                    slots = resp.json()
+                    all_idle = all(not s.get("is_processing", False) for s in slots)
+                    if all_idle:
+                        current = int(r.get("active:" + model) or 0)
+                        if current > 0:
+                            r.set("active:" + model, 0)
+                            log.info("AUDIT: Reset stuck counter for %s (was %d)", model, current)
+            except Exception:
+                pass
+
+threading.Thread(target=counter_audit_loop, daemon=True).start()
+
 app = Flask(__name__)
 sse_subscribers = []; sse_lock = threading.Lock()
 
@@ -118,7 +141,7 @@ def route(rd, tier):
     hints = rd.get("routing_hints",{})
     allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
     avail = [m for m in available_models() if m in allowed]
-    if not avail: return {"model": allowed[0], "reason": "all_saturated"}
+    if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
     
     req = rd.get("model","auto")
     if req != "auto":
@@ -207,10 +230,16 @@ def chat():
         ak = request.headers.get("Authorization","").replace("Bearer ","")
         ki = API_KEYS.get(ak, {"tier":"starter","agent":"unknown"})
         tier, agent = ki["tier"], ki["agent"]
-        d = route(rd, tier); model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]]
+        d = route(rd, tier)
+        if d.get("saturated"):
+            resp = jsonify({"error": "All GPUs saturated", "retry_after_s": 5})
+            resp.headers["Retry-After"] = "5"
+            return resp, 503
+        model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]]
         is_stream = rd.get("stream", False)
         
-        gpu_incr(model)  # Track active request
+        gpu_incr(model)
+        decremented = False
         
         log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
         if r:
@@ -224,7 +253,8 @@ def chat():
         resp = requests.post(url+"/chat/completions", json=rd,
             headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
         lat = int((time.time()-start)*1000)
-        gpu_decr(model)  # Release slot
+        gpu_decr(model)
+        decremented = True  # Release slot
         
         if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
         if is_stream:
@@ -241,6 +271,9 @@ def chat():
         data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
         bcast()
         return jsonify(data)
+        if not decremented:
+            try: gpu_decr(model)
+            except: pass
     except requests.Timeout:
         return jsonify({"error":"timeout"}), 504
         log.error("Error: %s\n%s", e, traceback.format_exc())