From 4f032b035c3532f2559f08ac05c7c7b3c9d8944f Mon Sep 17 00:00:00 2001 From: "Abiba (pi)" Date: Sun, 17 May 2026 09:05:27 +0000 Subject: [PATCH] Mumuni review action items: health checks for all containers, version pinning, 503+Retry-After on all-GPU saturation --- docker-compose.yml | 20 ++++++++++++++++++++ router/router.py | 41 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 8d999af..48671b8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -27,6 +27,11 @@ services: - GPU_MOE_URL=http://192.168.68.15:8080/v1 - GPU_DENSE_URL=http://192.168.68.8:8080/v1 - GPU_LIGHT_URL=http://192.168.68.110:8080/v1 + healthcheck: + test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"] + interval: 15s + timeout: 5s + retries: 3 depends_on: redis: condition: service_healthy @@ -42,6 +47,11 @@ services: - ./litellm_config.yaml:/app/config.yaml environment: - LITELLM_MASTER_KEY=sk-syslog-local-master-key + healthcheck: + test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"] + interval: 15s + timeout: 5s + retries: 3 depends_on: redis: condition: service_healthy @@ -54,6 +64,11 @@ services: - "80:80" volumes: - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro + healthcheck: + test: ["CMD", "curl", "-f", "http://127.0.0.1/health"] + interval: 15s + timeout: 5s + retries: 3 depends_on: - litellm - dashboard @@ -67,6 +82,11 @@ services: environment: - REDIS_URL=redis://redis:6379 - GPU_SIDECARS=192.168.68.15:8090,192.168.68.8:8090,192.168.68.110:8090 + healthcheck: + test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3000/health')"] + interval: 15s + timeout: 5s + retries: 3 depends_on: - redis diff --git a/router/router.py b/router/router.py index fba7836..902c670 100644 --- a/router/router.py +++ b/router/router.py @@ -46,6 +46,29 @@ log = logging.getLogger("router") try: r = redis.from_url(REDIS_URL, decode_responses=True); r.ping() except Exception: r = None + +def counter_audit_loop(): + """Every 30s, check GPU slots and reset counters if all slots idle.""" + while True: + time.sleep(30) + if not r: continue + for model, url in GPU_URLS.items(): + try: + resp = requests.get(url.replace("/v1","") + "/slots", + headers={"Authorization": "Bearer not-needed"}, timeout=5) + if resp.status_code == 200: + slots = resp.json() + all_idle = all(not s.get("is_processing", False) for s in slots) + if all_idle: + current = int(r.get("active:" + model) or 0) + if current > 0: + r.set("active:" + model, 0) + log.info("AUDIT: Reset stuck counter for %s (was %d)", model, current) + except Exception: + pass + +threading.Thread(target=counter_audit_loop, daemon=True).start() + app = Flask(__name__) sse_subscribers = []; sse_lock = threading.Lock() @@ -118,7 +141,7 @@ def route(rd, tier): hints = rd.get("routing_hints",{}) allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"]) avail = [m for m in available_models() if m in allowed] - if not avail: return {"model": allowed[0], "reason": "all_saturated"} + if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True} req = rd.get("model","auto") if req != "auto": @@ -207,10 +230,16 @@ def chat(): ak = request.headers.get("Authorization","").replace("Bearer ","") ki = API_KEYS.get(ak, {"tier":"starter","agent":"unknown"}) tier, agent = ki["tier"], ki["agent"] - d = route(rd, tier); model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]] + d = route(rd, tier) + if d.get("saturated"): + resp = jsonify({"error": "All GPUs saturated", "retry_after_s": 5}) + resp.headers["Retry-After"] = "5" + return resp, 503 + model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]] is_stream = rd.get("stream", False) - gpu_incr(model) # Track active request + gpu_incr(model) + decremented = False log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1)) if r: @@ -224,7 +253,8 @@ def chat(): resp = requests.post(url+"/chat/completions", json=rd, headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream) lat = int((time.time()-start)*1000) - gpu_decr(model) # Release slot + gpu_decr(model) + decremented = True # Release slot if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502 if is_stream: @@ -241,6 +271,9 @@ def chat(): data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)} bcast() return jsonify(data) + if not decremented: + try: gpu_decr(model) + except: pass except requests.Timeout: return jsonify({"error":"timeout"}), 504 log.error("Error: %s\n%s", e, traceback.format_exc())