Mumuni review action items: health checks for all containers, version pinning, 503+Retry-After on all-GPU saturation
This commit is contained in:
@@ -27,6 +27,11 @@ services:
|
|||||||
- GPU_MOE_URL=http://192.168.68.15:8080/v1
|
- GPU_MOE_URL=http://192.168.68.15:8080/v1
|
||||||
- GPU_DENSE_URL=http://192.168.68.8:8080/v1
|
- GPU_DENSE_URL=http://192.168.68.8:8080/v1
|
||||||
- GPU_LIGHT_URL=http://192.168.68.110:8080/v1
|
- GPU_LIGHT_URL=http://192.168.68.110:8080/v1
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"]
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
depends_on:
|
depends_on:
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
@@ -42,6 +47,11 @@ services:
|
|||||||
- ./litellm_config.yaml:/app/config.yaml
|
- ./litellm_config.yaml:/app/config.yaml
|
||||||
environment:
|
environment:
|
||||||
- LITELLM_MASTER_KEY=sk-syslog-local-master-key
|
- LITELLM_MASTER_KEY=sk-syslog-local-master-key
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"]
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
depends_on:
|
depends_on:
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
@@ -54,6 +64,11 @@ services:
|
|||||||
- "80:80"
|
- "80:80"
|
||||||
volumes:
|
volumes:
|
||||||
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
|
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://127.0.0.1/health"]
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
depends_on:
|
depends_on:
|
||||||
- litellm
|
- litellm
|
||||||
- dashboard
|
- dashboard
|
||||||
@@ -67,6 +82,11 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- REDIS_URL=redis://redis:6379
|
- REDIS_URL=redis://redis:6379
|
||||||
- GPU_SIDECARS=192.168.68.15:8090,192.168.68.8:8090,192.168.68.110:8090
|
- GPU_SIDECARS=192.168.68.15:8090,192.168.68.8:8090,192.168.68.110:8090
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3000/health')"]
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
|
|
||||||
|
|||||||
+37
-4
@@ -46,6 +46,29 @@ log = logging.getLogger("router")
|
|||||||
try: r = redis.from_url(REDIS_URL, decode_responses=True); r.ping()
|
try: r = redis.from_url(REDIS_URL, decode_responses=True); r.ping()
|
||||||
except Exception: r = None
|
except Exception: r = None
|
||||||
|
|
||||||
|
|
||||||
|
def counter_audit_loop():
|
||||||
|
"""Every 30s, check GPU slots and reset counters if all slots idle."""
|
||||||
|
while True:
|
||||||
|
time.sleep(30)
|
||||||
|
if not r: continue
|
||||||
|
for model, url in GPU_URLS.items():
|
||||||
|
try:
|
||||||
|
resp = requests.get(url.replace("/v1","") + "/slots",
|
||||||
|
headers={"Authorization": "Bearer not-needed"}, timeout=5)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
slots = resp.json()
|
||||||
|
all_idle = all(not s.get("is_processing", False) for s in slots)
|
||||||
|
if all_idle:
|
||||||
|
current = int(r.get("active:" + model) or 0)
|
||||||
|
if current > 0:
|
||||||
|
r.set("active:" + model, 0)
|
||||||
|
log.info("AUDIT: Reset stuck counter for %s (was %d)", model, current)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
threading.Thread(target=counter_audit_loop, daemon=True).start()
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
sse_subscribers = []; sse_lock = threading.Lock()
|
sse_subscribers = []; sse_lock = threading.Lock()
|
||||||
|
|
||||||
@@ -118,7 +141,7 @@ def route(rd, tier):
|
|||||||
hints = rd.get("routing_hints",{})
|
hints = rd.get("routing_hints",{})
|
||||||
allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
|
allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
|
||||||
avail = [m for m in available_models() if m in allowed]
|
avail = [m for m in available_models() if m in allowed]
|
||||||
if not avail: return {"model": allowed[0], "reason": "all_saturated"}
|
if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
|
||||||
|
|
||||||
req = rd.get("model","auto")
|
req = rd.get("model","auto")
|
||||||
if req != "auto":
|
if req != "auto":
|
||||||
@@ -207,10 +230,16 @@ def chat():
|
|||||||
ak = request.headers.get("Authorization","").replace("Bearer ","")
|
ak = request.headers.get("Authorization","").replace("Bearer ","")
|
||||||
ki = API_KEYS.get(ak, {"tier":"starter","agent":"unknown"})
|
ki = API_KEYS.get(ak, {"tier":"starter","agent":"unknown"})
|
||||||
tier, agent = ki["tier"], ki["agent"]
|
tier, agent = ki["tier"], ki["agent"]
|
||||||
d = route(rd, tier); model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]]
|
d = route(rd, tier)
|
||||||
|
if d.get("saturated"):
|
||||||
|
resp = jsonify({"error": "All GPUs saturated", "retry_after_s": 5})
|
||||||
|
resp.headers["Retry-After"] = "5"
|
||||||
|
return resp, 503
|
||||||
|
model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]]
|
||||||
is_stream = rd.get("stream", False)
|
is_stream = rd.get("stream", False)
|
||||||
|
|
||||||
gpu_incr(model) # Track active request
|
gpu_incr(model)
|
||||||
|
decremented = False
|
||||||
|
|
||||||
log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
|
log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
|
||||||
if r:
|
if r:
|
||||||
@@ -224,7 +253,8 @@ def chat():
|
|||||||
resp = requests.post(url+"/chat/completions", json=rd,
|
resp = requests.post(url+"/chat/completions", json=rd,
|
||||||
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
|
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
|
||||||
lat = int((time.time()-start)*1000)
|
lat = int((time.time()-start)*1000)
|
||||||
gpu_decr(model) # Release slot
|
gpu_decr(model)
|
||||||
|
decremented = True # Release slot
|
||||||
|
|
||||||
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
||||||
if is_stream:
|
if is_stream:
|
||||||
@@ -241,6 +271,9 @@ def chat():
|
|||||||
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
|
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
|
||||||
bcast()
|
bcast()
|
||||||
return jsonify(data)
|
return jsonify(data)
|
||||||
|
if not decremented:
|
||||||
|
try: gpu_decr(model)
|
||||||
|
except: pass
|
||||||
except requests.Timeout:
|
except requests.Timeout:
|
||||||
return jsonify({"error":"timeout"}), 504
|
return jsonify({"error":"timeout"}), 504
|
||||||
log.error("Error: %s\n%s", e, traceback.format_exc())
|
log.error("Error: %s\n%s", e, traceback.format_exc())
|
||||||
|
|||||||
Reference in New Issue
Block a user