May 19, 2026: Full harness update
- Model migration: gemma-4-E4B → qwen3.5-9b-vlm - Dashboard reorder: Usage Over Time + GPU Metrics to top - Router counter leak fix (gpu_decr in except handler) - VLM slot upgrade 1→2 - Redis stale key cleanup - Automated maintenance cron job - LiteLLM config update - GPU router config update - README update
This commit is contained in:
+18
-19
@@ -10,24 +10,24 @@ GPU_LIGHT_URL = os.environ.get("GPU_LIGHT_URL", "http://192.168.68.110:8080/v1")
|
||||
GPU_SIDECARS = {
|
||||
"qwen3.6-35B-A3B": "http://192.168.68.15:8090",
|
||||
"qwen3.6-27B-code": "http://192.168.68.8:8090",
|
||||
"gemma-4-E4B": "http://192.168.68.110:8090",
|
||||
"qwen3.5-9b-vlm": "http://192.168.68.110:8090",
|
||||
}
|
||||
GPU_URLS = {
|
||||
"qwen3.6-35B-A3B": GPU_MOE_URL,
|
||||
"qwen3.6-27B-code": GPU_DENSE_URL,
|
||||
"gemma-4-E4B": GPU_LIGHT_URL,
|
||||
"qwen3.5-9b-vlm": GPU_LIGHT_URL,
|
||||
}
|
||||
# Max concurrent requests per GPU (based on llama.cpp --parallel)
|
||||
GPU_MAX_CONCURRENT = {
|
||||
"qwen3.6-35B-A3B": 2, # 2 slots
|
||||
"qwen3.6-27B-code": 2, # 2 slots
|
||||
"gemma-4-E4B": 1, # 1 slot
|
||||
"qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom)
|
||||
}
|
||||
|
||||
TIER_MODELS = {
|
||||
"starter": ["gemma-4-E4B"],
|
||||
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
||||
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
||||
"starter": ["qwen3.5-9b-vlm"],
|
||||
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
|
||||
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
|
||||
}
|
||||
API_KEYS = {
|
||||
"sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
|
||||
@@ -139,7 +139,7 @@ def route(rd, tier):
|
||||
sys = any(m.get("role")=="system" for m in msgs)
|
||||
turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
|
||||
hints = rd.get("routing_hints",{})
|
||||
allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
|
||||
allowed = TIER_MODELS.get(tier, ["qwen3.5-9b-vlm"])
|
||||
avail = [m for m in available_models() if m in allowed]
|
||||
if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
|
||||
|
||||
@@ -155,24 +155,24 @@ def route(rd, tier):
|
||||
return {"model": target, "reason": "explicit"}
|
||||
|
||||
if hints:
|
||||
if hints.get("priority")=="speed" and "gemma-4-E4B" in avail:
|
||||
return select_best_gpu(["gemma-4-E4B"], "hint_speed") or {"model":"gemma-4-E4B","reason":"hint_speed"}
|
||||
if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
|
||||
return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
|
||||
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
|
||||
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
||||
|
||||
# Heavy -> dense (but fall back to MoE if dense is busy)
|
||||
if t > 4000 or sys or turns > 6:
|
||||
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","gemma-4-E4B"]
|
||||
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"]
|
||||
candidates = [m for m in candidates if m in avail]
|
||||
result = select_best_gpu(candidates, "heavy_reasoning")
|
||||
if result: return result
|
||||
|
||||
# Ultra-light -> gemma
|
||||
# Ultra-light -> VLM
|
||||
first_msg = msgs[0].get("content","") if msgs else ""
|
||||
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
||||
if words <= 3 and turns <= 1 and not sys and "gemma-4-E4B" in avail:
|
||||
if not is_gpu_busy("gemma-4-E4B"):
|
||||
return {"model":"gemma-4-E4B","reason":"ultra_light"}
|
||||
if words <= 3 and turns <= 1 and not sys and "qwen3.5-9b-vlm" in avail:
|
||||
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
||||
return {"model":"qwen3.5-9b-vlm","reason":"ultra_light"}
|
||||
|
||||
# Default: MoE, fall back to dense if MoE is busy
|
||||
if "qwen3.6-35B-A3B" in avail:
|
||||
@@ -239,7 +239,6 @@ def chat():
|
||||
is_stream = rd.get("stream", False)
|
||||
|
||||
gpu_incr(model)
|
||||
decremented = False
|
||||
|
||||
log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
|
||||
if r:
|
||||
@@ -254,7 +253,6 @@ def chat():
|
||||
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
|
||||
lat = int((time.time()-start)*1000)
|
||||
gpu_decr(model)
|
||||
decremented = True # Release slot
|
||||
|
||||
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
||||
if is_stream:
|
||||
@@ -271,11 +269,12 @@ def chat():
|
||||
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
|
||||
bcast()
|
||||
return jsonify(data)
|
||||
if not decremented:
|
||||
try: gpu_decr(model)
|
||||
except: pass
|
||||
except requests.Timeout:
|
||||
gpu_decr(model)
|
||||
log.error("TIMEOUT: %s -> %s", agent, model)
|
||||
return jsonify({"error":"timeout"}), 504
|
||||
except Exception as e:
|
||||
gpu_decr(model)
|
||||
log.error("Error: %s\n%s", e, traceback.format_exc())
|
||||
return jsonify({"error":str(e)}), 500
|
||||
|
||||
|
||||
Reference in New Issue
Block a user