May 19, 2026: Full harness update

- Model migration: gemma-4-E4B → qwen3.5-9b-vlm
- Dashboard reorder: Usage Over Time + GPU Metrics to top
- Router counter leak fix (gpu_decr in except handler)
- VLM slot upgrade 1→2
- Redis stale key cleanup
- Automated maintenance cron job
- LiteLLM config update
- GPU router config update
- README update
This commit is contained in:
Abiba
2026-05-19 15:03:34 +00:00
parent 4f032b035c
commit 9c31b5d622
7 changed files with 43 additions and 46 deletions
+18 -19
View File
@@ -10,24 +10,24 @@ GPU_LIGHT_URL = os.environ.get("GPU_LIGHT_URL", "http://192.168.68.110:8080/v1")
GPU_SIDECARS = {
"qwen3.6-35B-A3B": "http://192.168.68.15:8090",
"qwen3.6-27B-code": "http://192.168.68.8:8090",
"gemma-4-E4B": "http://192.168.68.110:8090",
"qwen3.5-9b-vlm": "http://192.168.68.110:8090",
}
GPU_URLS = {
"qwen3.6-35B-A3B": GPU_MOE_URL,
"qwen3.6-27B-code": GPU_DENSE_URL,
"gemma-4-E4B": GPU_LIGHT_URL,
"qwen3.5-9b-vlm": GPU_LIGHT_URL,
}
# Max concurrent requests per GPU (based on llama.cpp --parallel)
GPU_MAX_CONCURRENT = {
"qwen3.6-35B-A3B": 2, # 2 slots
"qwen3.6-27B-code": 2, # 2 slots
"gemma-4-E4B": 1, # 1 slot
"qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom)
}
TIER_MODELS = {
"starter": ["gemma-4-E4B"],
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
"starter": ["qwen3.5-9b-vlm"],
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
}
API_KEYS = {
"sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
@@ -139,7 +139,7 @@ def route(rd, tier):
sys = any(m.get("role")=="system" for m in msgs)
turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
hints = rd.get("routing_hints",{})
allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
allowed = TIER_MODELS.get(tier, ["qwen3.5-9b-vlm"])
avail = [m for m in available_models() if m in allowed]
if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
@@ -155,24 +155,24 @@ def route(rd, tier):
return {"model": target, "reason": "explicit"}
if hints:
if hints.get("priority")=="speed" and "gemma-4-E4B" in avail:
return select_best_gpu(["gemma-4-E4B"], "hint_speed") or {"model":"gemma-4-E4B","reason":"hint_speed"}
if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
# Heavy -> dense (but fall back to MoE if dense is busy)
if t > 4000 or sys or turns > 6:
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","gemma-4-E4B"]
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"]
candidates = [m for m in candidates if m in avail]
result = select_best_gpu(candidates, "heavy_reasoning")
if result: return result
# Ultra-light -> gemma
# Ultra-light -> VLM
first_msg = msgs[0].get("content","") if msgs else ""
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
if words <= 3 and turns <= 1 and not sys and "gemma-4-E4B" in avail:
if not is_gpu_busy("gemma-4-E4B"):
return {"model":"gemma-4-E4B","reason":"ultra_light"}
if words <= 3 and turns <= 1 and not sys and "qwen3.5-9b-vlm" in avail:
if not is_gpu_busy("qwen3.5-9b-vlm"):
return {"model":"qwen3.5-9b-vlm","reason":"ultra_light"}
# Default: MoE, fall back to dense if MoE is busy
if "qwen3.6-35B-A3B" in avail:
@@ -239,7 +239,6 @@ def chat():
is_stream = rd.get("stream", False)
gpu_incr(model)
decremented = False
log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
if r:
@@ -254,7 +253,6 @@ def chat():
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
lat = int((time.time()-start)*1000)
gpu_decr(model)
decremented = True # Release slot
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
if is_stream:
@@ -271,11 +269,12 @@ def chat():
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
bcast()
return jsonify(data)
if not decremented:
try: gpu_decr(model)
except: pass
except requests.Timeout:
gpu_decr(model)
log.error("TIMEOUT: %s -> %s", agent, model)
return jsonify({"error":"timeout"}), 504
except Exception as e:
gpu_decr(model)
log.error("Error: %s\n%s", e, traceback.format_exc())
return jsonify({"error":str(e)}), 500