May 19, 2026: Full harness update

- Model migration: gemma-4-E4B → qwen3.5-9b-vlm - Dashboard reorder: Usage Over Time + GPU Metrics to top - Router counter leak fix (gpu_decr in except handler) - VLM slot upgrade 1→2 - Redis stale key cleanup - Automated maintenance cron job - LiteLLM config update - GPU router config update - README update
2026-05-19 15:03:34 +00:00
parent 4f032b035c
commit 9c31b5d622
7 changed files with 43 additions and 46 deletions
@@ -10,24 +10,24 @@ GPU_LIGHT_URL = os.environ.get("GPU_LIGHT_URL", "http://192.168.68.110:8080/v1")
 GPU_SIDECARS = {
    "qwen3.6-35B-A3B": "http://192.168.68.15:8090",
    "qwen3.6-27B-code": "http://192.168.68.8:8090",
-    "gemma-4-E4B": "http://192.168.68.110:8090",
+    "qwen3.5-9b-vlm": "http://192.168.68.110:8090",
 }
 GPU_URLS = {
    "qwen3.6-35B-A3B": GPU_MOE_URL,
    "qwen3.6-27B-code": GPU_DENSE_URL,
-    "gemma-4-E4B": GPU_LIGHT_URL,
+    "qwen3.5-9b-vlm": GPU_LIGHT_URL,
 }
 # Max concurrent requests per GPU (based on llama.cpp --parallel)
 GPU_MAX_CONCURRENT = {
    "qwen3.6-35B-A3B": 2,   # 2 slots
    "qwen3.6-27B-code": 2,  # 2 slots
-    "gemma-4-E4B": 1,       # 1 slot
+    "qwen3.5-9b-vlm": 2,       # 2 slots (12GB VRAM, 4GB headroom)
 }

 TIER_MODELS = {
-    "starter": ["gemma-4-E4B"],
-    "professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
-    "enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
+    "starter": ["qwen3.5-9b-vlm"],
+    "professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
+    "enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
 }
 API_KEYS = {
    "sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
@@ -139,7 +139,7 @@ def route(rd, tier):
    sys = any(m.get("role")=="system" for m in msgs)
    turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
    hints = rd.get("routing_hints",{})
-    allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
+    allowed = TIER_MODELS.get(tier, ["qwen3.5-9b-vlm"])
    avail = [m for m in available_models() if m in allowed]
    if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
    
@@ -155,24 +155,24 @@ def route(rd, tier):
        return {"model": target, "reason": "explicit"}
    
    if hints:
-        if hints.get("priority")=="speed" and "gemma-4-E4B" in avail:
-            return select_best_gpu(["gemma-4-E4B"], "hint_speed") or {"model":"gemma-4-E4B","reason":"hint_speed"}
+        if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
+            return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
        if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
            return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
    
    # Heavy -> dense (but fall back to MoE if dense is busy)
    if t > 4000 or sys or turns > 6:
-        candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","gemma-4-E4B"]
+        candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"]
        candidates = [m for m in candidates if m in avail]
        result = select_best_gpu(candidates, "heavy_reasoning")
        if result: return result
    
-    # Ultra-light -> gemma
+    # Ultra-light -> VLM
    first_msg = msgs[0].get("content","") if msgs else ""
    words = len(first_msg.split()) if isinstance(first_msg, str) else 99
-    if words <= 3 and turns <= 1 and not sys and "gemma-4-E4B" in avail:
-        if not is_gpu_busy("gemma-4-E4B"):
-            return {"model":"gemma-4-E4B","reason":"ultra_light"}
+    if words <= 3 and turns <= 1 and not sys and "qwen3.5-9b-vlm" in avail:
+        if not is_gpu_busy("qwen3.5-9b-vlm"):
+            return {"model":"qwen3.5-9b-vlm","reason":"ultra_light"}
    
    # Default: MoE, fall back to dense if MoE is busy
    if "qwen3.6-35B-A3B" in avail:
@@ -239,7 +239,6 @@ def chat():
        is_stream = rd.get("stream", False)
        
        gpu_incr(model)
-        decremented = False
        
        log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
        if r:
@@ -254,7 +253,6 @@ def chat():
            headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
        lat = int((time.time()-start)*1000)
        gpu_decr(model)
-        decremented = True  # Release slot
        
        if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
        if is_stream:
@@ -271,11 +269,12 @@ def chat():
        data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
        bcast()
        return jsonify(data)
-        if not decremented:
-            try: gpu_decr(model)
-            except: pass
    except requests.Timeout:
+        gpu_decr(model)
+        log.error("TIMEOUT: %s -> %s", agent, model)
        return jsonify({"error":"timeout"}), 504
+    except Exception as e:
+        gpu_decr(model)
        log.error("Error: %s\n%s", e, traceback.format_exc())
        return jsonify({"error":str(e)}), 500