May 19, 2026: Full harness update

- Model migration: gemma-4-E4B → qwen3.5-9b-vlm - Dashboard reorder: Usage Over Time + GPU Metrics to top - Router counter leak fix (gpu_decr in except handler) - VLM slot upgrade 1→2 - Redis stale key cleanup - Automated maintenance cron job - LiteLLM config update - GPU router config update - README update
2026-05-19 15:03:34 +00:00
parent 4f032b035c
commit 9c31b5d622
7 changed files with 43 additions and 46 deletions
@@ -1,5 +1,3 @@
 .git
 __pycache__/
 *.pyc
 .env
 redis-data/
 ssl/
@@ -8,7 +8,7 @@ CT 116 Docker stack for routing local GPU models through a unified OpenAI-compat
 nginx :80 → router :9000 → GPU backends
                ├─ qwen3.6-35B-A3B (MoE) @ 192.168.68.15:8080
                ├─ qwen3.6-27B-code (Dense) @ 192.168.68.8:8080
-                └─ gemma-4-E4B (Light) @ 192.168.68.110:8080
+                └─ qwen3.5-9b-vlm (VLM) @ 192.168.68.110:8080
 LiteLLM :8081 (fallback) | Dashboard :3000 | Redis :6379 (local)
 ```
@@ -80,17 +80,7 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
 </div>
 <div class="row g-3 align-items-stretch">
-  <!-- ROW 1: 3 GPU Cards -->
+  <!-- ROW 1: Usage Chart (8) + GPU Metrics (4) -->
  <div class="col-md-4"><div class="gpu-card" id="gpu-moe"><div class="text-secondary small">Loading...</div></div></div>
  <div class="col-md-4"><div class="gpu-card" id="gpu-dense"><div class="text-secondary small">Loading...</div></div></div>
  <div class="col-md-4"><div class="gpu-card" id="gpu-light"><div class="text-secondary small">Loading...</div></div></div>
  <!-- ROW 2: Queue + Model + Agent -->
  <div class="col-md-4"><div class="chart-card"><div class="title">Queue Status</div><div class="text-center" id="queue-viz"></div></div></div>
  <div class="col-md-4"><div class="chart-card"><div class="title">Model Distribution</div><div id="route-bars"></div></div></div>
  <div class="col-md-4"><div class="chart-card"><div class="title">Agent Activity</div><div id="agent-bars"></div></div></div>
  <!-- ROW 3: Usage Chart (8) + GPU Metrics (4) -->
  <div class="col-md-8"><div class="chart-card"><div class="title d-flex justify-content-between align-items-center">
    <span>Usage Over Time</span>
    <div class="d-flex gap-1">
@@ -101,6 +91,16 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
  </div><div id="timeseries-chart" style="height:150px"></div><div id="timeseries-legend" class="d-flex justify-content-center gap-3 mt-2 flex-wrap small"></div></div></div>
  <div class="col-md-4"><div class="chart-card"><div class="title">GPU Metrics</div><div id="gpu-metrics-card"></div></div></div>
  <!-- ROW 2: 3 GPU Cards -->
  <div class="col-md-4"><div class="gpu-card" id="gpu-moe"><div class="text-secondary small">Loading...</div></div></div>
  <div class="col-md-4"><div class="gpu-card" id="gpu-dense"><div class="text-secondary small">Loading...</div></div></div>
  <div class="col-md-4"><div class="gpu-card" id="gpu-light"><div class="text-secondary small">Loading...</div></div></div>
  <!-- ROW 3: Queue + Model + Agent -->
  <div class="col-md-4"><div class="chart-card"><div class="title">Queue Status</div><div class="text-center" id="queue-viz"></div></div></div>
  <div class="col-md-4"><div class="chart-card"><div class="title">Model Distribution</div><div id="route-bars"></div></div></div>
  <div class="col-md-4"><div class="chart-card"><div class="title">Agent Activity</div><div id="agent-bars"></div></div></div>
  <!-- ROW 4: Live Stream -->
  <div class="col-12"><div class="chart-card"><div class="title">Live Stream</div>
    <div class="table-responsive"><table class="table table-custom mb-0">
@@ -111,9 +111,9 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
 </div>
 <script>
-var MC={'gemma-4-E4B':'#22c55e','qwen3.6-27B-code':'#f59e0b','qwen3.6-35B-A3B':'#a78bfa'};
+var MC={'qwen3.5-9b-vlm':'#22c55e','qwen3.6-27B-code':'#f59e0b','qwen3.6-35B-A3B':'#a78bfa'};
-var ML={'gemma-4-E4B':'Gemma 4B','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
+var ML={'qwen3.5-9b-vlm':'Qwen3.5 9B VLM','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
-var GL={'qwen3.6-35B-A3B':'MoE - Strix Halo','qwen3.6-27B-code':'Dense - RTX 3090','gemma-4-E4B':'Light - RTX 5070'};
+var GL={'qwen3.6-35B-A3B':'MoE - Strix Halo','qwen3.6-27B-code':'Dense - RTX 3090','qwen3.5-9b-vlm':'VLM - RTX 5070'};
 function $(id){return document.getElementById(id);}
 function render(data){
@@ -122,7 +122,7 @@ var t=Object.values(data.route_counts||{}).reduce((a,b)=>a+b,0);
 var ta=0,tm=0;data.gpus.forEach(function(g){ta+=(g.active_requests||0);tm+=(g.max_concurrent||1)});
 $('kpi-total').textContent=t;$('kpi-active').textContent=ta+'/'+tm;$('kpi-agents').textContent=Object.keys(data.agent_counts||{}).length;
 $('update-time').textContent=new Date().toLocaleTimeString();
-var ids={'qwen3.6-35B-A3B':'gpu-moe','qwen3.6-27B-code':'gpu-dense','gemma-4-E4B':'gpu-light'};
+var ids={'qwen3.6-35B-A3B':'gpu-moe','qwen3.6-27B-code':'gpu-dense','qwen3.5-9b-vlm':'gpu-light'};
 data.gpus.forEach(function(g){
 var el=$(ids[g.id]);if(!el)return;
 var a=g.active_requests||0,mx=g.max_concurrent||1;
@@ -154,14 +154,14 @@ var sc=pct>=100?'#ef4444':pct>=50?'#f59e0b':'#22c55e';
 var circ=188.5,dash=(pct/100)*circ;
 var h='<div class=\"d-inline-block position-relative mb-2\"><svg width=\"72\" height=\"72\"><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"#1e293b\" stroke-width=\"6\"/><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"'+sc+'\" stroke-width=\"6\" stroke-dasharray=\"'+dash+' '+(circ-dash)+'\" stroke-linecap=\"round\" transform=\"rotate(-90 36 36)\"/></svg><div style=\"position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);text-align:center\"><div class=\"ring-label\" style=\"color:'+sc+'\">'+ta+'</div><div class=\"ring-sublabel\">/ '+tm+' slots</div></div></div>';
 h+='<div class=\"fw-bold mb-2 small\" style=\"color:'+sc+'\">'+st+'</div>';
-var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','gemma-4-E4B':'Gemma'};
+var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
 data.gpus.forEach(function(g){var a=g.active_requests||0,mx=g.max_concurrent||1,gp=mx>0?Math.round(a/mx*100):0;h+='<div class=\"d-flex align-items-center gap-2 mb-1 justify-content-center\"><span class=\"small\" style=\"min-width:32px;text-align:right;font-size:10px\">'+(lb[g.id]||g.id)+'</span><div style=\"flex:1;max-width:70px;height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+gp+'%;background:'+sc+';border-radius:2px\"></div></div><span class=\"small\" style=\"min-width:22px;font-size:10px\">'+a+'/'+mx+'</span></div>'});
 el.innerHTML=h;
 }
 function renderGPUMetrics(data){
 var el=$('gpu-metrics-card');if(!el)return;
-var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','gemma-4-E4B':'Gemma'};
+var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
 var h='';data.gpus.forEach(function(g){
 var nm=lb[g.id]||g.id,tp=g.temp_c||0,ut=g.gpu_util_pct||0,pw=g.power_w||0,pl=g.power_limit_w||0;
 var tc=tp>85?'#ef4444':tp>70?'#f59e0b':'#22c55e',uc=ut>90?'#ef4444':ut>70?'#f59e0b':'#22c55e';
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
 }
 upstream ocu_llm_pool {
-    ## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
+    ## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
    server 192.168.68.110:8080;
 }
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
    "heavy"          llmgpu_pool;
    "qwen3.5-27B"    llmgpu_pool;
    "light"          ocu_llm_pool;
-    "gemma-4"        ocu_llm_pool;
+    "qwen3.5-9b-vlm"        ocu_llm_pool;
 }
 ## Rate limit zone — 10 req/s per IP, burst of 20
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
 }
 upstream ocu_llm_pool {
-    ## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
+    ## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
    server 192.168.68.110:8080;
 }
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
    "heavy"          llmgpu_pool;
    "qwen3.5-27B"    llmgpu_pool;
    "light"          ocu_llm_pool;
-    "gemma-4"        ocu_llm_pool;
+    "qwen3.5-9b-vlm"        ocu_llm_pool;
 }
 server {
@@ -11,9 +11,9 @@ model_list:
      api_base: http://192.168.68.8:8080/v1
      api_key: "not-needed"
-  - model_name: gemma-4-E4B
+  - model_name: qwen3.5-9b-vlm
    litellm_params:
-      model: openai/gemma-4-E4B
+      model: openai/qwen3.5-9b-vlm
      api_base: http://192.168.68.110:8080/v1
      api_key: "not-needed"
@@ -10,24 +10,24 @@ GPU_LIGHT_URL = os.environ.get("GPU_LIGHT_URL", "http://192.168.68.110:8080/v1")
 GPU_SIDECARS = {
    "qwen3.6-35B-A3B": "http://192.168.68.15:8090",
    "qwen3.6-27B-code": "http://192.168.68.8:8090",
-    "gemma-4-E4B": "http://192.168.68.110:8090",
+    "qwen3.5-9b-vlm": "http://192.168.68.110:8090",
 }
 GPU_URLS = {
    "qwen3.6-35B-A3B": GPU_MOE_URL,
    "qwen3.6-27B-code": GPU_DENSE_URL,
-    "gemma-4-E4B": GPU_LIGHT_URL,
+    "qwen3.5-9b-vlm": GPU_LIGHT_URL,
 }
 # Max concurrent requests per GPU (based on llama.cpp --parallel)
 GPU_MAX_CONCURRENT = {
    "qwen3.6-35B-A3B": 2,   # 2 slots
    "qwen3.6-27B-code": 2,  # 2 slots
-    "gemma-4-E4B": 1,       # 1 slot
+    "qwen3.5-9b-vlm": 2,       # 2 slots (12GB VRAM, 4GB headroom)
 }
 TIER_MODELS = {
-    "starter": ["gemma-4-E4B"],
+    "starter": ["qwen3.5-9b-vlm"],
-    "professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
+    "professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
-    "enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
+    "enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
 }
 API_KEYS = {
    "sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
@@ -139,7 +139,7 @@ def route(rd, tier):
    sys = any(m.get("role")=="system" for m in msgs)
    turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
    hints = rd.get("routing_hints",{})
-    allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
+    allowed = TIER_MODELS.get(tier, ["qwen3.5-9b-vlm"])
    avail = [m for m in available_models() if m in allowed]
    if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
@@ -155,24 +155,24 @@ def route(rd, tier):
        return {"model": target, "reason": "explicit"}
    if hints:
-        if hints.get("priority")=="speed" and "gemma-4-E4B" in avail:
+        if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
-            return select_best_gpu(["gemma-4-E4B"], "hint_speed") or {"model":"gemma-4-E4B","reason":"hint_speed"}
+            return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
        if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
            return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
    # Heavy -> dense (but fall back to MoE if dense is busy)
    if t > 4000 or sys or turns > 6:
-        candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","gemma-4-E4B"]
+        candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"]
        candidates = [m for m in candidates if m in avail]
        result = select_best_gpu(candidates, "heavy_reasoning")
        if result: return result
-    # Ultra-light -> gemma
+    # Ultra-light -> VLM
    first_msg = msgs[0].get("content","") if msgs else ""
    words = len(first_msg.split()) if isinstance(first_msg, str) else 99
-    if words <= 3 and turns <= 1 and not sys and "gemma-4-E4B" in avail:
+    if words <= 3 and turns <= 1 and not sys and "qwen3.5-9b-vlm" in avail:
-        if not is_gpu_busy("gemma-4-E4B"):
+        if not is_gpu_busy("qwen3.5-9b-vlm"):
-            return {"model":"gemma-4-E4B","reason":"ultra_light"}
+            return {"model":"qwen3.5-9b-vlm","reason":"ultra_light"}
    # Default: MoE, fall back to dense if MoE is busy
    if "qwen3.6-35B-A3B" in avail:
@@ -239,7 +239,6 @@ def chat():
        is_stream = rd.get("stream", False)
        gpu_incr(model)
        decremented = False
        log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
        if r:
@@ -254,7 +253,6 @@ def chat():
            headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
        lat = int((time.time()-start)*1000)
        gpu_decr(model)
        decremented = True  # Release slot
        if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
        if is_stream:
@@ -271,11 +269,12 @@ def chat():
        data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
        bcast()
        return jsonify(data)
        if not decremented:
            try: gpu_decr(model)
            except: pass
    except requests.Timeout:
        gpu_decr(model)
        log.error("TIMEOUT: %s -> %s", agent, model)
        return jsonify({"error":"timeout"}), 504
    except Exception as e:
        gpu_decr(model)
        log.error("Error: %s\n%s", e, traceback.format_exc())
        return jsonify({"error":str(e)}), 500