May 19, 2026: Full harness update

- Model migration: gemma-4-E4B → qwen3.5-9b-vlm
- Dashboard reorder: Usage Over Time + GPU Metrics to top
- Router counter leak fix (gpu_decr in except handler)
- VLM slot upgrade 1→2
- Redis stale key cleanup
- Automated maintenance cron job
- LiteLLM config update
- GPU router config update
- README update
This commit is contained in:
Abiba
2026-05-19 15:03:34 +00:00
parent 4f032b035c
commit 9c31b5d622
7 changed files with 43 additions and 46 deletions
+1 -3
View File
@@ -1,5 +1,3 @@
.git
__pycache__/ __pycache__/
*.pyc *.pyc
.env
redis-data/
ssl/
+1 -1
View File
@@ -8,7 +8,7 @@ CT 116 Docker stack for routing local GPU models through a unified OpenAI-compat
nginx :80 → router :9000 → GPU backends nginx :80 → router :9000 → GPU backends
├─ qwen3.6-35B-A3B (MoE) @ 192.168.68.15:8080 ├─ qwen3.6-35B-A3B (MoE) @ 192.168.68.15:8080
├─ qwen3.6-27B-code (Dense) @ 192.168.68.8:8080 ├─ qwen3.6-27B-code (Dense) @ 192.168.68.8:8080
└─ gemma-4-E4B (Light) @ 192.168.68.110:8080 └─ qwen3.5-9b-vlm (VLM) @ 192.168.68.110:8080
LiteLLM :8081 (fallback) | Dashboard :3000 | Redis :6379 (local) LiteLLM :8081 (fallback) | Dashboard :3000 | Redis :6379 (local)
``` ```
+17 -17
View File
@@ -80,17 +80,7 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
</div> </div>
<div class="row g-3 align-items-stretch"> <div class="row g-3 align-items-stretch">
<!-- ROW 1: 3 GPU Cards --> <!-- ROW 1: Usage Chart (8) + GPU Metrics (4) -->
<div class="col-md-4"><div class="gpu-card" id="gpu-moe"><div class="text-secondary small">Loading...</div></div></div>
<div class="col-md-4"><div class="gpu-card" id="gpu-dense"><div class="text-secondary small">Loading...</div></div></div>
<div class="col-md-4"><div class="gpu-card" id="gpu-light"><div class="text-secondary small">Loading...</div></div></div>
<!-- ROW 2: Queue + Model + Agent -->
<div class="col-md-4"><div class="chart-card"><div class="title">Queue Status</div><div class="text-center" id="queue-viz"></div></div></div>
<div class="col-md-4"><div class="chart-card"><div class="title">Model Distribution</div><div id="route-bars"></div></div></div>
<div class="col-md-4"><div class="chart-card"><div class="title">Agent Activity</div><div id="agent-bars"></div></div></div>
<!-- ROW 3: Usage Chart (8) + GPU Metrics (4) -->
<div class="col-md-8"><div class="chart-card"><div class="title d-flex justify-content-between align-items-center"> <div class="col-md-8"><div class="chart-card"><div class="title d-flex justify-content-between align-items-center">
<span>Usage Over Time</span> <span>Usage Over Time</span>
<div class="d-flex gap-1"> <div class="d-flex gap-1">
@@ -101,6 +91,16 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
</div><div id="timeseries-chart" style="height:150px"></div><div id="timeseries-legend" class="d-flex justify-content-center gap-3 mt-2 flex-wrap small"></div></div></div> </div><div id="timeseries-chart" style="height:150px"></div><div id="timeseries-legend" class="d-flex justify-content-center gap-3 mt-2 flex-wrap small"></div></div></div>
<div class="col-md-4"><div class="chart-card"><div class="title">GPU Metrics</div><div id="gpu-metrics-card"></div></div></div> <div class="col-md-4"><div class="chart-card"><div class="title">GPU Metrics</div><div id="gpu-metrics-card"></div></div></div>
<!-- ROW 2: 3 GPU Cards -->
<div class="col-md-4"><div class="gpu-card" id="gpu-moe"><div class="text-secondary small">Loading...</div></div></div>
<div class="col-md-4"><div class="gpu-card" id="gpu-dense"><div class="text-secondary small">Loading...</div></div></div>
<div class="col-md-4"><div class="gpu-card" id="gpu-light"><div class="text-secondary small">Loading...</div></div></div>
<!-- ROW 3: Queue + Model + Agent -->
<div class="col-md-4"><div class="chart-card"><div class="title">Queue Status</div><div class="text-center" id="queue-viz"></div></div></div>
<div class="col-md-4"><div class="chart-card"><div class="title">Model Distribution</div><div id="route-bars"></div></div></div>
<div class="col-md-4"><div class="chart-card"><div class="title">Agent Activity</div><div id="agent-bars"></div></div></div>
<!-- ROW 4: Live Stream --> <!-- ROW 4: Live Stream -->
<div class="col-12"><div class="chart-card"><div class="title">Live Stream</div> <div class="col-12"><div class="chart-card"><div class="title">Live Stream</div>
<div class="table-responsive"><table class="table table-custom mb-0"> <div class="table-responsive"><table class="table table-custom mb-0">
@@ -111,9 +111,9 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
</div> </div>
<script> <script>
var MC={'gemma-4-E4B':'#22c55e','qwen3.6-27B-code':'#f59e0b','qwen3.6-35B-A3B':'#a78bfa'}; var MC={'qwen3.5-9b-vlm':'#22c55e','qwen3.6-27B-code':'#f59e0b','qwen3.6-35B-A3B':'#a78bfa'};
var ML={'gemma-4-E4B':'Gemma 4B','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'}; var ML={'qwen3.5-9b-vlm':'Qwen3.5 9B VLM','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
var GL={'qwen3.6-35B-A3B':'MoE - Strix Halo','qwen3.6-27B-code':'Dense - RTX 3090','gemma-4-E4B':'Light - RTX 5070'}; var GL={'qwen3.6-35B-A3B':'MoE - Strix Halo','qwen3.6-27B-code':'Dense - RTX 3090','qwen3.5-9b-vlm':'VLM - RTX 5070'};
function $(id){return document.getElementById(id);} function $(id){return document.getElementById(id);}
function render(data){ function render(data){
@@ -122,7 +122,7 @@ var t=Object.values(data.route_counts||{}).reduce((a,b)=>a+b,0);
var ta=0,tm=0;data.gpus.forEach(function(g){ta+=(g.active_requests||0);tm+=(g.max_concurrent||1)}); var ta=0,tm=0;data.gpus.forEach(function(g){ta+=(g.active_requests||0);tm+=(g.max_concurrent||1)});
$('kpi-total').textContent=t;$('kpi-active').textContent=ta+'/'+tm;$('kpi-agents').textContent=Object.keys(data.agent_counts||{}).length; $('kpi-total').textContent=t;$('kpi-active').textContent=ta+'/'+tm;$('kpi-agents').textContent=Object.keys(data.agent_counts||{}).length;
$('update-time').textContent=new Date().toLocaleTimeString(); $('update-time').textContent=new Date().toLocaleTimeString();
var ids={'qwen3.6-35B-A3B':'gpu-moe','qwen3.6-27B-code':'gpu-dense','gemma-4-E4B':'gpu-light'}; var ids={'qwen3.6-35B-A3B':'gpu-moe','qwen3.6-27B-code':'gpu-dense','qwen3.5-9b-vlm':'gpu-light'};
data.gpus.forEach(function(g){ data.gpus.forEach(function(g){
var el=$(ids[g.id]);if(!el)return; var el=$(ids[g.id]);if(!el)return;
var a=g.active_requests||0,mx=g.max_concurrent||1; var a=g.active_requests||0,mx=g.max_concurrent||1;
@@ -154,14 +154,14 @@ var sc=pct>=100?'#ef4444':pct>=50?'#f59e0b':'#22c55e';
var circ=188.5,dash=(pct/100)*circ; var circ=188.5,dash=(pct/100)*circ;
var h='<div class=\"d-inline-block position-relative mb-2\"><svg width=\"72\" height=\"72\"><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"#1e293b\" stroke-width=\"6\"/><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"'+sc+'\" stroke-width=\"6\" stroke-dasharray=\"'+dash+' '+(circ-dash)+'\" stroke-linecap=\"round\" transform=\"rotate(-90 36 36)\"/></svg><div style=\"position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);text-align:center\"><div class=\"ring-label\" style=\"color:'+sc+'\">'+ta+'</div><div class=\"ring-sublabel\">/ '+tm+' slots</div></div></div>'; var h='<div class=\"d-inline-block position-relative mb-2\"><svg width=\"72\" height=\"72\"><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"#1e293b\" stroke-width=\"6\"/><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"'+sc+'\" stroke-width=\"6\" stroke-dasharray=\"'+dash+' '+(circ-dash)+'\" stroke-linecap=\"round\" transform=\"rotate(-90 36 36)\"/></svg><div style=\"position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);text-align:center\"><div class=\"ring-label\" style=\"color:'+sc+'\">'+ta+'</div><div class=\"ring-sublabel\">/ '+tm+' slots</div></div></div>';
h+='<div class=\"fw-bold mb-2 small\" style=\"color:'+sc+'\">'+st+'</div>'; h+='<div class=\"fw-bold mb-2 small\" style=\"color:'+sc+'\">'+st+'</div>';
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','gemma-4-E4B':'Gemma'}; var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
data.gpus.forEach(function(g){var a=g.active_requests||0,mx=g.max_concurrent||1,gp=mx>0?Math.round(a/mx*100):0;h+='<div class=\"d-flex align-items-center gap-2 mb-1 justify-content-center\"><span class=\"small\" style=\"min-width:32px;text-align:right;font-size:10px\">'+(lb[g.id]||g.id)+'</span><div style=\"flex:1;max-width:70px;height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+gp+'%;background:'+sc+';border-radius:2px\"></div></div><span class=\"small\" style=\"min-width:22px;font-size:10px\">'+a+'/'+mx+'</span></div>'}); data.gpus.forEach(function(g){var a=g.active_requests||0,mx=g.max_concurrent||1,gp=mx>0?Math.round(a/mx*100):0;h+='<div class=\"d-flex align-items-center gap-2 mb-1 justify-content-center\"><span class=\"small\" style=\"min-width:32px;text-align:right;font-size:10px\">'+(lb[g.id]||g.id)+'</span><div style=\"flex:1;max-width:70px;height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+gp+'%;background:'+sc+';border-radius:2px\"></div></div><span class=\"small\" style=\"min-width:22px;font-size:10px\">'+a+'/'+mx+'</span></div>'});
el.innerHTML=h; el.innerHTML=h;
} }
function renderGPUMetrics(data){ function renderGPUMetrics(data){
var el=$('gpu-metrics-card');if(!el)return; var el=$('gpu-metrics-card');if(!el)return;
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','gemma-4-E4B':'Gemma'}; var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
var h='';data.gpus.forEach(function(g){ var h='';data.gpus.forEach(function(g){
var nm=lb[g.id]||g.id,tp=g.temp_c||0,ut=g.gpu_util_pct||0,pw=g.power_w||0,pl=g.power_limit_w||0; var nm=lb[g.id]||g.id,tp=g.temp_c||0,ut=g.gpu_util_pct||0,pw=g.power_w||0,pl=g.power_limit_w||0;
var tc=tp>85?'#ef4444':tp>70?'#f59e0b':'#22c55e',uc=ut>90?'#ef4444':ut>70?'#f59e0b':'#22c55e'; var tc=tp>85?'#ef4444':tp>70?'#f59e0b':'#22c55e',uc=ut>90?'#ef4444':ut>70?'#f59e0b':'#22c55e';
+2 -2
View File
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
} }
upstream ocu_llm_pool { upstream ocu_llm_pool {
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks ## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
server 192.168.68.110:8080; server 192.168.68.110:8080;
} }
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
"heavy" llmgpu_pool; "heavy" llmgpu_pool;
"qwen3.5-27B" llmgpu_pool; "qwen3.5-27B" llmgpu_pool;
"light" ocu_llm_pool; "light" ocu_llm_pool;
"gemma-4" ocu_llm_pool; "qwen3.5-9b-vlm" ocu_llm_pool;
} }
## Rate limit zone — 10 req/s per IP, burst of 20 ## Rate limit zone — 10 req/s per IP, burst of 20
+2 -2
View File
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
} }
upstream ocu_llm_pool { upstream ocu_llm_pool {
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks ## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
server 192.168.68.110:8080; server 192.168.68.110:8080;
} }
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
"heavy" llmgpu_pool; "heavy" llmgpu_pool;
"qwen3.5-27B" llmgpu_pool; "qwen3.5-27B" llmgpu_pool;
"light" ocu_llm_pool; "light" ocu_llm_pool;
"gemma-4" ocu_llm_pool; "qwen3.5-9b-vlm" ocu_llm_pool;
} }
server { server {
+2 -2
View File
@@ -11,9 +11,9 @@ model_list:
api_base: http://192.168.68.8:8080/v1 api_base: http://192.168.68.8:8080/v1
api_key: "not-needed" api_key: "not-needed"
- model_name: gemma-4-E4B - model_name: qwen3.5-9b-vlm
litellm_params: litellm_params:
model: openai/gemma-4-E4B model: openai/qwen3.5-9b-vlm
api_base: http://192.168.68.110:8080/v1 api_base: http://192.168.68.110:8080/v1
api_key: "not-needed" api_key: "not-needed"
+18 -19
View File
@@ -10,24 +10,24 @@ GPU_LIGHT_URL = os.environ.get("GPU_LIGHT_URL", "http://192.168.68.110:8080/v1")
GPU_SIDECARS = { GPU_SIDECARS = {
"qwen3.6-35B-A3B": "http://192.168.68.15:8090", "qwen3.6-35B-A3B": "http://192.168.68.15:8090",
"qwen3.6-27B-code": "http://192.168.68.8:8090", "qwen3.6-27B-code": "http://192.168.68.8:8090",
"gemma-4-E4B": "http://192.168.68.110:8090", "qwen3.5-9b-vlm": "http://192.168.68.110:8090",
} }
GPU_URLS = { GPU_URLS = {
"qwen3.6-35B-A3B": GPU_MOE_URL, "qwen3.6-35B-A3B": GPU_MOE_URL,
"qwen3.6-27B-code": GPU_DENSE_URL, "qwen3.6-27B-code": GPU_DENSE_URL,
"gemma-4-E4B": GPU_LIGHT_URL, "qwen3.5-9b-vlm": GPU_LIGHT_URL,
} }
# Max concurrent requests per GPU (based on llama.cpp --parallel) # Max concurrent requests per GPU (based on llama.cpp --parallel)
GPU_MAX_CONCURRENT = { GPU_MAX_CONCURRENT = {
"qwen3.6-35B-A3B": 2, # 2 slots "qwen3.6-35B-A3B": 2, # 2 slots
"qwen3.6-27B-code": 2, # 2 slots "qwen3.6-27B-code": 2, # 2 slots
"gemma-4-E4B": 1, # 1 slot "qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom)
} }
TIER_MODELS = { TIER_MODELS = {
"starter": ["gemma-4-E4B"], "starter": ["qwen3.5-9b-vlm"],
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"], "professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"], "enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
} }
API_KEYS = { API_KEYS = {
"sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"}, "sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
@@ -139,7 +139,7 @@ def route(rd, tier):
sys = any(m.get("role")=="system" for m in msgs) sys = any(m.get("role")=="system" for m in msgs)
turns = len([m for m in msgs if m.get("role") in ("user","assistant")]) turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
hints = rd.get("routing_hints",{}) hints = rd.get("routing_hints",{})
allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"]) allowed = TIER_MODELS.get(tier, ["qwen3.5-9b-vlm"])
avail = [m for m in available_models() if m in allowed] avail = [m for m in available_models() if m in allowed]
if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True} if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
@@ -155,24 +155,24 @@ def route(rd, tier):
return {"model": target, "reason": "explicit"} return {"model": target, "reason": "explicit"}
if hints: if hints:
if hints.get("priority")=="speed" and "gemma-4-E4B" in avail: if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
return select_best_gpu(["gemma-4-E4B"], "hint_speed") or {"model":"gemma-4-E4B","reason":"hint_speed"} return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail: if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"} return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
# Heavy -> dense (but fall back to MoE if dense is busy) # Heavy -> dense (but fall back to MoE if dense is busy)
if t > 4000 or sys or turns > 6: if t > 4000 or sys or turns > 6:
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","gemma-4-E4B"] candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"]
candidates = [m for m in candidates if m in avail] candidates = [m for m in candidates if m in avail]
result = select_best_gpu(candidates, "heavy_reasoning") result = select_best_gpu(candidates, "heavy_reasoning")
if result: return result if result: return result
# Ultra-light -> gemma # Ultra-light -> VLM
first_msg = msgs[0].get("content","") if msgs else "" first_msg = msgs[0].get("content","") if msgs else ""
words = len(first_msg.split()) if isinstance(first_msg, str) else 99 words = len(first_msg.split()) if isinstance(first_msg, str) else 99
if words <= 3 and turns <= 1 and not sys and "gemma-4-E4B" in avail: if words <= 3 and turns <= 1 and not sys and "qwen3.5-9b-vlm" in avail:
if not is_gpu_busy("gemma-4-E4B"): if not is_gpu_busy("qwen3.5-9b-vlm"):
return {"model":"gemma-4-E4B","reason":"ultra_light"} return {"model":"qwen3.5-9b-vlm","reason":"ultra_light"}
# Default: MoE, fall back to dense if MoE is busy # Default: MoE, fall back to dense if MoE is busy
if "qwen3.6-35B-A3B" in avail: if "qwen3.6-35B-A3B" in avail:
@@ -239,7 +239,6 @@ def chat():
is_stream = rd.get("stream", False) is_stream = rd.get("stream", False)
gpu_incr(model) gpu_incr(model)
decremented = False
log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1)) log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
if r: if r:
@@ -254,7 +253,6 @@ def chat():
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream) headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
lat = int((time.time()-start)*1000) lat = int((time.time()-start)*1000)
gpu_decr(model) gpu_decr(model)
decremented = True # Release slot
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502 if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
if is_stream: if is_stream:
@@ -271,11 +269,12 @@ def chat():
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)} data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
bcast() bcast()
return jsonify(data) return jsonify(data)
if not decremented:
try: gpu_decr(model)
except: pass
except requests.Timeout: except requests.Timeout:
gpu_decr(model)
log.error("TIMEOUT: %s -> %s", agent, model)
return jsonify({"error":"timeout"}), 504 return jsonify({"error":"timeout"}), 504
except Exception as e:
gpu_decr(model)
log.error("Error: %s\n%s", e, traceback.format_exc()) log.error("Error: %s\n%s", e, traceback.format_exc())
return jsonify({"error":str(e)}), 500 return jsonify({"error":str(e)}), 500