May 19, 2026: Full harness update
- Model migration: gemma-4-E4B → qwen3.5-9b-vlm - Dashboard reorder: Usage Over Time + GPU Metrics to top - Router counter leak fix (gpu_decr in except handler) - VLM slot upgrade 1→2 - Redis stale key cleanup - Automated maintenance cron job - LiteLLM config update - GPU router config update - README update
This commit is contained in:
+1
-3
@@ -1,5 +1,3 @@
|
||||
.git
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.env
|
||||
redis-data/
|
||||
ssl/
|
||||
|
||||
@@ -8,7 +8,7 @@ CT 116 Docker stack for routing local GPU models through a unified OpenAI-compat
|
||||
nginx :80 → router :9000 → GPU backends
|
||||
├─ qwen3.6-35B-A3B (MoE) @ 192.168.68.15:8080
|
||||
├─ qwen3.6-27B-code (Dense) @ 192.168.68.8:8080
|
||||
└─ gemma-4-E4B (Light) @ 192.168.68.110:8080
|
||||
└─ qwen3.5-9b-vlm (VLM) @ 192.168.68.110:8080
|
||||
|
||||
LiteLLM :8081 (fallback) | Dashboard :3000 | Redis :6379 (local)
|
||||
```
|
||||
|
||||
+17
-17
@@ -80,17 +80,7 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
|
||||
</div>
|
||||
|
||||
<div class="row g-3 align-items-stretch">
|
||||
<!-- ROW 1: 3 GPU Cards -->
|
||||
<div class="col-md-4"><div class="gpu-card" id="gpu-moe"><div class="text-secondary small">Loading...</div></div></div>
|
||||
<div class="col-md-4"><div class="gpu-card" id="gpu-dense"><div class="text-secondary small">Loading...</div></div></div>
|
||||
<div class="col-md-4"><div class="gpu-card" id="gpu-light"><div class="text-secondary small">Loading...</div></div></div>
|
||||
|
||||
<!-- ROW 2: Queue + Model + Agent -->
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">Queue Status</div><div class="text-center" id="queue-viz"></div></div></div>
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">Model Distribution</div><div id="route-bars"></div></div></div>
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">Agent Activity</div><div id="agent-bars"></div></div></div>
|
||||
|
||||
<!-- ROW 3: Usage Chart (8) + GPU Metrics (4) -->
|
||||
<!-- ROW 1: Usage Chart (8) + GPU Metrics (4) -->
|
||||
<div class="col-md-8"><div class="chart-card"><div class="title d-flex justify-content-between align-items-center">
|
||||
<span>Usage Over Time</span>
|
||||
<div class="d-flex gap-1">
|
||||
@@ -101,6 +91,16 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
|
||||
</div><div id="timeseries-chart" style="height:150px"></div><div id="timeseries-legend" class="d-flex justify-content-center gap-3 mt-2 flex-wrap small"></div></div></div>
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">GPU Metrics</div><div id="gpu-metrics-card"></div></div></div>
|
||||
|
||||
<!-- ROW 2: 3 GPU Cards -->
|
||||
<div class="col-md-4"><div class="gpu-card" id="gpu-moe"><div class="text-secondary small">Loading...</div></div></div>
|
||||
<div class="col-md-4"><div class="gpu-card" id="gpu-dense"><div class="text-secondary small">Loading...</div></div></div>
|
||||
<div class="col-md-4"><div class="gpu-card" id="gpu-light"><div class="text-secondary small">Loading...</div></div></div>
|
||||
|
||||
<!-- ROW 3: Queue + Model + Agent -->
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">Queue Status</div><div class="text-center" id="queue-viz"></div></div></div>
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">Model Distribution</div><div id="route-bars"></div></div></div>
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">Agent Activity</div><div id="agent-bars"></div></div></div>
|
||||
|
||||
<!-- ROW 4: Live Stream -->
|
||||
<div class="col-12"><div class="chart-card"><div class="title">Live Stream</div>
|
||||
<div class="table-responsive"><table class="table table-custom mb-0">
|
||||
@@ -111,9 +111,9 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
|
||||
</div>
|
||||
|
||||
<script>
|
||||
var MC={'gemma-4-E4B':'#22c55e','qwen3.6-27B-code':'#f59e0b','qwen3.6-35B-A3B':'#a78bfa'};
|
||||
var ML={'gemma-4-E4B':'Gemma 4B','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
|
||||
var GL={'qwen3.6-35B-A3B':'MoE - Strix Halo','qwen3.6-27B-code':'Dense - RTX 3090','gemma-4-E4B':'Light - RTX 5070'};
|
||||
var MC={'qwen3.5-9b-vlm':'#22c55e','qwen3.6-27B-code':'#f59e0b','qwen3.6-35B-A3B':'#a78bfa'};
|
||||
var ML={'qwen3.5-9b-vlm':'Qwen3.5 9B VLM','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
|
||||
var GL={'qwen3.6-35B-A3B':'MoE - Strix Halo','qwen3.6-27B-code':'Dense - RTX 3090','qwen3.5-9b-vlm':'VLM - RTX 5070'};
|
||||
function $(id){return document.getElementById(id);}
|
||||
|
||||
function render(data){
|
||||
@@ -122,7 +122,7 @@ var t=Object.values(data.route_counts||{}).reduce((a,b)=>a+b,0);
|
||||
var ta=0,tm=0;data.gpus.forEach(function(g){ta+=(g.active_requests||0);tm+=(g.max_concurrent||1)});
|
||||
$('kpi-total').textContent=t;$('kpi-active').textContent=ta+'/'+tm;$('kpi-agents').textContent=Object.keys(data.agent_counts||{}).length;
|
||||
$('update-time').textContent=new Date().toLocaleTimeString();
|
||||
var ids={'qwen3.6-35B-A3B':'gpu-moe','qwen3.6-27B-code':'gpu-dense','gemma-4-E4B':'gpu-light'};
|
||||
var ids={'qwen3.6-35B-A3B':'gpu-moe','qwen3.6-27B-code':'gpu-dense','qwen3.5-9b-vlm':'gpu-light'};
|
||||
data.gpus.forEach(function(g){
|
||||
var el=$(ids[g.id]);if(!el)return;
|
||||
var a=g.active_requests||0,mx=g.max_concurrent||1;
|
||||
@@ -154,14 +154,14 @@ var sc=pct>=100?'#ef4444':pct>=50?'#f59e0b':'#22c55e';
|
||||
var circ=188.5,dash=(pct/100)*circ;
|
||||
var h='<div class=\"d-inline-block position-relative mb-2\"><svg width=\"72\" height=\"72\"><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"#1e293b\" stroke-width=\"6\"/><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"'+sc+'\" stroke-width=\"6\" stroke-dasharray=\"'+dash+' '+(circ-dash)+'\" stroke-linecap=\"round\" transform=\"rotate(-90 36 36)\"/></svg><div style=\"position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);text-align:center\"><div class=\"ring-label\" style=\"color:'+sc+'\">'+ta+'</div><div class=\"ring-sublabel\">/ '+tm+' slots</div></div></div>';
|
||||
h+='<div class=\"fw-bold mb-2 small\" style=\"color:'+sc+'\">'+st+'</div>';
|
||||
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','gemma-4-E4B':'Gemma'};
|
||||
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
|
||||
data.gpus.forEach(function(g){var a=g.active_requests||0,mx=g.max_concurrent||1,gp=mx>0?Math.round(a/mx*100):0;h+='<div class=\"d-flex align-items-center gap-2 mb-1 justify-content-center\"><span class=\"small\" style=\"min-width:32px;text-align:right;font-size:10px\">'+(lb[g.id]||g.id)+'</span><div style=\"flex:1;max-width:70px;height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+gp+'%;background:'+sc+';border-radius:2px\"></div></div><span class=\"small\" style=\"min-width:22px;font-size:10px\">'+a+'/'+mx+'</span></div>'});
|
||||
el.innerHTML=h;
|
||||
}
|
||||
|
||||
function renderGPUMetrics(data){
|
||||
var el=$('gpu-metrics-card');if(!el)return;
|
||||
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','gemma-4-E4B':'Gemma'};
|
||||
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
|
||||
var h='';data.gpus.forEach(function(g){
|
||||
var nm=lb[g.id]||g.id,tp=g.temp_c||0,ut=g.gpu_util_pct||0,pw=g.power_w||0,pl=g.power_limit_w||0;
|
||||
var tc=tp>85?'#ef4444':tp>70?'#f59e0b':'#22c55e',uc=ut>90?'#ef4444':ut>70?'#f59e0b':'#22c55e';
|
||||
|
||||
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
|
||||
}
|
||||
|
||||
upstream ocu_llm_pool {
|
||||
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
||||
## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
|
||||
server 192.168.68.110:8080;
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
|
||||
"heavy" llmgpu_pool;
|
||||
"qwen3.5-27B" llmgpu_pool;
|
||||
"light" ocu_llm_pool;
|
||||
"gemma-4" ocu_llm_pool;
|
||||
"qwen3.5-9b-vlm" ocu_llm_pool;
|
||||
}
|
||||
|
||||
## Rate limit zone — 10 req/s per IP, burst of 20
|
||||
|
||||
+2
-2
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
|
||||
}
|
||||
|
||||
upstream ocu_llm_pool {
|
||||
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
||||
## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
|
||||
server 192.168.68.110:8080;
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
|
||||
"heavy" llmgpu_pool;
|
||||
"qwen3.5-27B" llmgpu_pool;
|
||||
"light" ocu_llm_pool;
|
||||
"gemma-4" ocu_llm_pool;
|
||||
"qwen3.5-9b-vlm" ocu_llm_pool;
|
||||
}
|
||||
|
||||
server {
|
||||
|
||||
+2
-2
@@ -11,9 +11,9 @@ model_list:
|
||||
api_base: http://192.168.68.8:8080/v1
|
||||
api_key: "not-needed"
|
||||
|
||||
- model_name: gemma-4-E4B
|
||||
- model_name: qwen3.5-9b-vlm
|
||||
litellm_params:
|
||||
model: openai/gemma-4-E4B
|
||||
model: openai/qwen3.5-9b-vlm
|
||||
api_base: http://192.168.68.110:8080/v1
|
||||
api_key: "not-needed"
|
||||
|
||||
|
||||
+18
-19
@@ -10,24 +10,24 @@ GPU_LIGHT_URL = os.environ.get("GPU_LIGHT_URL", "http://192.168.68.110:8080/v1")
|
||||
GPU_SIDECARS = {
|
||||
"qwen3.6-35B-A3B": "http://192.168.68.15:8090",
|
||||
"qwen3.6-27B-code": "http://192.168.68.8:8090",
|
||||
"gemma-4-E4B": "http://192.168.68.110:8090",
|
||||
"qwen3.5-9b-vlm": "http://192.168.68.110:8090",
|
||||
}
|
||||
GPU_URLS = {
|
||||
"qwen3.6-35B-A3B": GPU_MOE_URL,
|
||||
"qwen3.6-27B-code": GPU_DENSE_URL,
|
||||
"gemma-4-E4B": GPU_LIGHT_URL,
|
||||
"qwen3.5-9b-vlm": GPU_LIGHT_URL,
|
||||
}
|
||||
# Max concurrent requests per GPU (based on llama.cpp --parallel)
|
||||
GPU_MAX_CONCURRENT = {
|
||||
"qwen3.6-35B-A3B": 2, # 2 slots
|
||||
"qwen3.6-27B-code": 2, # 2 slots
|
||||
"gemma-4-E4B": 1, # 1 slot
|
||||
"qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom)
|
||||
}
|
||||
|
||||
TIER_MODELS = {
|
||||
"starter": ["gemma-4-E4B"],
|
||||
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
||||
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
||||
"starter": ["qwen3.5-9b-vlm"],
|
||||
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
|
||||
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
|
||||
}
|
||||
API_KEYS = {
|
||||
"sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
|
||||
@@ -139,7 +139,7 @@ def route(rd, tier):
|
||||
sys = any(m.get("role")=="system" for m in msgs)
|
||||
turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
|
||||
hints = rd.get("routing_hints",{})
|
||||
allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
|
||||
allowed = TIER_MODELS.get(tier, ["qwen3.5-9b-vlm"])
|
||||
avail = [m for m in available_models() if m in allowed]
|
||||
if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
|
||||
|
||||
@@ -155,24 +155,24 @@ def route(rd, tier):
|
||||
return {"model": target, "reason": "explicit"}
|
||||
|
||||
if hints:
|
||||
if hints.get("priority")=="speed" and "gemma-4-E4B" in avail:
|
||||
return select_best_gpu(["gemma-4-E4B"], "hint_speed") or {"model":"gemma-4-E4B","reason":"hint_speed"}
|
||||
if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
|
||||
return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
|
||||
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
|
||||
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
||||
|
||||
# Heavy -> dense (but fall back to MoE if dense is busy)
|
||||
if t > 4000 or sys or turns > 6:
|
||||
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","gemma-4-E4B"]
|
||||
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"]
|
||||
candidates = [m for m in candidates if m in avail]
|
||||
result = select_best_gpu(candidates, "heavy_reasoning")
|
||||
if result: return result
|
||||
|
||||
# Ultra-light -> gemma
|
||||
# Ultra-light -> VLM
|
||||
first_msg = msgs[0].get("content","") if msgs else ""
|
||||
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
||||
if words <= 3 and turns <= 1 and not sys and "gemma-4-E4B" in avail:
|
||||
if not is_gpu_busy("gemma-4-E4B"):
|
||||
return {"model":"gemma-4-E4B","reason":"ultra_light"}
|
||||
if words <= 3 and turns <= 1 and not sys and "qwen3.5-9b-vlm" in avail:
|
||||
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
||||
return {"model":"qwen3.5-9b-vlm","reason":"ultra_light"}
|
||||
|
||||
# Default: MoE, fall back to dense if MoE is busy
|
||||
if "qwen3.6-35B-A3B" in avail:
|
||||
@@ -239,7 +239,6 @@ def chat():
|
||||
is_stream = rd.get("stream", False)
|
||||
|
||||
gpu_incr(model)
|
||||
decremented = False
|
||||
|
||||
log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
|
||||
if r:
|
||||
@@ -254,7 +253,6 @@ def chat():
|
||||
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
|
||||
lat = int((time.time()-start)*1000)
|
||||
gpu_decr(model)
|
||||
decremented = True # Release slot
|
||||
|
||||
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
||||
if is_stream:
|
||||
@@ -271,11 +269,12 @@ def chat():
|
||||
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
|
||||
bcast()
|
||||
return jsonify(data)
|
||||
if not decremented:
|
||||
try: gpu_decr(model)
|
||||
except: pass
|
||||
except requests.Timeout:
|
||||
gpu_decr(model)
|
||||
log.error("TIMEOUT: %s -> %s", agent, model)
|
||||
return jsonify({"error":"timeout"}), 504
|
||||
except Exception as e:
|
||||
gpu_decr(model)
|
||||
log.error("Error: %s\n%s", e, traceback.format_exc())
|
||||
return jsonify({"error":str(e)}), 500
|
||||
|
||||
|
||||
Reference in New Issue
Block a user