May 19, 2026: Full harness update
- Model migration: gemma-4-E4B → qwen3.5-9b-vlm - Dashboard reorder: Usage Over Time + GPU Metrics to top - Router counter leak fix (gpu_decr in except handler) - VLM slot upgrade 1→2 - Redis stale key cleanup - Automated maintenance cron job - LiteLLM config update - GPU router config update - README update
This commit is contained in:
+1
-3
@@ -1,5 +1,3 @@
|
|||||||
|
.git
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
.env
|
|
||||||
redis-data/
|
|
||||||
ssl/
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ CT 116 Docker stack for routing local GPU models through a unified OpenAI-compat
|
|||||||
nginx :80 → router :9000 → GPU backends
|
nginx :80 → router :9000 → GPU backends
|
||||||
├─ qwen3.6-35B-A3B (MoE) @ 192.168.68.15:8080
|
├─ qwen3.6-35B-A3B (MoE) @ 192.168.68.15:8080
|
||||||
├─ qwen3.6-27B-code (Dense) @ 192.168.68.8:8080
|
├─ qwen3.6-27B-code (Dense) @ 192.168.68.8:8080
|
||||||
└─ gemma-4-E4B (Light) @ 192.168.68.110:8080
|
└─ qwen3.5-9b-vlm (VLM) @ 192.168.68.110:8080
|
||||||
|
|
||||||
LiteLLM :8081 (fallback) | Dashboard :3000 | Redis :6379 (local)
|
LiteLLM :8081 (fallback) | Dashboard :3000 | Redis :6379 (local)
|
||||||
```
|
```
|
||||||
|
|||||||
+17
-17
@@ -80,17 +80,7 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="row g-3 align-items-stretch">
|
<div class="row g-3 align-items-stretch">
|
||||||
<!-- ROW 1: 3 GPU Cards -->
|
<!-- ROW 1: Usage Chart (8) + GPU Metrics (4) -->
|
||||||
<div class="col-md-4"><div class="gpu-card" id="gpu-moe"><div class="text-secondary small">Loading...</div></div></div>
|
|
||||||
<div class="col-md-4"><div class="gpu-card" id="gpu-dense"><div class="text-secondary small">Loading...</div></div></div>
|
|
||||||
<div class="col-md-4"><div class="gpu-card" id="gpu-light"><div class="text-secondary small">Loading...</div></div></div>
|
|
||||||
|
|
||||||
<!-- ROW 2: Queue + Model + Agent -->
|
|
||||||
<div class="col-md-4"><div class="chart-card"><div class="title">Queue Status</div><div class="text-center" id="queue-viz"></div></div></div>
|
|
||||||
<div class="col-md-4"><div class="chart-card"><div class="title">Model Distribution</div><div id="route-bars"></div></div></div>
|
|
||||||
<div class="col-md-4"><div class="chart-card"><div class="title">Agent Activity</div><div id="agent-bars"></div></div></div>
|
|
||||||
|
|
||||||
<!-- ROW 3: Usage Chart (8) + GPU Metrics (4) -->
|
|
||||||
<div class="col-md-8"><div class="chart-card"><div class="title d-flex justify-content-between align-items-center">
|
<div class="col-md-8"><div class="chart-card"><div class="title d-flex justify-content-between align-items-center">
|
||||||
<span>Usage Over Time</span>
|
<span>Usage Over Time</span>
|
||||||
<div class="d-flex gap-1">
|
<div class="d-flex gap-1">
|
||||||
@@ -101,6 +91,16 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
|
|||||||
</div><div id="timeseries-chart" style="height:150px"></div><div id="timeseries-legend" class="d-flex justify-content-center gap-3 mt-2 flex-wrap small"></div></div></div>
|
</div><div id="timeseries-chart" style="height:150px"></div><div id="timeseries-legend" class="d-flex justify-content-center gap-3 mt-2 flex-wrap small"></div></div></div>
|
||||||
<div class="col-md-4"><div class="chart-card"><div class="title">GPU Metrics</div><div id="gpu-metrics-card"></div></div></div>
|
<div class="col-md-4"><div class="chart-card"><div class="title">GPU Metrics</div><div id="gpu-metrics-card"></div></div></div>
|
||||||
|
|
||||||
|
<!-- ROW 2: 3 GPU Cards -->
|
||||||
|
<div class="col-md-4"><div class="gpu-card" id="gpu-moe"><div class="text-secondary small">Loading...</div></div></div>
|
||||||
|
<div class="col-md-4"><div class="gpu-card" id="gpu-dense"><div class="text-secondary small">Loading...</div></div></div>
|
||||||
|
<div class="col-md-4"><div class="gpu-card" id="gpu-light"><div class="text-secondary small">Loading...</div></div></div>
|
||||||
|
|
||||||
|
<!-- ROW 3: Queue + Model + Agent -->
|
||||||
|
<div class="col-md-4"><div class="chart-card"><div class="title">Queue Status</div><div class="text-center" id="queue-viz"></div></div></div>
|
||||||
|
<div class="col-md-4"><div class="chart-card"><div class="title">Model Distribution</div><div id="route-bars"></div></div></div>
|
||||||
|
<div class="col-md-4"><div class="chart-card"><div class="title">Agent Activity</div><div id="agent-bars"></div></div></div>
|
||||||
|
|
||||||
<!-- ROW 4: Live Stream -->
|
<!-- ROW 4: Live Stream -->
|
||||||
<div class="col-12"><div class="chart-card"><div class="title">Live Stream</div>
|
<div class="col-12"><div class="chart-card"><div class="title">Live Stream</div>
|
||||||
<div class="table-responsive"><table class="table table-custom mb-0">
|
<div class="table-responsive"><table class="table table-custom mb-0">
|
||||||
@@ -111,9 +111,9 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
var MC={'gemma-4-E4B':'#22c55e','qwen3.6-27B-code':'#f59e0b','qwen3.6-35B-A3B':'#a78bfa'};
|
var MC={'qwen3.5-9b-vlm':'#22c55e','qwen3.6-27B-code':'#f59e0b','qwen3.6-35B-A3B':'#a78bfa'};
|
||||||
var ML={'gemma-4-E4B':'Gemma 4B','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
|
var ML={'qwen3.5-9b-vlm':'Qwen3.5 9B VLM','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
|
||||||
var GL={'qwen3.6-35B-A3B':'MoE - Strix Halo','qwen3.6-27B-code':'Dense - RTX 3090','gemma-4-E4B':'Light - RTX 5070'};
|
var GL={'qwen3.6-35B-A3B':'MoE - Strix Halo','qwen3.6-27B-code':'Dense - RTX 3090','qwen3.5-9b-vlm':'VLM - RTX 5070'};
|
||||||
function $(id){return document.getElementById(id);}
|
function $(id){return document.getElementById(id);}
|
||||||
|
|
||||||
function render(data){
|
function render(data){
|
||||||
@@ -122,7 +122,7 @@ var t=Object.values(data.route_counts||{}).reduce((a,b)=>a+b,0);
|
|||||||
var ta=0,tm=0;data.gpus.forEach(function(g){ta+=(g.active_requests||0);tm+=(g.max_concurrent||1)});
|
var ta=0,tm=0;data.gpus.forEach(function(g){ta+=(g.active_requests||0);tm+=(g.max_concurrent||1)});
|
||||||
$('kpi-total').textContent=t;$('kpi-active').textContent=ta+'/'+tm;$('kpi-agents').textContent=Object.keys(data.agent_counts||{}).length;
|
$('kpi-total').textContent=t;$('kpi-active').textContent=ta+'/'+tm;$('kpi-agents').textContent=Object.keys(data.agent_counts||{}).length;
|
||||||
$('update-time').textContent=new Date().toLocaleTimeString();
|
$('update-time').textContent=new Date().toLocaleTimeString();
|
||||||
var ids={'qwen3.6-35B-A3B':'gpu-moe','qwen3.6-27B-code':'gpu-dense','gemma-4-E4B':'gpu-light'};
|
var ids={'qwen3.6-35B-A3B':'gpu-moe','qwen3.6-27B-code':'gpu-dense','qwen3.5-9b-vlm':'gpu-light'};
|
||||||
data.gpus.forEach(function(g){
|
data.gpus.forEach(function(g){
|
||||||
var el=$(ids[g.id]);if(!el)return;
|
var el=$(ids[g.id]);if(!el)return;
|
||||||
var a=g.active_requests||0,mx=g.max_concurrent||1;
|
var a=g.active_requests||0,mx=g.max_concurrent||1;
|
||||||
@@ -154,14 +154,14 @@ var sc=pct>=100?'#ef4444':pct>=50?'#f59e0b':'#22c55e';
|
|||||||
var circ=188.5,dash=(pct/100)*circ;
|
var circ=188.5,dash=(pct/100)*circ;
|
||||||
var h='<div class=\"d-inline-block position-relative mb-2\"><svg width=\"72\" height=\"72\"><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"#1e293b\" stroke-width=\"6\"/><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"'+sc+'\" stroke-width=\"6\" stroke-dasharray=\"'+dash+' '+(circ-dash)+'\" stroke-linecap=\"round\" transform=\"rotate(-90 36 36)\"/></svg><div style=\"position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);text-align:center\"><div class=\"ring-label\" style=\"color:'+sc+'\">'+ta+'</div><div class=\"ring-sublabel\">/ '+tm+' slots</div></div></div>';
|
var h='<div class=\"d-inline-block position-relative mb-2\"><svg width=\"72\" height=\"72\"><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"#1e293b\" stroke-width=\"6\"/><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"'+sc+'\" stroke-width=\"6\" stroke-dasharray=\"'+dash+' '+(circ-dash)+'\" stroke-linecap=\"round\" transform=\"rotate(-90 36 36)\"/></svg><div style=\"position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);text-align:center\"><div class=\"ring-label\" style=\"color:'+sc+'\">'+ta+'</div><div class=\"ring-sublabel\">/ '+tm+' slots</div></div></div>';
|
||||||
h+='<div class=\"fw-bold mb-2 small\" style=\"color:'+sc+'\">'+st+'</div>';
|
h+='<div class=\"fw-bold mb-2 small\" style=\"color:'+sc+'\">'+st+'</div>';
|
||||||
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','gemma-4-E4B':'Gemma'};
|
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
|
||||||
data.gpus.forEach(function(g){var a=g.active_requests||0,mx=g.max_concurrent||1,gp=mx>0?Math.round(a/mx*100):0;h+='<div class=\"d-flex align-items-center gap-2 mb-1 justify-content-center\"><span class=\"small\" style=\"min-width:32px;text-align:right;font-size:10px\">'+(lb[g.id]||g.id)+'</span><div style=\"flex:1;max-width:70px;height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+gp+'%;background:'+sc+';border-radius:2px\"></div></div><span class=\"small\" style=\"min-width:22px;font-size:10px\">'+a+'/'+mx+'</span></div>'});
|
data.gpus.forEach(function(g){var a=g.active_requests||0,mx=g.max_concurrent||1,gp=mx>0?Math.round(a/mx*100):0;h+='<div class=\"d-flex align-items-center gap-2 mb-1 justify-content-center\"><span class=\"small\" style=\"min-width:32px;text-align:right;font-size:10px\">'+(lb[g.id]||g.id)+'</span><div style=\"flex:1;max-width:70px;height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+gp+'%;background:'+sc+';border-radius:2px\"></div></div><span class=\"small\" style=\"min-width:22px;font-size:10px\">'+a+'/'+mx+'</span></div>'});
|
||||||
el.innerHTML=h;
|
el.innerHTML=h;
|
||||||
}
|
}
|
||||||
|
|
||||||
function renderGPUMetrics(data){
|
function renderGPUMetrics(data){
|
||||||
var el=$('gpu-metrics-card');if(!el)return;
|
var el=$('gpu-metrics-card');if(!el)return;
|
||||||
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','gemma-4-E4B':'Gemma'};
|
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
|
||||||
var h='';data.gpus.forEach(function(g){
|
var h='';data.gpus.forEach(function(g){
|
||||||
var nm=lb[g.id]||g.id,tp=g.temp_c||0,ut=g.gpu_util_pct||0,pw=g.power_w||0,pl=g.power_limit_w||0;
|
var nm=lb[g.id]||g.id,tp=g.temp_c||0,ut=g.gpu_util_pct||0,pw=g.power_w||0,pl=g.power_limit_w||0;
|
||||||
var tc=tp>85?'#ef4444':tp>70?'#f59e0b':'#22c55e',uc=ut>90?'#ef4444':ut>70?'#f59e0b':'#22c55e';
|
var tc=tp>85?'#ef4444':tp>70?'#f59e0b':'#22c55e',uc=ut>90?'#ef4444':ut>70?'#f59e0b':'#22c55e';
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
upstream ocu_llm_pool {
|
upstream ocu_llm_pool {
|
||||||
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
|
||||||
server 192.168.68.110:8080;
|
server 192.168.68.110:8080;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
|
|||||||
"heavy" llmgpu_pool;
|
"heavy" llmgpu_pool;
|
||||||
"qwen3.5-27B" llmgpu_pool;
|
"qwen3.5-27B" llmgpu_pool;
|
||||||
"light" ocu_llm_pool;
|
"light" ocu_llm_pool;
|
||||||
"gemma-4" ocu_llm_pool;
|
"qwen3.5-9b-vlm" ocu_llm_pool;
|
||||||
}
|
}
|
||||||
|
|
||||||
## Rate limit zone — 10 req/s per IP, burst of 20
|
## Rate limit zone — 10 req/s per IP, burst of 20
|
||||||
|
|||||||
+2
-2
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
upstream ocu_llm_pool {
|
upstream ocu_llm_pool {
|
||||||
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
|
||||||
server 192.168.68.110:8080;
|
server 192.168.68.110:8080;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
|
|||||||
"heavy" llmgpu_pool;
|
"heavy" llmgpu_pool;
|
||||||
"qwen3.5-27B" llmgpu_pool;
|
"qwen3.5-27B" llmgpu_pool;
|
||||||
"light" ocu_llm_pool;
|
"light" ocu_llm_pool;
|
||||||
"gemma-4" ocu_llm_pool;
|
"qwen3.5-9b-vlm" ocu_llm_pool;
|
||||||
}
|
}
|
||||||
|
|
||||||
server {
|
server {
|
||||||
|
|||||||
+2
-2
@@ -11,9 +11,9 @@ model_list:
|
|||||||
api_base: http://192.168.68.8:8080/v1
|
api_base: http://192.168.68.8:8080/v1
|
||||||
api_key: "not-needed"
|
api_key: "not-needed"
|
||||||
|
|
||||||
- model_name: gemma-4-E4B
|
- model_name: qwen3.5-9b-vlm
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/gemma-4-E4B
|
model: openai/qwen3.5-9b-vlm
|
||||||
api_base: http://192.168.68.110:8080/v1
|
api_base: http://192.168.68.110:8080/v1
|
||||||
api_key: "not-needed"
|
api_key: "not-needed"
|
||||||
|
|
||||||
|
|||||||
+18
-19
@@ -10,24 +10,24 @@ GPU_LIGHT_URL = os.environ.get("GPU_LIGHT_URL", "http://192.168.68.110:8080/v1")
|
|||||||
GPU_SIDECARS = {
|
GPU_SIDECARS = {
|
||||||
"qwen3.6-35B-A3B": "http://192.168.68.15:8090",
|
"qwen3.6-35B-A3B": "http://192.168.68.15:8090",
|
||||||
"qwen3.6-27B-code": "http://192.168.68.8:8090",
|
"qwen3.6-27B-code": "http://192.168.68.8:8090",
|
||||||
"gemma-4-E4B": "http://192.168.68.110:8090",
|
"qwen3.5-9b-vlm": "http://192.168.68.110:8090",
|
||||||
}
|
}
|
||||||
GPU_URLS = {
|
GPU_URLS = {
|
||||||
"qwen3.6-35B-A3B": GPU_MOE_URL,
|
"qwen3.6-35B-A3B": GPU_MOE_URL,
|
||||||
"qwen3.6-27B-code": GPU_DENSE_URL,
|
"qwen3.6-27B-code": GPU_DENSE_URL,
|
||||||
"gemma-4-E4B": GPU_LIGHT_URL,
|
"qwen3.5-9b-vlm": GPU_LIGHT_URL,
|
||||||
}
|
}
|
||||||
# Max concurrent requests per GPU (based on llama.cpp --parallel)
|
# Max concurrent requests per GPU (based on llama.cpp --parallel)
|
||||||
GPU_MAX_CONCURRENT = {
|
GPU_MAX_CONCURRENT = {
|
||||||
"qwen3.6-35B-A3B": 2, # 2 slots
|
"qwen3.6-35B-A3B": 2, # 2 slots
|
||||||
"qwen3.6-27B-code": 2, # 2 slots
|
"qwen3.6-27B-code": 2, # 2 slots
|
||||||
"gemma-4-E4B": 1, # 1 slot
|
"qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom)
|
||||||
}
|
}
|
||||||
|
|
||||||
TIER_MODELS = {
|
TIER_MODELS = {
|
||||||
"starter": ["gemma-4-E4B"],
|
"starter": ["qwen3.5-9b-vlm"],
|
||||||
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
|
||||||
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
|
||||||
}
|
}
|
||||||
API_KEYS = {
|
API_KEYS = {
|
||||||
"sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
|
"sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
|
||||||
@@ -139,7 +139,7 @@ def route(rd, tier):
|
|||||||
sys = any(m.get("role")=="system" for m in msgs)
|
sys = any(m.get("role")=="system" for m in msgs)
|
||||||
turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
|
turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
|
||||||
hints = rd.get("routing_hints",{})
|
hints = rd.get("routing_hints",{})
|
||||||
allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
|
allowed = TIER_MODELS.get(tier, ["qwen3.5-9b-vlm"])
|
||||||
avail = [m for m in available_models() if m in allowed]
|
avail = [m for m in available_models() if m in allowed]
|
||||||
if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
|
if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
|
||||||
|
|
||||||
@@ -155,24 +155,24 @@ def route(rd, tier):
|
|||||||
return {"model": target, "reason": "explicit"}
|
return {"model": target, "reason": "explicit"}
|
||||||
|
|
||||||
if hints:
|
if hints:
|
||||||
if hints.get("priority")=="speed" and "gemma-4-E4B" in avail:
|
if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
|
||||||
return select_best_gpu(["gemma-4-E4B"], "hint_speed") or {"model":"gemma-4-E4B","reason":"hint_speed"}
|
return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
|
||||||
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
|
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
|
||||||
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
||||||
|
|
||||||
# Heavy -> dense (but fall back to MoE if dense is busy)
|
# Heavy -> dense (but fall back to MoE if dense is busy)
|
||||||
if t > 4000 or sys or turns > 6:
|
if t > 4000 or sys or turns > 6:
|
||||||
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","gemma-4-E4B"]
|
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"]
|
||||||
candidates = [m for m in candidates if m in avail]
|
candidates = [m for m in candidates if m in avail]
|
||||||
result = select_best_gpu(candidates, "heavy_reasoning")
|
result = select_best_gpu(candidates, "heavy_reasoning")
|
||||||
if result: return result
|
if result: return result
|
||||||
|
|
||||||
# Ultra-light -> gemma
|
# Ultra-light -> VLM
|
||||||
first_msg = msgs[0].get("content","") if msgs else ""
|
first_msg = msgs[0].get("content","") if msgs else ""
|
||||||
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
||||||
if words <= 3 and turns <= 1 and not sys and "gemma-4-E4B" in avail:
|
if words <= 3 and turns <= 1 and not sys and "qwen3.5-9b-vlm" in avail:
|
||||||
if not is_gpu_busy("gemma-4-E4B"):
|
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
||||||
return {"model":"gemma-4-E4B","reason":"ultra_light"}
|
return {"model":"qwen3.5-9b-vlm","reason":"ultra_light"}
|
||||||
|
|
||||||
# Default: MoE, fall back to dense if MoE is busy
|
# Default: MoE, fall back to dense if MoE is busy
|
||||||
if "qwen3.6-35B-A3B" in avail:
|
if "qwen3.6-35B-A3B" in avail:
|
||||||
@@ -239,7 +239,6 @@ def chat():
|
|||||||
is_stream = rd.get("stream", False)
|
is_stream = rd.get("stream", False)
|
||||||
|
|
||||||
gpu_incr(model)
|
gpu_incr(model)
|
||||||
decremented = False
|
|
||||||
|
|
||||||
log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
|
log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
|
||||||
if r:
|
if r:
|
||||||
@@ -254,7 +253,6 @@ def chat():
|
|||||||
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
|
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
|
||||||
lat = int((time.time()-start)*1000)
|
lat = int((time.time()-start)*1000)
|
||||||
gpu_decr(model)
|
gpu_decr(model)
|
||||||
decremented = True # Release slot
|
|
||||||
|
|
||||||
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
||||||
if is_stream:
|
if is_stream:
|
||||||
@@ -271,11 +269,12 @@ def chat():
|
|||||||
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
|
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)}
|
||||||
bcast()
|
bcast()
|
||||||
return jsonify(data)
|
return jsonify(data)
|
||||||
if not decremented:
|
|
||||||
try: gpu_decr(model)
|
|
||||||
except: pass
|
|
||||||
except requests.Timeout:
|
except requests.Timeout:
|
||||||
|
gpu_decr(model)
|
||||||
|
log.error("TIMEOUT: %s -> %s", agent, model)
|
||||||
return jsonify({"error":"timeout"}), 504
|
return jsonify({"error":"timeout"}), 504
|
||||||
|
except Exception as e:
|
||||||
|
gpu_decr(model)
|
||||||
log.error("Error: %s\n%s", e, traceback.format_exc())
|
log.error("Error: %s\n%s", e, traceback.format_exc())
|
||||||
return jsonify({"error":str(e)}), 500
|
return jsonify({"error":str(e)}), 500
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user