May 19, 2026: Full harness update
- Model migration: gemma-4-E4B → qwen3.5-9b-vlm - Dashboard reorder: Usage Over Time + GPU Metrics to top - Router counter leak fix (gpu_decr in except handler) - VLM slot upgrade 1→2 - Redis stale key cleanup - Automated maintenance cron job - LiteLLM config update - GPU router config update - README update
This commit is contained in:
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
|
||||
}
|
||||
|
||||
upstream ocu_llm_pool {
|
||||
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
||||
## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
|
||||
server 192.168.68.110:8080;
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
|
||||
"heavy" llmgpu_pool;
|
||||
"qwen3.5-27B" llmgpu_pool;
|
||||
"light" ocu_llm_pool;
|
||||
"gemma-4" ocu_llm_pool;
|
||||
"qwen3.5-9b-vlm" ocu_llm_pool;
|
||||
}
|
||||
|
||||
## Rate limit zone — 10 req/s per IP, burst of 20
|
||||
|
||||
Reference in New Issue
Block a user