May 19, 2026: Full harness update

- Model migration: gemma-4-E4B → qwen3.5-9b-vlm - Dashboard reorder: Usage Over Time + GPU Metrics to top - Router counter leak fix (gpu_decr in except handler) - VLM slot upgrade 1→2 - Redis stale key cleanup - Automated maintenance cron job - LiteLLM config update - GPU router config update - README update
2026-05-19 15:03:34 +00:00
parent 4f032b035c
commit 9c31b5d622
7 changed files with 43 additions and 46 deletions
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
 }

 upstream ocu_llm_pool {
-    ## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
+    ## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
    server 192.168.68.110:8080;
 }

@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
    "heavy"          llmgpu_pool;
    "qwen3.5-27B"    llmgpu_pool;
    "light"          ocu_llm_pool;
-    "gemma-4"        ocu_llm_pool;
+    "qwen3.5-9b-vlm"        ocu_llm_pool;
 }

 ## Rate limit zone — 10 req/s per IP, burst of 20