revert: MoE concurrency back to 2 (Dense-first routing handles thermal)

2026-05-27 00:04:42 +00:00
parent c4ea5e3a98
commit 93d0d3cc4b
1 changed files with 1 additions and 1 deletions
@@ -19,7 +19,7 @@ GPU_URLS = {
 }
 # Max concurrent requests per GPU (based on llama.cpp --parallel)
 GPU_MAX_CONCURRENT = {
-    "qwen3.6-35B-A3B": 1,   # 1 slot (thermal management: 94C at 2 concurrent)
+    "qwen3.6-35B-A3B": 2,   # 2 slots (Dense-first routing reduces thermal load)
    "qwen3.6-27B-code": 1,  # 1 slot (24GB VRAM saturated at 256K ctx)
    "qwen3.5-9b-vlm": 2,       # 2 slots (12GB VRAM, 4GB headroom)
 }