fix: reduce MoE concurrency 2→1 to prevent thermal timeout (94°C)

Strix Halo running qwen3.6-35B-A3B was hitting 94°C with 2 concurrent
slots, causing 300s request timeouts. Mumuni + Koby accumulated 15
timeouts in the last hour. Reduced to 1 slot for thermal headroom.

Medium and Default tiers already route VLM before MoE as fallback,
minimizing overflow traffic to the hot GPU.
This commit is contained in:
Abiba
2026-05-26 23:47:08 +00:00
parent b3db0841ef
commit ebe8f9ced4
+3 -3
View File
@@ -19,7 +19,7 @@ GPU_URLS = {
} }
# Max concurrent requests per GPU (based on llama.cpp --parallel) # Max concurrent requests per GPU (based on llama.cpp --parallel)
GPU_MAX_CONCURRENT = { GPU_MAX_CONCURRENT = {
"qwen3.6-35B-A3B": 2, # 2 slots "qwen3.6-35B-A3B": 1, # 1 slot (thermal management: 94C at 2 concurrent)
"qwen3.6-27B-code": 1, # 1 slot (24GB VRAM saturated at 256K ctx) "qwen3.6-27B-code": 1, # 1 slot (24GB VRAM saturated at 256K ctx)
"qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom) "qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom)
} }
@@ -229,7 +229,7 @@ def route(rd, tier):
# TIER 3: Medium complexity — Dense primary (speed), MoE fallback # TIER 3: Medium complexity — Dense primary (speed), MoE fallback
if t <= 25000 and turns <= 15: if t <= 25000 and turns <= 15:
candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail] candidates = [m for m in ["qwen3.6-27B-code","qwen3.5-9b-vlm","qwen3.6-35B-A3B"] if m in avail]
result = select_best_gpu(candidates, "medium") result = select_best_gpu(candidates, "medium")
if result: return result if result: return result
@@ -240,7 +240,7 @@ def route(rd, tier):
if result: return result if result: return result
# TIER 5: Default — balanced distribution: Dense first (speed), MoE second (capacity) # TIER 5: Default — balanced distribution: Dense first (speed), MoE second (capacity)
candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail] candidates = [m for m in ["qwen3.6-27B-code","qwen3.5-9b-vlm","qwen3.6-35B-A3B"] if m in avail]
result = select_best_gpu(candidates, "default") result = select_best_gpu(candidates, "default")
if result: return result if result: return result
return {"model":avail[0],"reason":"last_resort"} return {"model":avail[0],"reason":"last_resort"}