From acbcb208372ccee8cf9d84b23de71633a2ed413d Mon Sep 17 00:00:00 2001
From: Abiba <abiba@sysloggh.com>
Date: Sat, 30 May 2026 12:52:23 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20MoE=20concurrency=202=E2=86=921=20(95C?=
 =?UTF-8?q?=20thermal=20emergency)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MoE at 95C with p50=13s latency — thermal throttling causing
death spiral. Both slots stuck processing for 113s p95.
Dense idle at 38C with 2 free slots. Reducing MoE to 1 slot
forces heavy overflow to Dense, giving MoE thermal headroom.

Heavy tier: MoE → Dense → VLM still valid — first heavy goes
to MoE, second overflows to Dense.
---
 router/router.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router/router.py b/router/router.py
index 7fe2f31..53d7b64 100644
--- a/router/router.py
+++ b/router/router.py
@@ -19,7 +19,7 @@ GPU_URLS = {
 }
 # Max concurrent requests per GPU (based on llama.cpp --parallel)
 GPU_MAX_CONCURRENT = {
-    "qwen3.6-35B-A3B": 2,   # 2 slots (Dense-first routing reduces thermal load)
+    "qwen3.6-35B-A3B": 1,   # 1 slot (95C thermal emergency)
     "qwen3.6-27B-code": 2,  # 2 slots (128K context frees VRAM)
     "qwen3.5-9b-vlm": 2,       # 2 slots (12GB VRAM, 4GB headroom)
 }