fix: raise heavy threshold — 4000→12000 tokens, 8→15 turns

Agent conversations with system prompts easily exceed 4000 tokens,
forcing everything to Dense. Now only truly heavy work triggers Dense.
Most agent convos will route to MoE (default) instead.
This commit is contained in:
Abiba
2026-05-19 20:09:59 +00:00
parent b67021ac69
commit 3cbf38e3e2
+2 -2
View File
@@ -181,8 +181,8 @@ def route(rd, tier):
if "qwen3.6-27B-code" in avail and not is_gpu_busy("qwen3.6-27B-code"): if "qwen3.6-27B-code" in avail and not is_gpu_busy("qwen3.6-27B-code"):
return {"model":"qwen3.6-27B-code","reason":"simple_conv_fallback"} return {"model":"qwen3.6-27B-code","reason":"simple_conv_fallback"}
# TIER 3: Heavy reasoning — large context or very long conversations # TIER 3: Heavy reasoning — very large context or very long conversations
if t > 4000 or turns > 8: if t > 12000 or turns > 15:
candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail] candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail]
result = select_best_gpu(candidates, "heavy_reasoning") result = select_best_gpu(candidates, "heavy_reasoning")
if result: return result if result: return result