From 0983337fdbabc0ee09a8def0398c40c671f9fcd4 Mon Sep 17 00:00:00 2001 From: Abiba Date: Tue, 19 May 2026 21:24:36 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20heavy=20tier=20Dense=E2=86=92MoE?= =?UTF-8?q?=E2=86=92VLM?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- router/router.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/router/router.py b/router/router.py index 3b58c54..bfd8d7a 100644 --- a/router/router.py +++ b/router/router.py @@ -27,7 +27,7 @@ GPU_MAX_CONCURRENT = { # Context window sizes (tokens) — used for compaction signals GPU_CONTEXT = { "qwen3.6-35B-A3B": 131072, - "qwen3.6-27B-code": 65536, + "qwen3.6-27B-code": 98304, "qwen3.5-9b-vlm": 131072, } @@ -190,8 +190,8 @@ def route(rd, tier): # TIER 3: Heavy reasoning — extremely large context or very long conversations if t > 50000 or turns > 25: - # Prefer models with larger context windows (MoE/VLM at 131K, Dense at 65K) - candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.5-9b-vlm","qwen3.6-27B-code"] if m in avail] + # Dense first (98K, purpose-built for reasoning), then MoE/VLM 131K + candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail] result = select_best_gpu(candidates, "heavy_reasoning") if result: return result @@ -308,10 +308,10 @@ def chat(): if raw: yield clean_unicode(raw) bcast() ctx_remaining = GPU_CONTEXT.get(model, 65536) - estimate_tokens(rd.get("messages",[])) - r = Response(stream_with_context(gen()), mimetype="text/event-stream") - r.headers["X-Context-Remaining"] = str(max(0, ctx_remaining)) - r.headers["X-Context-Model"] = model - return r + sse_resp = Response(stream_with_context(gen()), mimetype="text/event-stream") + sse_resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining)) + sse_resp.headers["X-Context-Model"] = model + return sse_resp data = clean_response(resp.json()) for c in data.get("choices",[]): msg = c.get("message",{})