From 28d62e27ba21622260e856e1c4f7a78e4079bbb7 Mon Sep 17 00:00:00 2001 From: Abiba Date: Tue, 19 May 2026 21:13:57 +0000 Subject: [PATCH] feat: context-aware routing + compaction signals --- router/router.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/router/router.py b/router/router.py index fb2b6e1..3b58c54 100644 --- a/router/router.py +++ b/router/router.py @@ -24,6 +24,13 @@ GPU_MAX_CONCURRENT = { "qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom) } +# Context window sizes (tokens) — used for compaction signals +GPU_CONTEXT = { + "qwen3.6-35B-A3B": 131072, + "qwen3.6-27B-code": 65536, + "qwen3.5-9b-vlm": 131072, +} + TIER_MODELS = { "starter": ["qwen3.5-9b-vlm"], "professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"], @@ -183,12 +190,13 @@ def route(rd, tier): # TIER 3: Heavy reasoning — extremely large context or very long conversations if t > 50000 or turns > 25: - candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail] + # Prefer models with larger context windows (MoE/VLM at 131K, Dense at 65K) + candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.5-9b-vlm","qwen3.6-27B-code"] if m in avail] result = select_best_gpu(candidates, "heavy_reasoning") if result: return result # TIER 4: Default — MoE first, VLM helps, Dense last (slow) - if t <= 4000: + if t <= 50000: candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.5-9b-vlm","qwen3.6-27B-code"] if m in avail] result = select_best_gpu(candidates, "default") if result: return result @@ -299,15 +307,23 @@ def chat(): for raw in resp.iter_content(chunk_size=None, decode_unicode=True): if raw: yield clean_unicode(raw) bcast() - return Response(stream_with_context(gen()), mimetype="text/event-stream") + ctx_remaining = GPU_CONTEXT.get(model, 65536) - estimate_tokens(rd.get("messages",[])) + r = Response(stream_with_context(gen()), mimetype="text/event-stream") + r.headers["X-Context-Remaining"] = str(max(0, ctx_remaining)) + r.headers["X-Context-Model"] = model + return r data = clean_response(resp.json()) for c in data.get("choices",[]): msg = c.get("message",{}) if not msg.get("content") and msg.get("reasoning_content"): msg["content"] = msg["reasoning_content"] - data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model)} + ctx_remaining = GPU_CONTEXT.get(model, 65536) - estimate_tokens(rd.get("messages",[])) + data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model),"context_remaining": max(0, ctx_remaining)} + resp = jsonify(data) + resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining)) + resp.headers["X-Context-Model"] = model bcast() - return jsonify(data) + return resp except requests.Timeout: gpu_decr(model) log.error("TIMEOUT: %s -> %s", agent, model)