From 941e8db65e3a38785fb046fc7575742a1778d182 Mon Sep 17 00:00:00 2001 From: Abiba Date: Tue, 19 May 2026 17:01:55 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20redesigned=20routing=20tiers=20?= =?UTF-8?q?=E2=80=94=20VLM=20handles=20more=20traffic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New 4-tier routing: - TIER 1 (Lightweight): ≤100 words, single-turn → VLM first, fallback Dense - TIER 2 (Simple Conv): ≤1000 tokens, ≤4 turns → VLM preferred, fallback Dense - TIER 3 (Heavy): >4000 tokens, system prompts, >8 turns → Dense→MoE→VLM cascade - TIER 4 (Default): Medium tasks → Dense preferred, MoE default, VLM overflow VLM gets more utilization for simple conversations instead of defaulting everything to MoE. --- router/router.py | 50 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/router/router.py b/router/router.py index c3674fb..238d7ff 100644 --- a/router/router.py +++ b/router/router.py @@ -160,27 +160,45 @@ def route(rd, tier): if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail: return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"} - # Heavy -> dense (but fall back to MoE if dense is busy) - if t > 4000 or sys or turns > 6: - candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] - candidates = [m for m in candidates if m in avail] + first_msg = msgs[0].get("content","") if msgs else "" + words = len(first_msg.split()) if isinstance(first_msg, str) else 99 + + # TIER 1: Lightweight — single-turn short queries → VLM first + if not sys and turns <= 1 and words <= 100 and "qwen3.5-9b-vlm" in avail: + if not is_gpu_busy("qwen3.5-9b-vlm"): + return {"model":"qwen3.5-9b-vlm","reason":"lightweight"} + # VLM busy — fall back to Dense, then MoE + fallback = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B"] if m in avail] + result = select_best_gpu(fallback, "lightweight_fallback") + if result: return result + + # TIER 2: Simple conversations — low token, short context → VLM preferred + if not sys and t <= 1000 and turns <= 4 and "qwen3.5-9b-vlm" in avail: + if not is_gpu_busy("qwen3.5-9b-vlm"): + return {"model":"qwen3.5-9b-vlm","reason":"simple_conv"} + # VLM busy — try Dense + if "qwen3.6-27B-code" in avail and not is_gpu_busy("qwen3.6-27B-code"): + return {"model":"qwen3.6-27B-code","reason":"simple_conv_fallback"} + + # TIER 3: Heavy reasoning — large context, system prompts, long conversations + if t > 4000 or sys or turns > 8: + candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail] result = select_best_gpu(candidates, "heavy_reasoning") if result: return result - # Ultra-light -> VLM - first_msg = msgs[0].get("content","") if msgs else "" - words = len(first_msg.split()) if isinstance(first_msg, str) else 99 - if words <= 3 and turns <= 1 and not sys and "qwen3.5-9b-vlm" in avail: - if not is_gpu_busy("qwen3.5-9b-vlm"): - return {"model":"qwen3.5-9b-vlm","reason":"ultra_light"} + # TIER 4: Default — Dense preferred for medium tasks, MoE as workhorse, VLM as overflow + if turns <= 6 and t <= 4000: + # Medium complexity — try Dense first, then MoE, then VLM + candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail] + result = select_best_gpu(candidates, "medium_task") + if result: return result - # Default: MoE, fall back to dense if MoE is busy - if "qwen3.6-35B-A3B" in avail: - if is_gpu_busy("qwen3.6-35B-A3B") and "qwen3.6-27B-code" in avail: - return {"model": "qwen3.6-27B-code", "reason": "load_balanced_default"} + # Fallback — best available + if "qwen3.6-35B-A3B" in avail and not is_gpu_busy("qwen3.6-35B-A3B"): return {"model":"qwen3.6-35B-A3B","reason":"default_moe"} - - return {"model":avail[0],"reason":"fallback"} + result = select_best_gpu([m for m in avail], "fallback") + if result: return result + return {"model":avail[0],"reason":"last_resort"} def clean_unicode(text): if not isinstance(text, str): return text