From b3db0841ef4d836625fe74a31a6af7d11e1a87df Mon Sep 17 00:00:00 2001
From: Abiba <abiba@sysloggh.com>
Date: Tue, 26 May 2026 22:00:20 +0000
Subject: [PATCH] feat: redesigned routing tiers for even GPU distribution +
 speed priority
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OLD: Dense was last choice in every tier, got 4% of auto-routed traffic
NEW: 5-tier routing with speed-first prioritization

Tier 1 (Lightweight): VLM → Dense → MoE    (≤500 tok, ≤100 words)
Tier 2 (Simple):      VLM → Dense → MoE    (≤4000 tok, ≤6 turns)
Tier 3 (Medium):      DENSE → MoE → VLM    (≤25000 tok, ≤15 turns)
Tier 4 (Heavy):       MoE → Dense → VLM    (>25000 tok or >15 turns)
Tier 5 (Default):     DENSE → MoE → VLM    (balanced fallback)

Also: quality hint now routes to MoE (better reasoning)
Bugfix: Tier 1 now checks token count to prevent giant single-word
inputs from being routed as lightweight
---
 router/router.py | 47 +++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/router/router.py b/router/router.py
index 169b912..55db6fe 100644
--- a/router/router.py
+++ b/router/router.py
@@ -203,46 +203,45 @@ def route(rd, tier):
     if hints:
         if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
             return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
-        if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
-            return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
+        if hints.get("priority")=="quality" and "qwen3.6-35B-A3B" in avail:
+            return select_best_gpu(["qwen3.6-35B-A3B"], "hint_quality") or {"model":"qwen3.6-35B-A3B","reason":"hint_quality"}
     
     first_msg = msgs[0].get("content","") if msgs else ""
     words = len(first_msg.split()) if isinstance(first_msg, str) else 99
     
-    # TIER 1: Lightweight — single-turn short queries → VLM first
-    if not sys and turns <= 1 and words <= 100 and "qwen3.5-9b-vlm" in avail:
+    # TIER 1: Lightweight — single-turn short queries → VLM (fastest)
+    if not sys and turns <= 1 and t <= 500 and words <= 100 and "qwen3.5-9b-vlm" in avail:
         if not is_gpu_busy("qwen3.5-9b-vlm"):
             return {"model":"qwen3.5-9b-vlm","reason":"lightweight"}
-        # VLM busy — fall back to Dense, then MoE
-        fallback = [m for m in ["qwen3.6-35B-A3B","qwen3.6-27B-code"] if m in avail]
+        # VLM busy — Dense is faster for short queries than MoE
+        fallback = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B"] if m in avail]
         result = select_best_gpu(fallback, "lightweight_fallback")
         if result: return result
     
-    # TIER 2: Simple conversations — short context, any prompt → VLM preferred
-    if t <= 1000 and turns <= 4 and "qwen3.5-9b-vlm" in avail:
+    # TIER 2: Simple conversations — short context, any prompt → VLM first, Dense second
+    if t <= 4000 and turns <= 6 and "qwen3.5-9b-vlm" in avail:
         if not is_gpu_busy("qwen3.5-9b-vlm"):
             return {"model":"qwen3.5-9b-vlm","reason":"simple_conv"}
-        # VLM busy — try Dense
-        if "qwen3.6-27B-code" in avail and not is_gpu_busy("qwen3.6-27B-code"):
-            return {"model":"qwen3.6-27B-code","reason":"simple_conv_fallback"}
+        # VLM busy — fall back to Dense, then MoE
+        fallback = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B"] if m in avail]
+        result = select_best_gpu(fallback, "simple_conv_fallback")
+        if result: return result
     
-    # TIER 3: Heavy reasoning — extremely large context or very long conversations
-    if t > 50000 or turns > 25:
-        # MoE first (131K context handles heavy sessions), then Dense (98K reasoning), then Light (131K fallback)
+    # TIER 3: Medium complexity — Dense primary (speed), MoE fallback
+    if t <= 25000 and turns <= 15:
+        candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail]
+        result = select_best_gpu(candidates, "medium")
+        if result: return result
+    
+    # TIER 4: Heavy reasoning — large context or very long conversations → MoE first
+    if t > 25000 or turns > 15:
         candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.6-27B-code","qwen3.5-9b-vlm"] if m in avail]
         result = select_best_gpu(candidates, "heavy_reasoning")
         if result: return result
     
-    # TIER 4: Default — MoE first, VLM helps, Dense last (slow)
-    if t <= 50000:
-        candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.5-9b-vlm","qwen3.6-27B-code"] if m in avail]
-        result = select_best_gpu(candidates, "default")
-        if result: return result
-    
-    # Fallback — best available
-    if "qwen3.6-35B-A3B" in avail and not is_gpu_busy("qwen3.6-35B-A3B"):
-        return {"model":"qwen3.6-35B-A3B","reason":"default_moe"}
-    result = select_best_gpu([m for m in avail], "fallback")
+    # TIER 5: Default — balanced distribution: Dense first (speed), MoE second (capacity)
+    candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail]
+    result = select_best_gpu(candidates, "default")
     if result: return result
     return {"model":avail[0],"reason":"last_resort"}