feat: redesigned routing tiers for even GPU distribution + speed priority
OLD: Dense was last choice in every tier, got 4% of auto-routed traffic NEW: 5-tier routing with speed-first prioritization Tier 1 (Lightweight): VLM → Dense → MoE (≤500 tok, ≤100 words) Tier 2 (Simple): VLM → Dense → MoE (≤4000 tok, ≤6 turns) Tier 3 (Medium): DENSE → MoE → VLM (≤25000 tok, ≤15 turns) Tier 4 (Heavy): MoE → Dense → VLM (>25000 tok or >15 turns) Tier 5 (Default): DENSE → MoE → VLM (balanced fallback) Also: quality hint now routes to MoE (better reasoning) Bugfix: Tier 1 now checks token count to prevent giant single-word inputs from being routed as lightweight
This commit is contained in:
+22
-23
@@ -203,47 +203,46 @@ def route(rd, tier):
|
|||||||
if hints:
|
if hints:
|
||||||
if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
|
if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
|
||||||
return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
|
return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
|
||||||
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
|
if hints.get("priority")=="quality" and "qwen3.6-35B-A3B" in avail:
|
||||||
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
return select_best_gpu(["qwen3.6-35B-A3B"], "hint_quality") or {"model":"qwen3.6-35B-A3B","reason":"hint_quality"}
|
||||||
|
|
||||||
first_msg = msgs[0].get("content","") if msgs else ""
|
first_msg = msgs[0].get("content","") if msgs else ""
|
||||||
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
||||||
|
|
||||||
# TIER 1: Lightweight — single-turn short queries → VLM first
|
# TIER 1: Lightweight — single-turn short queries → VLM (fastest)
|
||||||
if not sys and turns <= 1 and words <= 100 and "qwen3.5-9b-vlm" in avail:
|
if not sys and turns <= 1 and t <= 500 and words <= 100 and "qwen3.5-9b-vlm" in avail:
|
||||||
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
||||||
return {"model":"qwen3.5-9b-vlm","reason":"lightweight"}
|
return {"model":"qwen3.5-9b-vlm","reason":"lightweight"}
|
||||||
# VLM busy — fall back to Dense, then MoE
|
# VLM busy — Dense is faster for short queries than MoE
|
||||||
fallback = [m for m in ["qwen3.6-35B-A3B","qwen3.6-27B-code"] if m in avail]
|
fallback = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B"] if m in avail]
|
||||||
result = select_best_gpu(fallback, "lightweight_fallback")
|
result = select_best_gpu(fallback, "lightweight_fallback")
|
||||||
if result: return result
|
if result: return result
|
||||||
|
|
||||||
# TIER 2: Simple conversations — short context, any prompt → VLM preferred
|
# TIER 2: Simple conversations — short context, any prompt → VLM first, Dense second
|
||||||
if t <= 1000 and turns <= 4 and "qwen3.5-9b-vlm" in avail:
|
if t <= 4000 and turns <= 6 and "qwen3.5-9b-vlm" in avail:
|
||||||
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
||||||
return {"model":"qwen3.5-9b-vlm","reason":"simple_conv"}
|
return {"model":"qwen3.5-9b-vlm","reason":"simple_conv"}
|
||||||
# VLM busy — try Dense
|
# VLM busy — fall back to Dense, then MoE
|
||||||
if "qwen3.6-27B-code" in avail and not is_gpu_busy("qwen3.6-27B-code"):
|
fallback = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B"] if m in avail]
|
||||||
return {"model":"qwen3.6-27B-code","reason":"simple_conv_fallback"}
|
result = select_best_gpu(fallback, "simple_conv_fallback")
|
||||||
|
if result: return result
|
||||||
|
|
||||||
# TIER 3: Heavy reasoning — extremely large context or very long conversations
|
# TIER 3: Medium complexity — Dense primary (speed), MoE fallback
|
||||||
if t > 50000 or turns > 25:
|
if t <= 25000 and turns <= 15:
|
||||||
# MoE first (131K context handles heavy sessions), then Dense (98K reasoning), then Light (131K fallback)
|
candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail]
|
||||||
|
result = select_best_gpu(candidates, "medium")
|
||||||
|
if result: return result
|
||||||
|
|
||||||
|
# TIER 4: Heavy reasoning — large context or very long conversations → MoE first
|
||||||
|
if t > 25000 or turns > 15:
|
||||||
candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.6-27B-code","qwen3.5-9b-vlm"] if m in avail]
|
candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.6-27B-code","qwen3.5-9b-vlm"] if m in avail]
|
||||||
result = select_best_gpu(candidates, "heavy_reasoning")
|
result = select_best_gpu(candidates, "heavy_reasoning")
|
||||||
if result: return result
|
if result: return result
|
||||||
|
|
||||||
# TIER 4: Default — MoE first, VLM helps, Dense last (slow)
|
# TIER 5: Default — balanced distribution: Dense first (speed), MoE second (capacity)
|
||||||
if t <= 50000:
|
candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail]
|
||||||
candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.5-9b-vlm","qwen3.6-27B-code"] if m in avail]
|
|
||||||
result = select_best_gpu(candidates, "default")
|
result = select_best_gpu(candidates, "default")
|
||||||
if result: return result
|
if result: return result
|
||||||
|
|
||||||
# Fallback — best available
|
|
||||||
if "qwen3.6-35B-A3B" in avail and not is_gpu_busy("qwen3.6-35B-A3B"):
|
|
||||||
return {"model":"qwen3.6-35B-A3B","reason":"default_moe"}
|
|
||||||
result = select_best_gpu([m for m in avail], "fallback")
|
|
||||||
if result: return result
|
|
||||||
return {"model":avail[0],"reason":"last_resort"}
|
return {"model":avail[0],"reason":"last_resort"}
|
||||||
|
|
||||||
def clean_unicode(text):
|
def clean_unicode(text):
|
||||||
|
|||||||
Reference in New Issue
Block a user