feat: redesigned routing tiers — VLM handles more traffic
New 4-tier routing: - TIER 1 (Lightweight): ≤100 words, single-turn → VLM first, fallback Dense - TIER 2 (Simple Conv): ≤1000 tokens, ≤4 turns → VLM preferred, fallback Dense - TIER 3 (Heavy): >4000 tokens, system prompts, >8 turns → Dense→MoE→VLM cascade - TIER 4 (Default): Medium tasks → Dense preferred, MoE default, VLM overflow VLM gets more utilization for simple conversations instead of defaulting everything to MoE.
This commit is contained in:
+34
-16
@@ -160,27 +160,45 @@ def route(rd, tier):
|
|||||||
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
|
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
|
||||||
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
||||||
|
|
||||||
# Heavy -> dense (but fall back to MoE if dense is busy)
|
first_msg = msgs[0].get("content","") if msgs else ""
|
||||||
if t > 4000 or sys or turns > 6:
|
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
||||||
candidates = ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"]
|
|
||||||
candidates = [m for m in candidates if m in avail]
|
# TIER 1: Lightweight — single-turn short queries → VLM first
|
||||||
|
if not sys and turns <= 1 and words <= 100 and "qwen3.5-9b-vlm" in avail:
|
||||||
|
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
||||||
|
return {"model":"qwen3.5-9b-vlm","reason":"lightweight"}
|
||||||
|
# VLM busy — fall back to Dense, then MoE
|
||||||
|
fallback = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B"] if m in avail]
|
||||||
|
result = select_best_gpu(fallback, "lightweight_fallback")
|
||||||
|
if result: return result
|
||||||
|
|
||||||
|
# TIER 2: Simple conversations — low token, short context → VLM preferred
|
||||||
|
if not sys and t <= 1000 and turns <= 4 and "qwen3.5-9b-vlm" in avail:
|
||||||
|
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
||||||
|
return {"model":"qwen3.5-9b-vlm","reason":"simple_conv"}
|
||||||
|
# VLM busy — try Dense
|
||||||
|
if "qwen3.6-27B-code" in avail and not is_gpu_busy("qwen3.6-27B-code"):
|
||||||
|
return {"model":"qwen3.6-27B-code","reason":"simple_conv_fallback"}
|
||||||
|
|
||||||
|
# TIER 3: Heavy reasoning — large context, system prompts, long conversations
|
||||||
|
if t > 4000 or sys or turns > 8:
|
||||||
|
candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail]
|
||||||
result = select_best_gpu(candidates, "heavy_reasoning")
|
result = select_best_gpu(candidates, "heavy_reasoning")
|
||||||
if result: return result
|
if result: return result
|
||||||
|
|
||||||
# Ultra-light -> VLM
|
# TIER 4: Default — Dense preferred for medium tasks, MoE as workhorse, VLM as overflow
|
||||||
first_msg = msgs[0].get("content","") if msgs else ""
|
if turns <= 6 and t <= 4000:
|
||||||
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
# Medium complexity — try Dense first, then MoE, then VLM
|
||||||
if words <= 3 and turns <= 1 and not sys and "qwen3.5-9b-vlm" in avail:
|
candidates = [m for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","qwen3.5-9b-vlm"] if m in avail]
|
||||||
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
result = select_best_gpu(candidates, "medium_task")
|
||||||
return {"model":"qwen3.5-9b-vlm","reason":"ultra_light"}
|
if result: return result
|
||||||
|
|
||||||
# Default: MoE, fall back to dense if MoE is busy
|
# Fallback — best available
|
||||||
if "qwen3.6-35B-A3B" in avail:
|
if "qwen3.6-35B-A3B" in avail and not is_gpu_busy("qwen3.6-35B-A3B"):
|
||||||
if is_gpu_busy("qwen3.6-35B-A3B") and "qwen3.6-27B-code" in avail:
|
|
||||||
return {"model": "qwen3.6-27B-code", "reason": "load_balanced_default"}
|
|
||||||
return {"model":"qwen3.6-35B-A3B","reason":"default_moe"}
|
return {"model":"qwen3.6-35B-A3B","reason":"default_moe"}
|
||||||
|
result = select_best_gpu([m for m in avail], "fallback")
|
||||||
return {"model":avail[0],"reason":"fallback"}
|
if result: return result
|
||||||
|
return {"model":avail[0],"reason":"last_resort"}
|
||||||
|
|
||||||
def clean_unicode(text):
|
def clean_unicode(text):
|
||||||
if not isinstance(text, str): return text
|
if not isinstance(text, str): return text
|
||||||
|
|||||||
Reference in New Issue
Block a user