From 34fb7516e13947405f05b16719f194573f7e1334 Mon Sep 17 00:00:00 2001 From: Abiba Date: Sat, 30 May 2026 12:55:29 +0000 Subject: [PATCH] fix: cross-agent GPU spreading prevents hotspot hammering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OLD: checked only if CURRENT agent was on a GPU Tanko→MoE, Mumuni also→MoE (didnt see Tanko) NEW: checks if ANY agent is on a GPU (cross-agent awareness) Pass 1: prefer GPUs with 0 agents Pass 2: prefer GPU this agent is not already on Pass 3: any non-busy GPU Prevents Tanko+Mumuni piling onto same GPU simultaneously even when both slots are free. Combined with MoE=1 slot, guarantees overflow goes to idle Dense. --- router/router.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/router/router.py b/router/router.py index 53d7b64..b480405 100644 --- a/router/router.py +++ b/router/router.py @@ -161,18 +161,26 @@ def is_gpu_busy(model): return active >= max_c def select_best_gpu(candidates, reason, agent=""): - """Pick best GPU, spreading different agents across GPUs when possible.""" - # Track which GPUs this agent is already using - agent_gpus = set() - if agent and r: + """Pick best GPU, spreading agents across GPUs to prevent hotspots.""" + # Count how many distinct agents are on each GPU + gpu_agent_counts = {} + if r: for m in GPU_URLS: - if r.get("agent_gpu:" + agent + ":" + m): - agent_gpus.add(m) - # First pass: prefer GPUs NOT used by this agent + count = 0 + for ak in API_KEYS.values(): + if r.get("agent_gpu:" + ak["agent"] + ":" + m): + count += 1 + gpu_agent_counts[m] = count + # First pass: prefer GPUs with 0 other agents (fresh GPU for this agent) for m in candidates: - if not is_gpu_busy(m) and m not in agent_gpus: + if not is_gpu_busy(m) and gpu_agent_counts.get(m, 0) == 0: return {"model": m, "reason": reason} - # Second pass: any non-busy GPU (agent reuse is ok) + # Second pass: prefer GPU this agent is NOT already on (skip own GPU) + if agent: + for m in candidates: + if not is_gpu_busy(m) and not r.get("agent_gpu:" + agent + ":" + m): + return {"model": m, "reason": reason} + # Third pass: any non-busy GPU for m in candidates: if not is_gpu_busy(m): return {"model": m, "reason": reason}