fix: cross-agent GPU spreading prevents hotspot hammering

OLD: checked only if CURRENT agent was on a GPU Tanko→MoE, Mumuni also→MoE (didnt see Tanko) NEW: checks if ANY agent is on a GPU (cross-agent awareness) Pass 1: prefer GPUs with 0 agents Pass 2: prefer GPU this agent is not already on Pass 3: any non-busy GPU Prevents Tanko+Mumuni piling onto same GPU simultaneously even when both slots are free. Combined with MoE=1 slot, guarantees overflow goes to idle Dense.
2026-05-30 12:55:29 +00:00
parent acbcb20837
commit 34fb7516e1
1 changed files with 17 additions and 9 deletions
@@ -161,18 +161,26 @@ def is_gpu_busy(model):
    return active >= max_c
 def select_best_gpu(candidates, reason, agent=""):
-    """Pick best GPU, spreading different agents across GPUs when possible."""
+    """Pick best GPU, spreading agents across GPUs to prevent hotspots."""
-    # Track which GPUs this agent is already using
+    # Count how many distinct agents are on each GPU
-    agent_gpus = set()
+    gpu_agent_counts = {}
-    if agent and r:
+    if r:
        for m in GPU_URLS:
-            if r.get("agent_gpu:" + agent + ":" + m):
+            count = 0
-                agent_gpus.add(m)
+            for ak in API_KEYS.values():
-    # First pass: prefer GPUs NOT used by this agent
+                if r.get("agent_gpu:" + ak["agent"] + ":" + m):
                    count += 1
            gpu_agent_counts[m] = count
    # First pass: prefer GPUs with 0 other agents (fresh GPU for this agent)
    for m in candidates:
-        if not is_gpu_busy(m) and m not in agent_gpus:
+        if not is_gpu_busy(m) and gpu_agent_counts.get(m, 0) == 0:
            return {"model": m, "reason": reason}
-    # Second pass: any non-busy GPU (agent reuse is ok)
+    # Second pass: prefer GPU this agent is NOT already on (skip own GPU)
    if agent:
        for m in candidates:
            if not is_gpu_busy(m) and not r.get("agent_gpu:" + agent + ":" + m):
                return {"model": m, "reason": reason}
    # Third pass: any non-busy GPU
    for m in candidates:
        if not is_gpu_busy(m):
            return {"model": m, "reason": reason}