From 34fb7516e13947405f05b16719f194573f7e1334 Mon Sep 17 00:00:00 2001
From: Abiba <abiba@sysloggh.com>
Date: Sat, 30 May 2026 12:55:29 +0000
Subject: [PATCH] fix: cross-agent GPU spreading prevents hotspot hammering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OLD: checked only if CURRENT agent was on a GPU
  Tanko→MoE, Mumuni also→MoE (didnt see Tanko)

NEW: checks if ANY agent is on a GPU (cross-agent awareness)
  Pass 1: prefer GPUs with 0 agents
  Pass 2: prefer GPU this agent is not already on
  Pass 3: any non-busy GPU

Prevents Tanko+Mumuni piling onto same GPU simultaneously
even when both slots are free. Combined with MoE=1 slot,
guarantees overflow goes to idle Dense.
---
 router/router.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/router/router.py b/router/router.py
index 53d7b64..b480405 100644
--- a/router/router.py
+++ b/router/router.py
@@ -161,18 +161,26 @@ def is_gpu_busy(model):
     return active >= max_c
 
 def select_best_gpu(candidates, reason, agent=""):
-    """Pick best GPU, spreading different agents across GPUs when possible."""
-    # Track which GPUs this agent is already using
-    agent_gpus = set()
-    if agent and r:
+    """Pick best GPU, spreading agents across GPUs to prevent hotspots."""
+    # Count how many distinct agents are on each GPU
+    gpu_agent_counts = {}
+    if r:
         for m in GPU_URLS:
-            if r.get("agent_gpu:" + agent + ":" + m):
-                agent_gpus.add(m)
-    # First pass: prefer GPUs NOT used by this agent
+            count = 0
+            for ak in API_KEYS.values():
+                if r.get("agent_gpu:" + ak["agent"] + ":" + m):
+                    count += 1
+            gpu_agent_counts[m] = count
+    # First pass: prefer GPUs with 0 other agents (fresh GPU for this agent)
     for m in candidates:
-        if not is_gpu_busy(m) and m not in agent_gpus:
+        if not is_gpu_busy(m) and gpu_agent_counts.get(m, 0) == 0:
             return {"model": m, "reason": reason}
-    # Second pass: any non-busy GPU (agent reuse is ok)
+    # Second pass: prefer GPU this agent is NOT already on (skip own GPU)
+    if agent:
+        for m in candidates:
+            if not is_gpu_busy(m) and not r.get("agent_gpu:" + agent + ":" + m):
+                return {"model": m, "reason": reason}
+    # Third pass: any non-busy GPU
     for m in candidates:
         if not is_gpu_busy(m):
             return {"model": m, "reason": reason}