fix: cross-agent GPU spreading prevents hotspot hammering

OLD: checked only if CURRENT agent was on a GPU
  Tanko→MoE, Mumuni also→MoE (didnt see Tanko)

NEW: checks if ANY agent is on a GPU (cross-agent awareness)
  Pass 1: prefer GPUs with 0 agents
  Pass 2: prefer GPU this agent is not already on
  Pass 3: any non-busy GPU

Prevents Tanko+Mumuni piling onto same GPU simultaneously
even when both slots are free. Combined with MoE=1 slot,
guarantees overflow goes to idle Dense.
This commit is contained in:
Abiba
2026-05-30 12:55:29 +00:00
parent acbcb20837
commit 34fb7516e1
+17 -9
View File
@@ -161,18 +161,26 @@ def is_gpu_busy(model):
return active >= max_c
def select_best_gpu(candidates, reason, agent=""):
"""Pick best GPU, spreading different agents across GPUs when possible."""
# Track which GPUs this agent is already using
agent_gpus = set()
if agent and r:
"""Pick best GPU, spreading agents across GPUs to prevent hotspots."""
# Count how many distinct agents are on each GPU
gpu_agent_counts = {}
if r:
for m in GPU_URLS:
if r.get("agent_gpu:" + agent + ":" + m):
agent_gpus.add(m)
# First pass: prefer GPUs NOT used by this agent
count = 0
for ak in API_KEYS.values():
if r.get("agent_gpu:" + ak["agent"] + ":" + m):
count += 1
gpu_agent_counts[m] = count
# First pass: prefer GPUs with 0 other agents (fresh GPU for this agent)
for m in candidates:
if not is_gpu_busy(m) and m not in agent_gpus:
if not is_gpu_busy(m) and gpu_agent_counts.get(m, 0) == 0:
return {"model": m, "reason": reason}
# Second pass: any non-busy GPU (agent reuse is ok)
# Second pass: prefer GPU this agent is NOT already on (skip own GPU)
if agent:
for m in candidates:
if not is_gpu_busy(m) and not r.get("agent_gpu:" + agent + ":" + m):
return {"model": m, "reason": reason}
# Third pass: any non-busy GPU
for m in candidates:
if not is_gpu_busy(m):
return {"model": m, "reason": reason}