Compare commits
29 Commits
3d42ea4767
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 5116e4b1a7 | |||
| e55bcef21a | |||
| 32bd817e97 | |||
| 79965450bb | |||
| 6c829abef5 | |||
| 6efd5ff51c | |||
| 350a90b524 | |||
| 3156c093d5 | |||
| 3cbf38e3e2 | |||
| b67021ac69 | |||
| 46dda918de | |||
| 7a78c0f98d | |||
| 15c474aea0 | |||
| bfc38f5436 | |||
| f519a3fa60 | |||
| 941e8db65e | |||
| 241de4f38c | |||
| beb2d1790a | |||
| f2f8e8c921 | |||
| 76ade81fda | |||
| 9c31b5d622 | |||
| 4f032b035c | |||
| 8f3b0c6647 | |||
| 808c9d3d13 | |||
| 9817fe2ef2 | |||
| 654cdff718 | |||
| bf90e57c5f | |||
| 2db2796e53 | |||
| ec0f9fac63 |
+1
-3
@@ -1,5 +1,3 @@
|
||||
.git
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.env
|
||||
redis-data/
|
||||
ssl/
|
||||
|
||||
@@ -6,9 +6,10 @@ CT 116 Docker stack for routing local GPU models through a unified OpenAI-compat
|
||||
|
||||
```
|
||||
nginx :80 → router :9000 → GPU backends
|
||||
├─ qwen3.6-35B-A3B (MoE) @ 192.168.68.15:8080
|
||||
├─ qwen3.6-27B-code (Dense) @ 192.168.68.8:8080
|
||||
└─ gemma-4-E4B (Light) @ 192.168.68.110:8080
|
||||
├─ qwen3.6-35B-A3B (MoE) @ 192.168.68.15:8080 [2 slots]
|
||||
├─ qwen3.6-27B-code (Dense) @ 192.168.68.8:8080 [2 slots]
|
||||
└─ qwen3.5-9b-vlm (VLM) @ 192.168.68.110:8080 [2 slots]
|
||||
Total: 6 concurrent slots
|
||||
|
||||
LiteLLM :8081 (fallback) | Dashboard :3000 | Redis :6379 (local)
|
||||
```
|
||||
@@ -24,10 +25,14 @@ docker compose up -d
|
||||
|
||||
| URL | Purpose |
|
||||
|-----|---------|
|
||||
| `/v1/chat/completions` | Inference API (OpenAI-compatible) |
|
||||
| `/v1/chat/completions` | Inference API (OpenAI-compatible) — **API key required** |
|
||||
| `/v1/models` | Available models |
|
||||
| `/` | Dashboard (GPU health, routing, agents, timeseries) |
|
||||
|
||||
## Authentication
|
||||
|
||||
**All `/v1/chat/completions` requests require a valid API key** via `Authorization: Bearer <key>`. Missing or invalid keys return **401 Unauthorized**.
|
||||
|
||||
## Agent API Keys
|
||||
|
||||
| Agent | Key |
|
||||
@@ -37,3 +42,34 @@ docker compose up -d
|
||||
| Tanko | `sk-syslog-tanko` |
|
||||
| Koby | `sk-syslog-koby` |
|
||||
| Kagenz0 | `sk-syslog-kagenz0` |
|
||||
| Koonimo | `sk-syslog-koonimo` |
|
||||
|
||||
## Routing Tiers
|
||||
|
||||
| Tier | Trigger | Priority |
|
||||
|------|---------|----------|
|
||||
| Lightweight | No system prompt, ≤1 turn, ≤100 words | VLM → MoE → Dense |
|
||||
| Simple Conv | ≤1000 tokens, ≤4 turns | VLM → MoE → Dense |
|
||||
| Heavy | >4000 tokens OR >8 turns | Dense → MoE → VLM |
|
||||
| Default | Everything else | MoE → VLM → Dense |
|
||||
|
||||
## Queue
|
||||
|
||||
When all GPUs are saturated, requests enter a polling queue (500ms intervals) instead of returning 503 immediately. Timeout: 30s (configurable via `QUEUE_TIMEOUT` env or `X-Queue-Timeout` header).
|
||||
|
||||
## Models
|
||||
|
||||
| GPU | Model | VRAM | Slots |
|
||||
|-----|-------|------|-------|
|
||||
| Strix Halo | qwen3.6-35B-A3B (MoE) | 65GB | 2 |
|
||||
| RTX 3090 | qwen3.6-27B-code (Dense) | 24GB | 2 |
|
||||
| RTX 5070 | qwen3.5-9b-vlm (VLM) | 12GB | 2 |
|
||||
|
||||
## Maintenance
|
||||
|
||||
Automated cron job runs daily at 3:00 AM UTC (`/opt/inference-harness/maintenance.sh`):
|
||||
- Cleans Redis timeseries keys >60 days
|
||||
- Prunes Docker build cache >7 days
|
||||
- Logs container health and Redis memory
|
||||
|
||||
Logs: `/var/log/harness-maintenance.log`
|
||||
|
||||
+160
-218
@@ -1,13 +1,11 @@
|
||||
"""Harness Dashboard."""
|
||||
"""SyslogAI Harness Dashboard — Modern Design."""
|
||||
import os, json, time, queue, threading
|
||||
import requests
|
||||
from flask import Flask, request, render_template_string, Response, stream_with_context
|
||||
|
||||
ROUTER_METRICS = os.environ.get("ROUTER_METRICS_URL", "http://router:9000/metrics")
|
||||
|
||||
app = Flask(__name__)
|
||||
sse_subscribers = []
|
||||
sse_lock = threading.Lock()
|
||||
sse_subscribers = []; sse_lock = threading.Lock()
|
||||
|
||||
def fetch_state():
|
||||
try:
|
||||
@@ -19,239 +17,188 @@ def fetch_state():
|
||||
def broadcast_loop():
|
||||
while True:
|
||||
time.sleep(3)
|
||||
data = fetch_state()
|
||||
payload = json.dumps(data)
|
||||
data = fetch_state(); payload = json.dumps(data)
|
||||
with sse_lock:
|
||||
dead = []
|
||||
for q in sse_subscribers:
|
||||
try: q.put(payload)
|
||||
except Exception: dead.append(q)
|
||||
dead = [q for q in sse_subscribers if not q.put(payload)]
|
||||
for q in dead: sse_subscribers.remove(q)
|
||||
|
||||
threading.Thread(target=broadcast_loop, daemon=True).start()
|
||||
|
||||
DASHBOARD_HTML = r"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<html lang="en" data-bs-theme="dark">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Inference Harness - Syslog Solution LLC</title>
|
||||
<meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>SyslogAI Harness</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
|
||||
<style>
|
||||
:root {
|
||||
--bg: #0a0e14; --card: #131820; --border: #1e2a3a; --text: #c9d1d9;
|
||||
--dim: #5c6670; --accent: #39bae6; --green: #7fd962; --yellow: #ffb454;
|
||||
--red: #f26d78; --blue: #59c2ff; --purple: #d2a6ff;
|
||||
}
|
||||
* { margin:0; padding:0; box-sizing:border-box; }
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'SF Pro Display', 'Segoe UI', system-ui, sans-serif;
|
||||
background: var(--bg); color: var(--text); min-height: 100vh;
|
||||
padding: clamp(12px, 3vw, 32px);
|
||||
}
|
||||
.header {
|
||||
display: flex; align-items: center; justify-content: space-between;
|
||||
flex-wrap: wrap; gap: 12px; margin-bottom: 24px;
|
||||
}
|
||||
.header h1 { font-size: clamp(18px, 4vw, 26px); font-weight: 700; color: #fff; }
|
||||
.header h1 span { color: var(--accent); }
|
||||
.status-bar { display: flex; gap: 16px; align-items: center; flex-wrap: wrap; font-size: 13px; color: var(--dim); }
|
||||
.status-dot { width: 8px; height: 8px; border-radius: 50%; display: inline-block; }
|
||||
.status-dot.live { background: var(--green); animation: pulse 2s infinite; }
|
||||
@keyframes pulse { 0%,100%{opacity:1} 50%{opacity:0.3} }
|
||||
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(min(100%, 340px), 1fr)); gap: 16px; }
|
||||
.card {
|
||||
background: var(--card); border: 1px solid var(--border);
|
||||
border-radius: 12px; padding: clamp(12px, 3vw, 20px);
|
||||
}
|
||||
.card-title {
|
||||
font-size: 13px; font-weight: 600; text-transform: uppercase;
|
||||
letter-spacing: 0.5px; color: var(--dim); margin-bottom: 14px;
|
||||
}
|
||||
.gpu-row {
|
||||
display: flex; align-items: center; gap: 14px; padding: 10px 0;
|
||||
border-bottom: 1px solid rgba(255,255,255,0.04);
|
||||
}
|
||||
.gpu-row:last-child { border-bottom: none; }
|
||||
.gpu-icon {
|
||||
width: 40px; height: 40px; border-radius: 10px; display: flex;
|
||||
align-items: center; justify-content: center; font-size: 18px; flex-shrink: 0;
|
||||
}
|
||||
.gpu-icon.green { background: rgba(127,217,98,0.12); color: var(--green); }
|
||||
.gpu-icon.yellow { background: rgba(255,180,84,0.12); color: var(--yellow); }
|
||||
.gpu-icon.red { background: rgba(242,109,120,0.12); color: var(--red); }
|
||||
.gpu-info { flex:1; min-width: 0; }
|
||||
.gpu-name { font-size: 14px; font-weight: 600; color: #e6edf3; }
|
||||
.gpu-metrics { display: flex; gap: 20px; flex-wrap: wrap; margin-top: 6px; }
|
||||
.gpu-metric { font-size: 12px; }
|
||||
.gpu-metric .label { color: var(--dim); }
|
||||
.gpu-metric .value { color: #e6edf3; font-weight: 500; font-variant-numeric: tabular-nums; }
|
||||
.vram-bar { width: 100%; height: 4px; background: rgba(255,255,255,0.06); border-radius: 2px; margin-top: 6px; overflow: hidden; }
|
||||
.vram-fill { height: 100%; border-radius: 2px; transition: width 0.6s ease; }
|
||||
.vram-fill.green { background: var(--green); }
|
||||
.vram-fill.yellow { background: var(--yellow); }
|
||||
.vram-fill.red { background: var(--red); }
|
||||
.bar-row { margin-bottom: 10px; }
|
||||
.bar-label { display: flex; justify-content: space-between; font-size: 12px; margin-bottom: 4px; }
|
||||
.bar-label .name { color: #e6edf3; }
|
||||
.bar-label .count { color: var(--dim); font-variant-numeric: tabular-nums; }
|
||||
.bar-track { height: 6px; background: rgba(255,255,255,0.06); border-radius: 3px; overflow: hidden; }
|
||||
body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', system-ui, sans-serif; padding: 20px 24px; }
|
||||
.card { background: #111827; border: 1px solid #1e293b; border-radius: 10px; height: 100%; }
|
||||
.stat-card { background: #111827; border: 1px solid #1e293b; border-radius: 10px; padding: 18px 20px; text-align: center; }
|
||||
.stat-value { font-size: 28px; font-weight: 700; line-height: 1.1; }
|
||||
.stat-label { font-size: 11px; text-transform: uppercase; letter-spacing: 0.6px; color: #64748b; margin-top: 4px; }
|
||||
.gpu-card { background: #111827; border: 1px solid #1e293b; border-radius: 10px; padding: 16px 18px; height: 100%; }
|
||||
.gpu-card .title { font-size: 13px; font-weight: 600; color: #e2e8f0; margin-bottom: 12px; display: flex; align-items: center; gap: 8px; }
|
||||
.gpu-card .status-dot { width: 8px; height: 8px; border-radius: 50%; flex-shrink: 0; }
|
||||
.gpu-card .row-metric { display: flex; justify-content: space-between; font-size: 12px; padding: 2px 0; }
|
||||
.gpu-card .row-metric .lbl { color: #64748b; }
|
||||
.gpu-card .row-metric .val { color: #e2e8f0; font-variant-numeric: tabular-nums; }
|
||||
.gpu-card .slot-bar { display: flex; gap: 3px; margin-top: 8px; }
|
||||
.gpu-card .slot-bar .s { flex: 1; height: 5px; border-radius: 2px; background: #1e293b; }
|
||||
.gpu-card .slot-bar .s.active { background: #38bdf8; }
|
||||
.chart-card { background: #111827; border: 1px solid #1e293b; border-radius: 10px; padding: 16px 18px; height: 100%; display: flex; flex-direction: column; }
|
||||
.chart-card .title { font-size: 13px; font-weight: 600; color: #e2e8f0; margin-bottom: 12px; }
|
||||
.bar-row { margin-bottom: 8px; }
|
||||
.bar-label { display: flex; justify-content: space-between; font-size: 11px; margin-bottom: 3px; color: #64748b; }
|
||||
.bar-label .name { color: #cbd5e1; }
|
||||
.bar-track { height: 5px; background: #1e293b; border-radius: 3px; overflow: hidden; }
|
||||
.bar-fill { height: 100%; border-radius: 3px; transition: width 0.6s ease; }
|
||||
.route-table { width: 100%; font-size: 12px; border-collapse: collapse; }
|
||||
.route-table th, .route-table td { text-align: left; padding: 6px 10px; }
|
||||
.route-table th { color: var(--dim); font-weight: 500; font-size: 11px; text-transform: uppercase; letter-spacing: 0.3px; border-bottom: 1px solid var(--border); }
|
||||
.route-table td { border-bottom: 1px solid rgba(255,255,255,0.03); color: #b0b8c4; }
|
||||
.agent-tag { display: inline-block; padding: 1px 7px; border-radius: 10px; font-size: 11px; font-weight: 600; }
|
||||
.agent-abiba { background: rgba(57,186,230,0.15); color: var(--accent); }
|
||||
.agent-mumuni { background: rgba(210,166,255,0.15); color: var(--purple); }
|
||||
.agent-tanko { background: rgba(255,180,84,0.15); color: var(--yellow); }
|
||||
.agent-koby { background: rgba(89,194,255,0.15); color: var(--blue); }
|
||||
.agent-kagenz0 { background: rgba(127,217,98,0.15); color: var(--green); }
|
||||
.agent-unknown { background: rgba(255,255,255,0.06); color: var(--dim); }
|
||||
.agent-admin { background: rgba(255,255,255,0.08); color: #e6edf3; }
|
||||
.full { grid-column: 1 / -1; }
|
||||
.period-btn {
|
||||
background: var(--card); border: 1px solid var(--border); color: var(--dim);
|
||||
padding: 4px 12px; border-radius: 6px; font-size: 12px; cursor: pointer;
|
||||
font-family: inherit; transition: all 0.2s;
|
||||
}
|
||||
.period-btn.active { background: var(--accent); color: #000; border-color: var(--accent); }
|
||||
.period-btn:hover { border-color: var(--accent); color: #e6edf3; }
|
||||
@media (max-width: 600px) {
|
||||
.gpu-metrics { gap: 10px; }
|
||||
.route-table { font-size: 11px; }
|
||||
.route-table th, .route-table td { padding: 4px 6px; }
|
||||
}
|
||||
.table-custom { font-size: 11px; margin: 0; }
|
||||
.table-custom th { color: #64748b; font-weight: 500; font-size: 10px; text-transform: uppercase; border-color: #1e293b; padding: 8px 10px; }
|
||||
.table-custom td { color: #94a3b8; border-color: rgba(30,41,59,0.5); padding: 6px 10px; }
|
||||
.agent-badge { font-size: 10px; padding: 2px 7px; border-radius: 8px; font-weight: 600; }
|
||||
.btn-sm-period { font-size: 10px; padding: 3px 10px; border-radius: 6px; border: 1px solid #1e293b; color: #64748b; background: transparent; cursor: pointer; }
|
||||
.btn-sm-period.active { background: #1d4ed8; color: #fff; border-color: #1d4ed8; }
|
||||
.ring-label { font-size: 22px; font-weight: 700; }
|
||||
.ring-sublabel { font-size: 10px; color: #64748b; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<h1><span>⚡</span> Inference Harness</h1>
|
||||
<div class="status-bar">
|
||||
<span class="status-dot" id="live-dot"></span>
|
||||
<span id="connection-status">connecting...</span>
|
||||
<span id="update-time"></span>
|
||||
<span id="total-requests">0 requests</span>
|
||||
|
||||
<!-- HEADER -->
|
||||
<div class="d-flex justify-content-between align-items-center mb-4">
|
||||
<div>
|
||||
<h5 class="mb-0 text-white fw-bold">⚡ SyslogAI Harness</h5>
|
||||
<div class="small text-secondary" id="live-indicator">
|
||||
<span class="status-dot" id="live-dot" style="width:6px;height:6px;border-radius:50%;display:inline-block;background:#22c55e;animation:pulse 2s infinite"></span>
|
||||
<span id="connection-status">live</span> · <span id="update-time"></span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="d-flex gap-2">
|
||||
<div class="stat-card" style="min-width:100px"><div class="stat-value text-info" id="kpi-total">0</div><div class="stat-label">Requests</div></div>
|
||||
<div class="stat-card" style="min-width:100px"><div class="stat-value text-warning" id="kpi-active">0</div><div class="stat-label">Active</div></div>
|
||||
<div class="stat-card" style="min-width:100px"><div class="stat-value" style="color:#a78bfa" id="kpi-agents">0</div><div class="stat-label">Agents</div></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="grid">
|
||||
<div class="card full">
|
||||
<div class="card-title">GPU Health</div>
|
||||
<div id="gpu-container">Loading...</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="card-title">Model Distribution</div>
|
||||
<div id="route-bars">-</div>
|
||||
</div>
|
||||
<div class="card" style="grid-column: span 2">
|
||||
<div class="card-title" style="display:flex;justify-content:space-between;align-items:center;flex-wrap:wrap;gap:4px">
|
||||
|
||||
<div class="row g-3 align-items-stretch">
|
||||
<!-- ROW 1: Usage Chart (8) + GPU Metrics (4) -->
|
||||
<div class="col-md-8"><div class="chart-card"><div class="title d-flex justify-content-between align-items-center">
|
||||
<span>Usage Over Time</span>
|
||||
<div style="display:flex;gap:4px">
|
||||
<button class="period-btn active" onclick="switchPeriod('day')">24h</button>
|
||||
<button class="period-btn" onclick="switchPeriod('week')">7d</button>
|
||||
<button class="period-btn" onclick="switchPeriod('month')">30d</button>
|
||||
<div class="d-flex gap-1">
|
||||
<button class="btn-sm-period active" onclick="switchPeriod('day')">24h</button>
|
||||
<button class="btn-sm-period" onclick="switchPeriod('week')">7d</button>
|
||||
<button class="btn-sm-period" onclick="switchPeriod('month')">30d</button>
|
||||
</div>
|
||||
</div>
|
||||
<div id="timeseries-chart" style="height:140px;position:relative;overflow:hidden">
|
||||
<div style="color:var(--dim);font-size:13px;padding:50px 0;text-align:center">Loading...</div>
|
||||
</div>
|
||||
<div id="timeseries-legend" style="display:flex;gap:16px;justify-content:center;margin-top:8px;flex-wrap:wrap"></div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="card-title">Agent Activity</div>
|
||||
<div id="agent-bars">-</div>
|
||||
</div>
|
||||
<div class="card full">
|
||||
<div class="card-title">Live Request Stream</div>
|
||||
<div style="overflow-x:auto">
|
||||
<table class="route-table">
|
||||
</div><div id="timeseries-chart" style="height:150px"></div><div id="timeseries-legend" class="d-flex justify-content-center gap-3 mt-2 flex-wrap small"></div></div></div>
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">GPU Metrics</div><div id="gpu-metrics-card"></div></div></div>
|
||||
|
||||
<!-- ROW 2: 3 GPU Cards -->
|
||||
<div class="col-md-4"><div class="gpu-card" id="gpu-moe"><div class="text-secondary small">Loading...</div></div></div>
|
||||
<div class="col-md-4"><div class="gpu-card" id="gpu-dense"><div class="text-secondary small">Loading...</div></div></div>
|
||||
<div class="col-md-4"><div class="gpu-card" id="gpu-light"><div class="text-secondary small">Loading...</div></div></div>
|
||||
|
||||
<!-- ROW 3: Queue + Model + Agent -->
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">Queue Status</div><div class="text-center" id="queue-viz"></div></div></div>
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">Model Distribution</div><div id="route-bars"></div></div></div>
|
||||
<div class="col-md-4"><div class="chart-card"><div class="title">Agent Activity</div><div id="agent-bars"></div></div></div>
|
||||
|
||||
<!-- ROW 4: Live Stream -->
|
||||
<div class="col-12"><div class="chart-card"><div class="title">Live Stream</div>
|
||||
<div class="table-responsive"><table class="table table-custom mb-0">
|
||||
<thead><tr><th>Time</th><th>Agent</th><th>Model</th><th>Reason</th><th>Tier</th></tr></thead>
|
||||
<tbody id="route-tbody"><tr><td colspan="5">Waiting for data...</td></tr></tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
<tbody id="route-tbody"></tbody>
|
||||
</table></div>
|
||||
</div></div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const MODEL_COLORS = {'gemma-4-E4B':'#7fd962','qwen3.6-27B-code':'#ffb454','qwen3.6-35B-A3B':'#d2a6ff'};
|
||||
const MODEL_LABELS = {'gemma-4-E4B':'Gemma 4B','qwen3.6-27B-code':'Qwen Code 27B','qwen3.6-35B-A3B':'Qwen MoE 35B'};
|
||||
const GPU_LABELS = {'NVIDIA GeForce RTX 5070':'RTX 5070 - Gemma 4B','NVIDIA GeForce RTX 3090':'RTX 3090 - Qwen Code 27B','AMD Radeon (Strix Halo)':'Strix Halo - Qwen MoE 35B'};
|
||||
var MC={'qwen3.5-9b-vlm':'#22c55e','qwen3.6-27B-code':'#f59e0b','qwen3.6-35B-A3B':'#a78bfa'};
|
||||
var ML={'qwen3.5-9b-vlm':'Qwen3.5 9B VLM','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
|
||||
var GL={'qwen3.6-35B-A3B':'MoE - Strix Halo','qwen3.6-27B-code':'Dense - RTX 3090','qwen3.5-9b-vlm':'VLM - RTX 5070'};
|
||||
function $(id){return document.getElementById(id);}
|
||||
|
||||
function statusIcon(status) {
|
||||
if (status === 'healthy') return '<span class="gpu-icon green">●</span>';
|
||||
if (status === 'saturated') return '<span class="gpu-icon yellow">◉</span>';
|
||||
return '<span class="gpu-icon red">○</span>';
|
||||
}
|
||||
function vramClass(pct) { if(pct>90)return'red';if(pct>75)return'yellow';return'green'; }
|
||||
|
||||
function render(data) {
|
||||
if(!data||!data.gpus)return;
|
||||
const total = Object.values(data.route_counts||{}).reduce((a,b)=>a+b,0);
|
||||
document.getElementById('total-requests').textContent = total + ' requests';
|
||||
document.getElementById('update-time').textContent = new Date().toLocaleTimeString();
|
||||
|
||||
const gpus = data.gpus||[];
|
||||
document.getElementById('gpu-container').innerHTML = gpus.map(g => '<div class="gpu-row">'+statusIcon(g.status)+'<div class="gpu-info"><div class="gpu-name">'+(GPU_LABELS[g.gpu_name]||g.gpu_name||g.id||'?')+'</div><div class="gpu-metrics"><div class="gpu-metric"><span class="label">VRAM</span> <span class="value">'+(g.vram_used_mb||'?')+'/'+(g.vram_total_mb||'?')+' MB</span></div><div class="gpu-metric"><span class="label">Temp</span> <span class="value">'+(g.temp_c||'?')+'C</span></div><div class="gpu-metric"><span class="label">Util</span> <span class="value">'+(g.gpu_util_pct||0)+'%</span></div>'+(g.power_w!=null?'<div class="gpu-metric"><span class="label">Power</span> <span class="value">'+g.power_w+'W</span></div>':'')+'</div><div class="vram-bar"><div class="vram-fill '+vramClass(g.vram_pct||0)+'" style="width:'+(g.vram_pct||0)+'%"></div></div></div><div style="font-size:24px;font-weight:700;color:'+(vramClass(g.vram_pct||0)==='red'?'var(--red)':vramClass(g.vram_pct||0)==='yellow'?'var(--yellow)':'var(--green)')+';min-width:50px;text-align:right">'+(g.vram_pct||0)+'%</div></div>').join('');
|
||||
|
||||
const rc = data.route_counts||{};
|
||||
const maxR = Math.max(1,...Object.values(rc));
|
||||
document.getElementById('route-bars').innerHTML = Object.entries(rc).length ? Object.entries(rc).sort((a,b)=>b[1]-a[1]).map(([m,c])=>'<div class="bar-row"><div class="bar-label"><span class="name">'+(MODEL_LABELS[m]||m)+'</span><span class="count">'+c+' ('+(total?Math.round(c/total*100):0)+'%)</span></div><div class="bar-track"><div class="bar-fill" style="width:'+(c/maxR*100)+'%;background:'+(MODEL_COLORS[m]||'#39bae6')+'"></div></div></div>').join('') : '<div style="color:var(--dim);font-size:13px">No data yet</div>';
|
||||
|
||||
const ac = data.agent_counts||{};
|
||||
const maxA = Math.max(1,...Object.values(ac));
|
||||
document.getElementById('agent-bars').innerHTML = Object.entries(ac).length ? Object.entries(ac).sort((a,b)=>b[1]-a[1]).map(([a,c])=>'<div class="bar-row"><div class="bar-label"><span class="name agent-'+a.toLowerCase().replace(/[^a-z]/g,'')+'">'+a+'</span><span class="count">'+c+' reqs</span></div><div class="bar-track"><div class="bar-fill" style="width:'+(c/maxA*100)+'%;background:var(--accent)"></div></div></div>').join('') : '<div style="color:var(--dim);font-size:13px">No agent activity yet</div>';
|
||||
|
||||
const recent = data.recent||[];
|
||||
document.getElementById('route-tbody').innerHTML = recent.length ? recent.slice(0,25).map(r=>{const d=new Date(r.ts*1000);const a=r.agent||'?';const cl='agent-'+a.toLowerCase().replace(/[^a-z0-9]/g,'');return'<tr><td style="color:var(--dim);font-size:11px">'+d.toLocaleTimeString()+'</td><td><span class="agent-tag '+cl+'">'+a+'</span></td><td>'+(MODEL_LABELS[r.model]||r.model)+'</td><td style="color:var(--dim);font-size:11px">'+(r.reason||'')+'</td><td style="font-size:11px;text-transform:uppercase;color:'+(r.tier==='enterprise'?'var(--purple)':r.tier==='professional'?'var(--blue)':'var(--dim)')+'">'+(r.tier||'')+'</td></tr>';}).join('') : '<tr><td colspan="5" style="color:var(--dim)">Waiting for requests...</td></tr>';
|
||||
function render(data){
|
||||
if(!data||!data.gpus)return;
|
||||
var t=Object.values(data.route_counts||{}).reduce((a,b)=>a+b,0);
|
||||
var ta=0,tm=0;data.gpus.forEach(function(g){ta+=(g.active_requests||0);tm+=(g.max_concurrent||1)});
|
||||
$('kpi-total').textContent=t;$('kpi-active').textContent=ta+'/'+tm;$('kpi-agents').textContent=Object.keys(data.agent_counts||{}).length;
|
||||
$('update-time').textContent=new Date().toLocaleTimeString();
|
||||
var ids={'qwen3.6-35B-A3B':'gpu-moe','qwen3.6-27B-code':'gpu-dense','qwen3.5-9b-vlm':'gpu-light'};
|
||||
data.gpus.forEach(function(g){
|
||||
var el=$(ids[g.id]);if(!el)return;
|
||||
var a=g.active_requests||0,mx=g.max_concurrent||1;
|
||||
var sc=g.status==='healthy'?'#22c55e':g.status==='saturated'?'#f59e0b':'#ef4444';
|
||||
var ss=g.status==='healthy'?'Online':g.status==='saturated'?'Busy':'Offline';
|
||||
var slots='';for(var i=0;i<mx;i++)slots+='<span class=\"s'+(i<a?' active':'')+'\"></span>';
|
||||
var h='<div class=\"title\"><span class=\"status-dot\" style=\"background:'+sc+'\"></span>'+GL[g.id]+'<span class=\"ms-auto small\" style=\"color:'+sc+'\">'+ss+'</span></div>';
|
||||
h+='<div class=\"row-metric\"><span class=\"lbl\">VRAM</span><span class=\"val\">'+g.vram_used_mb+' / '+g.vram_total_mb+' MB</span></div>';
|
||||
h+='<div class=\"row-metric\"><span class=\"lbl\">Utilization</span><span class=\"val\">'+g.gpu_util_pct+'%</span></div>';
|
||||
h+='<div class=\"row-metric\"><span class=\"lbl\">Temperature</span><span class=\"val\" style=\"color:'+(g.temp_c>85?'#ef4444':g.temp_c>70?'#f59e0b':'#22c55e')+'\">'+g.temp_c+'C</span></div>';
|
||||
if(g.power_w)h+='<div class=\"row-metric\"><span class=\"lbl\">Power</span><span class=\"val\">'+g.power_w+'W'+(g.power_limit_w?'/'+g.power_limit_w+'W':'')+'</span></div>';
|
||||
h+='<div class=\"row-metric\"><span class=\"lbl\">Slots</span><span class=\"val\" style=\"color:'+(a>=mx?'#ef4444':'#e2e8f0')+'\">'+a+' / '+mx+'</span></div>';
|
||||
h+='<div class=\"slot-bar\">'+slots+'</div>';el.innerHTML=h;
|
||||
});
|
||||
renderQueue(data);renderGPUMetrics(data);
|
||||
var rc=data.route_counts||{},mr=Math.max(1,...Object.values(rc));
|
||||
$('route-bars').innerHTML=Object.entries(rc).length?Object.entries(rc).sort((a,b)=>b[1]-a[1]).map(function(e){var m=e[0],c=e[1];return'<div class=\"bar-row\"><div class=\"bar-label\"><span class=\"name\">'+(ML[m]||m)+'</span><span>'+c+' ('+(t?Math.round(c/t*100):0)+'%)</span></div><div class=\"bar-track\"><div class=\"bar-fill\" style=\"width:'+(c/mr*100)+'%;background:'+(MC[m]||'#38bdf8')+'\"></div></div></div>';}).join(''):'<div class=\"text-secondary small\">-</div>';
|
||||
var ac=data.agent_counts||{},ma=Math.max(1,...Object.values(ac));
|
||||
$('agent-bars').innerHTML=Object.entries(ac).length?Object.entries(ac).sort((a,b)=>b[1]-a[1]).map(function(e){return'<div class=\"bar-row\"><div class=\"bar-label\"><span class=\"name\">'+e[0]+'</span><span>'+e[1]+'</span></div><div class=\"bar-track\"><div class=\"bar-fill\" style=\"width:'+(e[1]/ma*100)+'%;background:#38bdf8\"></div></div></div>';}).join(''):'<div class=\"text-secondary small\">-</div>';
|
||||
var recent=data.recent||[];
|
||||
$('route-tbody').innerHTML=recent.length?recent.slice(0,20).map(function(r){var d=new Date(r.ts*1000),ag=r.agent||'?';return'<tr><td class=\"text-secondary\">'+d.toLocaleTimeString()+'</td><td><span class=\"agent-badge\" style=\"background:rgba(56,189,248,0.12);color:#38bdf8\">'+ag+'</span></td><td>'+(ML[r.model]||r.model)+'</td><td class=\"text-secondary\">'+(r.reason||'')+'</td><td class=\"text-uppercase\" style=\"font-size:10px;color:'+(r.tier==='enterprise'?'#a78bfa':'#64748b')+'\">'+(r.tier||'')+'</td></tr>';}).join(''):'<tr><td colspan=\"5\" class=\"text-secondary\">Waiting...</td></tr>';
|
||||
}
|
||||
|
||||
let currentPeriod = 'day';
|
||||
async function switchPeriod(p) {
|
||||
currentPeriod = p;
|
||||
document.querySelectorAll('.period-btn').forEach(b => b.classList.remove('active'));
|
||||
document.querySelectorAll('.period-btn').forEach(b => { if(b.textContent.trim().startsWith(p==='day'?'24h':p==='week'?'7d':'30d')) b.classList.add('active'); });
|
||||
await loadTimeseries();
|
||||
}
|
||||
async function loadTimeseries() {
|
||||
try { const r = await fetch('/api/timeseries?period='+currentPeriod); renderTimeseries(await r.json()); } catch(e) {}
|
||||
}
|
||||
function renderTimeseries(d) {
|
||||
const models = d.models||{}, labels = d.labels||[];
|
||||
if(!labels.length)return;
|
||||
const container = document.getElementById('timeseries-chart');
|
||||
const legend = document.getElementById('timeseries-legend');
|
||||
const modelNames = Object.keys(models);
|
||||
if(!modelNames.length){container.innerHTML='<div style="color:var(--dim);font-size:13px;padding:50px 0;text-align:center">No data yet</div>';return;}
|
||||
const colors = {'gemma-4-E4B':'#7fd962','qwen3.6-27B-code':'#ffb454','qwen3.6-35B-A3B':'#d2a6ff'};
|
||||
const shortNames = {'gemma-4-E4B':'Gemma','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
|
||||
let maxVal = 1;
|
||||
for(const m in models) for(const v of models[m]) if(v>maxVal) maxVal=v;
|
||||
maxVal = Math.ceil(maxVal*1.15)||1;
|
||||
const W = labels.length>1?100/(labels.length-1):100, H=130;
|
||||
let paths='';
|
||||
for(const m of modelNames){const vals=models[m]||[];let d='';for(let i=0;i<vals.length;i++){const x=i*W,y=H-(vals[i]/maxVal)*H;d+=(i===0?'M':'L')+x.toFixed(1)+','+y.toFixed(1)+' ';}paths+='<path d="'+d+'" fill="none" stroke="'+(colors[m]||'#39bae6')+'" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" opacity="0.85"/>';}
|
||||
let grid='';
|
||||
for(let g=0;g<=4;g++){const y=(g/4)*H;grid+='<line x1="0" y1="'+y.toFixed(1)+'" x2="100" y2="'+y.toFixed(1)+'" stroke="rgba(255,255,255,0.05)" stroke-width="1"/>';}
|
||||
const svg='<svg viewBox="0 0 100 '+(H+16)+'" style="width:100%;height:'+(H+20)+'px;display:block" preserveAspectRatio="none">'+grid+paths+'</svg>';
|
||||
const step=Math.max(1,Math.floor(labels.length/8));
|
||||
let lh='<div style="display:flex;margin-top:2px;font-size:10px;color:var(--dim);overflow:hidden">';
|
||||
for(let i=0;i<labels.length;i+=step) lh+='<div style="flex:1;text-align:center">'+labels[i]+'</div>';
|
||||
lh+='</div>';
|
||||
container.innerHTML=svg+lh;
|
||||
legend.innerHTML=modelNames.map(m=>'<span style="display:flex;align-items:center;gap:6px;font-size:11px;color:var(--dim)"><svg width="18" height="10"><line x1="0" y1="5" x2="18" y2="5" stroke="'+(colors[m]||'#39bae6')+'" stroke-width="2.5"/></svg>'+shortNames[m]+'</span>').join('');
|
||||
function renderQueue(data){
|
||||
var el=$('queue-viz');if(!el)return;
|
||||
var ta=0,tm=0;data.gpus.forEach(function(g){ta+=(g.active_requests||0);tm+=(g.max_concurrent||1)});
|
||||
var pct=tm>0?Math.round(ta/tm*100):0,st=pct>=100?'SATURATED':pct>=50?'BUSY':'IDLE';
|
||||
var sc=pct>=100?'#ef4444':pct>=50?'#f59e0b':'#22c55e';
|
||||
var circ=188.5,dash=(pct/100)*circ;
|
||||
var h='<div class=\"d-inline-block position-relative mb-2\"><svg width=\"72\" height=\"72\"><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"#1e293b\" stroke-width=\"6\"/><circle cx=\"36\" cy=\"36\" r=\"30\" fill=\"none\" stroke=\"'+sc+'\" stroke-width=\"6\" stroke-dasharray=\"'+dash+' '+(circ-dash)+'\" stroke-linecap=\"round\" transform=\"rotate(-90 36 36)\"/></svg><div style=\"position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);text-align:center\"><div class=\"ring-label\" style=\"color:'+sc+'\">'+ta+'</div><div class=\"ring-sublabel\">/ '+tm+' slots</div></div></div>';
|
||||
h+='<div class=\"fw-bold mb-2 small\" style=\"color:'+sc+'\">'+st+'</div>';
|
||||
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
|
||||
data.gpus.forEach(function(g){var a=g.active_requests||0,mx=g.max_concurrent||1,gp=mx>0?Math.round(a/mx*100):0;h+='<div class=\"d-flex align-items-center gap-2 mb-1 justify-content-center\"><span class=\"small\" style=\"min-width:32px;text-align:right;font-size:10px\">'+(lb[g.id]||g.id)+'</span><div style=\"flex:1;max-width:70px;height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+gp+'%;background:'+sc+';border-radius:2px\"></div></div><span class=\"small\" style=\"min-width:22px;font-size:10px\">'+a+'/'+mx+'</span></div>'});
|
||||
el.innerHTML=h;
|
||||
}
|
||||
|
||||
function poll(){fetch('/api/state').then(r=>r.json()).then(data=>{render(data);document.getElementById('connection-status').textContent='live';document.getElementById('live-dot').className='status-dot live';}).catch(()=>{document.getElementById('connection-status').textContent='reconnecting...';document.getElementById('live-dot').className='status-dot';});}
|
||||
poll();setInterval(poll,3000);loadTimeseries();
|
||||
function renderGPUMetrics(data){
|
||||
var el=$('gpu-metrics-card');if(!el)return;
|
||||
var lb={'qwen3.6-35B-A3B':'MoE','qwen3.6-27B-code':'Dense','qwen3.5-9b-vlm':'VLM'};
|
||||
var h='';data.gpus.forEach(function(g){
|
||||
var nm=lb[g.id]||g.id,tp=g.temp_c||0,ut=g.gpu_util_pct||0,pw=g.power_w||0,pl=g.power_limit_w||0;
|
||||
var tc=tp>85?'#ef4444':tp>70?'#f59e0b':'#22c55e',uc=ut>90?'#ef4444':ut>70?'#f59e0b':'#22c55e';
|
||||
h+='<div class=\"mb-3\"><div class=\"fw-bold small text-white-50 mb-1\">'+nm+'</div>';
|
||||
h+='<div class=\"d-flex align-items-center gap-2 mb-1\"><span class=\"small text-secondary\" style=\"min-width:30px\">T</span><div class=\"flex-grow-1\" style=\"height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+Math.min(tp,100)+'%;background:'+tc+';border-radius:2px\"></div></div><span class=\"small\" style=\"color:'+tc+';min-width:30px;text-align:right\">'+tp+'C</span></div>';
|
||||
h+='<div class=\"d-flex align-items-center gap-2 mb-1\"><span class=\"small text-secondary\" style=\"min-width:30px\">U</span><div class=\"flex-grow-1\" style=\"height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+ut+'%;background:'+uc+';border-radius:2px\"></div></div><span class=\"small\" style=\"color:'+uc+';min-width:30px;text-align:right\">'+ut+'%</span></div>';
|
||||
if(pw>0){var pp=pl>0?Math.round(pw/pl*100):0,pc=pp>90?'#ef4444':pp>70?'#f59e0b':'#22c55e';h+='<div class=\"d-flex align-items-center gap-2\"><span class=\"small text-secondary\" style=\"min-width:30px\">P</span><div class=\"flex-grow-1\" style=\"height:3px;background:#1e293b;border-radius:2px;overflow:hidden\"><div style=\"height:100%;width:'+pp+'%;background:'+pc+';border-radius:2px\"></div></div><span class=\"small\" style=\"color:'+pc+';min-width:30px;text-align:right\">'+pw+'W</span></div>';}
|
||||
h+='</div>';});
|
||||
el.innerHTML=h;
|
||||
}
|
||||
|
||||
var cp='day';
|
||||
function switchPeriod(p){cp=p;document.querySelectorAll('.btn-sm-period').forEach(function(b){b.classList.remove('active')});event.target.classList.add('active');loadTS();}
|
||||
function loadTS(){fetch('/api/timeseries?period='+cp).then(function(r){return r.json()}).then(renderTS).catch(function(){})}
|
||||
function renderTS(d){
|
||||
var models=d.models||{},labels=d.labels||[];
|
||||
if(!labels.length)return;
|
||||
var cn=$('timeseries-chart'),lg=$('timeseries-legend'),mn=Object.keys(models);
|
||||
if(!mn.length){cn.innerHTML='<div class=\"text-secondary small text-center py-4\">-</div>';return;}
|
||||
var mv=1;for(var m in models)for(var i=0;i<models[m].length;i++)if(models[m][i]>mv)mv=models[m][i];mv=Math.ceil(mv*1.15)||1;
|
||||
var W=labels.length>1?100/(labels.length-1):100,H=130;
|
||||
var paths='';for(var mi=0;mi<mn.length;mi++){var m=mn[mi],vals=models[m]||[],d='';for(var i=0;i<vals.length;i++){var x=i*W,y=H-(vals[i]/mv)*H;d+=(i===0?'M':'L')+x.toFixed(1)+','+y.toFixed(1)+' ';}paths+='<path d=\"'+d+'\" fill=\"none\" stroke=\"'+(MC[m]||'#38bdf8')+'\" stroke-width=\"2\" stroke-linecap=\"round\" opacity=\"0.8\"/>';}
|
||||
var grid='';for(var g=0;g<=4;g++){var y=(g/4)*H;grid+='<line x1=\"0\" y1=\"'+y.toFixed(1)+'\" x2=\"100\" y2=\"'+y.toFixed(1)+'\" stroke=\"#1e293b\" stroke-width=\"1\"/>';}
|
||||
cn.innerHTML='<svg viewBox=\"0 0 100 '+(H+16)+'\" style=\"width:100%;height:'+(H+20)+'px;display:block\" preserveAspectRatio=\"none\">'+grid+paths+'</svg>';
|
||||
lg.innerHTML=mn.map(function(m){return'<span class=\"d-flex align-items-center gap-1\"><svg width=\"14\" height=\"8\"><line x1=\"0\" y1=\"4\" x2=\"14\" y2=\"4\" stroke=\"'+(MC[m]||'#38bdf8')+'\" stroke-width=\"2\"/></svg>'+(ML[m]||m)+'</span>';}).join('');
|
||||
}
|
||||
function poll(){fetch('/api/state').then(function(r){return r.json()}).then(function(data){render(data);$('connection-status').textContent='live';}).catch(function(){$('connection-status').textContent='reconnecting';});}
|
||||
poll();setInterval(poll,3000);loadTS();
|
||||
</script>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
@app.route("/")
|
||||
def dashboard():
|
||||
return render_template_string(DASHBOARD_HTML)
|
||||
def dashboard(): return render_template_string(DASHBOARD_HTML)
|
||||
|
||||
@app.route("/api/state")
|
||||
def api_state():
|
||||
return fetch_state()
|
||||
def api_state(): return fetch_state()
|
||||
|
||||
@app.route("/api/timeseries")
|
||||
def api_timeseries():
|
||||
@@ -264,27 +211,22 @@ def api_timeseries():
|
||||
|
||||
@app.route("/api/stream")
|
||||
def api_stream():
|
||||
def event_stream():
|
||||
def ev():
|
||||
q = queue.Queue()
|
||||
with sse_lock: sse_subscribers.append(q)
|
||||
try:
|
||||
data = fetch_state()
|
||||
yield "data: " + json.dumps(data) + "\n\n"
|
||||
yield "data: "+json.dumps(fetch_state())+"\n\n"
|
||||
while True:
|
||||
try: msg = q.get(timeout=3); yield "data: " + msg + "\n\n"
|
||||
except queue.Empty:
|
||||
data = fetch_state()
|
||||
yield "data: " + json.dumps(data) + "\n\n"
|
||||
try: msg = q.get(timeout=3); yield "data: "+msg+"\n\n"
|
||||
except queue.Empty: yield "data: "+json.dumps(fetch_state())+"\n\n"
|
||||
except GeneratorExit: pass
|
||||
finally:
|
||||
with sse_lock:
|
||||
if q in sse_subscribers: sse_subscribers.remove(q)
|
||||
return Response(stream_with_context(event_stream()), mimetype="text/event-stream",
|
||||
headers={"Cache-Control":"no-cache","X-Accel-Buffering":"no","Access-Control-Allow-Origin":"*"})
|
||||
return Response(stream_with_context(ev()), mimetype="text/event-stream", headers={"Cache-Control":"no-cache","X-Accel-Buffering":"no","Access-Control-Allow-Origin":"*"})
|
||||
|
||||
@app.route("/health")
|
||||
def health():
|
||||
return {"status":"healthy","service":"harness-dashboard"}
|
||||
def health(): return {"status":"healthy","service":"harness-dashboard"}
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=3000, debug=False)
|
||||
|
||||
@@ -27,6 +27,11 @@ services:
|
||||
- GPU_MOE_URL=http://192.168.68.15:8080/v1
|
||||
- GPU_DENSE_URL=http://192.168.68.8:8080/v1
|
||||
- GPU_LIGHT_URL=http://192.168.68.110:8080/v1
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
@@ -42,6 +47,11 @@ services:
|
||||
- ./litellm_config.yaml:/app/config.yaml
|
||||
environment:
|
||||
- LITELLM_MASTER_KEY=sk-syslog-local-master-key
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
@@ -54,6 +64,11 @@ services:
|
||||
- "80:80"
|
||||
volumes:
|
||||
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://127.0.0.1/health"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
depends_on:
|
||||
- litellm
|
||||
- dashboard
|
||||
@@ -67,6 +82,11 @@ services:
|
||||
environment:
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- GPU_SIDECARS=192.168.68.15:8090,192.168.68.8:8090,192.168.68.110:8090
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3000/health')"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
depends_on:
|
||||
- redis
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
|
||||
}
|
||||
|
||||
upstream ocu_llm_pool {
|
||||
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
||||
## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
|
||||
server 192.168.68.110:8080;
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
|
||||
"heavy" llmgpu_pool;
|
||||
"qwen3.5-27B" llmgpu_pool;
|
||||
"light" ocu_llm_pool;
|
||||
"gemma-4" ocu_llm_pool;
|
||||
"qwen3.5-9b-vlm" ocu_llm_pool;
|
||||
}
|
||||
|
||||
## Rate limit zone — 10 req/s per IP, burst of 20
|
||||
|
||||
+2
-2
@@ -13,7 +13,7 @@ upstream llmgpu_pool {
|
||||
}
|
||||
|
||||
upstream ocu_llm_pool {
|
||||
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
||||
## RTX 5070 — qwen3.5-9b-vlm (VLM) — Vision + light tasks
|
||||
server 192.168.68.110:8080;
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ map $http_x_syslog_model $gpu_upstream {
|
||||
"heavy" llmgpu_pool;
|
||||
"qwen3.5-27B" llmgpu_pool;
|
||||
"light" ocu_llm_pool;
|
||||
"gemma-4" ocu_llm_pool;
|
||||
"qwen3.5-9b-vlm" ocu_llm_pool;
|
||||
}
|
||||
|
||||
server {
|
||||
|
||||
+2
-2
@@ -11,9 +11,9 @@ model_list:
|
||||
api_base: http://192.168.68.8:8080/v1
|
||||
api_key: "not-needed"
|
||||
|
||||
- model_name: gemma-4-E4B
|
||||
- model_name: qwen3.5-9b-vlm
|
||||
litellm_params:
|
||||
model: openai/gemma-4-E4B
|
||||
model: openai/qwen3.5-9b-vlm
|
||||
api_base: http://192.168.68.110:8080/v1
|
||||
api_key: "not-needed"
|
||||
|
||||
|
||||
+1
-1
@@ -32,7 +32,7 @@ http {
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header Authorization $http_authorization;
|
||||
proxy_connect_timeout 10s;
|
||||
proxy_read_timeout 300s;
|
||||
proxy_read_timeout 600s;
|
||||
proxy_buffering off;
|
||||
}
|
||||
|
||||
|
||||
+237
-32
@@ -10,17 +10,31 @@ GPU_LIGHT_URL = os.environ.get("GPU_LIGHT_URL", "http://192.168.68.110:8080/v1")
|
||||
GPU_SIDECARS = {
|
||||
"qwen3.6-35B-A3B": "http://192.168.68.15:8090",
|
||||
"qwen3.6-27B-code": "http://192.168.68.8:8090",
|
||||
"gemma-4-E4B": "http://192.168.68.110:8090",
|
||||
"qwen3.5-9b-vlm": "http://192.168.68.110:8090",
|
||||
}
|
||||
GPU_URLS = {
|
||||
"qwen3.6-35B-A3B": GPU_MOE_URL,
|
||||
"qwen3.6-27B-code": GPU_DENSE_URL,
|
||||
"gemma-4-E4B": GPU_LIGHT_URL,
|
||||
"qwen3.5-9b-vlm": GPU_LIGHT_URL,
|
||||
}
|
||||
# Max concurrent requests per GPU (based on llama.cpp --parallel)
|
||||
GPU_MAX_CONCURRENT = {
|
||||
"qwen3.6-35B-A3B": 2, # 2 slots
|
||||
"qwen3.6-27B-code": 2, # 2 slots
|
||||
"qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom)
|
||||
}
|
||||
|
||||
# Context window sizes (tokens) — used for compaction signals
|
||||
GPU_CONTEXT = {
|
||||
"qwen3.6-35B-A3B": 131072,
|
||||
"qwen3.6-27B-code": 98304,
|
||||
"qwen3.5-9b-vlm": 131072,
|
||||
}
|
||||
|
||||
TIER_MODELS = {
|
||||
"starter": ["gemma-4-E4B"],
|
||||
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
||||
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
||||
"starter": ["qwen3.5-9b-vlm"],
|
||||
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
|
||||
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "qwen3.5-9b-vlm"],
|
||||
}
|
||||
API_KEYS = {
|
||||
"sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
|
||||
@@ -29,6 +43,7 @@ API_KEYS = {
|
||||
"sk-syslog-tanko": {"tier": "enterprise", "agent": "Tanko"},
|
||||
"sk-syslog-koby": {"tier": "enterprise", "agent": "Koby"},
|
||||
"sk-syslog-kagenz0": {"tier": "enterprise", "agent": "Kagenz0"},
|
||||
"sk-syslog-koonimo": {"tier": "enterprise", "agent": "Koonimo"},
|
||||
"sk-starter-abc123": {"tier": "starter", "agent": "test-starter"},
|
||||
"sk-professional-xyz789": {"tier": "professional", "agent": "test-pro"},
|
||||
}
|
||||
@@ -38,9 +53,47 @@ log = logging.getLogger("router")
|
||||
try: r = redis.from_url(REDIS_URL, decode_responses=True); r.ping()
|
||||
except Exception: r = None
|
||||
|
||||
|
||||
def counter_audit_loop():
|
||||
"""Every 30s, check GPU slots and reset counters if all slots idle."""
|
||||
while True:
|
||||
time.sleep(30)
|
||||
if not r: continue
|
||||
for model, url in GPU_URLS.items():
|
||||
try:
|
||||
resp = requests.get(url.replace("/v1","") + "/slots",
|
||||
headers={"Authorization": "Bearer not-needed"}, timeout=5)
|
||||
if resp.status_code == 200:
|
||||
slots = resp.json()
|
||||
all_idle = all(not s.get("is_processing", False) for s in slots)
|
||||
if all_idle:
|
||||
current = int(r.get("active:" + model) or 0)
|
||||
if current > 0:
|
||||
r.set("active:" + model, 0)
|
||||
log.info("AUDIT: Reset stuck counter for %s (was %d)", model, current)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
threading.Thread(target=counter_audit_loop, daemon=True).start()
|
||||
|
||||
app = Flask(__name__)
|
||||
sse_subscribers = []; sse_lock = threading.Lock()
|
||||
|
||||
def gpu_active_count(model):
|
||||
"""Get number of in-flight requests for a GPU."""
|
||||
if r:
|
||||
return int(r.get("active:" + model) or 0)
|
||||
return 0
|
||||
|
||||
def gpu_incr(model):
|
||||
if r: r.incr("active:" + model)
|
||||
|
||||
def gpu_decr(model):
|
||||
if r:
|
||||
v = r.decr("active:" + model)
|
||||
if v and int(v) < 0:
|
||||
r.set("active:" + model, 0) # never go negative
|
||||
|
||||
def check_gpu_health(model):
|
||||
url = GPU_SIDECARS.get(model)
|
||||
if not url: return {"status": "unknown"}
|
||||
@@ -49,40 +102,124 @@ def check_gpu_health(model):
|
||||
if resp.status_code == 200:
|
||||
d = resp.json()
|
||||
pct = (d.get("vram_used_mb",0) / max(d.get("vram_total_mb",1), 1)) * 100
|
||||
return {"status": "healthy" if pct < 90 else "saturated", "vram_used_mb": d.get("vram_used_mb"), "vram_total_mb": d.get("vram_total_mb"), "vram_pct": round(pct,1), "temp_c": d.get("temp_c"), "gpu_util_pct": d.get("gpu_util_pct"), "gpu_name": d.get("gpu_name"), "power_w": d.get("power_w"), "power_limit_w": d.get("power_limit_w")}
|
||||
status = "healthy" if pct < 90 else "saturated"
|
||||
# Also check if llama.cpp endpoint is actually responding
|
||||
gpu_url = GPU_URLS.get(model, "")
|
||||
try:
|
||||
hr = requests.get(gpu_url.replace("/v1","") + "/health", headers={"Authorization": "Bearer not-needed"}, timeout=3)
|
||||
if hr.status_code != 200:
|
||||
status = "down"
|
||||
except Exception:
|
||||
status = "down"
|
||||
return {"status": status, "vram_used_mb": d.get("vram_used_mb"), "vram_total_mb": d.get("vram_total_mb"), "vram_pct": round(pct,1), "temp_c": d.get("temp_c"), "gpu_util_pct": d.get("gpu_util_pct"), "gpu_name": d.get("gpu_name"), "power_w": d.get("power_w"), "power_limit_w": d.get("power_limit_w")}
|
||||
except Exception: pass
|
||||
return {"status": "down"}
|
||||
|
||||
def available_models(): return [m for m in GPU_URLS if check_gpu_health(m)["status"] in ("healthy","saturated")]
|
||||
|
||||
def estimate_tokens(msgs): return sum(len(str(m.get("content",""))) for m in msgs) // 4
|
||||
def estimate_tokens(msgs):
|
||||
"""Estimate token count from messages. Uses JSON length / 3.5 (closer to real tokenizer ratios for dense text)."""
|
||||
return len(json.dumps(msgs, default=str)) // 3.5
|
||||
|
||||
def is_gpu_busy(model):
|
||||
"""Check if GPU is at or near max concurrent capacity."""
|
||||
active = gpu_active_count(model)
|
||||
max_c = GPU_MAX_CONCURRENT.get(model, 1)
|
||||
return active >= max_c
|
||||
|
||||
def select_best_gpu(candidates, reason):
|
||||
"""Pick the best GPU from candidates IN ORDER — first non-busy one wins."""
|
||||
for m in candidates:
|
||||
if not is_gpu_busy(m):
|
||||
return {"model": m, "reason": reason}
|
||||
# All busy — pick least loaded
|
||||
best = None
|
||||
best_load = 999
|
||||
for m in candidates:
|
||||
load = gpu_active_count(m)
|
||||
if load < best_load:
|
||||
best_load = load
|
||||
best = m
|
||||
if best:
|
||||
return {"model": best, "reason": "load_balanced_" + reason}
|
||||
return None
|
||||
|
||||
def route(rd, tier):
|
||||
msgs = rd.get("messages",[]); t = estimate_tokens(msgs)
|
||||
sys = any(m.get("role")=="system" for m in msgs)
|
||||
turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
|
||||
hints = rd.get("routing_hints",{})
|
||||
allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
|
||||
allowed = TIER_MODELS.get(tier, ["qwen3.5-9b-vlm"])
|
||||
avail = [m for m in available_models() if m in allowed]
|
||||
if not avail: return {"model": allowed[0], "reason": "all_saturated"}
|
||||
if not avail: return {"model": allowed[0], "reason": "all_saturated", "saturated": True}
|
||||
# Check if all available GPUs are at max capacity
|
||||
if all(is_gpu_busy(m) for m in avail):
|
||||
return {"model": avail[0], "reason": "all_saturated", "saturated": True}
|
||||
|
||||
req = rd.get("model","auto")
|
||||
if req != "auto": return {"model": req if req in avail else avail[0], "reason": "explicit"}
|
||||
if req != "auto":
|
||||
target = req if req in avail else avail[0]
|
||||
# If explicit model is busy, check if another can take it
|
||||
if is_gpu_busy(target) and req in allowed:
|
||||
alts = [m for m in avail if m != target and m in allowed]
|
||||
if alts:
|
||||
alt = select_best_gpu(alts, "explicit")
|
||||
if alt: return alt
|
||||
return {"model": target, "reason": "explicit"}
|
||||
|
||||
if hints:
|
||||
if hints.get("priority")=="speed" and "gemma-4-E4B" in avail: return {"model":"gemma-4-E4B","reason":"hint_speed"}
|
||||
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail: return {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
||||
if t > 4000 or sys or turns > 6:
|
||||
for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","gemma-4-E4B"]:
|
||||
if m in avail: return {"model":m,"reason":"heavy_reasoning"}
|
||||
if hints.get("priority")=="speed" and "qwen3.5-9b-vlm" in avail:
|
||||
return select_best_gpu(["qwen3.5-9b-vlm"], "hint_speed") or {"model":"qwen3.5-9b-vlm","reason":"hint_speed"}
|
||||
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail:
|
||||
return select_best_gpu(["qwen3.6-27B-code"], "hint_quality") or {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
||||
|
||||
first_msg = msgs[0].get("content","") if msgs else ""
|
||||
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
||||
if words <= 3 and turns <= 1 and not sys and "gemma-4-E4B" in avail:
|
||||
return {"model":"gemma-4-E4B","reason":"ultra_light"}
|
||||
if "qwen3.6-35B-A3B" in avail: return {"model":"qwen3.6-35B-A3B","reason":"default_moe"}
|
||||
return {"model":avail[0],"reason":"fallback"}
|
||||
|
||||
# TIER 1: Lightweight — single-turn short queries → VLM first
|
||||
if not sys and turns <= 1 and words <= 100 and "qwen3.5-9b-vlm" in avail:
|
||||
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
||||
return {"model":"qwen3.5-9b-vlm","reason":"lightweight"}
|
||||
# VLM busy — fall back to Dense, then MoE
|
||||
fallback = [m for m in ["qwen3.6-35B-A3B","qwen3.6-27B-code"] if m in avail]
|
||||
result = select_best_gpu(fallback, "lightweight_fallback")
|
||||
if result: return result
|
||||
|
||||
# TIER 2: Simple conversations — short context, any prompt → VLM preferred
|
||||
if t <= 1000 and turns <= 4 and "qwen3.5-9b-vlm" in avail:
|
||||
if not is_gpu_busy("qwen3.5-9b-vlm"):
|
||||
return {"model":"qwen3.5-9b-vlm","reason":"simple_conv"}
|
||||
# VLM busy — try Dense
|
||||
if "qwen3.6-27B-code" in avail and not is_gpu_busy("qwen3.6-27B-code"):
|
||||
return {"model":"qwen3.6-27B-code","reason":"simple_conv_fallback"}
|
||||
|
||||
# TIER 3: Heavy reasoning — extremely large context or very long conversations
|
||||
if t > 50000 or turns > 25:
|
||||
# MoE first (131K context handles heavy sessions), then Dense (98K reasoning), then Light (131K fallback)
|
||||
candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.6-27B-code","qwen3.5-9b-vlm"] if m in avail]
|
||||
result = select_best_gpu(candidates, "heavy_reasoning")
|
||||
if result: return result
|
||||
|
||||
# TIER 4: Default — MoE first, VLM helps, Dense last (slow)
|
||||
if t <= 50000:
|
||||
candidates = [m for m in ["qwen3.6-35B-A3B","qwen3.5-9b-vlm","qwen3.6-27B-code"] if m in avail]
|
||||
result = select_best_gpu(candidates, "default")
|
||||
if result: return result
|
||||
|
||||
# Fallback — best available
|
||||
if "qwen3.6-35B-A3B" in avail and not is_gpu_busy("qwen3.6-35B-A3B"):
|
||||
return {"model":"qwen3.6-35B-A3B","reason":"default_moe"}
|
||||
result = select_best_gpu([m for m in avail], "fallback")
|
||||
if result: return result
|
||||
return {"model":avail[0],"reason":"last_resort"}
|
||||
|
||||
def clean_unicode(text):
|
||||
if not isinstance(text, str): return text
|
||||
return text.replace("\u2014","-").replace("\u2013","-").replace("\u2018",").replace(u2019,").replace("\u201c",').replace(u201d,').replace("\u2026","...").replace("\u00a0"," ")
|
||||
text = text.replace(chr(0x2014), "-"); text = text.replace(chr(0x2013), "-")
|
||||
text = text.replace(chr(0x2018), "'"); text = text.replace(chr(0x2019), "'")
|
||||
text = text.replace(chr(0x201C), '"'); text = text.replace(chr(0x201D), '"')
|
||||
text = text.replace(chr(0x2026), "..."); text = text.replace(chr(0x00A0), " ")
|
||||
return text.encode("ascii", "ignore").decode("ascii")
|
||||
|
||||
def clean_response(d):
|
||||
if isinstance(d, dict): return {k: clean_response(v) for k,v in d.items()}
|
||||
@@ -91,10 +228,11 @@ def clean_response(d):
|
||||
return d
|
||||
|
||||
def get_metrics():
|
||||
d = {"gpus":[],"route_counts":{},"agent_counts":{},"tier_counts":{},"recent":[],"timestamp":time.time()}
|
||||
d = {"gpus":[],"route_counts":{},"agent_counts":{},"tier_counts":{},"recent":[],"timestamp":time.time(),"active_requests":{}}
|
||||
for m in GPU_URLS:
|
||||
h = check_gpu_health(m)
|
||||
d["gpus"].append({"id":m,"gpu_name":h.get("gpu_name",m),"status":h.get("status"),"vram_used_mb":h.get("vram_used_mb"),"vram_total_mb":h.get("vram_total_mb"),"vram_pct":h.get("vram_pct"),"temp_c":h.get("temp_c"),"gpu_util_pct":h.get("gpu_util_pct"),"power_w":h.get("power_w"),"power_limit_w":h.get("power_limit_w")})
|
||||
d["gpus"].append({"id":m,"gpu_name":h.get("gpu_name",m),"status":h.get("status"),"vram_used_mb":h.get("vram_used_mb"),"vram_total_mb":h.get("vram_total_mb"),"vram_pct":h.get("vram_pct"),"temp_c":h.get("temp_c"),"gpu_util_pct":h.get("gpu_util_pct"),"power_w":h.get("power_w"),"power_limit_w":h.get("power_limit_w"),"active_requests":gpu_active_count(m), "max_concurrent": GPU_MAX_CONCURRENT.get(m, 1)})
|
||||
d["active_requests"][m] = gpu_active_count(m)
|
||||
if r:
|
||||
try:
|
||||
for m in GPU_URLS: d["route_counts"][m] = int(r.get("routes:"+m) or 0)
|
||||
@@ -116,16 +254,56 @@ def bcast():
|
||||
except Exception: dead.append(q)
|
||||
for q in dead: sse_subscribers.remove(q)
|
||||
|
||||
QUEUE_TIMEOUT = int(os.environ.get("QUEUE_TIMEOUT", "30")) # max seconds to queue before 503
|
||||
|
||||
@app.route("/v1/chat/completions", methods=["POST"])
|
||||
def chat():
|
||||
try:
|
||||
rd = request.get_json(force=True)
|
||||
ak = request.headers.get("Authorization","").replace("Bearer ","")
|
||||
ki = API_KEYS.get(ak, {"tier":"starter","agent":"unknown"})
|
||||
if not ak or ak not in API_KEYS:
|
||||
log.warning("AUTH_REJECTED: no/invalid API key from %s", request.remote_addr)
|
||||
return jsonify({"error": "Unauthorized — valid API key required"}), 401
|
||||
ki = API_KEYS[ak]
|
||||
tier, agent = ki["tier"], ki["agent"]
|
||||
d = route(rd, tier); model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]]
|
||||
|
||||
# Allow agent to override queue timeout via header
|
||||
q_timeout = int(request.headers.get("X-Queue-Timeout", str(QUEUE_TIMEOUT)))
|
||||
|
||||
# Cross-turn context tracking: accumulate tokens per session
|
||||
session_id = request.headers.get("X-Session-Id", "")
|
||||
session_tokens = 0
|
||||
if session_id and r:
|
||||
try:
|
||||
prev = int(r.get("session:" + session_id) or 0)
|
||||
current = estimate_tokens(rd.get("messages",[]))
|
||||
session_tokens = max(prev, current) # context only grows
|
||||
r.set("session:" + session_id, session_tokens, ex=86400) # TTL 24h
|
||||
except Exception: pass
|
||||
|
||||
d = route(rd, tier)
|
||||
queue_start = time.time()
|
||||
|
||||
# Queue loop: wait for a GPU slot instead of immediate 503
|
||||
while d.get("saturated"):
|
||||
elapsed = time.time() - queue_start
|
||||
if elapsed > q_timeout:
|
||||
resp = jsonify({"error": "All GPUs saturated", "queued_s": round(elapsed,1), "retry_after_s": 5})
|
||||
resp.headers["Retry-After"] = "5"
|
||||
log.warning("QUEUE_TIMEOUT: %s waited %.1fs, all GPUs saturated", agent, elapsed)
|
||||
return resp, 503
|
||||
time.sleep(0.5) # poll every 500ms
|
||||
d = route(rd, tier)
|
||||
|
||||
waited = time.time() - queue_start
|
||||
if waited > 0.5:
|
||||
log.info("QUEUED: %s waited %.1fs before slot opened", agent, waited)
|
||||
model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]]
|
||||
is_stream = rd.get("stream", False)
|
||||
log.info("ROUTE: %s -> %s (%s) stream=%s", agent, model, reason, is_stream)
|
||||
|
||||
gpu_incr(model)
|
||||
|
||||
log.info("ROUTE: %s -> %s (%s) stream=%s active=%d/%d", agent, model, reason, is_stream, gpu_active_count(model), GPU_MAX_CONCURRENT.get(model,1))
|
||||
if r:
|
||||
try:
|
||||
r.incr("routes:"+model); r.incr("routes:tier:"+tier); r.incr("routes:agent:"+agent)
|
||||
@@ -135,25 +313,45 @@ def chat():
|
||||
except Exception: pass
|
||||
start = time.time()
|
||||
resp = requests.post(url+"/chat/completions", json=rd,
|
||||
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=120, stream=is_stream)
|
||||
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=300, stream=is_stream)
|
||||
lat = int((time.time()-start)*1000)
|
||||
gpu_decr(model)
|
||||
|
||||
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
||||
if is_stream:
|
||||
def gen():
|
||||
for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
|
||||
if raw: yield clean_unicode(raw)
|
||||
bcast()
|
||||
return Response(stream_with_context(gen()), mimetype="text/event-stream")
|
||||
ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[])))
|
||||
ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100
|
||||
ctx_warning = "compact_urgent" if ctx_pct < 5 else ("compact_recommended" if ctx_pct < 15 else ("compact_soon" if ctx_pct < 30 else "ok"))
|
||||
sse_resp = Response(stream_with_context(gen()), mimetype="text/event-stream")
|
||||
sse_resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining))
|
||||
sse_resp.headers["X-Context-Warning"] = ctx_warning
|
||||
sse_resp.headers["X-Context-Model"] = model
|
||||
return sse_resp
|
||||
data = clean_response(resp.json())
|
||||
for c in data.get("choices",[]):
|
||||
msg = c.get("message",{})
|
||||
if not msg.get("content") and msg.get("reasoning_content"):
|
||||
msg["content"] = msg["reasoning_content"]
|
||||
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat}
|
||||
ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[])))
|
||||
ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100
|
||||
ctx_warning = "compact_urgent" if ctx_pct < 5 else ("compact_recommended" if ctx_pct < 15 else ("compact_soon" if ctx_pct < 30 else "ok"))
|
||||
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model),"context_remaining": max(0, ctx_remaining),"context_pct": round(ctx_pct,1),"context_warning": ctx_warning}
|
||||
resp = jsonify(data)
|
||||
resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining))
|
||||
resp.headers["X-Context-Warning"] = ctx_warning
|
||||
resp.headers["X-Context-Model"] = model
|
||||
bcast()
|
||||
return jsonify(data)
|
||||
except requests.Timeout: return jsonify({"error":"timeout"}), 504
|
||||
return resp
|
||||
except requests.Timeout:
|
||||
gpu_decr(model)
|
||||
log.error("TIMEOUT: %s -> %s", agent, model)
|
||||
return jsonify({"error":"timeout"}), 504
|
||||
except Exception as e:
|
||||
gpu_decr(model)
|
||||
log.error("Error: %s\n%s", e, traceback.format_exc())
|
||||
return jsonify({"error":str(e)}), 500
|
||||
|
||||
@@ -161,7 +359,14 @@ def chat():
|
||||
def models(): return jsonify({"object":"list","data":[{"id":m,"object":"model","owned_by":"syslog","status":check_gpu_health(m).get("status"),"gpu":check_gpu_health(m).get("gpu_name")} for m in GPU_URLS]})
|
||||
|
||||
@app.route("/health")
|
||||
def health(): return jsonify({"status":"healthy","redis":"connected" if r else "down","gpus":{m:check_gpu_health(m) for m in GPU_URLS},"available_models":available_models()})
|
||||
def health():
|
||||
gpus = {}
|
||||
for m in GPU_URLS:
|
||||
h = check_gpu_health(m)
|
||||
h["active_requests"] = gpu_active_count(m)
|
||||
h["max_concurrent"] = GPU_MAX_CONCURRENT.get(m, 1)
|
||||
gpus[m] = h
|
||||
return jsonify({"status":"healthy","redis":"connected" if r else "down","gpus":gpus,"available_models":available_models()})
|
||||
|
||||
@app.route("/metrics")
|
||||
def metrics(): return jsonify(get_metrics())
|
||||
@@ -209,5 +414,5 @@ def stream():
|
||||
headers={"Cache-Control":"no-cache","X-Accel-Buffering":"no","Access-Control-Allow-Origin":"*"})
|
||||
|
||||
if __name__ == "__main__":
|
||||
log.info("Router on :9000")
|
||||
log.info("Router on :9000 (load-aware)")
|
||||
app.run(host="0.0.0.0", port=9000, debug=False)
|
||||
|
||||
Reference in New Issue
Block a user