Compare commits
2 Commits
b65ea22765
...
3d42ea4767
| Author | SHA1 | Date | |
|---|---|---|---|
| 3d42ea4767 | |||
| 7b6c6aabe1 |
@@ -0,0 +1,5 @@
|
|||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.env
|
||||||
|
redis-data/
|
||||||
|
ssl/
|
||||||
@@ -1,63 +1,39 @@
|
|||||||
# Syslog Harness
|
# syslog-harness — Inference API Harness
|
||||||
|
|
||||||
Operational orchestration layer for Syslog's internal AI agents.
|
CT 116 Docker stack for routing local GPU models through a unified OpenAI-compatible API.
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
```
|
```
|
||||||
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
|
nginx :80 → router :9000 → GPU backends
|
||||||
│ Agent │────>│ Nginx │────>│ GPU Pool │
|
├─ qwen3.6-35B-A3B (MoE) @ 192.168.68.15:8080
|
||||||
│ (Hermes) │ │ Router │ │ (MoE/Dense)│
|
├─ qwen3.6-27B-code (Dense) @ 192.168.68.8:8080
|
||||||
└─────────────┘ └──────────────┘ └─────────────┘
|
└─ gemma-4-E4B (Light) @ 192.168.68.110:8080
|
||||||
│
|
|
||||||
├──> :8091 Queue Service (Docker)
|
LiteLLM :8081 (fallback) | Dashboard :3000 | Redis :6379 (local)
|
||||||
│
|
|
||||||
└──> :3001 Dashboard (Docker)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Components
|
## Deploy
|
||||||
|
|
||||||
| Service | Port | Container | Purpose |
|
|
||||||
|---|---|---|---|
|
|
||||||
| Nginx Router | 8080 | Host | Routes requests to GPU backends |
|
|
||||||
| Queue Service | 8091 | `syslog-queue` | Enqueues requests when GPUs are down |
|
|
||||||
| Dashboard | 3001 | `syslog-dashboard` | Observability UI + API |
|
|
||||||
|
|
||||||
## GPU Routing
|
|
||||||
|
|
||||||
| Header `X-Syslog-Model` | Backend | Model |
|
|
||||||
|---|---|---|
|
|
||||||
| (none) / `standard` | amdpve (.15) | qwen3.6-35B-A3B (MoE) |
|
|
||||||
| `heavy` / `qwen3.5-27B` | llmgpu (.8) | qwen3.5-27B (Dense) |
|
|
||||||
| `light` / `gemma-4` | ocu_llm (.110) | gemma-4-E4B (Light) |
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Build & start
|
cd /opt/inference-harness
|
||||||
docker compose build
|
|
||||||
docker compose up -d
|
docker compose up -d
|
||||||
|
|
||||||
# Verify
|
|
||||||
curl http://localhost:8091/health
|
|
||||||
curl http://localhost:3001/api/status
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Dashboard
|
## Endpoints
|
||||||
|
|
||||||
- **UI:** `http://<host>:8080/dashboard/harness.html`
|
| URL | Purpose |
|
||||||
- **API:** `http://<host>:8080/dashboard/api/status`
|
|-----|---------|
|
||||||
|
| `/v1/chat/completions` | Inference API (OpenAI-compatible) |
|
||||||
|
| `/v1/models` | Available models |
|
||||||
|
| `/` | Dashboard (GPU health, routing, agents, timeseries) |
|
||||||
|
|
||||||
## Circuit Breaker
|
## Agent API Keys
|
||||||
|
|
||||||
- Rate limit: 10 req/s per IP
|
| Agent | Key |
|
||||||
- Burst: 20 requests
|
|-------|-----|
|
||||||
- Excess returns 503
|
| Abiba | `sk-syslog-abiba` |
|
||||||
- Queue fallback on GPU 502/503
|
| Mumuni | `sk-syslog-mumuni` |
|
||||||
|
| Tanko | `sk-syslog-tanko` |
|
||||||
## Production Migration
|
| Koby | `sk-syslog-koby` |
|
||||||
|
| Kagenz0 | `sk-syslog-kagenz0` |
|
||||||
See [MIGRATION_PLAN.md](./MIGRATION_PLAN.md)
|
|
||||||
|
|
||||||
---
|
|
||||||
*Built for Syslog Solution LLC — Quality over speed.*
|
|
||||||
|
|||||||
@@ -0,0 +1,7 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
WORKDIR /app
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
COPY dashboard.py .
|
||||||
|
EXPOSE 3000
|
||||||
|
CMD ["python", "dashboard.py"]
|
||||||
@@ -0,0 +1,290 @@
|
|||||||
|
"""Harness Dashboard."""
|
||||||
|
import os, json, time, queue, threading
|
||||||
|
import requests
|
||||||
|
from flask import Flask, request, render_template_string, Response, stream_with_context
|
||||||
|
|
||||||
|
ROUTER_METRICS = os.environ.get("ROUTER_METRICS_URL", "http://router:9000/metrics")
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
sse_subscribers = []
|
||||||
|
sse_lock = threading.Lock()
|
||||||
|
|
||||||
|
def fetch_state():
|
||||||
|
try:
|
||||||
|
r = requests.get(ROUTER_METRICS, timeout=5)
|
||||||
|
if r.status_code == 200: return r.json()
|
||||||
|
except Exception: pass
|
||||||
|
return {"gpus":[],"route_counts":{},"agent_counts":{},"recent":[],"timestamp":time.time()}
|
||||||
|
|
||||||
|
def broadcast_loop():
|
||||||
|
while True:
|
||||||
|
time.sleep(3)
|
||||||
|
data = fetch_state()
|
||||||
|
payload = json.dumps(data)
|
||||||
|
with sse_lock:
|
||||||
|
dead = []
|
||||||
|
for q in sse_subscribers:
|
||||||
|
try: q.put(payload)
|
||||||
|
except Exception: dead.append(q)
|
||||||
|
for q in dead: sse_subscribers.remove(q)
|
||||||
|
|
||||||
|
threading.Thread(target=broadcast_loop, daemon=True).start()
|
||||||
|
|
||||||
|
DASHBOARD_HTML = r"""<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Inference Harness - Syslog Solution LLC</title>
|
||||||
|
<style>
|
||||||
|
:root {
|
||||||
|
--bg: #0a0e14; --card: #131820; --border: #1e2a3a; --text: #c9d1d9;
|
||||||
|
--dim: #5c6670; --accent: #39bae6; --green: #7fd962; --yellow: #ffb454;
|
||||||
|
--red: #f26d78; --blue: #59c2ff; --purple: #d2a6ff;
|
||||||
|
}
|
||||||
|
* { margin:0; padding:0; box-sizing:border-box; }
|
||||||
|
body {
|
||||||
|
font-family: -apple-system, BlinkMacSystemFont, 'SF Pro Display', 'Segoe UI', system-ui, sans-serif;
|
||||||
|
background: var(--bg); color: var(--text); min-height: 100vh;
|
||||||
|
padding: clamp(12px, 3vw, 32px);
|
||||||
|
}
|
||||||
|
.header {
|
||||||
|
display: flex; align-items: center; justify-content: space-between;
|
||||||
|
flex-wrap: wrap; gap: 12px; margin-bottom: 24px;
|
||||||
|
}
|
||||||
|
.header h1 { font-size: clamp(18px, 4vw, 26px); font-weight: 700; color: #fff; }
|
||||||
|
.header h1 span { color: var(--accent); }
|
||||||
|
.status-bar { display: flex; gap: 16px; align-items: center; flex-wrap: wrap; font-size: 13px; color: var(--dim); }
|
||||||
|
.status-dot { width: 8px; height: 8px; border-radius: 50%; display: inline-block; }
|
||||||
|
.status-dot.live { background: var(--green); animation: pulse 2s infinite; }
|
||||||
|
@keyframes pulse { 0%,100%{opacity:1} 50%{opacity:0.3} }
|
||||||
|
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(min(100%, 340px), 1fr)); gap: 16px; }
|
||||||
|
.card {
|
||||||
|
background: var(--card); border: 1px solid var(--border);
|
||||||
|
border-radius: 12px; padding: clamp(12px, 3vw, 20px);
|
||||||
|
}
|
||||||
|
.card-title {
|
||||||
|
font-size: 13px; font-weight: 600; text-transform: uppercase;
|
||||||
|
letter-spacing: 0.5px; color: var(--dim); margin-bottom: 14px;
|
||||||
|
}
|
||||||
|
.gpu-row {
|
||||||
|
display: flex; align-items: center; gap: 14px; padding: 10px 0;
|
||||||
|
border-bottom: 1px solid rgba(255,255,255,0.04);
|
||||||
|
}
|
||||||
|
.gpu-row:last-child { border-bottom: none; }
|
||||||
|
.gpu-icon {
|
||||||
|
width: 40px; height: 40px; border-radius: 10px; display: flex;
|
||||||
|
align-items: center; justify-content: center; font-size: 18px; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.gpu-icon.green { background: rgba(127,217,98,0.12); color: var(--green); }
|
||||||
|
.gpu-icon.yellow { background: rgba(255,180,84,0.12); color: var(--yellow); }
|
||||||
|
.gpu-icon.red { background: rgba(242,109,120,0.12); color: var(--red); }
|
||||||
|
.gpu-info { flex:1; min-width: 0; }
|
||||||
|
.gpu-name { font-size: 14px; font-weight: 600; color: #e6edf3; }
|
||||||
|
.gpu-metrics { display: flex; gap: 20px; flex-wrap: wrap; margin-top: 6px; }
|
||||||
|
.gpu-metric { font-size: 12px; }
|
||||||
|
.gpu-metric .label { color: var(--dim); }
|
||||||
|
.gpu-metric .value { color: #e6edf3; font-weight: 500; font-variant-numeric: tabular-nums; }
|
||||||
|
.vram-bar { width: 100%; height: 4px; background: rgba(255,255,255,0.06); border-radius: 2px; margin-top: 6px; overflow: hidden; }
|
||||||
|
.vram-fill { height: 100%; border-radius: 2px; transition: width 0.6s ease; }
|
||||||
|
.vram-fill.green { background: var(--green); }
|
||||||
|
.vram-fill.yellow { background: var(--yellow); }
|
||||||
|
.vram-fill.red { background: var(--red); }
|
||||||
|
.bar-row { margin-bottom: 10px; }
|
||||||
|
.bar-label { display: flex; justify-content: space-between; font-size: 12px; margin-bottom: 4px; }
|
||||||
|
.bar-label .name { color: #e6edf3; }
|
||||||
|
.bar-label .count { color: var(--dim); font-variant-numeric: tabular-nums; }
|
||||||
|
.bar-track { height: 6px; background: rgba(255,255,255,0.06); border-radius: 3px; overflow: hidden; }
|
||||||
|
.bar-fill { height: 100%; border-radius: 3px; transition: width 0.6s ease; }
|
||||||
|
.route-table { width: 100%; font-size: 12px; border-collapse: collapse; }
|
||||||
|
.route-table th, .route-table td { text-align: left; padding: 6px 10px; }
|
||||||
|
.route-table th { color: var(--dim); font-weight: 500; font-size: 11px; text-transform: uppercase; letter-spacing: 0.3px; border-bottom: 1px solid var(--border); }
|
||||||
|
.route-table td { border-bottom: 1px solid rgba(255,255,255,0.03); color: #b0b8c4; }
|
||||||
|
.agent-tag { display: inline-block; padding: 1px 7px; border-radius: 10px; font-size: 11px; font-weight: 600; }
|
||||||
|
.agent-abiba { background: rgba(57,186,230,0.15); color: var(--accent); }
|
||||||
|
.agent-mumuni { background: rgba(210,166,255,0.15); color: var(--purple); }
|
||||||
|
.agent-tanko { background: rgba(255,180,84,0.15); color: var(--yellow); }
|
||||||
|
.agent-koby { background: rgba(89,194,255,0.15); color: var(--blue); }
|
||||||
|
.agent-kagenz0 { background: rgba(127,217,98,0.15); color: var(--green); }
|
||||||
|
.agent-unknown { background: rgba(255,255,255,0.06); color: var(--dim); }
|
||||||
|
.agent-admin { background: rgba(255,255,255,0.08); color: #e6edf3; }
|
||||||
|
.full { grid-column: 1 / -1; }
|
||||||
|
.period-btn {
|
||||||
|
background: var(--card); border: 1px solid var(--border); color: var(--dim);
|
||||||
|
padding: 4px 12px; border-radius: 6px; font-size: 12px; cursor: pointer;
|
||||||
|
font-family: inherit; transition: all 0.2s;
|
||||||
|
}
|
||||||
|
.period-btn.active { background: var(--accent); color: #000; border-color: var(--accent); }
|
||||||
|
.period-btn:hover { border-color: var(--accent); color: #e6edf3; }
|
||||||
|
@media (max-width: 600px) {
|
||||||
|
.gpu-metrics { gap: 10px; }
|
||||||
|
.route-table { font-size: 11px; }
|
||||||
|
.route-table th, .route-table td { padding: 4px 6px; }
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="header">
|
||||||
|
<h1><span>⚡</span> Inference Harness</h1>
|
||||||
|
<div class="status-bar">
|
||||||
|
<span class="status-dot" id="live-dot"></span>
|
||||||
|
<span id="connection-status">connecting...</span>
|
||||||
|
<span id="update-time"></span>
|
||||||
|
<span id="total-requests">0 requests</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card full">
|
||||||
|
<div class="card-title">GPU Health</div>
|
||||||
|
<div id="gpu-container">Loading...</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-title">Model Distribution</div>
|
||||||
|
<div id="route-bars">-</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="grid-column: span 2">
|
||||||
|
<div class="card-title" style="display:flex;justify-content:space-between;align-items:center;flex-wrap:wrap;gap:4px">
|
||||||
|
<span>Usage Over Time</span>
|
||||||
|
<div style="display:flex;gap:4px">
|
||||||
|
<button class="period-btn active" onclick="switchPeriod('day')">24h</button>
|
||||||
|
<button class="period-btn" onclick="switchPeriod('week')">7d</button>
|
||||||
|
<button class="period-btn" onclick="switchPeriod('month')">30d</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="timeseries-chart" style="height:140px;position:relative;overflow:hidden">
|
||||||
|
<div style="color:var(--dim);font-size:13px;padding:50px 0;text-align:center">Loading...</div>
|
||||||
|
</div>
|
||||||
|
<div id="timeseries-legend" style="display:flex;gap:16px;justify-content:center;margin-top:8px;flex-wrap:wrap"></div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-title">Agent Activity</div>
|
||||||
|
<div id="agent-bars">-</div>
|
||||||
|
</div>
|
||||||
|
<div class="card full">
|
||||||
|
<div class="card-title">Live Request Stream</div>
|
||||||
|
<div style="overflow-x:auto">
|
||||||
|
<table class="route-table">
|
||||||
|
<thead><tr><th>Time</th><th>Agent</th><th>Model</th><th>Reason</th><th>Tier</th></tr></thead>
|
||||||
|
<tbody id="route-tbody"><tr><td colspan="5">Waiting for data...</td></tr></tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
const MODEL_COLORS = {'gemma-4-E4B':'#7fd962','qwen3.6-27B-code':'#ffb454','qwen3.6-35B-A3B':'#d2a6ff'};
|
||||||
|
const MODEL_LABELS = {'gemma-4-E4B':'Gemma 4B','qwen3.6-27B-code':'Qwen Code 27B','qwen3.6-35B-A3B':'Qwen MoE 35B'};
|
||||||
|
const GPU_LABELS = {'NVIDIA GeForce RTX 5070':'RTX 5070 - Gemma 4B','NVIDIA GeForce RTX 3090':'RTX 3090 - Qwen Code 27B','AMD Radeon (Strix Halo)':'Strix Halo - Qwen MoE 35B'};
|
||||||
|
|
||||||
|
function statusIcon(status) {
|
||||||
|
if (status === 'healthy') return '<span class="gpu-icon green">●</span>';
|
||||||
|
if (status === 'saturated') return '<span class="gpu-icon yellow">◉</span>';
|
||||||
|
return '<span class="gpu-icon red">○</span>';
|
||||||
|
}
|
||||||
|
function vramClass(pct) { if(pct>90)return'red';if(pct>75)return'yellow';return'green'; }
|
||||||
|
|
||||||
|
function render(data) {
|
||||||
|
if(!data||!data.gpus)return;
|
||||||
|
const total = Object.values(data.route_counts||{}).reduce((a,b)=>a+b,0);
|
||||||
|
document.getElementById('total-requests').textContent = total + ' requests';
|
||||||
|
document.getElementById('update-time').textContent = new Date().toLocaleTimeString();
|
||||||
|
|
||||||
|
const gpus = data.gpus||[];
|
||||||
|
document.getElementById('gpu-container').innerHTML = gpus.map(g => '<div class="gpu-row">'+statusIcon(g.status)+'<div class="gpu-info"><div class="gpu-name">'+(GPU_LABELS[g.gpu_name]||g.gpu_name||g.id||'?')+'</div><div class="gpu-metrics"><div class="gpu-metric"><span class="label">VRAM</span> <span class="value">'+(g.vram_used_mb||'?')+'/'+(g.vram_total_mb||'?')+' MB</span></div><div class="gpu-metric"><span class="label">Temp</span> <span class="value">'+(g.temp_c||'?')+'C</span></div><div class="gpu-metric"><span class="label">Util</span> <span class="value">'+(g.gpu_util_pct||0)+'%</span></div>'+(g.power_w!=null?'<div class="gpu-metric"><span class="label">Power</span> <span class="value">'+g.power_w+'W</span></div>':'')+'</div><div class="vram-bar"><div class="vram-fill '+vramClass(g.vram_pct||0)+'" style="width:'+(g.vram_pct||0)+'%"></div></div></div><div style="font-size:24px;font-weight:700;color:'+(vramClass(g.vram_pct||0)==='red'?'var(--red)':vramClass(g.vram_pct||0)==='yellow'?'var(--yellow)':'var(--green)')+';min-width:50px;text-align:right">'+(g.vram_pct||0)+'%</div></div>').join('');
|
||||||
|
|
||||||
|
const rc = data.route_counts||{};
|
||||||
|
const maxR = Math.max(1,...Object.values(rc));
|
||||||
|
document.getElementById('route-bars').innerHTML = Object.entries(rc).length ? Object.entries(rc).sort((a,b)=>b[1]-a[1]).map(([m,c])=>'<div class="bar-row"><div class="bar-label"><span class="name">'+(MODEL_LABELS[m]||m)+'</span><span class="count">'+c+' ('+(total?Math.round(c/total*100):0)+'%)</span></div><div class="bar-track"><div class="bar-fill" style="width:'+(c/maxR*100)+'%;background:'+(MODEL_COLORS[m]||'#39bae6')+'"></div></div></div>').join('') : '<div style="color:var(--dim);font-size:13px">No data yet</div>';
|
||||||
|
|
||||||
|
const ac = data.agent_counts||{};
|
||||||
|
const maxA = Math.max(1,...Object.values(ac));
|
||||||
|
document.getElementById('agent-bars').innerHTML = Object.entries(ac).length ? Object.entries(ac).sort((a,b)=>b[1]-a[1]).map(([a,c])=>'<div class="bar-row"><div class="bar-label"><span class="name agent-'+a.toLowerCase().replace(/[^a-z]/g,'')+'">'+a+'</span><span class="count">'+c+' reqs</span></div><div class="bar-track"><div class="bar-fill" style="width:'+(c/maxA*100)+'%;background:var(--accent)"></div></div></div>').join('') : '<div style="color:var(--dim);font-size:13px">No agent activity yet</div>';
|
||||||
|
|
||||||
|
const recent = data.recent||[];
|
||||||
|
document.getElementById('route-tbody').innerHTML = recent.length ? recent.slice(0,25).map(r=>{const d=new Date(r.ts*1000);const a=r.agent||'?';const cl='agent-'+a.toLowerCase().replace(/[^a-z0-9]/g,'');return'<tr><td style="color:var(--dim);font-size:11px">'+d.toLocaleTimeString()+'</td><td><span class="agent-tag '+cl+'">'+a+'</span></td><td>'+(MODEL_LABELS[r.model]||r.model)+'</td><td style="color:var(--dim);font-size:11px">'+(r.reason||'')+'</td><td style="font-size:11px;text-transform:uppercase;color:'+(r.tier==='enterprise'?'var(--purple)':r.tier==='professional'?'var(--blue)':'var(--dim)')+'">'+(r.tier||'')+'</td></tr>';}).join('') : '<tr><td colspan="5" style="color:var(--dim)">Waiting for requests...</td></tr>';
|
||||||
|
}
|
||||||
|
|
||||||
|
let currentPeriod = 'day';
|
||||||
|
async function switchPeriod(p) {
|
||||||
|
currentPeriod = p;
|
||||||
|
document.querySelectorAll('.period-btn').forEach(b => b.classList.remove('active'));
|
||||||
|
document.querySelectorAll('.period-btn').forEach(b => { if(b.textContent.trim().startsWith(p==='day'?'24h':p==='week'?'7d':'30d')) b.classList.add('active'); });
|
||||||
|
await loadTimeseries();
|
||||||
|
}
|
||||||
|
async function loadTimeseries() {
|
||||||
|
try { const r = await fetch('/api/timeseries?period='+currentPeriod); renderTimeseries(await r.json()); } catch(e) {}
|
||||||
|
}
|
||||||
|
function renderTimeseries(d) {
|
||||||
|
const models = d.models||{}, labels = d.labels||[];
|
||||||
|
if(!labels.length)return;
|
||||||
|
const container = document.getElementById('timeseries-chart');
|
||||||
|
const legend = document.getElementById('timeseries-legend');
|
||||||
|
const modelNames = Object.keys(models);
|
||||||
|
if(!modelNames.length){container.innerHTML='<div style="color:var(--dim);font-size:13px;padding:50px 0;text-align:center">No data yet</div>';return;}
|
||||||
|
const colors = {'gemma-4-E4B':'#7fd962','qwen3.6-27B-code':'#ffb454','qwen3.6-35B-A3B':'#d2a6ff'};
|
||||||
|
const shortNames = {'gemma-4-E4B':'Gemma','qwen3.6-27B-code':'Qwen Code','qwen3.6-35B-A3B':'Qwen MoE'};
|
||||||
|
let maxVal = 1;
|
||||||
|
for(const m in models) for(const v of models[m]) if(v>maxVal) maxVal=v;
|
||||||
|
maxVal = Math.ceil(maxVal*1.15)||1;
|
||||||
|
const W = labels.length>1?100/(labels.length-1):100, H=130;
|
||||||
|
let paths='';
|
||||||
|
for(const m of modelNames){const vals=models[m]||[];let d='';for(let i=0;i<vals.length;i++){const x=i*W,y=H-(vals[i]/maxVal)*H;d+=(i===0?'M':'L')+x.toFixed(1)+','+y.toFixed(1)+' ';}paths+='<path d="'+d+'" fill="none" stroke="'+(colors[m]||'#39bae6')+'" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" opacity="0.85"/>';}
|
||||||
|
let grid='';
|
||||||
|
for(let g=0;g<=4;g++){const y=(g/4)*H;grid+='<line x1="0" y1="'+y.toFixed(1)+'" x2="100" y2="'+y.toFixed(1)+'" stroke="rgba(255,255,255,0.05)" stroke-width="1"/>';}
|
||||||
|
const svg='<svg viewBox="0 0 100 '+(H+16)+'" style="width:100%;height:'+(H+20)+'px;display:block" preserveAspectRatio="none">'+grid+paths+'</svg>';
|
||||||
|
const step=Math.max(1,Math.floor(labels.length/8));
|
||||||
|
let lh='<div style="display:flex;margin-top:2px;font-size:10px;color:var(--dim);overflow:hidden">';
|
||||||
|
for(let i=0;i<labels.length;i+=step) lh+='<div style="flex:1;text-align:center">'+labels[i]+'</div>';
|
||||||
|
lh+='</div>';
|
||||||
|
container.innerHTML=svg+lh;
|
||||||
|
legend.innerHTML=modelNames.map(m=>'<span style="display:flex;align-items:center;gap:6px;font-size:11px;color:var(--dim)"><svg width="18" height="10"><line x1="0" y1="5" x2="18" y2="5" stroke="'+(colors[m]||'#39bae6')+'" stroke-width="2.5"/></svg>'+shortNames[m]+'</span>').join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function poll(){fetch('/api/state').then(r=>r.json()).then(data=>{render(data);document.getElementById('connection-status').textContent='live';document.getElementById('live-dot').className='status-dot live';}).catch(()=>{document.getElementById('connection-status').textContent='reconnecting...';document.getElementById('live-dot').className='status-dot';});}
|
||||||
|
poll();setInterval(poll,3000);loadTimeseries();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>"""
|
||||||
|
|
||||||
|
@app.route("/")
|
||||||
|
def dashboard():
|
||||||
|
return render_template_string(DASHBOARD_HTML)
|
||||||
|
|
||||||
|
@app.route("/api/state")
|
||||||
|
def api_state():
|
||||||
|
return fetch_state()
|
||||||
|
|
||||||
|
@app.route("/api/timeseries")
|
||||||
|
def api_timeseries():
|
||||||
|
period = request.args.get("period", "day")
|
||||||
|
try:
|
||||||
|
r = requests.get("http://router:9000/metrics/timeseries?period=" + period, timeout=5)
|
||||||
|
if r.status_code == 200: return r.json()
|
||||||
|
except Exception: pass
|
||||||
|
return {"models": {}, "labels": []}
|
||||||
|
|
||||||
|
@app.route("/api/stream")
|
||||||
|
def api_stream():
|
||||||
|
def event_stream():
|
||||||
|
q = queue.Queue()
|
||||||
|
with sse_lock: sse_subscribers.append(q)
|
||||||
|
try:
|
||||||
|
data = fetch_state()
|
||||||
|
yield "data: " + json.dumps(data) + "\n\n"
|
||||||
|
while True:
|
||||||
|
try: msg = q.get(timeout=3); yield "data: " + msg + "\n\n"
|
||||||
|
except queue.Empty:
|
||||||
|
data = fetch_state()
|
||||||
|
yield "data: " + json.dumps(data) + "\n\n"
|
||||||
|
except GeneratorExit: pass
|
||||||
|
finally:
|
||||||
|
with sse_lock:
|
||||||
|
if q in sse_subscribers: sse_subscribers.remove(q)
|
||||||
|
return Response(stream_with_context(event_stream()), mimetype="text/event-stream",
|
||||||
|
headers={"Cache-Control":"no-cache","X-Accel-Buffering":"no","Access-Control-Allow-Origin":"*"})
|
||||||
|
|
||||||
|
@app.route("/health")
|
||||||
|
def health():
|
||||||
|
return {"status":"healthy","service":"harness-dashboard"}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(host="0.0.0.0", port=3000, debug=False)
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
flask==3.1.*
|
||||||
|
requests==2.32.*
|
||||||
+66
-16
@@ -1,27 +1,77 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
queue-service:
|
redis:
|
||||||
build: ./queue-service
|
image: redis:7-alpine
|
||||||
container_name: syslog-queue
|
container_name: harness-redis
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- "8091:8091"
|
- "127.0.0.1:6379:6379"
|
||||||
|
volumes:
|
||||||
|
- redis-data:/data
|
||||||
|
command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "redis-cli", "ping"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
router:
|
||||||
|
build: ./router
|
||||||
|
container_name: harness-router
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "9000:9000"
|
||||||
environment:
|
environment:
|
||||||
- REDIS_HOST=192.168.68.7
|
- REDIS_URL=redis://redis:6379
|
||||||
- REDIS_PORT=6379
|
- GPU_MOE_URL=http://192.168.68.15:8080/v1
|
||||||
networks:
|
- GPU_DENSE_URL=http://192.168.68.8:8080/v1
|
||||||
- harness-net
|
- GPU_LIGHT_URL=http://192.168.68.110:8080/v1
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
|
||||||
|
litellm:
|
||||||
|
image: ghcr.io/berriai/litellm:main-stable
|
||||||
|
command: ["--config", "/app/config.yaml", "--port", "4000"]
|
||||||
|
container_name: harness-litellm
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "8081:4000"
|
||||||
|
volumes:
|
||||||
|
- ./litellm_config.yaml:/app/config.yaml
|
||||||
|
environment:
|
||||||
|
- LITELLM_MASTER_KEY=sk-syslog-local-master-key
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
|
||||||
|
nginx:
|
||||||
|
image: nginx:alpine
|
||||||
|
container_name: harness-nginx
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "80:80"
|
||||||
|
volumes:
|
||||||
|
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
|
||||||
|
depends_on:
|
||||||
|
- litellm
|
||||||
|
- dashboard
|
||||||
|
|
||||||
dashboard:
|
dashboard:
|
||||||
build: ./dashboard
|
build: ./dashboard
|
||||||
container_name: syslog-dashboard
|
container_name: harness-dashboard
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- "3001:3001"
|
- "3000:3000"
|
||||||
|
environment:
|
||||||
|
- REDIS_URL=redis://redis:6379
|
||||||
|
- GPU_SIDECARS=192.168.68.15:8090,192.168.68.8:8090,192.168.68.110:8090
|
||||||
depends_on:
|
depends_on:
|
||||||
- queue-service
|
- redis
|
||||||
networks:
|
|
||||||
- harness-net
|
|
||||||
|
|
||||||
networks:
|
volumes:
|
||||||
harness-net:
|
redis-data:
|
||||||
driver: bridge
|
|
||||||
|
# LiteLLM command override to load config
|
||||||
|
# (appended to fix config loading issue)
|
||||||
|
|||||||
@@ -0,0 +1,25 @@
|
|||||||
|
model_list:
|
||||||
|
- model_name: qwen3.6-35B-A3B
|
||||||
|
litellm_params:
|
||||||
|
model: openai/qwen3.6-35B-A3B
|
||||||
|
api_base: http://192.168.68.15:8080/v1
|
||||||
|
api_key: "not-needed"
|
||||||
|
|
||||||
|
- model_name: qwen3.6-27B-code
|
||||||
|
litellm_params:
|
||||||
|
model: openai/qwen3.6-27B-code-text
|
||||||
|
api_base: http://192.168.68.8:8080/v1
|
||||||
|
api_key: "not-needed"
|
||||||
|
|
||||||
|
- model_name: gemma-4-E4B
|
||||||
|
litellm_params:
|
||||||
|
model: openai/gemma-4-E4B
|
||||||
|
api_base: http://192.168.68.110:8080/v1
|
||||||
|
api_key: "not-needed"
|
||||||
|
|
||||||
|
general_settings:
|
||||||
|
master_key: sk-syslog-local-master-key
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
drop_params: true
|
||||||
|
request_timeout: 120
|
||||||
@@ -0,0 +1,79 @@
|
|||||||
|
worker_processes auto;
|
||||||
|
error_log /var/log/nginx/error.log warn;
|
||||||
|
pid /var/run/nginx.pid;
|
||||||
|
|
||||||
|
events { worker_connections 1024; }
|
||||||
|
|
||||||
|
http {
|
||||||
|
include /etc/nginx/mime.types;
|
||||||
|
default_type application/octet-stream;
|
||||||
|
|
||||||
|
log_format main launching rt=;
|
||||||
|
access_log /var/log/nginx/access.log main;
|
||||||
|
error_log /var/log/nginx/error.log;
|
||||||
|
sendfile on;
|
||||||
|
keepalive_timeout 65;
|
||||||
|
|
||||||
|
upstream router_api { server router:9000; }
|
||||||
|
upstream dashboard_ui { server dashboard:3000; }
|
||||||
|
upstream litellm_backend { server litellm:4000; }
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
|
||||||
|
# Disable buffering for SSE streams
|
||||||
|
proxy_buffering off;
|
||||||
|
|
||||||
|
# API — through router
|
||||||
|
location /v1/ {
|
||||||
|
proxy_pass http://router_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header Authorization $http_authorization;
|
||||||
|
proxy_connect_timeout 10s;
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
proxy_buffering off;
|
||||||
|
}
|
||||||
|
|
||||||
|
# SSE streaming endpoint
|
||||||
|
location /stream {
|
||||||
|
proxy_pass http://router_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header Connection "";
|
||||||
|
proxy_buffering off;
|
||||||
|
chunked_transfer_encoding off;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Dashboard API proxy for SSE
|
||||||
|
location /api/ {
|
||||||
|
proxy_pass http://dashboard_ui;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_buffering off;
|
||||||
|
}
|
||||||
|
|
||||||
|
# LiteLLM debug
|
||||||
|
location /litellm/ {
|
||||||
|
rewrite ^/litellm/(.*) /$1 break;
|
||||||
|
proxy_pass http://litellm_backend;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header Authorization $http_authorization;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Dashboard
|
||||||
|
location / {
|
||||||
|
proxy_pass http://dashboard_ui;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_buffering off;
|
||||||
|
}
|
||||||
|
|
||||||
|
location /health {
|
||||||
|
return 200 "{\"status\":\"healthy\"}";
|
||||||
|
add_header Content-Type application/json;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
COPY router.py .
|
||||||
|
|
||||||
|
EXPOSE 9000
|
||||||
|
CMD ["python", "router.py"]
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
flask==3.1.*
|
||||||
|
redis==5.2.*
|
||||||
|
requests==2.32.*
|
||||||
@@ -0,0 +1,213 @@
|
|||||||
|
import os, json, time, logging, traceback, threading, queue
|
||||||
|
import requests, redis
|
||||||
|
from flask import Flask, request, jsonify, Response, stream_with_context
|
||||||
|
|
||||||
|
REDIS_URL = os.environ.get("REDIS_URL", "redis://redis:6379")
|
||||||
|
GPU_MOE_URL = os.environ.get("GPU_MOE_URL", "http://192.168.68.15:8080/v1")
|
||||||
|
GPU_DENSE_URL = os.environ.get("GPU_DENSE_URL", "http://192.168.68.8:8080/v1")
|
||||||
|
GPU_LIGHT_URL = os.environ.get("GPU_LIGHT_URL", "http://192.168.68.110:8080/v1")
|
||||||
|
|
||||||
|
GPU_SIDECARS = {
|
||||||
|
"qwen3.6-35B-A3B": "http://192.168.68.15:8090",
|
||||||
|
"qwen3.6-27B-code": "http://192.168.68.8:8090",
|
||||||
|
"gemma-4-E4B": "http://192.168.68.110:8090",
|
||||||
|
}
|
||||||
|
GPU_URLS = {
|
||||||
|
"qwen3.6-35B-A3B": GPU_MOE_URL,
|
||||||
|
"qwen3.6-27B-code": GPU_DENSE_URL,
|
||||||
|
"gemma-4-E4B": GPU_LIGHT_URL,
|
||||||
|
}
|
||||||
|
TIER_MODELS = {
|
||||||
|
"starter": ["gemma-4-E4B"],
|
||||||
|
"professional": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
||||||
|
"enterprise": ["qwen3.6-35B-A3B", "qwen3.6-27B-code", "gemma-4-E4B"],
|
||||||
|
}
|
||||||
|
API_KEYS = {
|
||||||
|
"sk-syslog-local-master-key": {"tier": "enterprise", "agent": "admin"},
|
||||||
|
"sk-syslog-abiba": {"tier": "enterprise", "agent": "Abiba"},
|
||||||
|
"sk-syslog-mumuni": {"tier": "enterprise", "agent": "Mumuni"},
|
||||||
|
"sk-syslog-tanko": {"tier": "enterprise", "agent": "Tanko"},
|
||||||
|
"sk-syslog-koby": {"tier": "enterprise", "agent": "Koby"},
|
||||||
|
"sk-syslog-kagenz0": {"tier": "enterprise", "agent": "Kagenz0"},
|
||||||
|
"sk-starter-abc123": {"tier": "starter", "agent": "test-starter"},
|
||||||
|
"sk-professional-xyz789": {"tier": "professional", "agent": "test-pro"},
|
||||||
|
}
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [ROUTER] %(levelname)s %(message)s")
|
||||||
|
log = logging.getLogger("router")
|
||||||
|
try: r = redis.from_url(REDIS_URL, decode_responses=True); r.ping()
|
||||||
|
except Exception: r = None
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
sse_subscribers = []; sse_lock = threading.Lock()
|
||||||
|
|
||||||
|
def check_gpu_health(model):
|
||||||
|
url = GPU_SIDECARS.get(model)
|
||||||
|
if not url: return {"status": "unknown"}
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, timeout=5)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
d = resp.json()
|
||||||
|
pct = (d.get("vram_used_mb",0) / max(d.get("vram_total_mb",1), 1)) * 100
|
||||||
|
return {"status": "healthy" if pct < 90 else "saturated", "vram_used_mb": d.get("vram_used_mb"), "vram_total_mb": d.get("vram_total_mb"), "vram_pct": round(pct,1), "temp_c": d.get("temp_c"), "gpu_util_pct": d.get("gpu_util_pct"), "gpu_name": d.get("gpu_name"), "power_w": d.get("power_w"), "power_limit_w": d.get("power_limit_w")}
|
||||||
|
except Exception: pass
|
||||||
|
return {"status": "down"}
|
||||||
|
|
||||||
|
def available_models(): return [m for m in GPU_URLS if check_gpu_health(m)["status"] in ("healthy","saturated")]
|
||||||
|
|
||||||
|
def estimate_tokens(msgs): return sum(len(str(m.get("content",""))) for m in msgs) // 4
|
||||||
|
|
||||||
|
def route(rd, tier):
|
||||||
|
msgs = rd.get("messages",[]); t = estimate_tokens(msgs)
|
||||||
|
sys = any(m.get("role")=="system" for m in msgs)
|
||||||
|
turns = len([m for m in msgs if m.get("role") in ("user","assistant")])
|
||||||
|
hints = rd.get("routing_hints",{})
|
||||||
|
allowed = TIER_MODELS.get(tier, ["gemma-4-E4B"])
|
||||||
|
avail = [m for m in available_models() if m in allowed]
|
||||||
|
if not avail: return {"model": allowed[0], "reason": "all_saturated"}
|
||||||
|
req = rd.get("model","auto")
|
||||||
|
if req != "auto": return {"model": req if req in avail else avail[0], "reason": "explicit"}
|
||||||
|
if hints:
|
||||||
|
if hints.get("priority")=="speed" and "gemma-4-E4B" in avail: return {"model":"gemma-4-E4B","reason":"hint_speed"}
|
||||||
|
if hints.get("priority")=="quality" and "qwen3.6-27B-code" in avail: return {"model":"qwen3.6-27B-code","reason":"hint_quality"}
|
||||||
|
if t > 4000 or sys or turns > 6:
|
||||||
|
for m in ["qwen3.6-27B-code","qwen3.6-35B-A3B","gemma-4-E4B"]:
|
||||||
|
if m in avail: return {"model":m,"reason":"heavy_reasoning"}
|
||||||
|
first_msg = msgs[0].get("content","") if msgs else ""
|
||||||
|
words = len(first_msg.split()) if isinstance(first_msg, str) else 99
|
||||||
|
if words <= 3 and turns <= 1 and not sys and "gemma-4-E4B" in avail:
|
||||||
|
return {"model":"gemma-4-E4B","reason":"ultra_light"}
|
||||||
|
if "qwen3.6-35B-A3B" in avail: return {"model":"qwen3.6-35B-A3B","reason":"default_moe"}
|
||||||
|
return {"model":avail[0],"reason":"fallback"}
|
||||||
|
|
||||||
|
def clean_unicode(text):
|
||||||
|
if not isinstance(text, str): return text
|
||||||
|
return text.replace("\u2014","-").replace("\u2013","-").replace("\u2018",").replace(u2019,").replace("\u201c",').replace(u201d,').replace("\u2026","...").replace("\u00a0"," ")
|
||||||
|
|
||||||
|
def clean_response(d):
|
||||||
|
if isinstance(d, dict): return {k: clean_response(v) for k,v in d.items()}
|
||||||
|
if isinstance(d, list): return [clean_response(v) for v in d]
|
||||||
|
if isinstance(d, str): return clean_unicode(d)
|
||||||
|
return d
|
||||||
|
|
||||||
|
def get_metrics():
|
||||||
|
d = {"gpus":[],"route_counts":{},"agent_counts":{},"tier_counts":{},"recent":[],"timestamp":time.time()}
|
||||||
|
for m in GPU_URLS:
|
||||||
|
h = check_gpu_health(m)
|
||||||
|
d["gpus"].append({"id":m,"gpu_name":h.get("gpu_name",m),"status":h.get("status"),"vram_used_mb":h.get("vram_used_mb"),"vram_total_mb":h.get("vram_total_mb"),"vram_pct":h.get("vram_pct"),"temp_c":h.get("temp_c"),"gpu_util_pct":h.get("gpu_util_pct"),"power_w":h.get("power_w"),"power_limit_w":h.get("power_limit_w")})
|
||||||
|
if r:
|
||||||
|
try:
|
||||||
|
for m in GPU_URLS: d["route_counts"][m] = int(r.get("routes:"+m) or 0)
|
||||||
|
for k,v in API_KEYS.items():
|
||||||
|
c = int(r.get("routes:agent:"+v["agent"]) or 0)
|
||||||
|
if c>0: d["agent_counts"][v["agent"]] = c
|
||||||
|
for t in TIER_MODELS: d["tier_counts"][t] = int(r.get("routes:tier:"+t) or 0)
|
||||||
|
raw = r.lrange("routes:recent",0,49)
|
||||||
|
d["recent"] = [json.loads(x) for x in raw] if raw else []
|
||||||
|
except Exception: pass
|
||||||
|
return d
|
||||||
|
|
||||||
|
def bcast():
|
||||||
|
data = get_metrics(); payload = json.dumps(data)
|
||||||
|
with sse_lock:
|
||||||
|
dead = []
|
||||||
|
for q in sse_subscribers:
|
||||||
|
try: q.put(payload)
|
||||||
|
except Exception: dead.append(q)
|
||||||
|
for q in dead: sse_subscribers.remove(q)
|
||||||
|
|
||||||
|
@app.route("/v1/chat/completions", methods=["POST"])
|
||||||
|
def chat():
|
||||||
|
try:
|
||||||
|
rd = request.get_json(force=True)
|
||||||
|
ak = request.headers.get("Authorization","").replace("Bearer ","")
|
||||||
|
ki = API_KEYS.get(ak, {"tier":"starter","agent":"unknown"})
|
||||||
|
tier, agent = ki["tier"], ki["agent"]
|
||||||
|
d = route(rd, tier); model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]]
|
||||||
|
is_stream = rd.get("stream", False)
|
||||||
|
log.info("ROUTE: %s -> %s (%s) stream=%s", agent, model, reason, is_stream)
|
||||||
|
if r:
|
||||||
|
try:
|
||||||
|
r.incr("routes:"+model); r.incr("routes:tier:"+tier); r.incr("routes:agent:"+agent)
|
||||||
|
r.incr("ts:"+model+":"+time.strftime("%Y%m%d%H"))
|
||||||
|
r.lpush("routes:recent", json.dumps({"ts":time.time(),"model":model,"reason":reason,"tier":tier,"agent":agent}))
|
||||||
|
r.ltrim("routes:recent",0,999)
|
||||||
|
except Exception: pass
|
||||||
|
start = time.time()
|
||||||
|
resp = requests.post(url+"/chat/completions", json=rd,
|
||||||
|
headers={"Content-Type":"application/json","Authorization":"Bearer not-needed"}, timeout=120, stream=is_stream)
|
||||||
|
lat = int((time.time()-start)*1000)
|
||||||
|
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
||||||
|
if is_stream:
|
||||||
|
def gen():
|
||||||
|
for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
|
||||||
|
if raw: yield clean_unicode(raw)
|
||||||
|
bcast()
|
||||||
|
return Response(stream_with_context(gen()), mimetype="text/event-stream")
|
||||||
|
data = clean_response(resp.json())
|
||||||
|
for c in data.get("choices",[]):
|
||||||
|
msg = c.get("message",{})
|
||||||
|
if not msg.get("content") and msg.get("reasoning_content"):
|
||||||
|
msg["content"] = msg["reasoning_content"]
|
||||||
|
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat}
|
||||||
|
bcast()
|
||||||
|
return jsonify(data)
|
||||||
|
except requests.Timeout: return jsonify({"error":"timeout"}), 504
|
||||||
|
except Exception as e:
|
||||||
|
log.error("Error: %s\n%s", e, traceback.format_exc())
|
||||||
|
return jsonify({"error":str(e)}), 500
|
||||||
|
|
||||||
|
@app.route("/v1/models")
|
||||||
|
def models(): return jsonify({"object":"list","data":[{"id":m,"object":"model","owned_by":"syslog","status":check_gpu_health(m).get("status"),"gpu":check_gpu_health(m).get("gpu_name")} for m in GPU_URLS]})
|
||||||
|
|
||||||
|
@app.route("/health")
|
||||||
|
def health(): return jsonify({"status":"healthy","redis":"connected" if r else "down","gpus":{m:check_gpu_health(m) for m in GPU_URLS},"available_models":available_models()})
|
||||||
|
|
||||||
|
@app.route("/metrics")
|
||||||
|
def metrics(): return jsonify(get_metrics())
|
||||||
|
|
||||||
|
@app.route("/metrics/timeseries")
|
||||||
|
def metrics_timeseries():
|
||||||
|
period = request.args.get("period", "day"); models_list = list(GPU_URLS.keys())
|
||||||
|
data = {"models": {}, "labels": []}
|
||||||
|
if period == "day":
|
||||||
|
buckets = [time.strftime("%Y%m%d%H", time.gmtime(time.time()-h*3600)) for h in range(23,-1,-1)]
|
||||||
|
data["labels"] = [time.strftime("%H:00", time.gmtime(time.time()-h*3600)) for h in range(23,-1,-1)]
|
||||||
|
elif period == "week":
|
||||||
|
buckets = [time.strftime("%Y%m%d", time.gmtime(time.time()-d*86400)) for d in range(6,-1,-1)]
|
||||||
|
data["labels"] = [time.strftime("%a", time.gmtime(time.time()-d*86400)) for d in range(6,-1,-1)]
|
||||||
|
else:
|
||||||
|
buckets = [time.strftime("%Y%m%d", time.gmtime(time.time()-d*86400)) for d in range(29,-1,-1)]
|
||||||
|
data["labels"] = [time.strftime("%m/%d", time.gmtime(time.time()-d*86400)) for d in range(29,-1,-1)]
|
||||||
|
if r:
|
||||||
|
for model in models_list:
|
||||||
|
counts = []
|
||||||
|
for bucket in buckets:
|
||||||
|
total = 0
|
||||||
|
if period in ("week","month"):
|
||||||
|
for hh in range(24): total += int(r.get("ts:"+model+":"+bucket+"{:02d}".format(hh)) or 0)
|
||||||
|
else: total = int(r.get("ts:"+model+":"+bucket) or 0)
|
||||||
|
counts.append(total)
|
||||||
|
data["models"][model] = counts
|
||||||
|
return jsonify(data)
|
||||||
|
|
||||||
|
@app.route("/stream")
|
||||||
|
def stream():
|
||||||
|
def ev():
|
||||||
|
q = queue.Queue()
|
||||||
|
with sse_lock: sse_subscribers.append(q)
|
||||||
|
try:
|
||||||
|
yield "data: "+json.dumps(get_metrics())+"\n\n"
|
||||||
|
while True:
|
||||||
|
try: yield "data: "+q.get(timeout=3)+"\n\n"
|
||||||
|
except queue.Empty: yield "data: "+json.dumps(get_metrics())+"\n\n"
|
||||||
|
except GeneratorExit: pass
|
||||||
|
finally:
|
||||||
|
with sse_lock:
|
||||||
|
if q in sse_subscribers: sse_subscribers.remove(q)
|
||||||
|
return Response(stream_with_context(ev()), mimetype="text/event-stream",
|
||||||
|
headers={"Cache-Control":"no-cache","X-Accel-Buffering":"no","Access-Control-Allow-Origin":"*"})
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
log.info("Router on :9000")
|
||||||
|
app.run(host="0.0.0.0", port=9000, debug=False)
|
||||||
Reference in New Issue
Block a user