Add GPU dashboard container + Nginx routing
This commit is contained in:
@@ -0,0 +1,14 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
RUN pip install requests
|
||||||
|
|
||||||
|
COPY gpu-dashboard/ /app/
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN mkdir -p /app/public && \
|
||||||
|
cp gpu.html /app/public/ && \
|
||||||
|
touch /app/public/gpu_metrics.json
|
||||||
|
|
||||||
|
EXPOSE 8092
|
||||||
|
|
||||||
|
CMD ["sh", "-c", "python3 gpu_collector.py & python3 -m http.server 8092 --directory /app/public & wait"]
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
services:
|
||||||
|
redis:
|
||||||
|
image: redis:7-alpine
|
||||||
|
restart: always
|
||||||
|
networks:
|
||||||
|
- gpu-router-net
|
||||||
|
volumes:
|
||||||
|
- redis-data:/data
|
||||||
|
|
||||||
|
queue-service:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile.queue
|
||||||
|
restart: always
|
||||||
|
networks:
|
||||||
|
- gpu-router-net
|
||||||
|
ports:
|
||||||
|
- "8091:8091"
|
||||||
|
depends_on:
|
||||||
|
- redis
|
||||||
|
environment:
|
||||||
|
- REDIS_HOST=redis
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
|
||||||
|
dashboard:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile.dashboard
|
||||||
|
restart: always
|
||||||
|
networks:
|
||||||
|
- gpu-router-net
|
||||||
|
ports:
|
||||||
|
- "3001:3001"
|
||||||
|
depends_on:
|
||||||
|
- redis
|
||||||
|
|
||||||
|
gpu-dashboard:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile.gpu
|
||||||
|
restart: always
|
||||||
|
networks:
|
||||||
|
- gpu-router-net
|
||||||
|
ports:
|
||||||
|
- "8092:8092"
|
||||||
|
|
||||||
|
networks:
|
||||||
|
gpu-router-net:
|
||||||
|
driver: bridge
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
redis-data:
|
||||||
@@ -0,0 +1,183 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>GPU Monitor</title>
|
||||||
|
<style>
|
||||||
|
* { margin: 0; padding: 0; box-sizing: border-box; }
|
||||||
|
body { background: #0d1117; color: #c9d1d9; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; padding: 20px; }
|
||||||
|
h1 { font-size: 1.3em; margin-bottom: 4px; }
|
||||||
|
.topbar { display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px; padding-bottom: 12px; border-bottom: 1px solid #21262d; }
|
||||||
|
.topbar .status { font-size: 0.85em; color: #8b949e; }
|
||||||
|
.topbar .status .dot { display: inline-block; width: 8px; height: 8px; border-radius: 50%; margin-right: 6px; }
|
||||||
|
.dot.green { background: #3fb950; }
|
||||||
|
.dot.yellow { background: #d2991d; }
|
||||||
|
.dot.red { background: #f85149; }
|
||||||
|
.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); gap: 16px; }
|
||||||
|
.card { background: #161b22; border: 1px solid #21262d; border-radius: 8px; padding: 16px; }
|
||||||
|
.card.stale { opacity: 0.5; }
|
||||||
|
.card.dead { opacity: 0.3; border-color: #f85149; }
|
||||||
|
.card-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 12px; }
|
||||||
|
.card-header .name { font-weight: 600; font-size: 1.05em; }
|
||||||
|
.card-header .host { font-size: 0.8em; color: #8b949e; }
|
||||||
|
.card-header .state { font-size: 0.75em; padding: 2px 8px; border-radius: 10px; font-weight: 600; }
|
||||||
|
.state.idle { background: #1b3826; color: #3fb950; }
|
||||||
|
.state.busy { background: #3d1f1a; color: #f85149; }
|
||||||
|
.state.unknown { background: #21262d; color: #8b949e; }
|
||||||
|
.metric { margin-bottom: 10px; }
|
||||||
|
.metric-label { display: flex; justify-content: space-between; font-size: 0.82em; color: #8b949e; margin-bottom: 2px; }
|
||||||
|
.metric-label .val { color: #c9d1d9; font-weight: 500; }
|
||||||
|
.bar { height: 6px; border-radius: 3px; background: #21262d; overflow: hidden; }
|
||||||
|
.bar-fill { height: 100%; border-radius: 3px; transition: width 0.5s ease; }
|
||||||
|
.bar-fill.temp-cool { background: #3fb950; }
|
||||||
|
.bar-fill.temp-warm { background: #d2991d; }
|
||||||
|
.bar-fill.temp-hot { background: #f85149; }
|
||||||
|
.bar-fill.util { background: #58a6ff; }
|
||||||
|
.bar-fill.vram { background: #bc8cff; }
|
||||||
|
.bar-fill.power { background: #f0883e; }
|
||||||
|
.model-line { font-size: 0.82em; color: #8b949e; margin-top: 8px; padding-top: 8px; border-top: 1px solid #21262d; }
|
||||||
|
.model-line span { color: #c9d1d9; }
|
||||||
|
.error { color: #f85149; font-size: 0.85em; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="topbar">
|
||||||
|
<div>
|
||||||
|
<h1><a href="/" style="color:#58a6ff;text-decoration:none;">← Workspace</a> · GPU Monitor</h1>
|
||||||
|
<span class="status"><span class="dot green" id="status-dot"></span><span id="status-text">Loading...</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="status" id="age">—</div>
|
||||||
|
</div>
|
||||||
|
<div class="cards" id="cards"></div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const INTERVAL = 5000;
|
||||||
|
let lastFetchTime = null;
|
||||||
|
|
||||||
|
function updateClock() {
|
||||||
|
const el = document.getElementById('age');
|
||||||
|
if (!lastFetchTime) { el.textContent = '—'; return; }
|
||||||
|
const age = Math.round((Date.now() / 1000) - lastFetchTime);
|
||||||
|
el.textContent = age <= 60 ? `updated ${age}s ago` : `stale ${age}s ago`;
|
||||||
|
}
|
||||||
|
setInterval(updateClock, 1000);
|
||||||
|
|
||||||
|
const TEMP_WARN = 70, TEMP_HOT = 82;
|
||||||
|
const VRAM_WARN = 80, VRAM_HOT = 92;
|
||||||
|
|
||||||
|
function tempClass(c) { return c > TEMP_HOT ? 'temp-hot' : c > TEMP_WARN ? 'temp-warm' : 'temp-cool'; }
|
||||||
|
function vramClass(pct) { return pct > VRAM_HOT ? 'temp-hot' : pct > VRAM_WARN ? 'temp-warm' : 'temp-cool'; }
|
||||||
|
function pct(val, max) { return max ? Math.round(val / max * 100) : 0; }
|
||||||
|
function mbToGB(mb) { return mb ? (mb / 1024).toFixed(1) : '—'; }
|
||||||
|
|
||||||
|
function renderCard(g) {
|
||||||
|
const hw = g.hardware || {};
|
||||||
|
const inf = g.inference || {};
|
||||||
|
const online = g.online !== false;
|
||||||
|
const stale = g.stale === true;
|
||||||
|
let cardClass = '';
|
||||||
|
if (!online) cardClass = 'dead';
|
||||||
|
else if (stale) cardClass = 'stale';
|
||||||
|
|
||||||
|
let stateClass = inf.state || 'unknown';
|
||||||
|
let stateLabel = inf.state ? inf.state.toUpperCase() : 'UNKNOWN';
|
||||||
|
if (!online) { stateClass = 'unknown'; stateLabel = 'OFFLINE'; }
|
||||||
|
|
||||||
|
const temp = hw.temp_c;
|
||||||
|
const util = hw.gpu_util_pct;
|
||||||
|
const vramUsed = hw.vram_used_mb;
|
||||||
|
const vramTotal = hw.vram_total_mb;
|
||||||
|
const power = hw.power_w;
|
||||||
|
const powerLimit = hw.power_limit_w;
|
||||||
|
const fan = hw.fan_pct;
|
||||||
|
const vendor = hw.vendor;
|
||||||
|
|
||||||
|
let html = `<div class="card ${cardClass}">`;
|
||||||
|
html += `<div class="card-header">`;
|
||||||
|
html += `<div><div class="name">${g.gpu_name}</div><div class="host">${g.host}</div></div>`;
|
||||||
|
html += `<div class="state ${stateClass}">${stateLabel}</div>`;
|
||||||
|
html += `</div>`;
|
||||||
|
|
||||||
|
if (!online) {
|
||||||
|
html += `<div class="error">Unreachable</div>`;
|
||||||
|
} else if (hw.error) {
|
||||||
|
html += `<div class="error">${hw.error}</div>`;
|
||||||
|
} else {
|
||||||
|
// Temperature
|
||||||
|
if (temp != null) {
|
||||||
|
html += `<div class="metric"><div class="metric-label"><span>Temperature</span><span class="val">${temp}°C</span></div>`;
|
||||||
|
html += `<div class="bar"><div class="bar-fill ${tempClass(temp)}" style="width:${Math.min(temp,100)}%"></div></div></div>`;
|
||||||
|
}
|
||||||
|
// Utilization
|
||||||
|
if (util != null) {
|
||||||
|
html += `<div class="metric"><div class="metric-label"><span>GPU Utilization</span><span class="val">${util}%</span></div>`;
|
||||||
|
html += `<div class="bar"><div class="bar-fill util" style="width:${util}%"></div></div></div>`;
|
||||||
|
}
|
||||||
|
// VRAM
|
||||||
|
if (vramUsed != null && vramTotal != null) {
|
||||||
|
const vramPct = pct(vramUsed, vramTotal);
|
||||||
|
html += `<div class="metric"><div class="metric-label"><span>VRAM</span><span class="val">${mbToGB(vramUsed)} / ${mbToGB(vramTotal)} GB</span></div>`;
|
||||||
|
html += `<div class="bar"><div class="bar-fill ${vramClass(vramPct)}" style="width:${vramPct}%"></div></div></div>`;
|
||||||
|
}
|
||||||
|
// Power
|
||||||
|
if (power != null) {
|
||||||
|
const powerPct = powerLimit ? pct(power, powerLimit) : 0;
|
||||||
|
const powerText = powerLimit ? `${power}W / ${powerLimit}W` : `${power}W`;
|
||||||
|
html += `<div class="metric"><div class="metric-label"><span>Power</span><span class="val">${powerText}</span></div>`;
|
||||||
|
if (powerLimit) html += `<div class="bar"><div class="bar-fill power" style="width:${powerPct}%"></div></div>`;
|
||||||
|
html += `</div>`;
|
||||||
|
}
|
||||||
|
// Fan (NVIDIA only)
|
||||||
|
if (fan != null) {
|
||||||
|
html += `<div class="metric"><div class="metric-label"><span>Fan Speed</span><span class="val">${fan}%</span></div>`;
|
||||||
|
html += `<div class="bar"><div class="bar-fill util" style="width:${fan}%"></div></div></div>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Model loaded
|
||||||
|
html += `<div class="model-line">Model: <span>${inf.model || '—'}</span></div>`;
|
||||||
|
html += `</div>`;
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function refresh() {
|
||||||
|
try {
|
||||||
|
const resp = await fetch('gpu_metrics.json?t=' + Date.now());
|
||||||
|
const data = await resp.json();
|
||||||
|
const gpus = data.gpus || [];
|
||||||
|
|
||||||
|
document.getElementById('cards').innerHTML = gpus.map(renderCard).join('');
|
||||||
|
|
||||||
|
// Top bar status
|
||||||
|
const online = gpus.filter(g => g.online !== false).length;
|
||||||
|
const total = gpus.length;
|
||||||
|
const dot = document.getElementById('status-dot');
|
||||||
|
const txt = document.getElementById('status-text');
|
||||||
|
if (online === total) { dot.className = 'dot green'; txt.textContent = `${online}/${total} online`; }
|
||||||
|
else if (online > 0) { dot.className = 'dot yellow'; txt.textContent = `${online}/${total} online`; }
|
||||||
|
else { dot.className = 'dot red'; txt.textContent = 'All offline'; }
|
||||||
|
|
||||||
|
// Capture fetch time for live clock
|
||||||
|
lastFetchTime = Date.now() / 1000;
|
||||||
|
} catch(e) {
|
||||||
|
document.getElementById('status-dot').className = 'dot red';
|
||||||
|
document.getElementById('status-text').textContent = 'Collector down';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render skeletons instantly
|
||||||
|
const SKELETONS = [
|
||||||
|
{host:'amdpve', gpu_name:'AMD Strix Halo', hardware:{}, inference:{}, online:true},
|
||||||
|
{host:'llmgpu', gpu_name:'RTX 3090', hardware:{}, inference:{}, online:true},
|
||||||
|
{host:'ocu-llm', gpu_name:'RTX 5070', hardware:{}, inference:{}, online:true},
|
||||||
|
];
|
||||||
|
document.getElementById('cards').innerHTML = SKELETONS.map(g =>
|
||||||
|
`<div class="card"><div class="card-header"><div><div class="name">${g.gpu_name}</div><div class="host">${g.host}</div></div><div class="state unknown">···</div></div><div class="model-line" style="color:#8b949e;">Loading metrics...</div></div>`
|
||||||
|
).join('');
|
||||||
|
|
||||||
|
refresh();
|
||||||
|
setInterval(refresh, INTERVAL);
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,115 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""GPU metrics collector — polls sidecars + llama.cpp every 10s, writes to Workspace."""
|
||||||
|
|
||||||
|
import urllib.request, json, time, os
|
||||||
|
|
||||||
|
HOSTS = [
|
||||||
|
{"name": "amdpve", "host": "192.168.68.15", "gpu": "AMD Strix Halo", "llama_port": 8080},
|
||||||
|
{"name": "llmgpu", "host": "192.168.68.8", "gpu": "RTX 3090", "llama_port": 8080},
|
||||||
|
{"name": "ocu-llm", "host": "192.168.68.110", "gpu": "RTX 5070", "llama_port": 8080},
|
||||||
|
]
|
||||||
|
OUTPUT = "/app/public/gpu_metrics.json"
|
||||||
|
INTERVAL = 10
|
||||||
|
STALE_THRESHOLD = 30 # seconds before marking stale
|
||||||
|
DEAD_THRESHOLD = 60 # seconds before marking unreachable
|
||||||
|
|
||||||
|
last_seen = {}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_json(url, timeout=3):
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
resp = urllib.request.urlopen(req, timeout=timeout)
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def collect_one(h):
|
||||||
|
"""Collect GPU hardware + llama.cpp inference state for one host."""
|
||||||
|
name = h["name"]
|
||||||
|
host = h["host"]
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
# GPU hardware from sidecar
|
||||||
|
gpu = fetch_json(f"http://{host}:8090/")
|
||||||
|
|
||||||
|
# llama.cpp inference state
|
||||||
|
llamacpp_health = fetch_json(f"http://{host}:{h['llama_port']}/health")
|
||||||
|
llamacpp_models = fetch_json(f"http://{host}:{h['llama_port']}/v1/models")
|
||||||
|
|
||||||
|
# Determine inference state
|
||||||
|
model_name = None
|
||||||
|
inference_state = "unknown"
|
||||||
|
if llamacpp_models:
|
||||||
|
models = llamacpp_models.get("data", [])
|
||||||
|
if models:
|
||||||
|
model_name = models[0].get("id")
|
||||||
|
|
||||||
|
if llamacpp_health:
|
||||||
|
status = llamacpp_health.get("status", "")
|
||||||
|
if status == "ok":
|
||||||
|
idle = llamacpp_health.get("slots_idle", 0)
|
||||||
|
processing = llamacpp_health.get("slots_processing", 0)
|
||||||
|
if idle and not processing:
|
||||||
|
inference_state = "idle"
|
||||||
|
elif processing:
|
||||||
|
inference_state = "busy"
|
||||||
|
else:
|
||||||
|
inference_state = "idle"
|
||||||
|
|
||||||
|
# Check for /slots endpoint for is_processing detail
|
||||||
|
slots = fetch_json(f"http://{host}:{h['llama_port']}/slots")
|
||||||
|
if slots and isinstance(slots, list) and len(slots) > 0:
|
||||||
|
if slots[0].get("is_processing"):
|
||||||
|
inference_state = "busy"
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"host": name,
|
||||||
|
"gpu_name": h["gpu"],
|
||||||
|
"inference": {
|
||||||
|
"state": inference_state,
|
||||||
|
"model": model_name,
|
||||||
|
},
|
||||||
|
"hardware": gpu if gpu else None,
|
||||||
|
"online": gpu is not None,
|
||||||
|
"timestamp": now,
|
||||||
|
}
|
||||||
|
|
||||||
|
if gpu is not None:
|
||||||
|
last_seen[name] = now
|
||||||
|
|
||||||
|
if name in last_seen:
|
||||||
|
age = now - last_seen[name]
|
||||||
|
if age > DEAD_THRESHOLD:
|
||||||
|
result["online"] = False
|
||||||
|
elif age > STALE_THRESHOLD:
|
||||||
|
result["stale"] = True
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print(f"GPU collector starting, output={OUTPUT}, interval={INTERVAL}s")
|
||||||
|
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
start = time.time()
|
||||||
|
results = [collect_one(h) for h in HOSTS]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"updated": start,
|
||||||
|
"gpus": results,
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(OUTPUT + ".tmp", "w") as f:
|
||||||
|
json.dump(payload, f)
|
||||||
|
os.rename(OUTPUT + ".tmp", OUTPUT)
|
||||||
|
|
||||||
|
elapsed = time.time() - start
|
||||||
|
sleep_for = max(0, INTERVAL - elapsed)
|
||||||
|
time.sleep(sleep_for)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,122 @@
|
|||||||
|
## Syslog GPU Router — Nginx Configuration (Docker-internal)
|
||||||
|
## Routes incoming agent requests to the appropriate GPU backend
|
||||||
|
## based on the X-Syslog-Model header.
|
||||||
|
|
||||||
|
upstream amdpve_pool {
|
||||||
|
## Strix Halo 395 — qwen3.6-35B-A3B (MoE) — Default workhorse
|
||||||
|
server 192.168.68.15:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream llmgpu_pool {
|
||||||
|
## RTX 3090 — qwen3.5-27B (Dense) — Heavy reasoning
|
||||||
|
server 192.168.68.8:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream ocu_llm_pool {
|
||||||
|
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
||||||
|
server 192.168.68.110:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream queue_service {
|
||||||
|
## Agent queue with circuit breaker (Docker container)
|
||||||
|
server queue-service:8091;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream dashboard_service {
|
||||||
|
## Harness dashboard (Docker container)
|
||||||
|
server dashboard:3001;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream gpu_dashboard_pool {
|
||||||
|
## GPU dashboard (Docker container)
|
||||||
|
server syslog-harness-gpu-dashboard-1:8092;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Mapping: X-Syslog-Model header → upstream backend
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
map $http_x_syslog_model $gpu_upstream {
|
||||||
|
default amdpve_pool;
|
||||||
|
"standard" amdpve_pool;
|
||||||
|
"heavy" llmgpu_pool;
|
||||||
|
"qwen3.5-27B" llmgpu_pool;
|
||||||
|
"light" ocu_llm_pool;
|
||||||
|
"gemma-4" ocu_llm_pool;
|
||||||
|
}
|
||||||
|
|
||||||
|
## Rate limit zone — 10 req/s per IP, burst of 20
|
||||||
|
limit_req_zone $binary_remote_addr zone=perip:10m rate=10r/s;
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name _;
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Dashboard — observability UI (MUST be before / catch-all)
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location /dashboard {
|
||||||
|
proxy_pass http://dashboard_service/;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## GPU Dashboard — observability UI (MUST be before / catch-all)
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location /gpu {
|
||||||
|
proxy_pass http://gpu_dashboard_pool/;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Main location — proxy to selected upstream
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location / {
|
||||||
|
limit_req zone=perip burst=20 nodelay;
|
||||||
|
limit_req_status 503;
|
||||||
|
proxy_pass http://$gpu_upstream;
|
||||||
|
|
||||||
|
## Preserve original host and headers
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
## Pass through the model header so backends can log it
|
||||||
|
proxy_pass_header X-Syslog-Model;
|
||||||
|
|
||||||
|
## Streaming support (SSE for LLM responses)
|
||||||
|
proxy_buffering off;
|
||||||
|
proxy_cache off;
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
proxy_send_timeout 300s;
|
||||||
|
|
||||||
|
## Basic failover — retry on error or timeout
|
||||||
|
proxy_next_upstream error timeout http_502 http_503;
|
||||||
|
proxy_next_upstream_tries 2;
|
||||||
|
|
||||||
|
## Add a response header for observability
|
||||||
|
add_header X-Routed-To $gpu_upstream always;
|
||||||
|
|
||||||
|
## Fallback to queue when all GPU upstreams are down
|
||||||
|
error_page 502 503 504 = @queue_fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Queue fallback — enqueue when GPUs are unavailable
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location @queue_fallback {
|
||||||
|
rewrite ^ /enqueue break;
|
||||||
|
proxy_pass http://queue_service;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_set_header Content-Type $content_type;
|
||||||
|
proxy_pass_request_body on;
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user