b09a93f45c
- SMART_QUEUE_IMPLEMENTATION.md: Complete implementation draft (1572 lines) with 10 quick-win fixes and full smart queue consumer rewrite - ARCHITECTURE_REVIEW.md: 26-issue audit with prioritized findings - Verified all 3 GPUs live: amdpve (73% util), llmgpu (idle), ocu_llm (idle) - Redis 7.4.9 confirmed streams support - GPU sidecar metrics verified on all hosts Key fixes: - QW-1: Dockerfile path mismatch (Dockerfile.queue -> queue-service/Dockerfile) - QW-2: Nginx fallback only on ALL-GPU failure (not single GPU) - QW-3: Container names fixed to Docker service names - QW-4: Redis host default fixed (192.168.68.7 -> redis) - QW-5: Dependency version pinning - QW-7-10: Health checks, restart policy, Gunicorn, single-process collector Smart queue features: - Redis Streams + consumer groups - GPU-aware load balancing via sidecar metrics - Per-GPU circuit breakers with half-open recovery - Adaptive backpressure (0-30 normal, 30-40 warn, 40-50 503, >50 open) - Dead letter queue with retry endpoint - Job ID tracking and /status/<job_id> API
134 lines
5.6 KiB
Python
134 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Syslog Harness Dashboard — Simple HTTP server exposing GPU health + metrics."""
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
import urllib.request
|
|
from http.server import HTTPServer, SimpleHTTPRequestHandler
|
|
from datetime import datetime
|
|
|
|
GPUS = {
|
|
"amdpve": {"endpoint": os.getenv("AMDVE_EP", "192.168.68.15:8080"), "model": "qwen3.6-35B-A3B (MoE)", "vram": "65GB"},
|
|
"llmgpu": {"endpoint": os.getenv("LLMGPU_EP", "192.168.68.8:8080"), "model": "qwen3.5-27B (Dense)", "vram": "24GB"},
|
|
"ocu_llm": {"endpoint": os.getenv("OCU_LLM_EP", "192.168.68.110:8080"), "model": "gemma-4-E4B (Light)", "vram": "12GB"},
|
|
}
|
|
|
|
|
|
def check_gpu(name, info):
|
|
try:
|
|
start = time.time()
|
|
# Use simple HTTP GET to check if the GPU endpoint is alive
|
|
resp = urllib.request.urlopen(f"http://{info['endpoint']}/", timeout=3)
|
|
latency = (time.time() - start) * 1000
|
|
return {
|
|
"status": "up",
|
|
"latency_ms": round(latency, 1),
|
|
"model": info["model"],
|
|
"vram": info["vram"],
|
|
}
|
|
except Exception as e:
|
|
return {"status": "down", "error": str(e)[:50], "model": info["model"], "vram": info["vram"]}
|
|
|
|
|
|
def get_queue_status():
|
|
try:
|
|
req = urllib.request.Request("http://queue-service:8091/status")
|
|
resp = urllib.request.urlopen(req, timeout=2)
|
|
return json.loads(resp.read())
|
|
except Exception:
|
|
return {"queue_depth": -1, "circuit_breaker": "unknown", "gpu_health": {}}
|
|
|
|
|
|
DASHBOARD_HTML = """
|
|
<!DOCTYPE html>
|
|
<html><head><meta charset="utf-8"><title>🦅 Syslog Harness</title>
|
|
<style>
|
|
body { background: #1a1a2e; color: #e0e0e0; font-family: monospace; margin: 0; padding: 20px; }
|
|
.card { background: #16213e; border-radius: 8px; padding: 16px; margin: 10px 0; border-left: 4px solid #0f3460; }
|
|
.up { border-left-color: #00d26a; } .down { border-left-color: #ff4757; }
|
|
.warn { border-left-color: #ffa502; }
|
|
h1 { color: #00d26a; font-size: 24px; } h2 { color: #0f3460; font-size: 16px; }
|
|
.metric { display: inline-block; margin: 4px 12px; }
|
|
.value { font-weight: bold; color: #00d26a; }
|
|
#refresh { position: fixed; top: 10px; right: 10px; background: #0f3460; color: white;
|
|
border: none; padding: 8px 16px; border-radius: 4px; cursor: pointer; }
|
|
table { width: 100%; border-collapse: collapse; margin: 10px 0; }
|
|
th, td { text-align: left; padding: 8px; border-bottom: 1px solid #0f3460; }
|
|
th { color: #00d26a; }
|
|
</style></head><body>
|
|
<button id="refresh" onclick="location.reload()">↻ Refresh</button>
|
|
<h1>🦅 Syslog Harness Dashboard</h1>
|
|
<h2>Updated: <span id="ts"></span></h2>
|
|
|
|
<div class="card" id="queue-card">
|
|
<h2>Queue & Circuit Breaker</h2>
|
|
<div class="metric">Depth: <span class="value" id="depth">--</span></div>
|
|
<div class="metric">Circuit: <span class="value" id="circuit">--</span></div>
|
|
<div class="metric">Threshold: <span class="value" id="threshold">--</span></div>
|
|
</div>
|
|
|
|
<div class="card">
|
|
<h2>GPU Endpoints</h2>
|
|
<table><tr><th>GPU</th><th>Model</th><th>VRAM</th><th>Status</th><th>Latency</th></tr>
|
|
<tbody id="gpu-table"></tbody></table>
|
|
</div>
|
|
|
|
<script>
|
|
document.getElementById('ts').textContent = new Date().toISOString();
|
|
fetch('/api/status').then(r => r.json()).then(data => {
|
|
document.getElementById('depth').textContent = data.queue_depth;
|
|
document.getElementById('circuit').textContent = data.circuit_breaker;
|
|
document.getElementById('threshold').textContent = 'warn:' + data.thresholds.warn + ' / open:' + data.thresholds.open;
|
|
const card = document.getElementById('queue-card');
|
|
if (data.circuit_breaker === 'open') card.className = 'card warn';
|
|
else if (data.circuit_breaker === 'warn') card.className = 'card warn';
|
|
else card.className = 'card up';
|
|
let html = '';
|
|
for (const [name, gpu] of Object.entries(data.gpu_health)) {
|
|
const status = gpu.status === 'up' ? '✅' : '❌';
|
|
const latency = gpu.status === 'up' ? gpu.latency_ms + 'ms' : gpu.error;
|
|
const rowClass = gpu.status === 'up' ? '' : 'down';
|
|
html += `<tr class="${rowClass}"><td>${name}</td><td>${gpu.model}</td><td>${gpu.vram}</td><td>${status}</td><td>${latency}</td></tr>`;
|
|
}
|
|
document.getElementById('gpu-table').innerHTML = html;
|
|
});
|
|
setInterval(() => location.reload(), 10000);
|
|
</script></body></html>
|
|
"""
|
|
|
|
|
|
class Handler(SimpleHTTPRequestHandler):
|
|
def do_GET(self):
|
|
if self.path == "/" or self.path == "/harness.html":
|
|
self.send_response(200)
|
|
self.send_header("Content-Type", "text/html; charset=utf-8")
|
|
self.end_headers()
|
|
self.wfile.write(DASHBOARD_HTML.encode())
|
|
elif self.path == "/api/status":
|
|
status = get_queue_status()
|
|
enriched = {
|
|
"queue_depth": status.get("queue_depth", -1),
|
|
"circuit_breaker": status.get("circuit_breaker", "unknown"),
|
|
"thresholds": status.get("thresholds", {"warn": 30, "open": 50}),
|
|
"gpu_health": {},
|
|
}
|
|
for name, info in GPUS.items():
|
|
enriched["gpu_health"][name] = check_gpu(name, info)
|
|
self.send_response(200)
|
|
self.send_header("Content-Type", "application/json")
|
|
self.end_headers()
|
|
self.wfile.write(json.dumps(enriched).encode())
|
|
else:
|
|
self.send_response(404)
|
|
self.end_headers()
|
|
|
|
def log_message(self, format, *args):
|
|
pass # Suppress request logs
|
|
|
|
|
|
if __name__ == "__main__":
|
|
server = HTTPServer(("0.0.0.0", 3001), Handler)
|
|
print("Dashboard running on :3001/harness.html")
|
|
server.serve_forever()
|