syslog-harness/dashboard/harness-dashboard.py

#!/usr/bin/env python3
"""Syslog Harness Dashboard — Simple HTTP server exposing GPU health + metrics."""

import json
import os
import time
import urllib.request
from http.server import HTTPServer, SimpleHTTPRequestHandler
from datetime import datetime

GPUS = {
    "amdpve": {"endpoint": os.getenv("AMDVE_EP", "192.168.68.15:8080"), "model": "qwen3.6-35B-A3B (MoE)", "vram": "65GB"},
    "llmgpu": {"endpoint": os.getenv("LLMGPU_EP", "192.168.68.8:8080"), "model": "qwen3.5-27B (Dense)", "vram": "24GB"},
    "ocu_llm": {"endpoint": os.getenv("OCU_LLM_EP", "192.168.68.110:8080"), "model": "gemma-4-E4B (Light)", "vram": "12GB"},
}


def check_gpu(name, info):
    try:
        start = time.time()
        # Use simple HTTP GET to check if the GPU endpoint is alive
        resp = urllib.request.urlopen(f"http://{info['endpoint']}/", timeout=3)
        latency = (time.time() - start) * 1000
        return {
            "status": "up",
            "latency_ms": round(latency, 1),
            "model": info["model"],
            "vram": info["vram"],
        }
    except Exception as e:
        return {"status": "down", "error": str(e)[:50], "model": info["model"], "vram": info["vram"]}


def get_queue_status():
    try:
        req = urllib.request.Request("http://queue-service:8091/status")
        resp = urllib.request.urlopen(req, timeout=2)
        return json.loads(resp.read())
    except Exception:
        return {"queue_depth": -1, "circuit_breaker": "unknown", "gpu_health": {}}


DASHBOARD_HTML = """
<!DOCTYPE html>
<html><head><meta charset="utf-8"><title>🦅 Syslog Harness</title>
<style>
  body { background: #1a1a2e; color: #e0e0e0; font-family: monospace; margin: 0; padding: 20px; }
  .card { background: #16213e; border-radius: 8px; padding: 16px; margin: 10px 0; border-left: 4px solid #0f3460; }
  .up { border-left-color: #00d26a; } .down { border-left-color: #ff4757; }
  .warn { border-left-color: #ffa502; }
  h1 { color: #00d26a; font-size: 24px; } h2 { color: #0f3460; font-size: 16px; }
  .metric { display: inline-block; margin: 4px 12px; }
  .value { font-weight: bold; color: #00d26a; }
  #refresh { position: fixed; top: 10px; right: 10px; background: #0f3460; color: white;
             border: none; padding: 8px 16px; border-radius: 4px; cursor: pointer; }
  table { width: 100%; border-collapse: collapse; margin: 10px 0; }
  th, td { text-align: left; padding: 8px; border-bottom: 1px solid #0f3460; }
  th { color: #00d26a; }
</style></head><body>
<button id="refresh" onclick="location.reload()">↻ Refresh</button>
<h1>🦅 Syslog Harness Dashboard</h1>
<h2>Updated: <span id="ts"></span></h2>

<div class="card" id="queue-card">
  <h2>Queue & Circuit Breaker</h2>
  <div class="metric">Depth: <span class="value" id="depth">--</span></div>
  <div class="metric">Circuit: <span class="value" id="circuit">--</span></div>
  <div class="metric">Threshold: <span class="value" id="threshold">--</span></div>
</div>

<div class="card">
  <h2>GPU Endpoints</h2>
  <table><tr><th>GPU</th><th>Model</th><th>VRAM</th><th>Status</th><th>Latency</th></tr>
  <tbody id="gpu-table"></tbody></table>
</div>

<script>
  document.getElementById('ts').textContent = new Date().toISOString();
  fetch('/api/status').then(r => r.json()).then(data => {
    document.getElementById('depth').textContent = data.queue_depth;
    document.getElementById('circuit').textContent = data.circuit_breaker;
    document.getElementById('threshold').textContent = 'warn:' + data.thresholds.warn + ' / open:' + data.thresholds.open;
    const card = document.getElementById('queue-card');
    if (data.circuit_breaker === 'open') card.className = 'card warn';
    else if (data.circuit_breaker === 'warn') card.className = 'card warn';
    else card.className = 'card up';
    let html = '';
    for (const [name, gpu] of Object.entries(data.gpu_health)) {
      const status = gpu.status === 'up' ? '✅' : '❌';
      const latency = gpu.status === 'up' ? gpu.latency_ms + 'ms' : gpu.error;
      const rowClass = gpu.status === 'up' ? '' : 'down';
      html += `<tr class="${rowClass}"><td>${name}</td><td>${gpu.model}</td><td>${gpu.vram}</td><td>${status}</td><td>${latency}</td></tr>`;
    }
    document.getElementById('gpu-table').innerHTML = html;
  });
  setInterval(() => location.reload(), 10000);
</script></body></html>
"""


class Handler(SimpleHTTPRequestHandler):
    def do_GET(self):
        if self.path == "/" or self.path == "/harness.html":
            self.send_response(200)
            self.send_header("Content-Type", "text/html; charset=utf-8")
            self.end_headers()
            self.wfile.write(DASHBOARD_HTML.encode())
        elif self.path == "/api/status":
            status = get_queue_status()
            enriched = {
                "queue_depth": status.get("queue_depth", -1),
                "circuit_breaker": status.get("circuit_breaker", "unknown"),
                "thresholds": status.get("thresholds", {"warn": 30, "open": 50}),
                "gpu_health": {},
            }
            for name, info in GPUS.items():
                enriched["gpu_health"][name] = check_gpu(name, info)
            self.send_response(200)
            self.send_header("Content-Type", "application/json")
            self.end_headers()
            self.wfile.write(json.dumps(enriched).encode())
        else:
            self.send_response(404)
            self.end_headers()

    def log_message(self, format, *args):
        pass  # Suppress request logs


if __name__ == "__main__":
    server = HTTPServer(("0.0.0.0", 3001), Handler)
    print("Dashboard running on :3001/harness.html")
    server.serve_forever()