#!/usr/bin/env python3 """Syslog Harness Dashboard — Simple HTTP server exposing GPU health + metrics.""" import json import os import time import urllib.request from http.server import HTTPServer, SimpleHTTPRequestHandler from datetime import datetime GPUS = { "amdpve": {"endpoint": os.getenv("AMDVE_EP", "192.168.68.15:8080"), "model": "qwen3.6-35B-A3B (MoE)", "vram": "65GB"}, "llmgpu": {"endpoint": os.getenv("LLMGPU_EP", "192.168.68.8:8080"), "model": "qwen3.5-27B (Dense)", "vram": "24GB"}, "ocu_llm": {"endpoint": os.getenv("OCU_LLM_EP", "192.168.68.110:8080"), "model": "gemma-4-E4B (Light)", "vram": "12GB"}, } def check_gpu(name, info): try: start = time.time() # Use simple HTTP GET to check if the GPU endpoint is alive resp = urllib.request.urlopen(f"http://{info['endpoint']}/", timeout=3) latency = (time.time() - start) * 1000 return { "status": "up", "latency_ms": round(latency, 1), "model": info["model"], "vram": info["vram"], } except Exception as e: return {"status": "down", "error": str(e)[:50], "model": info["model"], "vram": info["vram"]} def get_queue_status(): try: req = urllib.request.Request("http://queue-service:8091/status") resp = urllib.request.urlopen(req, timeout=2) return json.loads(resp.read()) except Exception: return {"queue_depth": -1, "circuit_breaker": "unknown", "gpu_health": {}} DASHBOARD_HTML = """
| GPU | Model | VRAM | Status | Latency |
|---|