feat: Smart Queue Consumer implementation draft + architecture review

- SMART_QUEUE_IMPLEMENTATION.md: Complete implementation draft (1572 lines) with 10 quick-win fixes and full smart queue consumer rewrite - ARCHITECTURE_REVIEW.md: 26-issue audit with prioritized findings - Verified all 3 GPUs live: amdpve (73% util), llmgpu (idle), ocu_llm (idle) - Redis 7.4.9 confirmed streams support - GPU sidecar metrics verified on all hosts Key fixes: - QW-1: Dockerfile path mismatch (Dockerfile.queue -> queue-service/Dockerfile) - QW-2: Nginx fallback only on ALL-GPU failure (not single GPU) - QW-3: Container names fixed to Docker service names - QW-4: Redis host default fixed (192.168.68.7 -> redis) - QW-5: Dependency version pinning - QW-7-10: Health checks, restart policy, Gunicorn, single-process collector Smart queue features: - Redis Streams + consumer groups - GPU-aware load balancing via sidecar metrics - Per-GPU circuit breakers with half-open recovery - Adaptive backpressure (0-30 normal, 30-40 warn, 40-50 503, >50 open) - Dead letter queue with retry endpoint - Job ID tracking and /status/<job_id> API
2026-05-17 03:55:20 +00:00
parent e95475f431
commit b09a93f45c
15 changed files with 3895 additions and 1 deletions
@@ -0,0 +1,8 @@
+FROM python:3.13-slim
+
+COPY harness-dashboard.py /app/harness-dashboard.py
+WORKDIR /app
+
+EXPOSE 3001
+
+CMD ["python3", "harness-dashboard.py"]
@@ -0,0 +1,5 @@
+FROM python:3.11-slim
+WORKDIR /app
+COPY harness-dashboard.py .
+EXPOSE 3001
+CMD ["python3", "harness-dashboard.py"]
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""Syslog Harness Dashboard — Simple HTTP server exposing GPU health + metrics."""
+
+import json
+import os
+import time
+import urllib.request
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+from datetime import datetime
+
+GPUS = {
+    "amdpve": {"endpoint": os.getenv("AMDVE_EP", "192.168.68.15:8080"), "model": "qwen3.6-35B-A3B (MoE)", "vram": "65GB"},
+    "llmgpu": {"endpoint": os.getenv("LLMGPU_EP", "192.168.68.8:8080"), "model": "qwen3.5-27B (Dense)", "vram": "24GB"},
+    "ocu_llm": {"endpoint": os.getenv("OCU_LLM_EP", "192.168.68.110:8080"), "model": "gemma-4-E4B (Light)", "vram": "12GB"},
+}
+
+
+def check_gpu(name, info):
+    try:
+        start = time.time()
+        # Use simple HTTP GET to check if the GPU endpoint is alive
+        resp = urllib.request.urlopen(f"http://{info['endpoint']}/", timeout=3)
+        latency = (time.time() - start) * 1000
+        return {
+            "status": "up",
+            "latency_ms": round(latency, 1),
+            "model": info["model"],
+            "vram": info["vram"],
+        }
+    except Exception as e:
+        return {"status": "down", "error": str(e)[:50], "model": info["model"], "vram": info["vram"]}
+
+
+def get_queue_status():
+    try:
+        req = urllib.request.Request("http://queue-service:8091/status")
+        resp = urllib.request.urlopen(req, timeout=2)
+        return json.loads(resp.read())
+    except Exception:
+        return {"queue_depth": -1, "circuit_breaker": "unknown", "gpu_health": {}}
+
+
+DASHBOARD_HTML = """
+<!DOCTYPE html>
+<html><head><meta charset="utf-8"><title>🦅 Syslog Harness</title>
+<style>
+  body { background: #1a1a2e; color: #e0e0e0; font-family: monospace; margin: 0; padding: 20px; }
+  .card { background: #16213e; border-radius: 8px; padding: 16px; margin: 10px 0; border-left: 4px solid #0f3460; }
+  .up { border-left-color: #00d26a; } .down { border-left-color: #ff4757; }
+  .warn { border-left-color: #ffa502; }
+  h1 { color: #00d26a; font-size: 24px; } h2 { color: #0f3460; font-size: 16px; }
+  .metric { display: inline-block; margin: 4px 12px; }
+  .value { font-weight: bold; color: #00d26a; }
+  #refresh { position: fixed; top: 10px; right: 10px; background: #0f3460; color: white;
+             border: none; padding: 8px 16px; border-radius: 4px; cursor: pointer; }
+  table { width: 100%; border-collapse: collapse; margin: 10px 0; }
+  th, td { text-align: left; padding: 8px; border-bottom: 1px solid #0f3460; }
+  th { color: #00d26a; }
+</style></head><body>
+<button id="refresh" onclick="location.reload()">↻ Refresh</button>
+<h1>🦅 Syslog Harness Dashboard</h1>
+<h2>Updated: <span id="ts"></span></h2>
+
+<div class="card" id="queue-card">
+  <h2>Queue & Circuit Breaker</h2>
+  <div class="metric">Depth: <span class="value" id="depth">--</span></div>
+  <div class="metric">Circuit: <span class="value" id="circuit">--</span></div>
+  <div class="metric">Threshold: <span class="value" id="threshold">--</span></div>
+</div>
+
+<div class="card">
+  <h2>GPU Endpoints</h2>
+  <table><tr><th>GPU</th><th>Model</th><th>VRAM</th><th>Status</th><th>Latency</th></tr>
+  <tbody id="gpu-table"></tbody></table>
+</div>
+
+<script>
+  document.getElementById('ts').textContent = new Date().toISOString();
+  fetch('/api/status').then(r => r.json()).then(data => {
+    document.getElementById('depth').textContent = data.queue_depth;
+    document.getElementById('circuit').textContent = data.circuit_breaker;
+    document.getElementById('threshold').textContent = 'warn:' + data.thresholds.warn + ' / open:' + data.thresholds.open;
+    const card = document.getElementById('queue-card');
+    if (data.circuit_breaker === 'open') card.className = 'card warn';
+    else if (data.circuit_breaker === 'warn') card.className = 'card warn';
+    else card.className = 'card up';
+    let html = '';
+    for (const [name, gpu] of Object.entries(data.gpu_health)) {
+      const status = gpu.status === 'up' ? '✅' : '❌';
+      const latency = gpu.status === 'up' ? gpu.latency_ms + 'ms' : gpu.error;
+      const rowClass = gpu.status === 'up' ? '' : 'down';
+      html += `<tr class="${rowClass}"><td>${name}</td><td>${gpu.model}</td><td>${gpu.vram}</td><td>${status}</td><td>${latency}</td></tr>`;
+    }
+    document.getElementById('gpu-table').innerHTML = html;
+  });
+  setInterval(() => location.reload(), 10000);
+</script></body></html>
+"""
+
+
+class Handler(SimpleHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == "/" or self.path == "/harness.html":
+            self.send_response(200)
+            self.send_header("Content-Type", "text/html; charset=utf-8")
+            self.end_headers()
+            self.wfile.write(DASHBOARD_HTML.encode())
+        elif self.path == "/api/status":
+            status = get_queue_status()
+            enriched = {
+                "queue_depth": status.get("queue_depth", -1),
+                "circuit_breaker": status.get("circuit_breaker", "unknown"),
+                "thresholds": status.get("thresholds", {"warn": 30, "open": 50}),
+                "gpu_health": {},
+            }
+            for name, info in GPUS.items():
+                enriched["gpu_health"][name] = check_gpu(name, info)
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(json.dumps(enriched).encode())
+        else:
+            self.send_response(404)
+            self.end_headers()
+
+    def log_message(self, format, *args):
+        pass  # Suppress request logs
+
+
+if __name__ == "__main__":
+    server = HTTPServer(("0.0.0.0", 3001), Handler)
+    print("Dashboard running on :3001/harness.html")
+    server.serve_forever()