feat: Smart Queue Consumer implementation draft + architecture review

- SMART_QUEUE_IMPLEMENTATION.md: Complete implementation draft (1572 lines)
  with 10 quick-win fixes and full smart queue consumer rewrite
- ARCHITECTURE_REVIEW.md: 26-issue audit with prioritized findings
- Verified all 3 GPUs live: amdpve (73% util), llmgpu (idle), ocu_llm (idle)
- Redis 7.4.9 confirmed streams support
- GPU sidecar metrics verified on all hosts

Key fixes:
- QW-1: Dockerfile path mismatch (Dockerfile.queue -> queue-service/Dockerfile)
- QW-2: Nginx fallback only on ALL-GPU failure (not single GPU)
- QW-3: Container names fixed to Docker service names
- QW-4: Redis host default fixed (192.168.68.7 -> redis)
- QW-5: Dependency version pinning
- QW-7-10: Health checks, restart policy, Gunicorn, single-process collector

Smart queue features:
- Redis Streams + consumer groups
- GPU-aware load balancing via sidecar metrics
- Per-GPU circuit breakers with half-open recovery
- Adaptive backpressure (0-30 normal, 30-40 warn, 40-50 503, >50 open)
- Dead letter queue with retry endpoint
- Job ID tracking and /status/<job_id> API
This commit is contained in:
SyslogBot
2026-05-17 03:55:20 +00:00
parent e95475f431
commit b09a93f45c
15 changed files with 3895 additions and 1 deletions
+8
View File
@@ -0,0 +1,8 @@
FROM python:3.13-slim
COPY harness-dashboard.py /app/harness-dashboard.py
WORKDIR /app
EXPOSE 3001
CMD ["python3", "harness-dashboard.py"]
+5
View File
@@ -0,0 +1,5 @@
FROM python:3.11-slim
WORKDIR /app
COPY harness-dashboard.py .
EXPOSE 3001
CMD ["python3", "harness-dashboard.py"]
+133
View File
@@ -0,0 +1,133 @@
#!/usr/bin/env python3
"""Syslog Harness Dashboard — Simple HTTP server exposing GPU health + metrics."""
import json
import os
import time
import urllib.request
from http.server import HTTPServer, SimpleHTTPRequestHandler
from datetime import datetime
GPUS = {
"amdpve": {"endpoint": os.getenv("AMDVE_EP", "192.168.68.15:8080"), "model": "qwen3.6-35B-A3B (MoE)", "vram": "65GB"},
"llmgpu": {"endpoint": os.getenv("LLMGPU_EP", "192.168.68.8:8080"), "model": "qwen3.5-27B (Dense)", "vram": "24GB"},
"ocu_llm": {"endpoint": os.getenv("OCU_LLM_EP", "192.168.68.110:8080"), "model": "gemma-4-E4B (Light)", "vram": "12GB"},
}
def check_gpu(name, info):
try:
start = time.time()
# Use simple HTTP GET to check if the GPU endpoint is alive
resp = urllib.request.urlopen(f"http://{info['endpoint']}/", timeout=3)
latency = (time.time() - start) * 1000
return {
"status": "up",
"latency_ms": round(latency, 1),
"model": info["model"],
"vram": info["vram"],
}
except Exception as e:
return {"status": "down", "error": str(e)[:50], "model": info["model"], "vram": info["vram"]}
def get_queue_status():
try:
req = urllib.request.Request("http://queue-service:8091/status")
resp = urllib.request.urlopen(req, timeout=2)
return json.loads(resp.read())
except Exception:
return {"queue_depth": -1, "circuit_breaker": "unknown", "gpu_health": {}}
DASHBOARD_HTML = """
<!DOCTYPE html>
<html><head><meta charset="utf-8"><title>🦅 Syslog Harness</title>
<style>
body { background: #1a1a2e; color: #e0e0e0; font-family: monospace; margin: 0; padding: 20px; }
.card { background: #16213e; border-radius: 8px; padding: 16px; margin: 10px 0; border-left: 4px solid #0f3460; }
.up { border-left-color: #00d26a; } .down { border-left-color: #ff4757; }
.warn { border-left-color: #ffa502; }
h1 { color: #00d26a; font-size: 24px; } h2 { color: #0f3460; font-size: 16px; }
.metric { display: inline-block; margin: 4px 12px; }
.value { font-weight: bold; color: #00d26a; }
#refresh { position: fixed; top: 10px; right: 10px; background: #0f3460; color: white;
border: none; padding: 8px 16px; border-radius: 4px; cursor: pointer; }
table { width: 100%; border-collapse: collapse; margin: 10px 0; }
th, td { text-align: left; padding: 8px; border-bottom: 1px solid #0f3460; }
th { color: #00d26a; }
</style></head><body>
<button id="refresh" onclick="location.reload()">↻ Refresh</button>
<h1>🦅 Syslog Harness Dashboard</h1>
<h2>Updated: <span id="ts"></span></h2>
<div class="card" id="queue-card">
<h2>Queue & Circuit Breaker</h2>
<div class="metric">Depth: <span class="value" id="depth">--</span></div>
<div class="metric">Circuit: <span class="value" id="circuit">--</span></div>
<div class="metric">Threshold: <span class="value" id="threshold">--</span></div>
</div>
<div class="card">
<h2>GPU Endpoints</h2>
<table><tr><th>GPU</th><th>Model</th><th>VRAM</th><th>Status</th><th>Latency</th></tr>
<tbody id="gpu-table"></tbody></table>
</div>
<script>
document.getElementById('ts').textContent = new Date().toISOString();
fetch('/api/status').then(r => r.json()).then(data => {
document.getElementById('depth').textContent = data.queue_depth;
document.getElementById('circuit').textContent = data.circuit_breaker;
document.getElementById('threshold').textContent = 'warn:' + data.thresholds.warn + ' / open:' + data.thresholds.open;
const card = document.getElementById('queue-card');
if (data.circuit_breaker === 'open') card.className = 'card warn';
else if (data.circuit_breaker === 'warn') card.className = 'card warn';
else card.className = 'card up';
let html = '';
for (const [name, gpu] of Object.entries(data.gpu_health)) {
const status = gpu.status === 'up' ? '' : '';
const latency = gpu.status === 'up' ? gpu.latency_ms + 'ms' : gpu.error;
const rowClass = gpu.status === 'up' ? '' : 'down';
html += `<tr class="${rowClass}"><td>${name}</td><td>${gpu.model}</td><td>${gpu.vram}</td><td>${status}</td><td>${latency}</td></tr>`;
}
document.getElementById('gpu-table').innerHTML = html;
});
setInterval(() => location.reload(), 10000);
</script></body></html>
"""
class Handler(SimpleHTTPRequestHandler):
def do_GET(self):
if self.path == "/" or self.path == "/harness.html":
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.end_headers()
self.wfile.write(DASHBOARD_HTML.encode())
elif self.path == "/api/status":
status = get_queue_status()
enriched = {
"queue_depth": status.get("queue_depth", -1),
"circuit_breaker": status.get("circuit_breaker", "unknown"),
"thresholds": status.get("thresholds", {"warn": 30, "open": 50}),
"gpu_health": {},
}
for name, info in GPUS.items():
enriched["gpu_health"][name] = check_gpu(name, info)
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(enriched).encode())
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
pass # Suppress request logs
if __name__ == "__main__":
server = HTTPServer(("0.0.0.0", 3001), Handler)
print("Dashboard running on :3001/harness.html")
server.serve_forever()