feat: Smart Queue Consumer implementation draft + architecture review

- SMART_QUEUE_IMPLEMENTATION.md: Complete implementation draft (1572 lines) with 10 quick-win fixes and full smart queue consumer rewrite - ARCHITECTURE_REVIEW.md: 26-issue audit with prioritized findings - Verified all 3 GPUs live: amdpve (73% util), llmgpu (idle), ocu_llm (idle) - Redis 7.4.9 confirmed streams support - GPU sidecar metrics verified on all hosts Key fixes: - QW-1: Dockerfile path mismatch (Dockerfile.queue -> queue-service/Dockerfile) - QW-2: Nginx fallback only on ALL-GPU failure (not single GPU) - QW-3: Container names fixed to Docker service names - QW-4: Redis host default fixed (192.168.68.7 -> redis) - QW-5: Dependency version pinning - QW-7-10: Health checks, restart policy, Gunicorn, single-process collector Smart queue features: - Redis Streams + consumer groups - GPU-aware load balancing via sidecar metrics - Per-GPU circuit breakers with half-open recovery - Adaptive backpressure (0-30 normal, 30-40 warn, 40-50 503, >50 open) - Dead letter queue with retry endpoint - Job ID tracking and /status/<job_id> API
2026-05-17 03:55:20 +00:00
parent e95475f431
commit b09a93f45c
15 changed files with 3895 additions and 1 deletions
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""GPU metrics collector — polls sidecars + llama.cpp every 10s, writes to Workspace."""
+
+import urllib.request, json, time, os
+
+HOSTS = [
+    {"name": "amdpve", "host": "192.168.68.15", "gpu": "AMD Strix Halo", "llama_port": 8080},
+    {"name": "llmgpu", "host": "192.168.68.8", "gpu": "RTX 3090", "llama_port": 8080},
+    {"name": "ocu-llm", "host": "192.168.68.110", "gpu": "RTX 5070", "llama_port": 8080},
+]
+OUTPUT = "/root/hermes-workspace/public/gpu_metrics.json"
+INTERVAL = 10
+STALE_THRESHOLD = 30  # seconds before marking stale
+DEAD_THRESHOLD = 60   # seconds before marking unreachable
+
+last_seen = {}
+
+
+def fetch_json(url, timeout=3):
+    try:
+        req = urllib.request.Request(url)
+        resp = urllib.request.urlopen(req, timeout=timeout)
+        return json.loads(resp.read().decode())
+    except Exception:
+        return None
+
+
+def collect_one(h):
+    """Collect GPU hardware + llama.cpp inference state for one host."""
+    name = h["name"]
+    host = h["host"]
+    now = time.time()
+
+    # GPU hardware from sidecar
+    gpu = fetch_json(f"http://{host}:8090/")
+
+    # llama.cpp inference state
+    llamacpp_health = fetch_json(f"http://{host}:{h['llama_port']}/health")
+    llamacpp_models = fetch_json(f"http://{host}:{h['llama_port']}/v1/models")
+
+    # Determine inference state
+    model_name = None
+    inference_state = "unknown"
+    if llamacpp_models:
+        models = llamacpp_models.get("data", [])
+        if models:
+            model_name = models[0].get("id")
+
+    if llamacpp_health:
+        status = llamacpp_health.get("status", "")
+        if status == "ok":
+            idle = llamacpp_health.get("slots_idle", 0)
+            processing = llamacpp_health.get("slots_processing", 0)
+            if idle and not processing:
+                inference_state = "idle"
+            elif processing:
+                inference_state = "busy"
+            else:
+                inference_state = "idle"
+
+    # Check for /slots endpoint for is_processing detail
+    slots = fetch_json(f"http://{host}:{h['llama_port']}/slots")
+    if slots and isinstance(slots, list) and len(slots) > 0:
+        if slots[0].get("is_processing"):
+            inference_state = "busy"
+
+    result = {
+        "host": name,
+        "gpu_name": h["gpu"],
+        "inference": {
+            "state": inference_state,
+            "model": model_name,
+        },
+        "hardware": gpu if gpu else None,
+        "online": gpu is not None,
+        "timestamp": now,
+    }
+
+    if gpu is not None:
+        last_seen[name] = now
+
+    if name in last_seen:
+        age = now - last_seen[name]
+        if age > DEAD_THRESHOLD:
+            result["online"] = False
+        elif age > STALE_THRESHOLD:
+            result["stale"] = True
+
+    return result
+
+
+def main():
+    print(f"GPU collector starting, output={OUTPUT}, interval={INTERVAL}s")
+    os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
+
+    while True:
+        start = time.time()
+        results = [collect_one(h) for h in HOSTS]
+
+        payload = {
+            "updated": start,
+            "gpus": results,
+        }
+
+        with open(OUTPUT + ".tmp", "w") as f:
+            json.dump(payload, f)
+        os.rename(OUTPUT + ".tmp", OUTPUT)
+
+        elapsed = time.time() - start
+        sleep_for = max(0, INTERVAL - elapsed)
+        time.sleep(sleep_for)
+
+
+if __name__ == "__main__":
+    main()