feat: Smart Queue Consumer implementation draft + architecture review
- SMART_QUEUE_IMPLEMENTATION.md: Complete implementation draft (1572 lines) with 10 quick-win fixes and full smart queue consumer rewrite - ARCHITECTURE_REVIEW.md: 26-issue audit with prioritized findings - Verified all 3 GPUs live: amdpve (73% util), llmgpu (idle), ocu_llm (idle) - Redis 7.4.9 confirmed streams support - GPU sidecar metrics verified on all hosts Key fixes: - QW-1: Dockerfile path mismatch (Dockerfile.queue -> queue-service/Dockerfile) - QW-2: Nginx fallback only on ALL-GPU failure (not single GPU) - QW-3: Container names fixed to Docker service names - QW-4: Redis host default fixed (192.168.68.7 -> redis) - QW-5: Dependency version pinning - QW-7-10: Health checks, restart policy, Gunicorn, single-process collector Smart queue features: - Redis Streams + consumer groups - GPU-aware load balancing via sidecar metrics - Per-GPU circuit breakers with half-open recovery - Adaptive backpressure (0-30 normal, 30-40 warn, 40-50 503, >50 open) - Dead letter queue with retry endpoint - Job ID tracking and /status/<job_id> API
This commit is contained in:
@@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python3
|
||||
"""GPU metrics collector — polls sidecars + llama.cpp every 10s, writes to Workspace."""
|
||||
|
||||
import urllib.request, json, time, os
|
||||
|
||||
HOSTS = [
|
||||
{"name": "amdpve", "host": "192.168.68.15", "gpu": "AMD Strix Halo", "llama_port": 8080},
|
||||
{"name": "llmgpu", "host": "192.168.68.8", "gpu": "RTX 3090", "llama_port": 8080},
|
||||
{"name": "ocu-llm", "host": "192.168.68.110", "gpu": "RTX 5070", "llama_port": 8080},
|
||||
]
|
||||
OUTPUT = "/root/hermes-workspace/public/gpu_metrics.json"
|
||||
INTERVAL = 10
|
||||
STALE_THRESHOLD = 30 # seconds before marking stale
|
||||
DEAD_THRESHOLD = 60 # seconds before marking unreachable
|
||||
|
||||
last_seen = {}
|
||||
|
||||
|
||||
def fetch_json(url, timeout=3):
|
||||
try:
|
||||
req = urllib.request.Request(url)
|
||||
resp = urllib.request.urlopen(req, timeout=timeout)
|
||||
return json.loads(resp.read().decode())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def collect_one(h):
|
||||
"""Collect GPU hardware + llama.cpp inference state for one host."""
|
||||
name = h["name"]
|
||||
host = h["host"]
|
||||
now = time.time()
|
||||
|
||||
# GPU hardware from sidecar
|
||||
gpu = fetch_json(f"http://{host}:8090/")
|
||||
|
||||
# llama.cpp inference state
|
||||
llamacpp_health = fetch_json(f"http://{host}:{h['llama_port']}/health")
|
||||
llamacpp_models = fetch_json(f"http://{host}:{h['llama_port']}/v1/models")
|
||||
|
||||
# Determine inference state
|
||||
model_name = None
|
||||
inference_state = "unknown"
|
||||
if llamacpp_models:
|
||||
models = llamacpp_models.get("data", [])
|
||||
if models:
|
||||
model_name = models[0].get("id")
|
||||
|
||||
if llamacpp_health:
|
||||
status = llamacpp_health.get("status", "")
|
||||
if status == "ok":
|
||||
idle = llamacpp_health.get("slots_idle", 0)
|
||||
processing = llamacpp_health.get("slots_processing", 0)
|
||||
if idle and not processing:
|
||||
inference_state = "idle"
|
||||
elif processing:
|
||||
inference_state = "busy"
|
||||
else:
|
||||
inference_state = "idle"
|
||||
|
||||
# Check for /slots endpoint for is_processing detail
|
||||
slots = fetch_json(f"http://{host}:{h['llama_port']}/slots")
|
||||
if slots and isinstance(slots, list) and len(slots) > 0:
|
||||
if slots[0].get("is_processing"):
|
||||
inference_state = "busy"
|
||||
|
||||
result = {
|
||||
"host": name,
|
||||
"gpu_name": h["gpu"],
|
||||
"inference": {
|
||||
"state": inference_state,
|
||||
"model": model_name,
|
||||
},
|
||||
"hardware": gpu if gpu else None,
|
||||
"online": gpu is not None,
|
||||
"timestamp": now,
|
||||
}
|
||||
|
||||
if gpu is not None:
|
||||
last_seen[name] = now
|
||||
|
||||
if name in last_seen:
|
||||
age = now - last_seen[name]
|
||||
if age > DEAD_THRESHOLD:
|
||||
result["online"] = False
|
||||
elif age > STALE_THRESHOLD:
|
||||
result["stale"] = True
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
print(f"GPU collector starting, output={OUTPUT}, interval={INTERVAL}s")
|
||||
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
|
||||
|
||||
while True:
|
||||
start = time.time()
|
||||
results = [collect_one(h) for h in HOSTS]
|
||||
|
||||
payload = {
|
||||
"updated": start,
|
||||
"gpus": results,
|
||||
}
|
||||
|
||||
with open(OUTPUT + ".tmp", "w") as f:
|
||||
json.dump(payload, f)
|
||||
os.rename(OUTPUT + ".tmp", OUTPUT)
|
||||
|
||||
elapsed = time.time() - start
|
||||
sleep_for = max(0, INTERVAL - elapsed)
|
||||
time.sleep(sleep_for)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user