b09a93f45c
- SMART_QUEUE_IMPLEMENTATION.md: Complete implementation draft (1572 lines) with 10 quick-win fixes and full smart queue consumer rewrite - ARCHITECTURE_REVIEW.md: 26-issue audit with prioritized findings - Verified all 3 GPUs live: amdpve (73% util), llmgpu (idle), ocu_llm (idle) - Redis 7.4.9 confirmed streams support - GPU sidecar metrics verified on all hosts Key fixes: - QW-1: Dockerfile path mismatch (Dockerfile.queue -> queue-service/Dockerfile) - QW-2: Nginx fallback only on ALL-GPU failure (not single GPU) - QW-3: Container names fixed to Docker service names - QW-4: Redis host default fixed (192.168.68.7 -> redis) - QW-5: Dependency version pinning - QW-7-10: Health checks, restart policy, Gunicorn, single-process collector Smart queue features: - Redis Streams + consumer groups - GPU-aware load balancing via sidecar metrics - Per-GPU circuit breakers with half-open recovery - Adaptive backpressure (0-30 normal, 30-40 warn, 40-50 503, >50 open) - Dead letter queue with retry endpoint - Job ID tracking and /status/<job_id> API
116 lines
3.3 KiB
Python
116 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
"""GPU metrics collector — polls sidecars + llama.cpp every 10s, writes to Workspace."""
|
|
|
|
import urllib.request, json, time, os
|
|
|
|
HOSTS = [
|
|
{"name": "amdpve", "host": "192.168.68.15", "gpu": "AMD Strix Halo", "llama_port": 8080},
|
|
{"name": "llmgpu", "host": "192.168.68.8", "gpu": "RTX 3090", "llama_port": 8080},
|
|
{"name": "ocu-llm", "host": "192.168.68.110", "gpu": "RTX 5070", "llama_port": 8080},
|
|
]
|
|
OUTPUT = "/root/hermes-workspace/public/gpu_metrics.json"
|
|
INTERVAL = 10
|
|
STALE_THRESHOLD = 30 # seconds before marking stale
|
|
DEAD_THRESHOLD = 60 # seconds before marking unreachable
|
|
|
|
last_seen = {}
|
|
|
|
|
|
def fetch_json(url, timeout=3):
|
|
try:
|
|
req = urllib.request.Request(url)
|
|
resp = urllib.request.urlopen(req, timeout=timeout)
|
|
return json.loads(resp.read().decode())
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def collect_one(h):
|
|
"""Collect GPU hardware + llama.cpp inference state for one host."""
|
|
name = h["name"]
|
|
host = h["host"]
|
|
now = time.time()
|
|
|
|
# GPU hardware from sidecar
|
|
gpu = fetch_json(f"http://{host}:8090/")
|
|
|
|
# llama.cpp inference state
|
|
llamacpp_health = fetch_json(f"http://{host}:{h['llama_port']}/health")
|
|
llamacpp_models = fetch_json(f"http://{host}:{h['llama_port']}/v1/models")
|
|
|
|
# Determine inference state
|
|
model_name = None
|
|
inference_state = "unknown"
|
|
if llamacpp_models:
|
|
models = llamacpp_models.get("data", [])
|
|
if models:
|
|
model_name = models[0].get("id")
|
|
|
|
if llamacpp_health:
|
|
status = llamacpp_health.get("status", "")
|
|
if status == "ok":
|
|
idle = llamacpp_health.get("slots_idle", 0)
|
|
processing = llamacpp_health.get("slots_processing", 0)
|
|
if idle and not processing:
|
|
inference_state = "idle"
|
|
elif processing:
|
|
inference_state = "busy"
|
|
else:
|
|
inference_state = "idle"
|
|
|
|
# Check for /slots endpoint for is_processing detail
|
|
slots = fetch_json(f"http://{host}:{h['llama_port']}/slots")
|
|
if slots and isinstance(slots, list) and len(slots) > 0:
|
|
if slots[0].get("is_processing"):
|
|
inference_state = "busy"
|
|
|
|
result = {
|
|
"host": name,
|
|
"gpu_name": h["gpu"],
|
|
"inference": {
|
|
"state": inference_state,
|
|
"model": model_name,
|
|
},
|
|
"hardware": gpu if gpu else None,
|
|
"online": gpu is not None,
|
|
"timestamp": now,
|
|
}
|
|
|
|
if gpu is not None:
|
|
last_seen[name] = now
|
|
|
|
if name in last_seen:
|
|
age = now - last_seen[name]
|
|
if age > DEAD_THRESHOLD:
|
|
result["online"] = False
|
|
elif age > STALE_THRESHOLD:
|
|
result["stale"] = True
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
print(f"GPU collector starting, output={OUTPUT}, interval={INTERVAL}s")
|
|
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
|
|
|
|
while True:
|
|
start = time.time()
|
|
results = [collect_one(h) for h in HOSTS]
|
|
|
|
payload = {
|
|
"updated": start,
|
|
"gpus": results,
|
|
}
|
|
|
|
with open(OUTPUT + ".tmp", "w") as f:
|
|
json.dump(payload, f)
|
|
os.rename(OUTPUT + ".tmp", OUTPUT)
|
|
|
|
elapsed = time.time() - start
|
|
sleep_for = max(0, INTERVAL - elapsed)
|
|
time.sleep(sleep_for)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|