From e95475f43194aac844b7763ed2fee2bf9bdaea94 Mon Sep 17 00:00:00 2001 From: SyslogBot Date: Fri, 15 May 2026 22:25:56 +0000 Subject: [PATCH] Add GPU dashboard container + Nginx routing --- Dockerfile.gpu | 14 +++ docker-compose.yml | 54 ++++++++++ gpu-dashboard/gpu.html | 183 +++++++++++++++++++++++++++++++++ gpu-dashboard/gpu_collector.py | 115 +++++++++++++++++++++ gpu-router-docker.conf | 122 ++++++++++++++++++++++ 5 files changed, 488 insertions(+) create mode 100644 Dockerfile.gpu create mode 100644 docker-compose.yml create mode 100644 gpu-dashboard/gpu.html create mode 100644 gpu-dashboard/gpu_collector.py create mode 100644 gpu-router-docker.conf diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..6086f35 --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,14 @@ +FROM python:3.11-slim + +RUN pip install requests + +COPY gpu-dashboard/ /app/ +WORKDIR /app + +RUN mkdir -p /app/public && \ + cp gpu.html /app/public/ && \ + touch /app/public/gpu_metrics.json + +EXPOSE 8092 + +CMD ["sh", "-c", "python3 gpu_collector.py & python3 -m http.server 8092 --directory /app/public & wait"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..8833d42 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,54 @@ +version: "3.8" + +services: + redis: + image: redis:7-alpine + restart: always + networks: + - gpu-router-net + volumes: + - redis-data:/data + + queue-service: + build: + context: . + dockerfile: Dockerfile.queue + restart: always + networks: + - gpu-router-net + ports: + - "8091:8091" + depends_on: + - redis + environment: + - REDIS_HOST=redis + - REDIS_PORT=6379 + + dashboard: + build: + context: . + dockerfile: Dockerfile.dashboard + restart: always + networks: + - gpu-router-net + ports: + - "3001:3001" + depends_on: + - redis + + gpu-dashboard: + build: + context: . + dockerfile: Dockerfile.gpu + restart: always + networks: + - gpu-router-net + ports: + - "8092:8092" + +networks: + gpu-router-net: + driver: bridge + +volumes: + redis-data: diff --git a/gpu-dashboard/gpu.html b/gpu-dashboard/gpu.html new file mode 100644 index 0000000..b461682 --- /dev/null +++ b/gpu-dashboard/gpu.html @@ -0,0 +1,183 @@ + + + + + +GPU Monitor + + + +
+
+

← Workspace · GPU Monitor

+ Loading... +
+
+
+
+ + + + diff --git a/gpu-dashboard/gpu_collector.py b/gpu-dashboard/gpu_collector.py new file mode 100644 index 0000000..d374821 --- /dev/null +++ b/gpu-dashboard/gpu_collector.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""GPU metrics collector — polls sidecars + llama.cpp every 10s, writes to Workspace.""" + +import urllib.request, json, time, os + +HOSTS = [ + {"name": "amdpve", "host": "192.168.68.15", "gpu": "AMD Strix Halo", "llama_port": 8080}, + {"name": "llmgpu", "host": "192.168.68.8", "gpu": "RTX 3090", "llama_port": 8080}, + {"name": "ocu-llm", "host": "192.168.68.110", "gpu": "RTX 5070", "llama_port": 8080}, +] +OUTPUT = "/app/public/gpu_metrics.json" +INTERVAL = 10 +STALE_THRESHOLD = 30 # seconds before marking stale +DEAD_THRESHOLD = 60 # seconds before marking unreachable + +last_seen = {} + + +def fetch_json(url, timeout=3): + try: + req = urllib.request.Request(url) + resp = urllib.request.urlopen(req, timeout=timeout) + return json.loads(resp.read().decode()) + except Exception: + return None + + +def collect_one(h): + """Collect GPU hardware + llama.cpp inference state for one host.""" + name = h["name"] + host = h["host"] + now = time.time() + + # GPU hardware from sidecar + gpu = fetch_json(f"http://{host}:8090/") + + # llama.cpp inference state + llamacpp_health = fetch_json(f"http://{host}:{h['llama_port']}/health") + llamacpp_models = fetch_json(f"http://{host}:{h['llama_port']}/v1/models") + + # Determine inference state + model_name = None + inference_state = "unknown" + if llamacpp_models: + models = llamacpp_models.get("data", []) + if models: + model_name = models[0].get("id") + + if llamacpp_health: + status = llamacpp_health.get("status", "") + if status == "ok": + idle = llamacpp_health.get("slots_idle", 0) + processing = llamacpp_health.get("slots_processing", 0) + if idle and not processing: + inference_state = "idle" + elif processing: + inference_state = "busy" + else: + inference_state = "idle" + + # Check for /slots endpoint for is_processing detail + slots = fetch_json(f"http://{host}:{h['llama_port']}/slots") + if slots and isinstance(slots, list) and len(slots) > 0: + if slots[0].get("is_processing"): + inference_state = "busy" + + result = { + "host": name, + "gpu_name": h["gpu"], + "inference": { + "state": inference_state, + "model": model_name, + }, + "hardware": gpu if gpu else None, + "online": gpu is not None, + "timestamp": now, + } + + if gpu is not None: + last_seen[name] = now + + if name in last_seen: + age = now - last_seen[name] + if age > DEAD_THRESHOLD: + result["online"] = False + elif age > STALE_THRESHOLD: + result["stale"] = True + + return result + + +def main(): + print(f"GPU collector starting, output={OUTPUT}, interval={INTERVAL}s") + os.makedirs(os.path.dirname(OUTPUT), exist_ok=True) + + while True: + start = time.time() + results = [collect_one(h) for h in HOSTS] + + payload = { + "updated": start, + "gpus": results, + } + + with open(OUTPUT + ".tmp", "w") as f: + json.dump(payload, f) + os.rename(OUTPUT + ".tmp", OUTPUT) + + elapsed = time.time() - start + sleep_for = max(0, INTERVAL - elapsed) + time.sleep(sleep_for) + + +if __name__ == "__main__": + main() diff --git a/gpu-router-docker.conf b/gpu-router-docker.conf new file mode 100644 index 0000000..31de7cc --- /dev/null +++ b/gpu-router-docker.conf @@ -0,0 +1,122 @@ +## Syslog GPU Router — Nginx Configuration (Docker-internal) +## Routes incoming agent requests to the appropriate GPU backend +## based on the X-Syslog-Model header. + +upstream amdpve_pool { + ## Strix Halo 395 — qwen3.6-35B-A3B (MoE) — Default workhorse + server 192.168.68.15:8080; +} + +upstream llmgpu_pool { + ## RTX 3090 — qwen3.5-27B (Dense) — Heavy reasoning + server 192.168.68.8:8080; +} + +upstream ocu_llm_pool { + ## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks + server 192.168.68.110:8080; +} + +upstream queue_service { + ## Agent queue with circuit breaker (Docker container) + server queue-service:8091; +} + +upstream dashboard_service { + ## Harness dashboard (Docker container) + server dashboard:3001; +} + +upstream gpu_dashboard_pool { + ## GPU dashboard (Docker container) + server syslog-harness-gpu-dashboard-1:8092; +} + +## ------------------------------------------------------------------ +## Mapping: X-Syslog-Model header → upstream backend +## ------------------------------------------------------------------ +map $http_x_syslog_model $gpu_upstream { + default amdpve_pool; + "standard" amdpve_pool; + "heavy" llmgpu_pool; + "qwen3.5-27B" llmgpu_pool; + "light" ocu_llm_pool; + "gemma-4" ocu_llm_pool; +} + +## Rate limit zone — 10 req/s per IP, burst of 20 +limit_req_zone $binary_remote_addr zone=perip:10m rate=10r/s; + +server { + listen 80; + server_name _; + + ## ------------------------------------------------------------------ + ## Dashboard — observability UI (MUST be before / catch-all) + ## ------------------------------------------------------------------ + location /dashboard { + proxy_pass http://dashboard_service/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } + + ## ------------------------------------------------------------------ + ## GPU Dashboard — observability UI (MUST be before / catch-all) + ## ------------------------------------------------------------------ + location /gpu { + proxy_pass http://gpu_dashboard_pool/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + ## ------------------------------------------------------------------ + ## Main location — proxy to selected upstream + ## ------------------------------------------------------------------ + location / { + limit_req zone=perip burst=20 nodelay; + limit_req_status 503; + proxy_pass http://$gpu_upstream; + + ## Preserve original host and headers + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + ## Pass through the model header so backends can log it + proxy_pass_header X-Syslog-Model; + + ## Streaming support (SSE for LLM responses) + proxy_buffering off; + proxy_cache off; + proxy_read_timeout 300s; + proxy_send_timeout 300s; + + ## Basic failover — retry on error or timeout + proxy_next_upstream error timeout http_502 http_503; + proxy_next_upstream_tries 2; + + ## Add a response header for observability + add_header X-Routed-To $gpu_upstream always; + + ## Fallback to queue when all GPU upstreams are down + error_page 502 503 504 = @queue_fallback; + } + + ## ------------------------------------------------------------------ + ## Queue fallback — enqueue when GPUs are unavailable + ## ------------------------------------------------------------------ + location @queue_fallback { + rewrite ^ /enqueue break; + proxy_pass http://queue_service; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Content-Type $content_type; + proxy_pass_request_body on; + } +}