Files
syslog-harness/gpu-router-docker.conf
T
SyslogBot b09a93f45c feat: Smart Queue Consumer implementation draft + architecture review
- SMART_QUEUE_IMPLEMENTATION.md: Complete implementation draft (1572 lines)
  with 10 quick-win fixes and full smart queue consumer rewrite
- ARCHITECTURE_REVIEW.md: 26-issue audit with prioritized findings
- Verified all 3 GPUs live: amdpve (73% util), llmgpu (idle), ocu_llm (idle)
- Redis 7.4.9 confirmed streams support
- GPU sidecar metrics verified on all hosts

Key fixes:
- QW-1: Dockerfile path mismatch (Dockerfile.queue -> queue-service/Dockerfile)
- QW-2: Nginx fallback only on ALL-GPU failure (not single GPU)
- QW-3: Container names fixed to Docker service names
- QW-4: Redis host default fixed (192.168.68.7 -> redis)
- QW-5: Dependency version pinning
- QW-7-10: Health checks, restart policy, Gunicorn, single-process collector

Smart queue features:
- Redis Streams + consumer groups
- GPU-aware load balancing via sidecar metrics
- Per-GPU circuit breakers with half-open recovery
- Adaptive backpressure (0-30 normal, 30-40 warn, 40-50 503, >50 open)
- Dead letter queue with retry endpoint
- Job ID tracking and /status/<job_id> API
2026-05-17 03:55:20 +00:00

123 lines
4.3 KiB
Plaintext

## Syslog GPU Router — Nginx Configuration (Docker-internal)
## Routes incoming agent requests to the appropriate GPU backend
## based on the X-Syslog-Model header.
upstream amdpve_pool {
## Strix Halo 395 — qwen3.6-35B-A3B (MoE) — Default workhorse
server 192.168.68.15:8080;
}
upstream llmgpu_pool {
## RTX 3090 — qwen3.5-27B (Dense) — Heavy reasoning
server 192.168.68.8:8080;
}
upstream ocu_llm_pool {
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
server 192.168.68.110:8080;
}
upstream queue_service {
## Agent queue with circuit breaker (Docker container)
server queue-service:8091;
}
upstream dashboard_service {
## Harness dashboard (Docker container)
server syslog-harness-dashboard-1:3001;
}
upstream gpu_dashboard_pool {
## GPU dashboard (Docker container)
server syslog-harness-gpu-dashboard-1:8092;
}
## ------------------------------------------------------------------
## Mapping: X-Syslog-Model header → upstream backend
## ------------------------------------------------------------------
map $http_x_syslog_model $gpu_upstream {
default amdpve_pool;
"standard" amdpve_pool;
"heavy" llmgpu_pool;
"qwen3.5-27B" llmgpu_pool;
"light" ocu_llm_pool;
"gemma-4" ocu_llm_pool;
}
## Rate limit zone — 10 req/s per IP, burst of 20
limit_req_zone $binary_remote_addr zone=perip:10m rate=10r/s;
server {
listen 80;
server_name _;
## ------------------------------------------------------------------
## Dashboard — observability UI (MUST be before / catch-all)
## ------------------------------------------------------------------
location /dashboard {
proxy_pass http://dashboard_service/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
## ------------------------------------------------------------------
## GPU Dashboard — observability UI (MUST be before / catch-all)
## ------------------------------------------------------------------
location /gpu {
proxy_pass http://gpu_dashboard_pool/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
## ------------------------------------------------------------------
## Main location — proxy to selected upstream
## ------------------------------------------------------------------
location / {
limit_req zone=perip burst=20 nodelay;
limit_req_status 503;
proxy_pass http://$gpu_upstream;
## Preserve original host and headers
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
## Pass through the model header so backends can log it
proxy_pass_header X-Syslog-Model;
## Streaming support (SSE for LLM responses)
proxy_buffering off;
proxy_cache off;
proxy_read_timeout 300s;
proxy_send_timeout 300s;
## Basic failover — retry on error or timeout
proxy_next_upstream error timeout http_502 http_503;
proxy_next_upstream_tries 2;
## Add a response header for observability
add_header X-Routed-To $gpu_upstream always;
## Fallback to queue when all GPU upstreams are down
error_page 502 503 504 = @queue_fallback;
}
## ------------------------------------------------------------------
## Queue fallback — enqueue when GPUs are unavailable
## ------------------------------------------------------------------
location @queue_fallback {
rewrite ^ /enqueue break;
proxy_pass http://queue_service;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Content-Type $content_type;
proxy_pass_request_body on;
}
}