## Syslog GPU Router — Nginx Configuration (Docker-internal) ## Routes incoming agent requests to the appropriate GPU backend ## based on the X-Syslog-Model header. upstream amdpve_pool { ## Strix Halo 395 — qwen3.6-35B-A3B (MoE) — Default workhorse server 192.168.68.15:8080; } upstream llmgpu_pool { ## RTX 3090 — qwen3.5-27B (Dense) — Heavy reasoning server 192.168.68.8:8080; } upstream ocu_llm_pool { ## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks server 192.168.68.110:8080; } upstream queue_service { ## Agent queue with circuit breaker (Docker container) server queue-service:8091; } upstream dashboard_service { ## Harness dashboard (Docker container) server dashboard:3001; } upstream gpu_dashboard_pool { ## GPU dashboard (Docker container) server syslog-harness-gpu-dashboard-1:8092; } ## ------------------------------------------------------------------ ## Mapping: X-Syslog-Model header → upstream backend ## ------------------------------------------------------------------ map $http_x_syslog_model $gpu_upstream { default amdpve_pool; "standard" amdpve_pool; "heavy" llmgpu_pool; "qwen3.5-27B" llmgpu_pool; "light" ocu_llm_pool; "gemma-4" ocu_llm_pool; } ## Rate limit zone — 10 req/s per IP, burst of 20 limit_req_zone $binary_remote_addr zone=perip:10m rate=10r/s; server { listen 80; server_name _; ## ------------------------------------------------------------------ ## Dashboard — observability UI (MUST be before / catch-all) ## ------------------------------------------------------------------ location /dashboard { proxy_pass http://dashboard_service/; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; } ## ------------------------------------------------------------------ ## GPU Dashboard — observability UI (MUST be before / catch-all) ## ------------------------------------------------------------------ location /gpu { proxy_pass http://gpu_dashboard_pool/; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; } ## ------------------------------------------------------------------ ## Main location — proxy to selected upstream ## ------------------------------------------------------------------ location / { limit_req zone=perip burst=20 nodelay; limit_req_status 503; proxy_pass http://$gpu_upstream; ## Preserve original host and headers proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; ## Pass through the model header so backends can log it proxy_pass_header X-Syslog-Model; ## Streaming support (SSE for LLM responses) proxy_buffering off; proxy_cache off; proxy_read_timeout 300s; proxy_send_timeout 300s; ## Basic failover — retry on error or timeout proxy_next_upstream error timeout http_502 http_503; proxy_next_upstream_tries 2; ## Add a response header for observability add_header X-Routed-To $gpu_upstream always; ## Fallback to queue when all GPU upstreams are down error_page 502 503 504 = @queue_fallback; } ## ------------------------------------------------------------------ ## Queue fallback — enqueue when GPUs are unavailable ## ------------------------------------------------------------------ location @queue_fallback { rewrite ^ /enqueue break; proxy_pass http://queue_service; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; proxy_set_header Content-Type $content_type; proxy_pass_request_body on; } }