From b65ea22765fcff6cd96028eedc6c781be1db6125 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mumuni=20=F0=9F=A6=85=20=28Syslog=20Falcon=29?= Date: Fri, 15 May 2026 21:35:13 +0000 Subject: [PATCH] Update Nginx Docker config --- gpu-router-docker.conf | 106 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 gpu-router-docker.conf diff --git a/gpu-router-docker.conf b/gpu-router-docker.conf new file mode 100644 index 0000000..10e930f --- /dev/null +++ b/gpu-router-docker.conf @@ -0,0 +1,106 @@ +## Syslog GPU Router — Nginx Configuration (Docker-internal) +## Routes incoming agent requests to the appropriate GPU backend +## based on the X-Syslog-Model header. + +upstream amdpve_pool { + ## Strix Halo 395 — qwen3.6-35B-A3B (MoE) — Default workhorse + server 192.168.68.15:8080; +} + +upstream llmgpu_pool { + ## RTX 3090 — qwen3.5-27B (Dense) — Heavy reasoning + server 192.168.68.8:8080; +} + +upstream ocu_llm_pool { + ## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks + server 192.168.68.110:8080; +} + +upstream queue_service { + ## Agent queue with circuit breaker (Docker container) + server queue-service:8091; +} + +upstream dashboard_service { + ## Harness dashboard (Docker container) + server dashboard:3001; +} + +## ------------------------------------------------------------------ +## Mapping: X-Syslog-Model header → upstream backend +## ------------------------------------------------------------------ +map $http_x_syslog_model $gpu_upstream { + default amdpve_pool; + "standard" amdpve_pool; + "heavy" llmgpu_pool; + "qwen3.5-27B" llmgpu_pool; + "light" ocu_llm_pool; + "gemma-4" ocu_llm_pool; +} + +## Rate limit zone — 10 req/s per IP, burst of 20 +limit_req_zone $binary_remote_addr zone=perip:10m rate=10r/s; + +server { + listen 80; + server_name _; + + ## ------------------------------------------------------------------ + ## Dashboard — observability UI (MUST be before / catch-all) + ## ------------------------------------------------------------------ + location /dashboard { + proxy_pass http://dashboard_service/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } + + ## ------------------------------------------------------------------ + ## Main location — proxy to selected upstream + ## ------------------------------------------------------------------ + location / { + limit_req zone=perip burst=20 nodelay; + limit_req_status 503; + proxy_pass http://$gpu_upstream; + + ## Preserve original host and headers + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + ## Pass through the model header so backends can log it + proxy_pass_header X-Syslog-Model; + + ## Streaming support (SSE for LLM responses) + proxy_buffering off; + proxy_cache off; + proxy_read_timeout 300s; + proxy_send_timeout 300s; + + ## Basic failover — retry on error or timeout + proxy_next_upstream error timeout http_502 http_503; + proxy_next_upstream_tries 2; + + ## Add a response header for observability + add_header X-Routed-To $gpu_upstream always; + + ## Fallback to queue when all GPU upstreams are down + error_page 502 503 504 = @queue_fallback; + } + + ## ------------------------------------------------------------------ + ## Queue fallback — enqueue when GPUs are unavailable + ## ------------------------------------------------------------------ + location @queue_fallback { + rewrite ^ /enqueue break; + proxy_pass http://queue_service; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Content-Type $content_type; + proxy_pass_request_body on; + } +}