Merge: add Abiba harness code — nginx, LiteLLM, router, dashboard, Redis
This commit is contained in:
@@ -0,0 +1,8 @@
|
|||||||
|
# Syslog Harness Environment
|
||||||
|
REDIS_HOST=192.168.68.8
|
||||||
|
REDIS_PORT=6379
|
||||||
|
AMDPVE_ENDPOINT=http://192.168.68.15:8080
|
||||||
|
LLMGPU_ENDPOINT=http://192.168.68.8:8080
|
||||||
|
OCU_LLM_ENDPOINT=http://192.168.68.110:8080
|
||||||
|
CIRCUIT_BREAKER_THRESHOLD=5
|
||||||
|
CIRCUIT_BREAKER_TIMEOUT=30
|
||||||
@@ -0,0 +1,71 @@
|
|||||||
|
# Syslog Harness — Production Migration Plan
|
||||||
|
|
||||||
|
## Current State (Development)
|
||||||
|
- **Host:** CT 114 (192.168.68.123)
|
||||||
|
- **Docker containers:** `syslog-queue` (:8091), `syslog-dashboard` (:3001)
|
||||||
|
- **Nginx:** Local on CT 114, routing to GPUs + Docker services
|
||||||
|
- **Status:** All components verified and operational
|
||||||
|
|
||||||
|
## Target State (Production)
|
||||||
|
- **Host:** New CT (e.g., `docker-vm` on 192.168.68.x)
|
||||||
|
- **Docker containers:** Same queue + dashboard services
|
||||||
|
- **Nginx:** Containerized on production CT
|
||||||
|
- **GPU backends:** Same (192.168.68.15, .8, .110)
|
||||||
|
|
||||||
|
## Migration Steps
|
||||||
|
|
||||||
|
### 1. Prepare Production CT
|
||||||
|
```bash
|
||||||
|
# Create new CT on Proxmox
|
||||||
|
# Install Docker
|
||||||
|
apt update && apt install -y docker.io docker-compose-plugin
|
||||||
|
|
||||||
|
# Pull/cloned harness repo
|
||||||
|
git clone <repo-url> /root/syslog-harness
|
||||||
|
cd /root/syslog-harness
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Update docker-compose.yml for Production
|
||||||
|
- Change `REDIS_HOST` to production Redis IP
|
||||||
|
- Update GPU endpoint env vars if IPs change
|
||||||
|
- Add volume mounts for persistence
|
||||||
|
|
||||||
|
### 3. Build & Deploy
|
||||||
|
```bash
|
||||||
|
# Build images
|
||||||
|
docker compose build
|
||||||
|
|
||||||
|
# Start services
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
# Verify health
|
||||||
|
curl http://localhost:8091/health
|
||||||
|
curl http://localhost:3001/api/status
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Configure Nginx
|
||||||
|
- Copy `/etc/nginx/conf.d/gpu-router.conf` to production CT
|
||||||
|
- Update upstream IPs if needed
|
||||||
|
- Test and reload
|
||||||
|
|
||||||
|
### 5. DNS / Routing Update
|
||||||
|
- Point agent traffic to new CT IP
|
||||||
|
- Update Hermes config `inference_api_url`
|
||||||
|
- Test agent routing
|
||||||
|
|
||||||
|
### 6. Verification Checklist
|
||||||
|
- [ ] Queue service health check passes
|
||||||
|
- [ ] Dashboard API returns GPU health
|
||||||
|
- [ ] Nginx routes to correct GPU based on header
|
||||||
|
- [ ] Circuit breaker triggers on excess load
|
||||||
|
- [ ] Queue fallback works when GPUs down
|
||||||
|
- [ ] Agent requests reach correct model
|
||||||
|
|
||||||
|
## Rollback Plan
|
||||||
|
- Keep CT 114 running as backup
|
||||||
|
- Revert DNS/routing to .123 if issues
|
||||||
|
- Docker containers can be stopped/started instantly
|
||||||
|
|
||||||
|
---
|
||||||
|
*Created: May 15, 2026*
|
||||||
|
*Status: Development verified, ready for production migration*
|
||||||
@@ -0,0 +1,106 @@
|
|||||||
|
## Syslog GPU Router — Nginx Configuration (Docker-internal)
|
||||||
|
## Routes incoming agent requests to the appropriate GPU backend
|
||||||
|
## based on the X-Syslog-Model header.
|
||||||
|
|
||||||
|
upstream amdpve_pool {
|
||||||
|
## Strix Halo 395 — qwen3.6-35B-A3B (MoE) — Default workhorse
|
||||||
|
server 192.168.68.15:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream llmgpu_pool {
|
||||||
|
## RTX 3090 — qwen3.5-27B (Dense) — Heavy reasoning
|
||||||
|
server 192.168.68.8:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream ocu_llm_pool {
|
||||||
|
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
||||||
|
server 192.168.68.110:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream queue_service {
|
||||||
|
## Agent queue with circuit breaker (Docker container)
|
||||||
|
server queue-service:8091;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream dashboard_service {
|
||||||
|
## Harness dashboard (Docker container)
|
||||||
|
server dashboard:3001;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Mapping: X-Syslog-Model header → upstream backend
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
map $http_x_syslog_model $gpu_upstream {
|
||||||
|
default amdpve_pool;
|
||||||
|
"standard" amdpve_pool;
|
||||||
|
"heavy" llmgpu_pool;
|
||||||
|
"qwen3.5-27B" llmgpu_pool;
|
||||||
|
"light" ocu_llm_pool;
|
||||||
|
"gemma-4" ocu_llm_pool;
|
||||||
|
}
|
||||||
|
|
||||||
|
## Rate limit zone — 10 req/s per IP, burst of 20
|
||||||
|
limit_req_zone $binary_remote_addr zone=perip:10m rate=10r/s;
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name _;
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Dashboard — observability UI (MUST be before / catch-all)
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location /dashboard {
|
||||||
|
proxy_pass http://dashboard_service/;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Main location — proxy to selected upstream
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location / {
|
||||||
|
limit_req zone=perip burst=20 nodelay;
|
||||||
|
limit_req_status 503;
|
||||||
|
proxy_pass http://$gpu_upstream;
|
||||||
|
|
||||||
|
## Preserve original host and headers
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
## Pass through the model header so backends can log it
|
||||||
|
proxy_pass_header X-Syslog-Model;
|
||||||
|
|
||||||
|
## Streaming support (SSE for LLM responses)
|
||||||
|
proxy_buffering off;
|
||||||
|
proxy_cache off;
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
proxy_send_timeout 300s;
|
||||||
|
|
||||||
|
## Basic failover — retry on error or timeout
|
||||||
|
proxy_next_upstream error timeout http_502 http_503;
|
||||||
|
proxy_next_upstream_tries 2;
|
||||||
|
|
||||||
|
## Add a response header for observability
|
||||||
|
add_header X-Routed-To $gpu_upstream always;
|
||||||
|
|
||||||
|
## Fallback to queue when all GPU upstreams are down
|
||||||
|
error_page 502 503 504 = @queue_fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Queue fallback — enqueue when GPUs are unavailable
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location @queue_fallback {
|
||||||
|
rewrite ^ /enqueue break;
|
||||||
|
proxy_pass http://queue_service;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_set_header Content-Type $content_type;
|
||||||
|
proxy_pass_request_body on;
|
||||||
|
}
|
||||||
|
}
|
||||||
+106
@@ -0,0 +1,106 @@
|
|||||||
|
## Syslog GPU Router — Nginx Configuration
|
||||||
|
## Routes incoming agent requests to the appropriate GPU backend
|
||||||
|
## based on the X-Syslog-Model header.
|
||||||
|
|
||||||
|
upstream amdpve_pool {
|
||||||
|
## Strix Halo 395 — qwen3.6-35B-A3B (MoE) — Default workhorse
|
||||||
|
server 192.168.68.15:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream llmgpu_pool {
|
||||||
|
## RTX 3090 — qwen3.5-27B (Dense) — Heavy reasoning
|
||||||
|
server 192.168.68.8:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream ocu_llm_pool {
|
||||||
|
## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
|
||||||
|
server 192.168.68.110:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream queue_service {
|
||||||
|
## Agent queue with circuit breaker (Docker container)
|
||||||
|
server 127.0.0.1:8091;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream dashboard_service {
|
||||||
|
## Harness dashboard (Docker container)
|
||||||
|
server 127.0.0.1:3001;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Mapping: X-Syslog-Model header → upstream backend
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
map $http_x_syslog_model $gpu_upstream {
|
||||||
|
default amdpve_pool; # missing header → default workhorse
|
||||||
|
"standard" amdpve_pool;
|
||||||
|
"heavy" llmgpu_pool;
|
||||||
|
"qwen3.5-27B" llmgpu_pool;
|
||||||
|
"light" ocu_llm_pool;
|
||||||
|
"gemma-4" ocu_llm_pool;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 8080;
|
||||||
|
server_name _;
|
||||||
|
|
||||||
|
# Rate limit zone — 10 req/s per IP, burst of 20
|
||||||
|
limit_req_zone $binary_remote_addr zone=perip:10m rate=10r/s;
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Dashboard — observability UI (MUST be before / catch-all)
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location /dashboard {
|
||||||
|
proxy_pass http://dashboard_service/;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Main location — proxy to selected upstream
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location / {
|
||||||
|
limit_req zone=perip burst=20 nodelay;
|
||||||
|
limit_req_status 503;
|
||||||
|
proxy_pass http://$gpu_upstream;
|
||||||
|
|
||||||
|
## Preserve original host and headers
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
## Pass through the model header so backends can log it
|
||||||
|
proxy_pass_header X-Syslog-Model;
|
||||||
|
|
||||||
|
## Streaming support (SSE for LLM responses)
|
||||||
|
proxy_buffering off;
|
||||||
|
proxy_cache off;
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
proxy_send_timeout 300s;
|
||||||
|
|
||||||
|
## Basic failover — retry on error or timeout
|
||||||
|
proxy_next_upstream error timeout http_502 http_503;
|
||||||
|
proxy_next_upstream_tries 2;
|
||||||
|
|
||||||
|
## Add a response header for observability
|
||||||
|
add_header X-Routed-To $gpu_upstream always;
|
||||||
|
|
||||||
|
## Fallback to queue when all GPU upstreams are down
|
||||||
|
error_page 502 503 504 = @queue_fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
## Queue fallback — enqueue when GPUs are unavailable
|
||||||
|
## ------------------------------------------------------------------
|
||||||
|
location @queue_fallback {
|
||||||
|
rewrite ^ /enqueue break;
|
||||||
|
proxy_pass http://queue_service;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_set_header Content-Type $content_type;
|
||||||
|
proxy_pass_request_body on;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,121 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Syslog Inference Queue Service — Circuit breaker + request queuing.
|
||||||
|
|
||||||
|
Ports: 8091
|
||||||
|
Endpoints:
|
||||||
|
/health — liveness probe (Nginx upstream check)
|
||||||
|
/enqueue — POST inference request into queue (fallback from Nginx)
|
||||||
|
/status — GET queue depth + circuit breaker state
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
REDIS_HOST = os.getenv("REDIS_HOST", "192.168.68.7")
|
||||||
|
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
|
||||||
|
QUEUE_KEY = "inference:requests"
|
||||||
|
CIRCUIT_OPEN_THRESHOLD = 50
|
||||||
|
CIRCUIT_WARN_THRESHOLD = 30
|
||||||
|
|
||||||
|
# GPU endpoints for draining
|
||||||
|
GPUS = {
|
||||||
|
"amdpve": "192.168.68.15:8080",
|
||||||
|
"llmgpu": "192.168.68.8:8080",
|
||||||
|
"ocu_llm": "192.168.68.110:8080",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_redis():
|
||||||
|
try:
|
||||||
|
import redis
|
||||||
|
return redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_queue_depth(r):
|
||||||
|
try:
|
||||||
|
return r.llen(QUEUE_KEY)
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def check_gpu_health(endpoint):
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(f"http://{endpoint}/v1/models")
|
||||||
|
req.add_header("User-Agent", "queue-service/1.0")
|
||||||
|
resp = urllib.request.urlopen(req, timeout=3)
|
||||||
|
return resp.status == 200
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/health")
|
||||||
|
def health():
|
||||||
|
"""Nginx upstream health probe. Returns 200 if service is alive."""
|
||||||
|
return jsonify({"status": "ok", "service": "queue-service"}), 200
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/enqueue", methods=["POST"])
|
||||||
|
def enqueue():
|
||||||
|
"""Fallback endpoint — Nginx calls this when all GPU upstreams are down."""
|
||||||
|
r = get_redis()
|
||||||
|
if not r:
|
||||||
|
return jsonify({"error": "Redis unavailable"}), 503
|
||||||
|
|
||||||
|
depth = get_queue_depth(r)
|
||||||
|
if depth >= CIRCUIT_OPEN_THRESHOLD:
|
||||||
|
return jsonify({
|
||||||
|
"error": "Circuit breaker OPEN",
|
||||||
|
"queue_depth": depth,
|
||||||
|
"threshold": CIRCUIT_OPEN_THRESHOLD
|
||||||
|
}), 503
|
||||||
|
|
||||||
|
# Store the request in queue
|
||||||
|
payload = request.get_data(as_text=True)
|
||||||
|
headers = {k: v for k, v in request.headers if k.startswith("X-")}
|
||||||
|
r.rpush(QUEUE_KEY, json.dumps({
|
||||||
|
"payload": payload,
|
||||||
|
"headers": headers,
|
||||||
|
"queued_at": time.time()
|
||||||
|
}))
|
||||||
|
|
||||||
|
new_depth = get_queue_depth(r)
|
||||||
|
return jsonify({
|
||||||
|
"status": "queued",
|
||||||
|
"position": new_depth,
|
||||||
|
"circuit": "warn" if new_depth >= CIRCUIT_WARN_THRESHOLD else "closed"
|
||||||
|
}), 202
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/status")
|
||||||
|
def status():
|
||||||
|
"""GET queue depth + circuit breaker state + GPU health."""
|
||||||
|
r = get_redis()
|
||||||
|
depth = get_queue_depth(r) if r else -1
|
||||||
|
circuit = "open" if depth >= CIRCUIT_OPEN_THRESHOLD else ("warn" if depth >= CIRCUIT_WARN_THRESHOLD else "closed")
|
||||||
|
|
||||||
|
gpu_health = {}
|
||||||
|
for name, endpoint in GPUS.items():
|
||||||
|
gpu_health[name] = "up" if check_gpu_health(endpoint) else "down"
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
"queue_depth": depth,
|
||||||
|
"circuit_breaker": circuit,
|
||||||
|
"gpu_health": gpu_health,
|
||||||
|
"thresholds": {
|
||||||
|
"warn": CIRCUIT_WARN_THRESHOLD,
|
||||||
|
"open": CIRCUIT_OPEN_THRESHOLD
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(host="0.0.0.0", port=8091)
|
||||||
Reference in New Issue
Block a user