Merge: add Abiba harness code — nginx, LiteLLM, router, dashboard, Redis

2026-05-16 18:53:31 +00:00
parent 7b6c6aabe1 b65ea22765
commit 3d42ea4767
8 changed files with 412 additions and 0 deletions
@@ -0,0 +1,8 @@
 # Syslog Harness Environment
 REDIS_HOST=192.168.68.8
 REDIS_PORT=6379
 AMDPVE_ENDPOINT=http://192.168.68.15:8080
 LLMGPU_ENDPOINT=http://192.168.68.8:8080
 OCU_LLM_ENDPOINT=http://192.168.68.110:8080
 CIRCUIT_BREAKER_THRESHOLD=5
 CIRCUIT_BREAKER_TIMEOUT=30
@@ -0,0 +1,71 @@
 # Syslog Harness — Production Migration Plan
 ## Current State (Development)
 - **Host:** CT 114 (192.168.68.123)
 - **Docker containers:** `syslog-queue` (:8091), `syslog-dashboard` (:3001)
 - **Nginx:** Local on CT 114, routing to GPUs + Docker services
 - **Status:** All components verified and operational
 ## Target State (Production)
 - **Host:** New CT (e.g., `docker-vm` on 192.168.68.x)
 - **Docker containers:** Same queue + dashboard services
 - **Nginx:** Containerized on production CT
 - **GPU backends:** Same (192.168.68.15, .8, .110)
 ## Migration Steps
 ### 1. Prepare Production CT
 ```bash
 # Create new CT on Proxmox
 # Install Docker
 apt update && apt install -y docker.io docker-compose-plugin
 # Pull/cloned harness repo
 git clone <repo-url> /root/syslog-harness
 cd /root/syslog-harness
 ```
 ### 2. Update docker-compose.yml for Production
 - Change `REDIS_HOST` to production Redis IP
 - Update GPU endpoint env vars if IPs change
 - Add volume mounts for persistence
 ### 3. Build & Deploy
 ```bash
 # Build images
 docker compose build
 # Start services
 docker compose up -d
 # Verify health
 curl http://localhost:8091/health
 curl http://localhost:3001/api/status
 ```
 ### 4. Configure Nginx
 - Copy `/etc/nginx/conf.d/gpu-router.conf` to production CT
 - Update upstream IPs if needed
 - Test and reload
 ### 5. DNS / Routing Update
 - Point agent traffic to new CT IP
 - Update Hermes config `inference_api_url`
 - Test agent routing
 ### 6. Verification Checklist
 - [ ] Queue service health check passes
 - [ ] Dashboard API returns GPU health
 - [ ] Nginx routes to correct GPU based on header
 - [ ] Circuit breaker triggers on excess load
 - [ ] Queue fallback works when GPUs down
 - [ ] Agent requests reach correct model
 ## Rollback Plan
 - Keep CT 114 running as backup
 - Revert DNS/routing to .123 if issues
 - Docker containers can be stopped/started instantly
 ---
 *Created: May 15, 2026*
 *Status: Development verified, ready for production migration*
@@ -0,0 +1,106 @@
 ## Syslog GPU Router — Nginx Configuration (Docker-internal)
 ## Routes incoming agent requests to the appropriate GPU backend
 ## based on the X-Syslog-Model header.
 upstream amdpve_pool {
    ## Strix Halo 395 — qwen3.6-35B-A3B (MoE) — Default workhorse
    server 192.168.68.15:8080;
 }
 upstream llmgpu_pool {
    ## RTX 3090 — qwen3.5-27B (Dense) — Heavy reasoning
    server 192.168.68.8:8080;
 }
 upstream ocu_llm_pool {
    ## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
    server 192.168.68.110:8080;
 }
 upstream queue_service {
    ## Agent queue with circuit breaker (Docker container)
    server queue-service:8091;
 }
 upstream dashboard_service {
    ## Harness dashboard (Docker container)
    server dashboard:3001;
 }
 ## ------------------------------------------------------------------
 ## Mapping: X-Syslog-Model header → upstream backend
 ## ------------------------------------------------------------------
 map $http_x_syslog_model $gpu_upstream {
    default          amdpve_pool;
    "standard"       amdpve_pool;
    "heavy"          llmgpu_pool;
    "qwen3.5-27B"    llmgpu_pool;
    "light"          ocu_llm_pool;
    "gemma-4"        ocu_llm_pool;
 }
 ## Rate limit zone — 10 req/s per IP, burst of 20
 limit_req_zone $binary_remote_addr zone=perip:10m rate=10r/s;
 server {
    listen 80;
    server_name _;
    ## ------------------------------------------------------------------
    ## Dashboard — observability UI (MUST be before / catch-all)
    ## ------------------------------------------------------------------
    location /dashboard {
        proxy_pass http://dashboard_service/;
        proxy_set_header Host              $host;
        proxy_set_header X-Real-IP         $remote_addr;
        proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
    }
    ## ------------------------------------------------------------------
    ## Main location — proxy to selected upstream
    ## ------------------------------------------------------------------
    location / {
        limit_req zone=perip burst=20 nodelay;
        limit_req_status 503;
        proxy_pass http://$gpu_upstream;
        ## Preserve original host and headers
        proxy_set_header Host              $host;
        proxy_set_header X-Real-IP         $remote_addr;
        proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        ## Pass through the model header so backends can log it
        proxy_pass_header X-Syslog-Model;
        ## Streaming support (SSE for LLM responses)
        proxy_buffering off;
        proxy_cache     off;
        proxy_read_timeout  300s;
        proxy_send_timeout  300s;
        ## Basic failover — retry on error or timeout
        proxy_next_upstream error timeout http_502 http_503;
        proxy_next_upstream_tries 2;
        ## Add a response header for observability
        add_header X-Routed-To $gpu_upstream always;
        ## Fallback to queue when all GPU upstreams are down
        error_page 502 503 504 = @queue_fallback;
    }
    ## ------------------------------------------------------------------
    ## Queue fallback — enqueue when GPUs are unavailable
    ## ------------------------------------------------------------------
    location @queue_fallback {
        rewrite ^ /enqueue break;
        proxy_pass http://queue_service;
        proxy_set_header Host              $host;
        proxy_set_header X-Real-IP         $remote_addr;
        proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        proxy_set_header Content-Type      $content_type;
        proxy_pass_request_body            on;
    }
 }
@@ -0,0 +1,106 @@
 ## Syslog GPU Router — Nginx Configuration
 ## Routes incoming agent requests to the appropriate GPU backend
 ## based on the X-Syslog-Model header.
 upstream amdpve_pool {
    ## Strix Halo 395 — qwen3.6-35B-A3B (MoE) — Default workhorse
    server 192.168.68.15:8080;
 }
 upstream llmgpu_pool {
    ## RTX 3090 — qwen3.5-27B (Dense) — Heavy reasoning
    server 192.168.68.8:8080;
 }
 upstream ocu_llm_pool {
    ## RTX 5070 — gemma-4 (Dense 4B) — Ultra-light tasks
    server 192.168.68.110:8080;
 }
 upstream queue_service {
    ## Agent queue with circuit breaker (Docker container)
    server 127.0.0.1:8091;
 }
 upstream dashboard_service {
    ## Harness dashboard (Docker container)
    server 127.0.0.1:3001;
 }
 ## ------------------------------------------------------------------
 ## Mapping: X-Syslog-Model header → upstream backend
 ## ------------------------------------------------------------------
 map $http_x_syslog_model $gpu_upstream {
    default          amdpve_pool;   # missing header → default workhorse
    "standard"       amdpve_pool;
    "heavy"          llmgpu_pool;
    "qwen3.5-27B"    llmgpu_pool;
    "light"          ocu_llm_pool;
    "gemma-4"        ocu_llm_pool;
 }
 server {
    listen 8080;
    server_name _;
    # Rate limit zone — 10 req/s per IP, burst of 20
    limit_req_zone $binary_remote_addr zone=perip:10m rate=10r/s;
    ## ------------------------------------------------------------------
    ## Dashboard — observability UI (MUST be before / catch-all)
    ## ------------------------------------------------------------------
    location /dashboard {
        proxy_pass http://dashboard_service/;
        proxy_set_header Host              $host;
        proxy_set_header X-Real-IP         $remote_addr;
        proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
    }
    ## ------------------------------------------------------------------
    ## Main location — proxy to selected upstream
    ## ------------------------------------------------------------------
    location / {
        limit_req zone=perip burst=20 nodelay;
        limit_req_status 503;
        proxy_pass http://$gpu_upstream;
        ## Preserve original host and headers
        proxy_set_header Host              $host;
        proxy_set_header X-Real-IP         $remote_addr;
        proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        ## Pass through the model header so backends can log it
        proxy_pass_header X-Syslog-Model;
        ## Streaming support (SSE for LLM responses)
        proxy_buffering off;
        proxy_cache     off;
        proxy_read_timeout  300s;
        proxy_send_timeout  300s;
        ## Basic failover — retry on error or timeout
        proxy_next_upstream error timeout http_502 http_503;
        proxy_next_upstream_tries 2;
        ## Add a response header for observability
        add_header X-Routed-To $gpu_upstream always;
        ## Fallback to queue when all GPU upstreams are down
        error_page 502 503 504 = @queue_fallback;
    }
    ## ------------------------------------------------------------------
    ## Queue fallback — enqueue when GPUs are unavailable
    ## ------------------------------------------------------------------
    location @queue_fallback {
        rewrite ^ /enqueue break;
        proxy_pass http://queue_service;
        proxy_set_header Host              $host;
        proxy_set_header X-Real-IP         $remote_addr;
        proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        proxy_set_header Content-Type      $content_type;
        proxy_pass_request_body            on;
    }
 }
@@ -0,0 +1,121 @@
 #!/usr/bin/env python3
 """Syslog Inference Queue Service — Circuit breaker + request queuing.
 Ports: 8091
 Endpoints:
  /health          — liveness probe (Nginx upstream check)
  /enqueue         — POST inference request into queue (fallback from Nginx)
  /status          — GET queue depth + circuit breaker state
 """
 import json
 import os
 import sys
 import time
 import urllib.request
 from flask import Flask, request, jsonify
 app = Flask(__name__)
 # Configuration
 REDIS_HOST = os.getenv("REDIS_HOST", "192.168.68.7")
 REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
 QUEUE_KEY = "inference:requests"
 CIRCUIT_OPEN_THRESHOLD = 50
 CIRCUIT_WARN_THRESHOLD = 30
 # GPU endpoints for draining
 GPUS = {
    "amdpve": "192.168.68.15:8080",
    "llmgpu": "192.168.68.8:8080",
    "ocu_llm": "192.168.68.110:8080",
 }
 def get_redis():
    try:
        import redis
        return redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
    except Exception:
        return None
 def get_queue_depth(r):
    try:
        return r.llen(QUEUE_KEY)
    except Exception:
        return 0
 def check_gpu_health(endpoint):
    try:
        req = urllib.request.Request(f"http://{endpoint}/v1/models")
        req.add_header("User-Agent", "queue-service/1.0")
        resp = urllib.request.urlopen(req, timeout=3)
        return resp.status == 200
    except Exception:
        return False
@app.route("/health")
 def health():
    """Nginx upstream health probe. Returns 200 if service is alive."""
    return jsonify({"status": "ok", "service": "queue-service"}), 200
@app.route("/enqueue", methods=["POST"])
 def enqueue():
    """Fallback endpoint — Nginx calls this when all GPU upstreams are down."""
    r = get_redis()
    if not r:
        return jsonify({"error": "Redis unavailable"}), 503
    depth = get_queue_depth(r)
    if depth >= CIRCUIT_OPEN_THRESHOLD:
        return jsonify({
            "error": "Circuit breaker OPEN",
            "queue_depth": depth,
            "threshold": CIRCUIT_OPEN_THRESHOLD
        }), 503
    # Store the request in queue
    payload = request.get_data(as_text=True)
    headers = {k: v for k, v in request.headers if k.startswith("X-")}
    r.rpush(QUEUE_KEY, json.dumps({
        "payload": payload,
        "headers": headers,
        "queued_at": time.time()
    }))
    new_depth = get_queue_depth(r)
    return jsonify({
        "status": "queued",
        "position": new_depth,
        "circuit": "warn" if new_depth >= CIRCUIT_WARN_THRESHOLD else "closed"
    }), 202
@app.route("/status")
 def status():
    """GET queue depth + circuit breaker state + GPU health."""
    r = get_redis()
    depth = get_queue_depth(r) if r else -1
    circuit = "open" if depth >= CIRCUIT_OPEN_THRESHOLD else ("warn" if depth >= CIRCUIT_WARN_THRESHOLD else "closed")
    gpu_health = {}
    for name, endpoint in GPUS.items():
        gpu_health[name] = "up" if check_gpu_health(endpoint) else "down"
    return jsonify({
        "queue_depth": depth,
        "circuit_breaker": circuit,
        "gpu_health": gpu_health,
        "thresholds": {
            "warn": CIRCUIT_WARN_THRESHOLD,
            "open": CIRCUIT_OPEN_THRESHOLD
        }
    })
 if __name__ == "__main__":
    app.run(host="0.0.0.0", port=8091)