May 19, 2026: Full harness update

- Model migration: gemma-4-E4B → qwen3.5-9b-vlm - Dashboard reorder: Usage Over Time + GPU Metrics to top - Router counter leak fix (gpu_decr in except handler) - VLM slot upgrade 1→2 - Automated maintenance cron job - LiteLLM config update
2026-05-19 15:03:47 +00:00
commit 28fc57c5c7
15 changed files with 1455 additions and 0 deletions
@@ -0,0 +1,90 @@
+# Insert streaming support before the gpu_resp call
+import re
+with open('/opt/inference-harness/router/router.py') as f:
+    code = f.read()
+
+# Find the gpu_resp block and replace with streaming-aware version
+old = '''        start = time.time()
+        gpu_resp = requests.post(
+            gpu_url + "/chat/completions",
+            json=req_data,
+            headers={"Content-Type": "application/json", "Authorization": "Bearer not-needed"},
+            timeout=120,
+        )
+        latency_ms = int((time.time() - start) * 1000)
+
+        if gpu_resp.status_code != 200:
+            log.error("GPU error: %s %s", gpu_resp.status_code, gpu_resp.text[:200])
+            return jsonify({"error": "GPU backend returned " + str(gpu_resp.status_code)}), 502
+
+        response_data = gpu_resp.json()
+        response_data = fix_reasoning_content(response_data)
+
+        response_data["routing"] = {
+            "model": model, "reason": reason, "gpu": gpu_url,
+            "tier": tier, "agent": agent, "latency_ms": latency_ms,
+        }
+
+        return jsonify(response_data)'''
+
+new = '''        start = time.time()
+        is_stream = req_data.get("stream", False)
+        
+        gpu_resp = requests.post(
+            gpu_url + "/chat/completions",
+            json=req_data,
+            headers={"Content-Type": "application/json", "Authorization": "Bearer not-needed"},
+            timeout=120,
+            stream=is_stream,
+        )
+        latency_ms = int((time.time() - start) * 1000)
+
+        if gpu_resp.status_code != 200:
+            log.error("GPU error: %s %s", gpu_resp.status_code, gpu_resp.text[:200])
+            return jsonify({"error": "GPU backend returned " + str(gpu_resp.status_code)}), 502
+
+        if is_stream:
+            # Stream response back to client
+            def generate():
+                first = True
+                for line in gpu_resp.iter_lines(decode_unicode=True):
+                    if line:
+                        if first and line.startswith("data: "):
+                            # Inject routing into first chunk
+                            try:
+                                chunk = json.loads(line[6:])
+                                chunk["routing"] = {
+                                    "model": model, "reason": reason, "gpu": gpu_url,
+                                    "tier": tier, "agent": agent, "latency_ms": latency_ms,
+                                }
+                                yield "data: " + json.dumps(chunk) + "\n\n"
+                                first = False
+                                continue
+                            except Exception:
+                                pass
+                        yield line + "\n"
+                yield "data: [DONE]\n\n"
+            return Response(stream_with_context(generate()), mimetype="text/event-stream")
+
+        response_data = gpu_resp.json()
+        response_data = fix_reasoning_content(response_data)
+
+        response_data["routing"] = {
+            "model": model, "reason": reason, "gpu": gpu_url,
+            "tier": tier, "agent": agent, "latency_ms": latency_ms,
+        }
+
+        return jsonify(response_data)'''
+
+code = code.replace(old, new)
+
+# Add missing import
+if 'from flask import Flask, request, jsonify' in code:
+    code = code.replace(
+        'from flask import Flask, request, jsonify',
+        'from flask import Flask, request, jsonify, Response, stream_with_context'
+    )
+
+with open('/opt/inference-harness/router/router.py', 'w') as f:
+    f.write(code)
+print('Streaming support added')