May 19, 2026: Full harness update
- Model migration: gemma-4-E4B → qwen3.5-9b-vlm - Dashboard reorder: Usage Over Time + GPU Metrics to top - Router counter leak fix (gpu_decr in except handler) - VLM slot upgrade 1→2 - Automated maintenance cron job - LiteLLM config update
This commit is contained in:
@@ -0,0 +1,90 @@
|
||||
# Insert streaming support before the gpu_resp call
|
||||
import re
|
||||
with open('/opt/inference-harness/router/router.py') as f:
|
||||
code = f.read()
|
||||
|
||||
# Find the gpu_resp block and replace with streaming-aware version
|
||||
old = ''' start = time.time()
|
||||
gpu_resp = requests.post(
|
||||
gpu_url + "/chat/completions",
|
||||
json=req_data,
|
||||
headers={"Content-Type": "application/json", "Authorization": "Bearer not-needed"},
|
||||
timeout=120,
|
||||
)
|
||||
latency_ms = int((time.time() - start) * 1000)
|
||||
|
||||
if gpu_resp.status_code != 200:
|
||||
log.error("GPU error: %s %s", gpu_resp.status_code, gpu_resp.text[:200])
|
||||
return jsonify({"error": "GPU backend returned " + str(gpu_resp.status_code)}), 502
|
||||
|
||||
response_data = gpu_resp.json()
|
||||
response_data = fix_reasoning_content(response_data)
|
||||
|
||||
response_data["routing"] = {
|
||||
"model": model, "reason": reason, "gpu": gpu_url,
|
||||
"tier": tier, "agent": agent, "latency_ms": latency_ms,
|
||||
}
|
||||
|
||||
return jsonify(response_data)'''
|
||||
|
||||
new = ''' start = time.time()
|
||||
is_stream = req_data.get("stream", False)
|
||||
|
||||
gpu_resp = requests.post(
|
||||
gpu_url + "/chat/completions",
|
||||
json=req_data,
|
||||
headers={"Content-Type": "application/json", "Authorization": "Bearer not-needed"},
|
||||
timeout=120,
|
||||
stream=is_stream,
|
||||
)
|
||||
latency_ms = int((time.time() - start) * 1000)
|
||||
|
||||
if gpu_resp.status_code != 200:
|
||||
log.error("GPU error: %s %s", gpu_resp.status_code, gpu_resp.text[:200])
|
||||
return jsonify({"error": "GPU backend returned " + str(gpu_resp.status_code)}), 502
|
||||
|
||||
if is_stream:
|
||||
# Stream response back to client
|
||||
def generate():
|
||||
first = True
|
||||
for line in gpu_resp.iter_lines(decode_unicode=True):
|
||||
if line:
|
||||
if first and line.startswith("data: "):
|
||||
# Inject routing into first chunk
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
chunk["routing"] = {
|
||||
"model": model, "reason": reason, "gpu": gpu_url,
|
||||
"tier": tier, "agent": agent, "latency_ms": latency_ms,
|
||||
}
|
||||
yield "data: " + json.dumps(chunk) + "\n\n"
|
||||
first = False
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
yield line + "\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
return Response(stream_with_context(generate()), mimetype="text/event-stream")
|
||||
|
||||
response_data = gpu_resp.json()
|
||||
response_data = fix_reasoning_content(response_data)
|
||||
|
||||
response_data["routing"] = {
|
||||
"model": model, "reason": reason, "gpu": gpu_url,
|
||||
"tier": tier, "agent": agent, "latency_ms": latency_ms,
|
||||
}
|
||||
|
||||
return jsonify(response_data)'''
|
||||
|
||||
code = code.replace(old, new)
|
||||
|
||||
# Add missing import
|
||||
if 'from flask import Flask, request, jsonify' in code:
|
||||
code = code.replace(
|
||||
'from flask import Flask, request, jsonify',
|
||||
'from flask import Flask, request, jsonify, Response, stream_with_context'
|
||||
)
|
||||
|
||||
with open('/opt/inference-harness/router/router.py', 'w') as f:
|
||||
f.write(code)
|
||||
print('Streaming support added')
|
||||
Reference in New Issue
Block a user