inference-harness/router/http_patch.py

# Insert streaming support before the gpu_resp call
import re
with open('/opt/inference-harness/router/router.py') as f:
    code = f.read()

# Find the gpu_resp block and replace with streaming-aware version
old = '''        start = time.time()
        gpu_resp = requests.post(
            gpu_url + "/chat/completions",
            json=req_data,
            headers={"Content-Type": "application/json", "Authorization": "Bearer not-needed"},
            timeout=120,
        )
        latency_ms = int((time.time() - start) * 1000)

        if gpu_resp.status_code != 200:
            log.error("GPU error: %s %s", gpu_resp.status_code, gpu_resp.text[:200])
            return jsonify({"error": "GPU backend returned " + str(gpu_resp.status_code)}), 502

        response_data = gpu_resp.json()
        response_data = fix_reasoning_content(response_data)

        response_data["routing"] = {
            "model": model, "reason": reason, "gpu": gpu_url,
            "tier": tier, "agent": agent, "latency_ms": latency_ms,
        }

        return jsonify(response_data)'''

new = '''        start = time.time()
        is_stream = req_data.get("stream", False)

        gpu_resp = requests.post(
            gpu_url + "/chat/completions",
            json=req_data,
            headers={"Content-Type": "application/json", "Authorization": "Bearer not-needed"},
            timeout=120,
            stream=is_stream,
        )
        latency_ms = int((time.time() - start) * 1000)

        if gpu_resp.status_code != 200:
            log.error("GPU error: %s %s", gpu_resp.status_code, gpu_resp.text[:200])
            return jsonify({"error": "GPU backend returned " + str(gpu_resp.status_code)}), 502

        if is_stream:
            # Stream response back to client
            def generate():
                first = True
                for line in gpu_resp.iter_lines(decode_unicode=True):
                    if line:
                        if first and line.startswith("data: "):
                            # Inject routing into first chunk
                            try:
                                chunk = json.loads(line[6:])
                                chunk["routing"] = {
                                    "model": model, "reason": reason, "gpu": gpu_url,
                                    "tier": tier, "agent": agent, "latency_ms": latency_ms,
                                }
                                yield "data: " + json.dumps(chunk) + "\n\n"
                                first = False
                                continue
                            except Exception:
                                pass
                        yield line + "\n"
                yield "data: [DONE]\n\n"
            return Response(stream_with_context(generate()), mimetype="text/event-stream")

        response_data = gpu_resp.json()
        response_data = fix_reasoning_content(response_data)

        response_data["routing"] = {
            "model": model, "reason": reason, "gpu": gpu_url,
            "tier": tier, "agent": agent, "latency_ms": latency_ms,
        }

        return jsonify(response_data)'''

code = code.replace(old, new)

# Add missing import
if 'from flask import Flask, request, jsonify' in code:
    code = code.replace(
        'from flask import Flask, request, jsonify',
        'from flask import Flask, request, jsonify, Response, stream_with_context'
    )

with open('/opt/inference-harness/router/router.py', 'w') as f:
    f.write(code)
print('Streaming support added')