# Insert streaming support before the gpu_resp call import re with open('/opt/inference-harness/router/router.py') as f: code = f.read() # Find the gpu_resp block and replace with streaming-aware version old = ''' start = time.time() gpu_resp = requests.post( gpu_url + "/chat/completions", json=req_data, headers={"Content-Type": "application/json", "Authorization": "Bearer not-needed"}, timeout=120, ) latency_ms = int((time.time() - start) * 1000) if gpu_resp.status_code != 200: log.error("GPU error: %s %s", gpu_resp.status_code, gpu_resp.text[:200]) return jsonify({"error": "GPU backend returned " + str(gpu_resp.status_code)}), 502 response_data = gpu_resp.json() response_data = fix_reasoning_content(response_data) response_data["routing"] = { "model": model, "reason": reason, "gpu": gpu_url, "tier": tier, "agent": agent, "latency_ms": latency_ms, } return jsonify(response_data)''' new = ''' start = time.time() is_stream = req_data.get("stream", False) gpu_resp = requests.post( gpu_url + "/chat/completions", json=req_data, headers={"Content-Type": "application/json", "Authorization": "Bearer not-needed"}, timeout=120, stream=is_stream, ) latency_ms = int((time.time() - start) * 1000) if gpu_resp.status_code != 200: log.error("GPU error: %s %s", gpu_resp.status_code, gpu_resp.text[:200]) return jsonify({"error": "GPU backend returned " + str(gpu_resp.status_code)}), 502 if is_stream: # Stream response back to client def generate(): first = True for line in gpu_resp.iter_lines(decode_unicode=True): if line: if first and line.startswith("data: "): # Inject routing into first chunk try: chunk = json.loads(line[6:]) chunk["routing"] = { "model": model, "reason": reason, "gpu": gpu_url, "tier": tier, "agent": agent, "latency_ms": latency_ms, } yield "data: " + json.dumps(chunk) + "\n\n" first = False continue except Exception: pass yield line + "\n" yield "data: [DONE]\n\n" return Response(stream_with_context(generate()), mimetype="text/event-stream") response_data = gpu_resp.json() response_data = fix_reasoning_content(response_data) response_data["routing"] = { "model": model, "reason": reason, "gpu": gpu_url, "tier": tier, "agent": agent, "latency_ms": latency_ms, } return jsonify(response_data)''' code = code.replace(old, new) # Add missing import if 'from flask import Flask, request, jsonify' in code: code = code.replace( 'from flask import Flask, request, jsonify', 'from flask import Flask, request, jsonify, Response, stream_with_context' ) with open('/opt/inference-harness/router/router.py', 'w') as f: f.write(code) print('Streaming support added')