diff --git a/router/router.py b/router/router.py index 2b03199..7fe2f31 100644 --- a/router/router.py +++ b/router/router.py @@ -366,22 +366,27 @@ def chat(): if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502 if is_stream: - # Buffer stream to capture timings from final SSE chunk + # Buffer SSE chunks, handle split lines for large responses chunks = [] stream_timings = {} + buf = "" # accumulate partial lines for raw in resp.iter_content(chunk_size=None, decode_unicode=True): if raw: cleaned = clean_unicode(raw) chunks.append(cleaned) - # Parse last content chunk (before [DONE]) for timings - if not stream_timings and '"timings"' in cleaned and '"predicted_n"' in cleaned: - try: - json_str = cleaned.replace("data: ", "").strip() - if json_str.startswith("{"): - tj = json.loads(json_str).get("timings", {}) - if tj: - stream_timings = tj - except: pass + buf += cleaned + # Process complete lines from buffer + while "\n" in buf: + line, buf = buf.split("\n", 1) + line = line.strip() + if line.startswith("data: ") and not stream_timings: + js = line[6:].strip() + if js.startswith("{") and "timings" in js and "predicted_n" in js: + try: + tj = json.loads(js).get("timings", {}) + if tj: + stream_timings = tj + except: pass # Store perf record with real token counts from stream if stream_timings: pt = stream_timings.get("prompt_n", 0)