fix: buffer SSE chunks for large streaming responses

Mumuni 23K-token responses split the final SSE timings chunk
across HTTP frames. The old per-chunk check missed timings when
split. Now accumulates lines in a buffer before parsing.

Also fixed: store_perf_record accidentally dropped in prior edit.
This commit is contained in:
Abiba
2026-05-29 09:45:41 +00:00
parent d53685d874
commit a3bca93d9b
+15 -10
View File
@@ -366,22 +366,27 @@ def chat():
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502 if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
if is_stream: if is_stream:
# Buffer stream to capture timings from final SSE chunk # Buffer SSE chunks, handle split lines for large responses
chunks = [] chunks = []
stream_timings = {} stream_timings = {}
buf = "" # accumulate partial lines
for raw in resp.iter_content(chunk_size=None, decode_unicode=True): for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
if raw: if raw:
cleaned = clean_unicode(raw) cleaned = clean_unicode(raw)
chunks.append(cleaned) chunks.append(cleaned)
# Parse last content chunk (before [DONE]) for timings buf += cleaned
if not stream_timings and '"timings"' in cleaned and '"predicted_n"' in cleaned: # Process complete lines from buffer
try: while "\n" in buf:
json_str = cleaned.replace("data: ", "").strip() line, buf = buf.split("\n", 1)
if json_str.startswith("{"): line = line.strip()
tj = json.loads(json_str).get("timings", {}) if line.startswith("data: ") and not stream_timings:
if tj: js = line[6:].strip()
stream_timings = tj if js.startswith("{") and "timings" in js and "predicted_n" in js:
except: pass try:
tj = json.loads(js).get("timings", {})
if tj:
stream_timings = tj
except: pass
# Store perf record with real token counts from stream # Store perf record with real token counts from stream
if stream_timings: if stream_timings:
pt = stream_timings.get("prompt_n", 0) pt = stream_timings.get("prompt_n", 0)