fix: buffer SSE chunks for large streaming responses

Mumuni 23K-token responses split the final SSE timings chunk
across HTTP frames. The old per-chunk check missed timings when
split. Now accumulates lines in a buffer before parsing.

Also fixed: store_perf_record accidentally dropped in prior edit.
This commit is contained in:
Abiba
2026-05-29 09:45:41 +00:00
parent d53685d874
commit a3bca93d9b
+11 -6
View File
@@ -366,19 +366,24 @@ def chat():
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
if is_stream:
# Buffer stream to capture timings from final SSE chunk
# Buffer SSE chunks, handle split lines for large responses
chunks = []
stream_timings = {}
buf = "" # accumulate partial lines
for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
if raw:
cleaned = clean_unicode(raw)
chunks.append(cleaned)
# Parse last content chunk (before [DONE]) for timings
if not stream_timings and '"timings"' in cleaned and '"predicted_n"' in cleaned:
buf += cleaned
# Process complete lines from buffer
while "\n" in buf:
line, buf = buf.split("\n", 1)
line = line.strip()
if line.startswith("data: ") and not stream_timings:
js = line[6:].strip()
if js.startswith("{") and "timings" in js and "predicted_n" in js:
try:
json_str = cleaned.replace("data: ", "").strip()
if json_str.startswith("{"):
tj = json.loads(json_str).get("timings", {})
tj = json.loads(js).get("timings", {})
if tj:
stream_timings = tj
except: pass