fix: buffer SSE chunks for large streaming responses

Mumuni 23K-token responses split the final SSE timings chunk across HTTP frames. The old per-chunk check missed timings when split. Now accumulates lines in a buffer before parsing. Also fixed: store_perf_record accidentally dropped in prior edit.
2026-05-29 09:45:41 +00:00
parent d53685d874
commit a3bca93d9b
1 changed files with 15 additions and 10 deletions
@@ -366,19 +366,24 @@ def chat():
        
        if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
        if is_stream:
-            # Buffer stream to capture timings from final SSE chunk
+            # Buffer SSE chunks, handle split lines for large responses
            chunks = []
            stream_timings = {}
+            buf = ""  # accumulate partial lines
            for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
                if raw:
                    cleaned = clean_unicode(raw)
                    chunks.append(cleaned)
-                    # Parse last content chunk (before [DONE]) for timings
-                    if not stream_timings and '"timings"' in cleaned and '"predicted_n"' in cleaned:
+                    buf += cleaned
+                    # Process complete lines from buffer
+                    while "\n" in buf:
+                        line, buf = buf.split("\n", 1)
+                        line = line.strip()
+                        if line.startswith("data: ") and not stream_timings:
+                            js = line[6:].strip()
+                            if js.startswith("{") and "timings" in js and "predicted_n" in js:
                                try:
-                            json_str = cleaned.replace("data: ", "").strip()
-                            if json_str.startswith("{"):
-                                tj = json.loads(json_str).get("timings", {})
+                                    tj = json.loads(js).get("timings", {})
                                    if tj:
                                        stream_timings = tj
                                except: pass