fix: buffer SSE chunks for large streaming responses

Mumuni 23K-token responses split the final SSE timings chunk across HTTP frames. The old per-chunk check missed timings when split. Now accumulates lines in a buffer before parsing. Also fixed: store_perf_record accidentally dropped in prior edit.
2026-05-29 09:45:41 +00:00
parent d53685d874
commit a3bca93d9b
1 changed files with 15 additions and 10 deletions
@@ -366,22 +366,27 @@ def chat():
        if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
        if is_stream:
-            # Buffer stream to capture timings from final SSE chunk
+            # Buffer SSE chunks, handle split lines for large responses
            chunks = []
            stream_timings = {}
            buf = ""  # accumulate partial lines
            for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
                if raw:
                    cleaned = clean_unicode(raw)
                    chunks.append(cleaned)
-                    # Parse last content chunk (before [DONE]) for timings
+                    buf += cleaned
-                    if not stream_timings and '"timings"' in cleaned and '"predicted_n"' in cleaned:
+                    # Process complete lines from buffer
-                        try:
+                    while "\n" in buf:
-                            json_str = cleaned.replace("data: ", "").strip()
+                        line, buf = buf.split("\n", 1)
-                            if json_str.startswith("{"):
+                        line = line.strip()
-                                tj = json.loads(json_str).get("timings", {})
+                        if line.startswith("data: ") and not stream_timings:
-                                if tj:
+                            js = line[6:].strip()
-                                    stream_timings = tj
+                            if js.startswith("{") and "timings" in js and "predicted_n" in js:
-                        except: pass
+                                try:
                                    tj = json.loads(js).get("timings", {})
                                    if tj:
                                        stream_timings = tj
                                except: pass
            # Store perf record with real token counts from stream
            if stream_timings:
                pt = stream_timings.get("prompt_n", 0)