fix: buffer SSE chunks for large streaming responses
Mumuni 23K-token responses split the final SSE timings chunk across HTTP frames. The old per-chunk check missed timings when split. Now accumulates lines in a buffer before parsing. Also fixed: store_perf_record accidentally dropped in prior edit.
This commit is contained in:
+11
-6
@@ -366,19 +366,24 @@ def chat():
|
||||
|
||||
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
||||
if is_stream:
|
||||
# Buffer stream to capture timings from final SSE chunk
|
||||
# Buffer SSE chunks, handle split lines for large responses
|
||||
chunks = []
|
||||
stream_timings = {}
|
||||
buf = "" # accumulate partial lines
|
||||
for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
|
||||
if raw:
|
||||
cleaned = clean_unicode(raw)
|
||||
chunks.append(cleaned)
|
||||
# Parse last content chunk (before [DONE]) for timings
|
||||
if not stream_timings and '"timings"' in cleaned and '"predicted_n"' in cleaned:
|
||||
buf += cleaned
|
||||
# Process complete lines from buffer
|
||||
while "\n" in buf:
|
||||
line, buf = buf.split("\n", 1)
|
||||
line = line.strip()
|
||||
if line.startswith("data: ") and not stream_timings:
|
||||
js = line[6:].strip()
|
||||
if js.startswith("{") and "timings" in js and "predicted_n" in js:
|
||||
try:
|
||||
json_str = cleaned.replace("data: ", "").strip()
|
||||
if json_str.startswith("{"):
|
||||
tj = json.loads(json_str).get("timings", {})
|
||||
tj = json.loads(js).get("timings", {})
|
||||
if tj:
|
||||
stream_timings = tj
|
||||
except: pass
|
||||
|
||||
Reference in New Issue
Block a user