fix: buffer SSE chunks for large streaming responses
Mumuni 23K-token responses split the final SSE timings chunk across HTTP frames. The old per-chunk check missed timings when split. Now accumulates lines in a buffer before parsing. Also fixed: store_perf_record accidentally dropped in prior edit.
This commit is contained in:
+15
-10
@@ -366,22 +366,27 @@ def chat():
|
|||||||
|
|
||||||
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
|
||||||
if is_stream:
|
if is_stream:
|
||||||
# Buffer stream to capture timings from final SSE chunk
|
# Buffer SSE chunks, handle split lines for large responses
|
||||||
chunks = []
|
chunks = []
|
||||||
stream_timings = {}
|
stream_timings = {}
|
||||||
|
buf = "" # accumulate partial lines
|
||||||
for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
|
for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
|
||||||
if raw:
|
if raw:
|
||||||
cleaned = clean_unicode(raw)
|
cleaned = clean_unicode(raw)
|
||||||
chunks.append(cleaned)
|
chunks.append(cleaned)
|
||||||
# Parse last content chunk (before [DONE]) for timings
|
buf += cleaned
|
||||||
if not stream_timings and '"timings"' in cleaned and '"predicted_n"' in cleaned:
|
# Process complete lines from buffer
|
||||||
try:
|
while "\n" in buf:
|
||||||
json_str = cleaned.replace("data: ", "").strip()
|
line, buf = buf.split("\n", 1)
|
||||||
if json_str.startswith("{"):
|
line = line.strip()
|
||||||
tj = json.loads(json_str).get("timings", {})
|
if line.startswith("data: ") and not stream_timings:
|
||||||
if tj:
|
js = line[6:].strip()
|
||||||
stream_timings = tj
|
if js.startswith("{") and "timings" in js and "predicted_n" in js:
|
||||||
except: pass
|
try:
|
||||||
|
tj = json.loads(js).get("timings", {})
|
||||||
|
if tj:
|
||||||
|
stream_timings = tj
|
||||||
|
except: pass
|
||||||
# Store perf record with real token counts from stream
|
# Store perf record with real token counts from stream
|
||||||
if stream_timings:
|
if stream_timings:
|
||||||
pt = stream_timings.get("prompt_n", 0)
|
pt = stream_timings.get("prompt_n", 0)
|
||||||
|
|||||||
Reference in New Issue
Block a user