diff --git a/router/router.py b/router/router.py index 67b49a9..a0fde97 100644 --- a/router/router.py +++ b/router/router.py @@ -352,11 +352,34 @@ def chat(): if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502 if is_stream: + # Buffer stream to capture timings from final SSE chunk + chunks = [] + stream_timings = {} + for raw in resp.iter_content(chunk_size=None, decode_unicode=True): + if raw: + cleaned = clean_unicode(raw) + chunks.append(cleaned) + # Parse last content chunk (before [DONE]) for timings + if not stream_timings and '"timings"' in cleaned and '"predicted_n"' in cleaned: + try: + json_str = cleaned.replace("data: ", "").strip() + if json_str.startswith("{"): + tj = json.loads(json_str).get("timings", {}) + if tj: + stream_timings = tj + except: pass + # Store perf record with real token counts from stream + if stream_timings: + pt = stream_timings.get("prompt_n", 0) + ct = stream_timings.get("predicted_n", 0) + tps = stream_timings.get("predicted_per_second", 0) + gen_ms = stream_timings.get("predicted_ms", lat) + store_perf_record(model, agent, tier, reason, queue_ms, gen_ms, pt, ct, True) + else: + store_perf_record(model, agent, tier, reason, queue_ms, lat, estimate_tokens(rd.get("messages",[])), 0, True) + # Yield all chunks to client def gen(): - for raw in resp.iter_content(chunk_size=None, decode_unicode=True): - if raw: yield clean_unicode(raw) - # Streaming: can't get token counts without parsing stream, store latency + estimated tokens - store_perf_record(model, agent, tier, reason, queue_ms, lat, estimate_tokens(rd.get("messages",[])), 0, True) + for c in chunks: yield c bcast() ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[]))) ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100