diff --git a/router/router.py b/router/router.py
index 67b49a9..a0fde97 100644
--- a/router/router.py
+++ b/router/router.py
@@ -352,11 +352,34 @@ def chat():
         
         if resp.status_code != 200: return jsonify({"error":"GPU error "+str(resp.status_code)}), 502
         if is_stream:
+            # Buffer stream to capture timings from final SSE chunk
+            chunks = []
+            stream_timings = {}
+            for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
+                if raw:
+                    cleaned = clean_unicode(raw)
+                    chunks.append(cleaned)
+                    # Parse last content chunk (before [DONE]) for timings
+                    if not stream_timings and '"timings"' in cleaned and '"predicted_n"' in cleaned:
+                        try:
+                            json_str = cleaned.replace("data: ", "").strip()
+                            if json_str.startswith("{"):
+                                tj = json.loads(json_str).get("timings", {})
+                                if tj:
+                                    stream_timings = tj
+                        except: pass
+            # Store perf record with real token counts from stream
+            if stream_timings:
+                pt = stream_timings.get("prompt_n", 0)
+                ct = stream_timings.get("predicted_n", 0)
+                tps = stream_timings.get("predicted_per_second", 0)
+                gen_ms = stream_timings.get("predicted_ms", lat)
+                store_perf_record(model, agent, tier, reason, queue_ms, gen_ms, pt, ct, True)
+            else:
+                store_perf_record(model, agent, tier, reason, queue_ms, lat, estimate_tokens(rd.get("messages",[])), 0, True)
+            # Yield all chunks to client
             def gen():
-                for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
-                    if raw: yield clean_unicode(raw)
-            # Streaming: can't get token counts without parsing stream, store latency + estimated tokens
-            store_perf_record(model, agent, tier, reason, queue_ms, lat, estimate_tokens(rd.get("messages",[])), 0, True)
+                for c in chunks: yield c
             bcast()
             ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[])))
             ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100