diff --git a/router/router.py b/router/router.py index 0b7449d..85c59e8 100644 --- a/router/router.py +++ b/router/router.py @@ -324,8 +324,11 @@ def chat(): if raw: yield clean_unicode(raw) bcast() ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[]))) + ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100 + ctx_warning = "compact_urgent" if ctx_pct < 5 else ("compact_recommended" if ctx_pct < 15 else ("compact_soon" if ctx_pct < 30 else "ok")) sse_resp = Response(stream_with_context(gen()), mimetype="text/event-stream") sse_resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining)) + sse_resp.headers["X-Context-Warning"] = ctx_warning sse_resp.headers["X-Context-Model"] = model return sse_resp data = clean_response(resp.json()) @@ -334,9 +337,12 @@ def chat(): if not msg.get("content") and msg.get("reasoning_content"): msg["content"] = msg["reasoning_content"] ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[]))) - data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model),"context_remaining": max(0, ctx_remaining)} + ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100 + ctx_warning = "compact_urgent" if ctx_pct < 5 else ("compact_recommended" if ctx_pct < 15 else ("compact_soon" if ctx_pct < 30 else "ok")) + data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model),"context_remaining": max(0, ctx_remaining),"context_pct": round(ctx_pct,1),"context_warning": ctx_warning} resp = jsonify(data) resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining)) + resp.headers["X-Context-Warning"] = ctx_warning resp.headers["X-Context-Model"] = model bcast() return resp