router: heavy tier Dense→MoE→Light + X-Context-Warning headers (compact_soon/compact_recommended/compact_urgent)
This commit is contained in:
+7
-1
@@ -324,8 +324,11 @@ def chat():
|
|||||||
if raw: yield clean_unicode(raw)
|
if raw: yield clean_unicode(raw)
|
||||||
bcast()
|
bcast()
|
||||||
ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[])))
|
ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[])))
|
||||||
|
ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100
|
||||||
|
ctx_warning = "compact_urgent" if ctx_pct < 5 else ("compact_recommended" if ctx_pct < 15 else ("compact_soon" if ctx_pct < 30 else "ok"))
|
||||||
sse_resp = Response(stream_with_context(gen()), mimetype="text/event-stream")
|
sse_resp = Response(stream_with_context(gen()), mimetype="text/event-stream")
|
||||||
sse_resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining))
|
sse_resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining))
|
||||||
|
sse_resp.headers["X-Context-Warning"] = ctx_warning
|
||||||
sse_resp.headers["X-Context-Model"] = model
|
sse_resp.headers["X-Context-Model"] = model
|
||||||
return sse_resp
|
return sse_resp
|
||||||
data = clean_response(resp.json())
|
data = clean_response(resp.json())
|
||||||
@@ -334,9 +337,12 @@ def chat():
|
|||||||
if not msg.get("content") and msg.get("reasoning_content"):
|
if not msg.get("content") and msg.get("reasoning_content"):
|
||||||
msg["content"] = msg["reasoning_content"]
|
msg["content"] = msg["reasoning_content"]
|
||||||
ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[])))
|
ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[])))
|
||||||
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model),"context_remaining": max(0, ctx_remaining)}
|
ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100
|
||||||
|
ctx_warning = "compact_urgent" if ctx_pct < 5 else ("compact_recommended" if ctx_pct < 15 else ("compact_soon" if ctx_pct < 30 else "ok"))
|
||||||
|
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model),"context_remaining": max(0, ctx_remaining),"context_pct": round(ctx_pct,1),"context_warning": ctx_warning}
|
||||||
resp = jsonify(data)
|
resp = jsonify(data)
|
||||||
resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining))
|
resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining))
|
||||||
|
resp.headers["X-Context-Warning"] = ctx_warning
|
||||||
resp.headers["X-Context-Model"] = model
|
resp.headers["X-Context-Model"] = model
|
||||||
bcast()
|
bcast()
|
||||||
return resp
|
return resp
|
||||||
|
|||||||
Reference in New Issue
Block a user