feat: per-request performance tracking + /metrics/performance endpoint

router/router.py (+158 lines):
- store_perf_record(): captures queue_ms, inference_ms, prompt_tokens,
  completion_tokens, tokens_per_sec per request in Redis
- Per-model, per-reason, per-agent rolling windows (last 200-500)
- /metrics/performance?window=N endpoint with percentiles (p50/p95/p99)
  for latency, throughput, and queue time per model/reason/agent
- Queue time now surfaced in routing metadata and routes:recent
- Streaming requests tracked with estimated prompt tokens

nginx/nginx.conf:
- Added /metrics/ proxy pass to router_api

Enables model performance comparison and routing tier validation.
This commit is contained in:
Abiba
2026-05-25 16:50:45 +00:00
parent b7882b2434
commit b849cd3395
2 changed files with 171 additions and 6 deletions
+7
View File
@@ -78,6 +78,13 @@ http {
proxy_buffering off; proxy_buffering off;
} }
# Performance analytics
location /metrics/ {
proxy_pass http://router_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
location /health { location /health {
proxy_pass http://router_api/health; proxy_pass http://router_api/health;
proxy_http_version 1.1; proxy_http_version 1.1;
+164 -6
View File
@@ -1,4 +1,4 @@
import os, json, time, logging, traceback, threading, queue import os, json, time, logging, traceback, threading, queue, statistics, math
import requests, redis import requests, redis
from flask import Flask, request, jsonify, Response, stream_with_context from flask import Flask, request, jsonify, Response, stream_with_context
@@ -122,6 +122,38 @@ def estimate_tokens(msgs):
"""Estimate token count from messages. Uses JSON length / 3.5 (closer to real tokenizer ratios for dense text).""" """Estimate token count from messages. Uses JSON length / 3.5 (closer to real tokenizer ratios for dense text)."""
return len(json.dumps(msgs, default=str)) // 3.5 return len(json.dumps(msgs, default=str)) // 3.5
def store_perf_record(model, agent, tier, reason, queue_ms, inference_ms, prompt_tokens, completion_tokens, stream):
"""Store detailed performance record in Redis for analytics."""
if not r: return
try:
total_ms = queue_ms + inference_ms
tps = completion_tokens / (inference_ms / 1000) if inference_ms > 0 and completion_tokens > 0 else 0
rec = json.dumps({
"ts": time.time(),
"model": model, "agent": agent, "tier": tier, "reason": reason,
"queue_ms": round(queue_ms, 1),
"inference_ms": round(inference_ms, 1),
"total_ms": round(total_ms, 1),
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"tokens_per_sec": round(tps, 1),
"stream": stream
})
# Global recent list (last 500)
r.lpush("perf:recent", rec)
r.ltrim("perf:recent", 0, 499)
# Per-model list (last 200)
r.lpush("perf:model:" + model, rec)
r.ltrim("perf:model:" + model, 0, 199)
# Per-reason list (last 200)
r.lpush("perf:reason:" + reason, rec)
r.ltrim("perf:reason:" + reason, 0, 199)
# Per-agent list (last 200)
r.lpush("perf:agent:" + agent, rec)
r.ltrim("perf:agent:" + agent, 0, 199)
except Exception:
pass
def is_gpu_busy(model): def is_gpu_busy(model):
"""Check if GPU is at or near max concurrent capacity.""" """Check if GPU is at or near max concurrent capacity."""
active = gpu_active_count(model) active = gpu_active_count(model)
@@ -296,9 +328,9 @@ def chat():
time.sleep(0.5) # poll every 500ms time.sleep(0.5) # poll every 500ms
d = route(rd, tier) d = route(rd, tier)
waited = time.time() - queue_start queue_ms = (time.time() - queue_start) * 1000
if waited > 0.5: if queue_ms > 500:
log.info("QUEUED: %s waited %.1fs before slot opened", agent, waited) log.info("QUEUED: %s waited %.0fms before slot opened", agent, queue_ms)
model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]] model, reason, url = d["model"], d["reason"], GPU_URLS[d["model"]]
is_stream = rd.get("stream", False) is_stream = rd.get("stream", False)
@@ -309,7 +341,7 @@ def chat():
try: try:
r.incr("routes:"+model); r.incr("routes:tier:"+tier); r.incr("routes:agent:"+agent) r.incr("routes:"+model); r.incr("routes:tier:"+tier); r.incr("routes:agent:"+agent)
r.incr("ts:"+model+":"+time.strftime("%Y%m%d%H")) r.incr("ts:"+model+":"+time.strftime("%Y%m%d%H"))
r.lpush("routes:recent", json.dumps({"ts":time.time(),"model":model,"reason":reason,"tier":tier,"agent":agent})) r.lpush("routes:recent", json.dumps({"ts":time.time(),"model":model,"reason":reason,"tier":tier,"agent":agent,"queue_ms": round(queue_ms,1)}))
r.ltrim("routes:recent",0,999) r.ltrim("routes:recent",0,999)
except Exception: pass except Exception: pass
start = time.time() start = time.time()
@@ -323,6 +355,8 @@ def chat():
def gen(): def gen():
for raw in resp.iter_content(chunk_size=None, decode_unicode=True): for raw in resp.iter_content(chunk_size=None, decode_unicode=True):
if raw: yield clean_unicode(raw) if raw: yield clean_unicode(raw)
# Streaming: can't get token counts without parsing stream, store latency + estimated tokens
store_perf_record(model, agent, tier, reason, queue_ms, lat, estimate_tokens(rd.get("messages",[])), 0, True)
bcast() bcast()
ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[]))) ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[])))
ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100 ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100
@@ -337,10 +371,17 @@ def chat():
msg = c.get("message",{}) msg = c.get("message",{})
if not msg.get("content") and msg.get("reasoning_content"): if not msg.get("content") and msg.get("reasoning_content"):
msg["content"] = msg["reasoning_content"] msg["content"] = msg["reasoning_content"]
# Extract performance data from llama.cpp response
usage = data.get("usage", {})
timings = data.get("timings", {})
prompt_tokens = usage.get("prompt_tokens", 0)
completion_tokens = usage.get("completion_tokens", 0)
inference_ms = lat # total GPU round-trip
store_perf_record(model, agent, tier, reason, queue_ms, inference_ms, prompt_tokens, completion_tokens, False)
ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[]))) ctx_remaining = GPU_CONTEXT.get(model, 65536) - max(session_tokens, estimate_tokens(rd.get("messages",[])))
ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100 ctx_pct = ctx_remaining / GPU_CONTEXT.get(model, 65536) * 100
ctx_warning = "compact_urgent" if ctx_pct < 5 else ("compact_recommended" if ctx_pct < 15 else ("compact_soon" if ctx_pct < 30 else "ok")) ctx_warning = "compact_urgent" if ctx_pct < 5 else ("compact_recommended" if ctx_pct < 15 else ("compact_soon" if ctx_pct < 30 else "ok"))
data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"active_gpu":gpu_active_count(model),"context_remaining": max(0, ctx_remaining),"context_pct": round(ctx_pct,1),"context_warning": ctx_warning} data["routing"] = {"model":model,"reason":reason,"gpu":url,"tier":tier,"agent":agent,"latency_ms":lat,"queue_ms": round(queue_ms,1),"active_gpu":gpu_active_count(model),"context_remaining": max(0, ctx_remaining),"context_pct": round(ctx_pct,1),"context_warning": ctx_warning}
resp = jsonify(data) resp = jsonify(data)
resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining)) resp.headers["X-Context-Remaining"] = str(max(0, ctx_remaining))
resp.headers["X-Context-Warning"] = ctx_warning resp.headers["X-Context-Warning"] = ctx_warning
@@ -356,6 +397,123 @@ def chat():
log.error("Error: %s\n%s", e, traceback.format_exc()) log.error("Error: %s\n%s", e, traceback.format_exc())
return jsonify({"error":str(e)}), 500 return jsonify({"error":str(e)}), 500
@app.route("/metrics/performance")
def performance():
"""Per-request performance analytics with percentiles per model/reason/agent."""
if not r: return jsonify({"error": "Redis unavailable"}), 503
try:
window_hours = int(request.args.get("window", "24"))
model_filter = request.args.get("model", "all")
# Load recent records
cutoff = time.time() - (window_hours * 3600)
raw = r.lrange("perf:recent", 0, -1)
records = []
for x in raw:
try:
rec = json.loads(x)
if rec["ts"] >= cutoff:
records.append(rec)
except: pass
# Filter by model if specified
if model_filter != "all":
records = [r for r in records if r["model"] == model_filter]
if not records:
return jsonify({"models": [], "reasons": [], "agents": [], "summary": {"total_requests": 0}})
def pct(values, p):
if not values: return 0
return round(statistics.quantiles(sorted(values), n=100, method='inclusive')[min(p-1, 98)], 1)
# Per-model stats
model_groups = {}
for rec in records:
m = rec["model"]
if m not in model_groups: model_groups[m] = []
model_groups[m].append(rec)
models = []
for m, recs in sorted(model_groups.items()):
latencies = [r["total_ms"] for r in recs]
tps_vals = [r["tokens_per_sec"] for r in recs if r["tokens_per_sec"] > 0]
non_stream = [r for r in recs if not r["stream"]]
queue_times = [r["queue_ms"] for r in non_stream]
models.append({
"model": m,
"count": len(recs),
"stream_pct": round(len([r for r in recs if r["stream"]]) / len(recs) * 100, 1),
"latency": {
"avg": round(statistics.mean(latencies), 1),
"p50": pct(latencies, 50),
"p95": pct(latencies, 95),
"p99": pct(latencies, 99)
},
"throughput": {
"avg_tokens_per_sec": round(statistics.mean(tps_vals), 1) if tps_vals else 0,
"p50": pct(tps_vals, 50) if tps_vals else 0,
"p95": pct(tps_vals, 95) if tps_vals else 0,
},
"queue": {
"avg_ms": round(statistics.mean(queue_times), 1) if queue_times else 0,
"p95_ms": pct(queue_times, 95) if queue_times else 0,
} if queue_times else None
})
# Per-reason stats
reason_groups = {}
for rec in records:
rsn = rec["reason"]
if rsn not in reason_groups: reason_groups[rsn] = []
reason_groups[rsn].append(rec)
reasons = []
for rsn, recs in sorted(reason_groups.items(), key=lambda x: -len(x[1])):
latencies = [r["total_ms"] for r in recs]
reasons.append({
"reason": rsn,
"count": len(recs),
"avg_total_ms": round(statistics.mean(latencies), 1),
"p95_total_ms": pct(latencies, 95)
})
# Per-agent stats
agent_groups = {}
for rec in records:
ag = rec["agent"]
if ag not in agent_groups: agent_groups[ag] = []
agent_groups[ag].append(rec)
agents = []
for ag, recs in sorted(agent_groups.items(), key=lambda x: -len(x[1])):
latencies = [r["total_ms"] for r in recs]
tps_vals = [r["tokens_per_sec"] for r in recs if r["tokens_per_sec"] > 0]
agents.append({
"agent": ag,
"count": len(recs),
"avg_total_ms": round(statistics.mean(latencies), 1),
"avg_tokens_per_sec": round(statistics.mean(tps_vals), 1) if tps_vals else 0
})
all_lat = [r["total_ms"] for r in records]
all_tps = [r["tokens_per_sec"] for r in records if r["tokens_per_sec"] > 0]
summary = {
"total_requests": len(records),
"window_hours": window_hours,
"latency": {
"avg_ms": round(statistics.mean(all_lat), 1),
"p50_ms": pct(all_lat, 50),
"p95_ms": pct(all_lat, 95),
"p99_ms": pct(all_lat, 99)
},
"throughput_avg_tps": round(statistics.mean(all_tps), 1) if all_tps else 0
}
return jsonify({"models": models, "reasons": reasons, "agents": agents, "summary": summary})
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/v1/models") @app.route("/v1/models")
def models(): def models():
def _h(m): return check_gpu_health(m, sidecar_timeout=1.5, gpu_timeout=1) def _h(m): return check_gpu_health(m, sidecar_timeout=1.5, gpu_timeout=1)