feat: latency vs prompt size scatter plot on dashboard

Router: new /metrics/scatter endpoint returns individual data points
(prompt_tokens, inference_ms, model, agent, reason, stream)
for scatter visualization.

Dashboard: new panel showing latency vs prompt size by model.
- Log-scale X axis (prompt tokens) with model color coding
- Dropdown to filter by individual model or view all
- Hover tooltips with details per point
- Auto-refresh every 30s

Enables direct observation of context-length vs latency
relationship — validates routing tier decisions.
This commit is contained in:
Abiba
2026-05-26 12:18:31 +00:00
parent cfb05fa501
commit f47c3f3304
2 changed files with 92 additions and 3 deletions
+30
View File
@@ -537,6 +537,36 @@ def performance():
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/metrics/scatter")
def scatter():
"""Return individual data points for scatter plots (prompt_tokens vs latency)."""
if not r: return jsonify({"error": "Redis unavailable"}), 503
try:
window_hours = int(request.args.get("window", "24"))
model_filter = request.args.get("model", "all")
cutoff = time.time() - (window_hours * 3600)
raw = r.lrange("perf:recent", 0, -1)
points = []
for x in raw:
try:
rec = json.loads(x)
if rec["ts"] >= cutoff:
if model_filter == "all" or rec["model"] == model_filter:
points.append({
"model": rec["model"],
"agent": rec["agent"],
"reason": rec["reason"],
"prompt_tokens": int(rec.get("prompt_tokens", 0)),
"completion_tokens": rec.get("completion_tokens", 0),
"inference_ms": round(rec["inference_ms"], 1),
"tokens_per_sec": rec.get("tokens_per_sec", 0),
"stream": rec.get("stream", False)
})
except: pass
return jsonify({"points": points, "count": len(points)})
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/v1/models")
def models():
def _h(m): return check_gpu_health(m, sidecar_timeout=1.5, gpu_timeout=1)