From f47c3f3304fb0989ffc27bd0b8dd18bdaf8b63f2 Mon Sep 17 00:00:00 2001 From: Abiba Date: Tue, 26 May 2026 12:18:31 +0000 Subject: [PATCH] feat: latency vs prompt size scatter plot on dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Router: new /metrics/scatter endpoint returns individual data points (prompt_tokens, inference_ms, model, agent, reason, stream) for scatter visualization. Dashboard: new panel showing latency vs prompt size by model. - Log-scale X axis (prompt tokens) with model color coding - Dropdown to filter by individual model or view all - Hover tooltips with details per point - Auto-refresh every 30s Enables direct observation of context-length vs latency relationship — validates routing tier decisions. --- dashboard/dashboard.py | 65 ++++++++++++++++++++++++++++++++++++++++-- router/router.py | 30 +++++++++++++++++++ 2 files changed, 92 insertions(+), 3 deletions(-) diff --git a/dashboard/dashboard.py b/dashboard/dashboard.py index f141330..fe3c3c2 100644 --- a/dashboard/dashboard.py +++ b/dashboard/dashboard.py @@ -113,7 +113,20 @@ body { background: #0b0f17; color: #bcc3cd; font-family: -apple-system, BlinkMac
Routing Effectiveness — by Reason
Agent Performance
- + +
+ Latency vs Prompt Size — by Model +
+ +
+
+ +
Live Stream
@@ -244,7 +257,43 @@ $('perf-agents').innerHTML=aHTML; }else{$('perf-agents').innerHTML='
-
';} } function poll(){fetch('/api/state').then(function(r){return r.json()}).then(function(data){render(data);$('connection-status').textContent='live';}).catch(function(){$('connection-status').textContent='reconnecting';});} -poll();setInterval(poll,3000);loadTS();loadPerf();setInterval(loadPerf,15000); +function loadScatter(){ +var m=$('scatter-model').value; +fetch('/api/scatter?window=24&model='+m).then(function(r){return r.json()}).then(renderScatter).catch(function(){}); +} +function renderScatter(d){ +var pts=d.points||[],el=$('scatter-plot'),lg=$('scatter-legend'); +if(!pts.length){el.innerHTML='
No data yet
';return;} +var mcol={'qwen3.6-35B-A3B':'#a78bfa','qwen3.6-27B-code':'#f59e0b','qwen3.5-9b-vlm':'#22c55e','unknown':'#38bdf8'}; +var mlab={'qwen3.6-35B-A3B':'35B MoE','qwen3.6-27B-code':'27B Dense','qwen3.5-9b-vlm':'9B VLM'}; +var maxX=Math.max.apply(null,pts.map(function(p){return p.prompt_tokens||0}))||1000; +var maxY=Math.max.apply(null,pts.map(function(p){return p.inference_ms||0}))||5000; +// Log scale for X axis (prompt tokens vary widely) +var toX=function(t){return Math.log10(Math.max(t,1))/Math.log10(Math.max(maxX,10))*100;}; +var toY=function(t){return (t/maxY)*100;}; +var dots=''; +pts.forEach(function(p){ +var x=toX(p.prompt_tokens),y=toY(p.inference_ms),c=mcol[p.model]||'#38bdf8'; +var r=p.stream?1.5:2.5,o=p.stream?0.4:0.8; +dots+=''+mlab[p.model]+' | '+p.prompt_tokens+' tok | '+p.inference_ms+'ms | '+p.agent+''; +}); +// Grid lines +var grid=''; +for(var i=1;i<=4;i++){grid+='';} +for(var i=1;i<=4;i++){grid+='';} +// Axis labels +var xTicks=''; +var xVals=[10,100,1000,10000,100000]; +xVals.forEach(function(v){if(v<=maxX)xTicks+=''+(v>=1000?(v/1000)+'k':v)+'';}); +var yTicks=''; +var yVals=[500,1000,5000,10000,50000,100000]; +yVals.forEach(function(v){if(v<=maxY)yTicks+=''+(v>=1000?(v/1000)+'s':v+'ms')+'';}); +el.innerHTML=''+grid+dots+xTicks+yTicks+'Prompt Tokens (log scale)Inference Time'; +// Legend +var models=[];pts.forEach(function(p){if(models.indexOf(p.model)===-1)models.push(p.model);}); +lg.innerHTML=models.map(function(m){return''+mlab[m]+'';}).join(''); +} +poll();setInterval(poll,3000);loadTS();loadPerf();setInterval(loadPerf,15000);loadScatter();setInterval(loadScatter,30000); """ @@ -255,7 +304,17 @@ def dashboard(): return render_template_string(DASHBOARD_HTML) @app.route("/api/state") def api_state(): return fetch_state() -@app.route("/api/performance") +@app.route("/api/scatter") +def api_scatter(): + window = request.args.get("window", "24") + model = request.args.get("model", "all") + try: + r = requests.get(f"http://router:9000/metrics/scatter?window={window}&model={model}", timeout=10) + if r.status_code == 200: return r.json() + except Exception: pass + return {"points": [], "count": 0} + +@app.route("/api/timeseries") def api_performance(): window = request.args.get("window", "24") model = request.args.get("model", "all") diff --git a/router/router.py b/router/router.py index a0fde97..169b912 100644 --- a/router/router.py +++ b/router/router.py @@ -537,6 +537,36 @@ def performance(): except Exception as e: return jsonify({"error": str(e)}), 500 +@app.route("/metrics/scatter") +def scatter(): + """Return individual data points for scatter plots (prompt_tokens vs latency).""" + if not r: return jsonify({"error": "Redis unavailable"}), 503 + try: + window_hours = int(request.args.get("window", "24")) + model_filter = request.args.get("model", "all") + cutoff = time.time() - (window_hours * 3600) + raw = r.lrange("perf:recent", 0, -1) + points = [] + for x in raw: + try: + rec = json.loads(x) + if rec["ts"] >= cutoff: + if model_filter == "all" or rec["model"] == model_filter: + points.append({ + "model": rec["model"], + "agent": rec["agent"], + "reason": rec["reason"], + "prompt_tokens": int(rec.get("prompt_tokens", 0)), + "completion_tokens": rec.get("completion_tokens", 0), + "inference_ms": round(rec["inference_ms"], 1), + "tokens_per_sec": rec.get("tokens_per_sec", 0), + "stream": rec.get("stream", False) + }) + except: pass + return jsonify({"points": points, "count": len(points)}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + @app.route("/v1/models") def models(): def _h(m): return check_gpu_health(m, sidecar_timeout=1.5, gpu_timeout=1)
TimeAgentModelReasonTier