syslog-harness/gpu-dashboard/gpu_collector.py

#!/usr/bin/env python3
"""GPU metrics collector — polls sidecars + llama.cpp every 10s, writes to Workspace."""

import urllib.request, json, time, os

HOSTS = [
    {"name": "amdpve", "host": "192.168.68.15", "gpu": "AMD Strix Halo", "llama_port": 8080},
    {"name": "llmgpu", "host": "192.168.68.8", "gpu": "RTX 3090", "llama_port": 8080},
    {"name": "ocu-llm", "host": "192.168.68.110", "gpu": "RTX 5070", "llama_port": 8080},
]
OUTPUT = "/app/public/gpu_metrics.json"
INTERVAL = 10
STALE_THRESHOLD = 30  # seconds before marking stale
DEAD_THRESHOLD = 60   # seconds before marking unreachable

last_seen = {}


def fetch_json(url, timeout=3):
    try:
        req = urllib.request.Request(url)
        resp = urllib.request.urlopen(req, timeout=timeout)
        return json.loads(resp.read().decode())
    except Exception:
        return None


def collect_one(h):
    """Collect GPU hardware + llama.cpp inference state for one host."""
    name = h["name"]
    host = h["host"]
    now = time.time()

    # GPU hardware from sidecar
    gpu = fetch_json(f"http://{host}:8090/")

    # llama.cpp inference state
    llamacpp_health = fetch_json(f"http://{host}:{h['llama_port']}/health")
    llamacpp_models = fetch_json(f"http://{host}:{h['llama_port']}/v1/models")

    # Determine inference state
    model_name = None
    inference_state = "unknown"
    if llamacpp_models:
        models = llamacpp_models.get("data", [])
        if models:
            model_name = models[0].get("id")

    if llamacpp_health:
        status = llamacpp_health.get("status", "")
        if status == "ok":
            idle = llamacpp_health.get("slots_idle", 0)
            processing = llamacpp_health.get("slots_processing", 0)
            if idle and not processing:
                inference_state = "idle"
            elif processing:
                inference_state = "busy"
            else:
                inference_state = "idle"

    # Check for /slots endpoint for is_processing detail
    slots = fetch_json(f"http://{host}:{h['llama_port']}/slots")
    if slots and isinstance(slots, list) and len(slots) > 0:
        if slots[0].get("is_processing"):
            inference_state = "busy"

    result = {
        "host": name,
        "gpu_name": h["gpu"],
        "inference": {
            "state": inference_state,
            "model": model_name,
        },
        "hardware": gpu if gpu else None,
        "online": gpu is not None,
        "timestamp": now,
    }

    if gpu is not None:
        last_seen[name] = now

    if name in last_seen:
        age = now - last_seen[name]
        if age > DEAD_THRESHOLD:
            result["online"] = False
        elif age > STALE_THRESHOLD:
            result["stale"] = True

    return result


def main():
    print(f"GPU collector starting, output={OUTPUT}, interval={INTERVAL}s")
    os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)

    while True:
        start = time.time()
        results = [collect_one(h) for h in HOSTS]

        payload = {
            "updated": start,
            "gpus": results,
        }

        with open(OUTPUT + ".tmp", "w") as f:
            json.dump(payload, f)
        os.rename(OUTPUT + ".tmp", OUTPUT)

        elapsed = time.time() - start
        sleep_for = max(0, INTERVAL - elapsed)
        time.sleep(sleep_for)


if __name__ == "__main__":
    main()