#!/usr/bin/env python3 """GPU metrics collector — polls sidecars + llama.cpp every 10s, writes to Workspace.""" import urllib.request, json, time, os HOSTS = [ {"name": "amdpve", "host": "192.168.68.15", "gpu": "AMD Strix Halo", "llama_port": 8080}, {"name": "llmgpu", "host": "192.168.68.8", "gpu": "RTX 3090", "llama_port": 8080}, {"name": "ocu-llm", "host": "192.168.68.110", "gpu": "RTX 5070", "llama_port": 8080}, ] OUTPUT = "/app/public/gpu_metrics.json" INTERVAL = 10 STALE_THRESHOLD = 30 # seconds before marking stale DEAD_THRESHOLD = 60 # seconds before marking unreachable last_seen = {} def fetch_json(url, timeout=3): try: req = urllib.request.Request(url) resp = urllib.request.urlopen(req, timeout=timeout) return json.loads(resp.read().decode()) except Exception: return None def collect_one(h): """Collect GPU hardware + llama.cpp inference state for one host.""" name = h["name"] host = h["host"] now = time.time() # GPU hardware from sidecar gpu = fetch_json(f"http://{host}:8090/") # llama.cpp inference state llamacpp_health = fetch_json(f"http://{host}:{h['llama_port']}/health") llamacpp_models = fetch_json(f"http://{host}:{h['llama_port']}/v1/models") # Determine inference state model_name = None inference_state = "unknown" if llamacpp_models: models = llamacpp_models.get("data", []) if models: model_name = models[0].get("id") if llamacpp_health: status = llamacpp_health.get("status", "") if status == "ok": idle = llamacpp_health.get("slots_idle", 0) processing = llamacpp_health.get("slots_processing", 0) if idle and not processing: inference_state = "idle" elif processing: inference_state = "busy" else: inference_state = "idle" # Check for /slots endpoint for is_processing detail slots = fetch_json(f"http://{host}:{h['llama_port']}/slots") if slots and isinstance(slots, list) and len(slots) > 0: if slots[0].get("is_processing"): inference_state = "busy" result = { "host": name, "gpu_name": h["gpu"], "inference": { "state": inference_state, "model": model_name, }, "hardware": gpu if gpu else None, "online": gpu is not None, "timestamp": now, } if gpu is not None: last_seen[name] = now if name in last_seen: age = now - last_seen[name] if age > DEAD_THRESHOLD: result["online"] = False elif age > STALE_THRESHOLD: result["stale"] = True return result def main(): print(f"GPU collector starting, output={OUTPUT}, interval={INTERVAL}s") os.makedirs(os.path.dirname(OUTPUT), exist_ok=True) while True: start = time.time() results = [collect_one(h) for h in HOSTS] payload = { "updated": start, "gpus": results, } with open(OUTPUT + ".tmp", "w") as f: json.dump(payload, f) os.rename(OUTPUT + ".tmp", OUTPUT) elapsed = time.time() - start sleep_for = max(0, INTERVAL - elapsed) time.sleep(sleep_for) if __name__ == "__main__": main()