Mumuni review action items: health checks for all containers, version pinning, 503+Retry-After on all-GPU saturation

This commit is contained in:
Abiba (pi)
2026-05-17 09:05:27 +00:00
parent 8f3b0c6647
commit 4f032b035c
2 changed files with 57 additions and 4 deletions
+20
View File
@@ -27,6 +27,11 @@ services:
- GPU_MOE_URL=http://192.168.68.15:8080/v1
- GPU_DENSE_URL=http://192.168.68.8:8080/v1
- GPU_LIGHT_URL=http://192.168.68.110:8080/v1
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"]
interval: 15s
timeout: 5s
retries: 3
depends_on:
redis:
condition: service_healthy
@@ -42,6 +47,11 @@ services:
- ./litellm_config.yaml:/app/config.yaml
environment:
- LITELLM_MASTER_KEY=sk-syslog-local-master-key
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9000/health')"]
interval: 15s
timeout: 5s
retries: 3
depends_on:
redis:
condition: service_healthy
@@ -54,6 +64,11 @@ services:
- "80:80"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
healthcheck:
test: ["CMD", "curl", "-f", "http://127.0.0.1/health"]
interval: 15s
timeout: 5s
retries: 3
depends_on:
- litellm
- dashboard
@@ -67,6 +82,11 @@ services:
environment:
- REDIS_URL=redis://redis:6379
- GPU_SIDECARS=192.168.68.15:8090,192.168.68.8:8090,192.168.68.110:8090
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3000/health')"]
interval: 15s
timeout: 5s
retries: 3
depends_on:
- redis