revert: MoE concurrency back to 2 (Dense-first routing handles thermal)
This commit is contained in:
+1
-1
@@ -19,7 +19,7 @@ GPU_URLS = {
|
|||||||
}
|
}
|
||||||
# Max concurrent requests per GPU (based on llama.cpp --parallel)
|
# Max concurrent requests per GPU (based on llama.cpp --parallel)
|
||||||
GPU_MAX_CONCURRENT = {
|
GPU_MAX_CONCURRENT = {
|
||||||
"qwen3.6-35B-A3B": 1, # 1 slot (thermal management: 94C at 2 concurrent)
|
"qwen3.6-35B-A3B": 2, # 2 slots (Dense-first routing reduces thermal load)
|
||||||
"qwen3.6-27B-code": 1, # 1 slot (24GB VRAM saturated at 256K ctx)
|
"qwen3.6-27B-code": 1, # 1 slot (24GB VRAM saturated at 256K ctx)
|
||||||
"qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom)
|
"qwen3.5-9b-vlm": 2, # 2 slots (12GB VRAM, 4GB headroom)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user