none/vllm.sh

2 lines
229 B
Bash

python -m vllm.entrypoints.openai.api_server --model /model/Qwen2-VL-72B-Instruct-GPTQ-Int4 --served-model-name Qwen2-7B-Instruct --tensor-parallel-size 4 --limit-mm-per-prompt image=2 --gpu-memory-utilization 0.95 --port 12345