license: apache-2.0
Install vLLM
Use private vLLM branch: https://github.com/mistralai/vllm-private/compare/main...add_ml3_v4
You can install it by using the public vLLM docker:
ghcr.io/mistralai/vllm/vllm-openai:latest 8
and then install vLLM with:
cd vllm
VLLM_USE_PRECOMPILED=1 pip install --editable .
Launch
We tested ML3 by launching it from two 8xH200 nodes in parallel.
You can do the following to start the model. Connect to two 8xH200 nodes (connected via InfiniBand) and make sure to launch ray on each node. On one node you should start ray as "head" on the other as a "worker".
1.) Start ray
As soon as everything is install make sure to start ray on all GPU nodes.
Important: Make sure that on each node the command line has access to a python cmd. If it doesn't exist symlink or alias it to python3.
Then save the following code in a "start_ray.py" script and executed it as follows:
python3 start_ray.py --is_head --address 172.17.199.135 --ray_port 6379 --num_gpus 8 --num_cpus 20 --nnodes 2
on the "HEAD" node with the correct IP address (you can retrieve it via hostname -I).
On the worker nodes execute exactly the same command just remove --is_head and --nnodes.
import argparse
import os
from typing import Union
def generate_head_script(
address: str, ray_port: int, num_cpus: str, num_gpus: str, nnodes: int
) -> str:
num_cpus_str = str(num_cpus) if num_cpus.isdigit() else num_cpus
num_gpus_str = str(num_gpus) if num_gpus.isdigit() else num_gpus
script = f"""# Spawning Ray cluster (head node)
echo "Ray: Starting HEAD at $(hostname)..."
export RAY_memory_monitor_refresh_ms=0
ray start \\
--head \\
--node-ip-address={address} \\
--port={ray_port} \\
--num-cpus {num_cpus_str} \\
--num-gpus {num_gpus_str}
# Ray cluster needs to be initialized before spawning workers
echo "Waiting for {nnodes} worker nodes to connect..."
START_TIME=$(date +%s)
TIMEOUT=120 # seconds
INTERVAL=1
while :; do
# Count alive nodes
WORKER_COUNT=$(python -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))')
if [ "$WORKER_COUNT" -ge "{nnodes}" ]; then
echo "Ray: ✅ Found all ($WORKER_COUNT) nodes."
break
fi
NOW=$(date +%s)
ELAPSED=$(( NOW - START_TIME ))
if [ "$ELAPSED" -ge "$TIMEOUT" ]; then
echo "Ray: ❌ Timeout after $TIMEOUT seconds: not enough workers joined."
exit 1
fi
echo "⏳ Still waiting... ($WORKER_COUNT found)"
sleep "$INTERVAL"
done
"""
return script
def get_start_ray_worker_cmd(
main_address: str,
ray_port: int,
num_cpus: Union[int, str],
num_gpus: Union[int, str],
) -> str:
num_cpus_str = str(num_cpus) if isinstance(num_cpus, int) else num_cpus
num_gpus_str = str(num_gpus) if isinstance(num_gpus, int) else num_gpus
return f"""echo "Ray: Starting WORKER at $(hostname)..."
export RAY_memory_monitor_refresh_ms=0
ray start \\
--address {main_address}:{ray_port} \\
--num-cpus {num_cpus_str} \\
--num-gpus {num_gpus_str} \\
--block
"""
def main() -> None:
parser = argparse.ArgumentParser(
description="Generate Ray cluster scripts for head or worker nodes."
)
parser.add_argument(
"--is_head",
action="store_true",
help="Generate script for the head node (default: worker).",
)
parser.add_argument(
"--address", type=str, required=True, help="IP address of the head node."
)
parser.add_argument(
"--ray_port", type=int, required=True, help="Port for the Ray cluster."
)
parser.add_argument(
"--num_cpus",
type=str,
required=True,
help="Number of CPUs to allocate (e.g., '4' or 'auto').",
)
parser.add_argument(
"--num_gpus",
type=str,
required=True,
help="Number of GPUs to allocate (e.g., '1' or 'auto').",
)
parser.add_argument(
"--nnodes",
type=int,
help="Total number of nodes to wait for (only for head node).",
)
args = parser.parse_args()
print(f"Ray: Args: {args}")
if args.is_head:
script = generate_head_script(
address=args.address,
ray_port=args.ray_port,
num_cpus=args.num_cpus,
num_gpus=args.num_gpus,
nnodes=args.nnodes,
)
else:
assert args.nnodes is None, "nnodes is not used for worker nodes"
script = get_start_ray_worker_cmd(
main_address=args.address,
ray_port=args.ray_port,
num_cpus=args.num_cpus,
num_gpus=args.num_gpus,
)
os.system(script)
if __name__ == "__main__":
main()
on both machines.
2.) Once you've started ray you can launch vLLM just on the HEAD node.
vllm serve mistralai/mistral-large-3 \
--tokenizer_mode mistral --config_format mistral \
--load_format mistral --tool-call-parser mistral \
--enable-auto-tool-choice \
--limit-mm-per-prompt '{"image":10}' \
--tensor-parallel-size 16 \
--max_model_len 65536 \
--max_num_seqs 128 \
--enforce_eager
Loading the checkponit will take a while. Once the model is loaded you can ping it for example with the code as stated below.
Client
import requests
import json
url = "http://<your-server-url>:8000/v1/chat/completions"
headers = {"Content-Type": "application/json", "Authorization": "Bearer token"}
model = "mistralai/mistral-large-3"
image_url = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/europe.png"
messages = [
{
"role": "user",
"content": "Without browsing the web, how many days ago was Mistral founded?"
},
]
# messages = [
# {
# "role": "user",
# "content": [
# {
# "type": "text",
# "text": "Which of the depicted countries has the best food? Which the second and third and fourth? Name the country, its color on the map and one its city that is visible on the map, but is not the capital. Make absolutely sure to only name a city that can be seen on the map.",
# },
# {"type": "image_url", "image_url": {"url": image_url}},
# ],
# },
# ]
data = {"model": model, "messages": messages}
response = requests.post(url, headers=headers, data=json.dumps(data), max_tokens=16)
print(response.json()["choices"][0]["message"]["content"])