This guide covers running and trying out the Red Hat AI Inference Server to serve Mistral Voxtral-Mini-4B-Realtime-2602 model, powered by vLLM.
You can find the Voxtral Mini model card @ https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602
From the model card:
Voxtral Mini 4B Realtime 2602 is a multilingual, realtime speech-transcription model and among the first open-source solutions to achieve accuracy comparable to offline systems with a delay of <500ms. It supports 13 languages and outperforms existing open-source baselines across a range of tasks, making it ideal for applications like voice assistants and live subtitling.
- NVIDIA GPU with 16GB+ VRAM
- Podman
- Python 3.9+ with websockets, librosa, and numpy installed
curl -o CSX_Wikipedia_Detector_demo.wav \
https://upload.wikimedia.org/wikipedia/commons/5/5a/CSX_Wikipedia_Detector_demo.wav(It's some sample audio on Wikimedia Commons for a train line's defect detection system)
hf download mistralai/Voxtral-Mini-4B-Realtime-2602
Available @ registry.redhat.io/rhaiis-preview/vllm-cuda-rhel9:voxtral-realtime
Run:
podman run --rm \
--device nvidia.com/gpu=0 \
--security-opt=label=disable \
--shm-size=4g \
-p 8000:8000 \
-v ~/.cache/huggingface:/hf:Z \
-e HF_HUB_OFFLINE=1 \
-e VLLM_DISABLE_COMPILE_CACHE=1 \
-e HF_HOME=/hf \
registry.redhat.io/rhaiis-preview/vllm-cuda-rhel9:voxtral-realtime \
--model mistralai/Voxtral-Mini-4B-Realtime-2602 \
--tokenizer-mode mistral \
--config-format mistral \
--load-format mistral \
--trust-remote-code \
--compilation-config '{"cudagraph_mode":"PIECEWISE"}' \
--tensor-parallel-size 1 \
--max-model-len 45000 \
--max-num-batched-tokens 8192 \
--max-num-seqs 16 \
--gpu-memory-utilization 0.90 \
--host 0.0.0.0 --port 8000NOTE: You may want to customize the --device used, as well as the mounts for your HuggingFace cache.
Tested using an NVIDIA A100.
- WebSocket Required: Must use
/v1/realtimeendpoint with audio streaming
pip install websockets librosa numpySave this as realtime_test.py:
#!/usr/bin/env python3
"""
Simplified realtime client for testing Voxtral.
"""
import argparse
import asyncio
import base64
import json
import librosa
import numpy as np
import websockets
def audio_to_pcm16_base64(audio_path: str) -> str:
"""Load an audio file and convert it to base64-encoded PCM16 @ 16kHz."""
audio, _ = librosa.load(audio_path, sr=16000, mono=True)
pcm16 = (audio * 32767).astype(np.int16)
return base64.b64encode(pcm16.tobytes()).decode("utf-8")
async def realtime_transcribe(audio_path: str, host: str, port: int, model: str):
"""Connect to the Realtime API and transcribe an audio file."""
uri = f"ws://{host}:{port}/v1/realtime"
print(f"Connecting to {uri}...")
async with websockets.connect(uri) as ws:
# Wait for session.created
response = json.loads(await ws.recv())
if response["type"] == "session.created":
print(f"Session created: {response['id']}")
else:
print(f"Unexpected response: {response}")
return
# Validate model
await ws.send(json.dumps({"type": "session.update", "model": model}))
# Signal ready to start
await ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
# Convert audio file to base64 PCM16
print(f"Loading audio from: {audio_path}")
audio_base64 = audio_to_pcm16_base64(audio_path)
# Send audio in chunks
chunk_size = 4096
audio_bytes = base64.b64decode(audio_base64)
total_chunks = (len(audio_bytes) + chunk_size - 1) // chunk_size
print(f"Sending {total_chunks} audio chunks...")
for i in range(0, len(audio_bytes), chunk_size):
chunk = audio_bytes[i : i + chunk_size]
await ws.send(
json.dumps(
{
"type": "input_audio_buffer.append",
"audio": base64.b64encode(chunk).decode("utf-8"),
}
)
)
# Signal all audio is sent
await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True}))
print("Audio sent. Waiting for transcription...\n")
# Receive transcription
print("Transcription: ", end="", flush=True)
while True:
response = json.loads(await ws.recv())
if response["type"] == "transcription.delta":
print(response["delta"], end="", flush=True)
elif response["type"] == "transcription.done":
print(f"\n\nFinal transcription: {response['text']}")
if response.get("usage"):
print(f"Usage: {response['usage']}")
break
elif response["type"] == "error":
print(f"\nError: {response['error']}")
break
def main():
parser = argparse.ArgumentParser(description="Realtime WebSocket Transcription Client")
parser.add_argument("--model", type=str, default="mistralai/Voxtral-Mini-4B-Realtime-2602")
parser.add_argument("--audio_path", type=str, required=True)
parser.add_argument("--host", type=str, default="127.0.0.1")
parser.add_argument("--port", type=int, default=8000)
args = parser.parse_args()
asyncio.run(realtime_transcribe(args.audio_path, args.host, args.port, args.model))
if __name__ == "__main__":
main()python3 realtime_test.py --audio_path CSX_Wikipedia_Detector_demo.wav --host 127.0.0.1 --port 8000Connecting to ws://127.0.0.1:8000/v1/realtime...
Session created: sess-bb5031bf3b8b2daa
Loading audio from: /home/dougbtv/CSX_Wikipedia_Detector_demo.wav
Sending 167 audio chunks...
Audio sent. Waiting for transcription...
Transcription: CSX Equipment Defect Detector Milepost 700.1 No Defects No Defects Total Axle 738 Train Length 13764 Speed 45. End of
Final transcription: CSX Equipment Defect Detector Milepost 700.1 No Defects No Defects Total Axle 738 Train Length 13764 Speed 45. End of
Usage: {'prompt_tokens': 39, 'total_tokens': 298, 'completion_tokens': 259, 'prompt_tokens_details': None}