Skip to content

Instantly share code, notes, and snippets.

@alexjyong
Created December 9, 2025 03:44
Show Gist options
  • Select an option

  • Save alexjyong/0b2ce5c5fbd0320de7cfb4b2532df803 to your computer and use it in GitHub Desktop.

Select an option

Save alexjyong/0b2ce5c5fbd0320de7cfb4b2532df803 to your computer and use it in GitHub Desktop.
streamlined_qwen.sh
#!/bin/bash
set -e
# === Configuration ===
CONTAINER_NAME="vllm"
PORT=8000
MODEL_FOLDER="/root/.cache/huggingface/hub"
# === Step 1: Setup Conda Environment ===
echo "[INFO] Setting up environment..."
if ! conda env list | grep -qE "^inference[[:space:]]"; then
conda create -n inference python=3.12 -y
fi
eval "$(conda shell.bash hook)"
conda activate inference
# === Step 2: Install Dependencies ===
if ! python3 -c "import huggingface_hub" &>/dev/null; then
pip install -q -U huggingface_hub
fi
mkdir -p "$MODEL_FOLDER"
# === Step 3: HuggingFace Authentication ===
python3 << 'PYEOF'
from huggingface_hub import HfApi, login
import sys, os
try:
HfApi().whoami()
print("[INFO] Already authenticated")
except:
token = os.environ.get('HUGGINGFACE_TOKEN') or input("[INPUT] HuggingFace token: ").strip()
if token:
login(token=token, add_to_git_credential=True)
else:
print("[ERROR] No token provided")
sys.exit(1)
PYEOF
[ $? -ne 0 ] && exit 1
# === Step 4: Detect GPU ===
echo
GPU_INFO=$(nvidia-smi --query-gpu=name --format=csv,noheader,nounits | head -n1)
VRAM_GB=$(($(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n1) / 1024))
echo "[INFO] GPU: $GPU_INFO ($VRAM_GB GB VRAM)"
# === Step 5: Model Selection ===
echo
echo "Available Qwen Models:"
echo " [1] Qwen3-Coder-30B-A3B-Instruct-FP8 (⚑ RECOMMENDED - Fast, efficient)"
echo " [2] Qwen3-Coder-30B-A3B-Instruct (Full precision)"
echo " [3] Custom Qwen model"
echo
read -rp "Choice (1-3): " choice
case "$choice" in
1) MODEL="Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"; IS_FP8=true ;;
2) MODEL="Qwen/Qwen3-Coder-30B-A3B-Instruct"; IS_FP8=false ;;
3)
read -rp "Model name (e.g., Qwen/Model-Name): " MODEL
[[ "$MODEL" == *"FP8"* ]] && IS_FP8=true || IS_FP8=false
;;
*) echo "[ERROR] Invalid choice"; exit 1 ;;
esac
# === Step 6: Auto-calculate Optimal Settings ===
# Determine model size from name
if [[ "$MODEL" == *"72B"* ]]; then MODEL_SIZE=72; BASE_VRAM_FP16=144; BASE_VRAM_FP8=72
elif [[ "$MODEL" == *"70B"* ]]; then MODEL_SIZE=70; BASE_VRAM_FP16=140; BASE_VRAM_FP8=70
elif [[ "$MODEL" == *"30B"* ]]; then MODEL_SIZE=30; BASE_VRAM_FP16=60; BASE_VRAM_FP8=30
elif [[ "$MODEL" == *"14B"* ]]; then MODEL_SIZE=14; BASE_VRAM_FP16=28; BASE_VRAM_FP8=14
elif [[ "$MODEL" == *"8B"* ]] || [[ "$MODEL" == *"7B"* ]]; then MODEL_SIZE=8; BASE_VRAM_FP16=16; BASE_VRAM_FP8=8
else MODEL_SIZE=30; BASE_VRAM_FP16=60; BASE_VRAM_FP8=30; fi
# Calculate VRAM allocation
BASE_VRAM=$( [ "$IS_FP8" = true ] && echo $BASE_VRAM_FP8 || echo $BASE_VRAM_FP16 )
OVERHEAD=10 # Reserve for system/ops
AVAILABLE_KV=$((VRAM_GB - BASE_VRAM - OVERHEAD))
echo
echo "[INFO] Model: ${MODEL_SIZE}B parameters ($([ "$IS_FP8" = true ] && echo "FP8" || echo "FP16"))"
echo "[INFO] Base model VRAM: ${BASE_VRAM}GB | Available for context: ${AVAILABLE_KV}GB"
# Calculate max context based on available VRAM
# KV cache memory per 1K tokens: ~0.8GB (FP16) or ~0.4GB (FP8) for 30B model
if [ "$IS_FP8" = true ]; then
if [ $AVAILABLE_KV -ge 100 ]; then MAX_CTX=262144 # 256K
elif [ $AVAILABLE_KV -ge 50 ]; then MAX_CTX=131072 # 128K
elif [ $AVAILABLE_KV -ge 25 ]; then MAX_CTX=65536 # 64K
elif [ $AVAILABLE_KV -ge 12 ]; then MAX_CTX=32768 # 32K
else MAX_CTX=16384; fi # 16K
else
if [ $AVAILABLE_KV -ge 100 ]; then MAX_CTX=131072 # 128K
elif [ $AVAILABLE_KV -ge 50 ]; then MAX_CTX=65536 # 64K
elif [ $AVAILABLE_KV -ge 25 ]; then MAX_CTX=32768 # 32K
elif [ $AVAILABLE_KV -ge 12 ]; then MAX_CTX=16384 # 16K
else MAX_CTX=8192; fi # 8K
fi
MAX_BATCHED=$((MAX_CTX * 6))
echo "[INFO] Context window: $MAX_CTX tokens | Batched tokens: $MAX_BATCHED"
# GPU-specific recommendations
echo
if [ $VRAM_GB -lt $((BASE_VRAM + 10)) ]; then
echo "⚠️ WARNING: GPU may be insufficient for this model"
echo " Required: ~$((BASE_VRAM + 10))GB | Available: ${VRAM_GB}GB"
[ "$IS_FP8" = false ] && echo " πŸ’‘ Try FP8 variant to reduce VRAM by 50%"
read -rp "Continue anyway? (y/N): " confirm
[[ ! "$confirm" =~ ^[Yy]$ ]] && exit 1
elif [ "$IS_FP8" = false ] && [ $VRAM_GB -ge 140 ]; then
echo "πŸ’‘ TIP: You have ${VRAM_GB}GB VRAM but using FP16 model"
echo " Consider FP8 for 2x speed + larger context window"
elif [ "$IS_FP8" = true ] && [ $VRAM_GB -ge 140 ]; then
echo "πŸ’‘ TIP: Your H200 can handle 256K context with this FP8 model!"
fi
# === Step 7: Download Model ===
MODEL_PATH="$MODEL_FOLDER/$MODEL"
if [ ! -d "$MODEL_PATH" ]; then
echo
echo "[INFO] Downloading $MODEL..."
python3 << PYEOF
from huggingface_hub import snapshot_download
try:
snapshot_download(
repo_id="$MODEL",
local_dir="$MODEL_PATH",
ignore_patterns=["original/*", "*.md"],
local_dir_use_symlinks=False
)
print("[INFO] Download complete!")
except Exception as e:
print(f"[ERROR] Download failed: {e}")
exit(1)
PYEOF
[ $? -ne 0 ] && exit 1
else
echo "[INFO] Model already cached"
fi
# === Step 8: Cleanup Existing Containers ===
echo
echo "[INFO] Preparing container..."
docker ps -a --filter ancestor=vllm/vllm-openai:v0.9.0 --format "{{.ID}}" | xargs -r docker rm -f 2>/dev/null || true
sleep 3
# === Step 9: Build vLLM Command ===
VLLM_CMD="python3 -m vllm.entrypoints.openai.api_server \
--model '$MODEL' \
--port $PORT \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_CTX \
--max-num-batched-tokens $MAX_BATCHED \
--enable-chunked-prefill \
--kv-cache-dtype auto \
--enable-prefix-caching \
--disable-log-requests \
--swap-space 16 \
--enable-auto-tool-choice \
--tool-call-parser hermes"
# Add FP8 quantization if needed
[ "$IS_FP8" = true ] && VLLM_CMD="$VLLM_CMD --quantization fp8"
# === Step 10: Launch Model ===
echo "[INFO] Starting model server..."
docker exec -d "$CONTAINER_NAME" bash -c "export CUDA_VISIBLE_DEVICES='0' && $VLLM_CMD > /var/log/vllm.log 2>&1"
# === Step 11: Wait for API Ready ===
echo
echo "⏳ Waiting for model to load (typically 3-8 minutes)..."
MAX_WAIT=900
INTERVAL=10
ELAPSED=0
while [ $ELAPSED -lt $MAX_WAIT ]; do
if curl -s -o /dev/null -w "%{http_code}" "http://localhost:$PORT/v1/models" | grep -q "200"; then
echo
echo "βœ… MODEL READY!"
echo
echo "═══════════════════════════════════════════════"
echo "Model: $MODEL"
echo "Context: $MAX_CTX tokens | Precision: $([ "$IS_FP8" = true ] && echo "FP8 ⚑" || echo "FP16")"
echo "Endpoint: http://localhost:$PORT/v1"
echo "═══════════════════════════════════════════════"
echo
echo "Quick test:"
echo " curl http://localhost:$PORT/v1/models"
echo
echo "Chat example:"
cat << 'EOF'
curl http://localhost:$PORT/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "$MODEL",
"messages": [{"role": "user", "content": "Write a Python quicksort"}],
"max_tokens": 2048
}'
EOF
echo
echo "View logs:"
echo " docker exec $CONTAINER_NAME tail -f /var/log/vllm.log"
echo "═══════════════════════════════════════════════"
exit 0
fi
printf "."
sleep $INTERVAL
ELAPSED=$((ELAPSED + INTERVAL))
done
echo
echo "❌ Timeout after $((MAX_WAIT/60)) minutes"
echo "Check logs: docker exec $CONTAINER_NAME tail -n 100 /var/log/vllm.log"
exit 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment