alexjyong · December 9, 2025 03:44
diff --git a/streamlined_qwen.sh b/streamlined_qwen.sh
 #!/bin/bash
 set -e

 # === Configuration ===
 CONTAINER_NAME="vllm"
 PORT=8000
 MODEL_FOLDER="/root/.cache/huggingface/hub"

 # === Step 1: Setup Conda Environment ===
 echo "[INFO] Setting up environment..."
 if ! conda env list | grep -qE "^inference[[:space:]]"; then
    conda create -n inference python=3.12 -y
 fi
 eval "$(conda shell.bash hook)"
 conda activate inference

 # === Step 2: Install Dependencies ===
 if ! python3 -c "import huggingface_hub" &>/dev/null; then
    pip install -q -U huggingface_hub
 fi
 mkdir -p "$MODEL_FOLDER"

 # === Step 3: HuggingFace Authentication ===
 python3 << 'PYEOF'
 from huggingface_hub import HfApi, login
 import sys, os

 try:
    HfApi().whoami()
    print("[INFO] Already authenticated")
 except:
    token = os.environ.get('HUGGINGFACE_TOKEN') or input("[INPUT] HuggingFace token: ").strip()
    if token:
        login(token=token, add_to_git_credential=True)
    else:
        print("[ERROR] No token provided")
        sys.exit(1)
 PYEOF
 [ $? -ne 0 ] && exit 1

 # === Step 4: Detect GPU ===
 echo
 GPU_INFO=$(nvidia-smi --query-gpu=name --format=csv,noheader,nounits | head -n1)
 VRAM_GB=$(($(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n1) / 1024))
 echo "[INFO] GPU: $GPU_INFO ($VRAM_GB GB VRAM)"

 # === Step 5: Model Selection ===
 echo
 echo "Available Qwen Models:"
 echo " [1] Qwen3-Coder-30B-A3B-Instruct-FP8 (⚡ RECOMMENDED - Fast, efficient)"
 echo " [2] Qwen3-Coder-30B-A3B-Instruct (Full precision)"
 echo " [3] Custom Qwen model"
 echo
 read -rp "Choice (1-3): " choice

 case "$choice" in
    1) MODEL="Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"; IS_FP8=true ;;
    2) MODEL="Qwen/Qwen3-Coder-30B-A3B-Instruct"; IS_FP8=false ;;
    3) 
        read -rp "Model name (e.g., Qwen/Model-Name): " MODEL
        [[ "$MODEL" == *"FP8"* ]] && IS_FP8=true || IS_FP8=false
        ;;
    *) echo "[ERROR] Invalid choice"; exit 1 ;;
 esac

 # === Step 6: Auto-calculate Optimal Settings ===
 # Determine model size from name
 if [[ "$MODEL" == *"72B"* ]]; then MODEL_SIZE=72; BASE_VRAM_FP16=144; BASE_VRAM_FP8=72
 elif [[ "$MODEL" == *"70B"* ]]; then MODEL_SIZE=70; BASE_VRAM_FP16=140; BASE_VRAM_FP8=70
 elif [[ "$MODEL" == *"30B"* ]]; then MODEL_SIZE=30; BASE_VRAM_FP16=60; BASE_VRAM_FP8=30
 elif [[ "$MODEL" == *"14B"* ]]; then MODEL_SIZE=14; BASE_VRAM_FP16=28; BASE_VRAM_FP8=14
 elif [[ "$MODEL" == *"8B"* ]] || [[ "$MODEL" == *"7B"* ]]; then MODEL_SIZE=8; BASE_VRAM_FP16=16; BASE_VRAM_FP8=8
 else MODEL_SIZE=30; BASE_VRAM_FP16=60; BASE_VRAM_FP8=30; fi

 # Calculate VRAM allocation
 BASE_VRAM=$( [ "$IS_FP8" = true ] && echo $BASE_VRAM_FP8 || echo $BASE_VRAM_FP16 )
 OVERHEAD=10  # Reserve for system/ops
 AVAILABLE_KV=$((VRAM_GB - BASE_VRAM - OVERHEAD))

 echo
 echo "[INFO] Model: ${MODEL_SIZE}B parameters ($([ "$IS_FP8" = true ] && echo "FP8" || echo "FP16"))"
 echo "[INFO] Base model VRAM: ${BASE_VRAM}GB | Available for context: ${AVAILABLE_KV}GB"

 # Calculate max context based on available VRAM
 # KV cache memory per 1K tokens: ~0.8GB (FP16) or ~0.4GB (FP8) for 30B model
 if [ "$IS_FP8" = true ]; then
    if [ $AVAILABLE_KV -ge 100 ]; then MAX_CTX=262144  # 256K
    elif [ $AVAILABLE_KV -ge 50 ]; then MAX_CTX=131072  # 128K
    elif [ $AVAILABLE_KV -ge 25 ]; then MAX_CTX=65536   # 64K
    elif [ $AVAILABLE_KV -ge 12 ]; then MAX_CTX=32768   # 32K
    else MAX_CTX=16384; fi  # 16K
 else
    if [ $AVAILABLE_KV -ge 100 ]; then MAX_CTX=131072   # 128K
    elif [ $AVAILABLE_KV -ge 50 ]; then MAX_CTX=65536   # 64K
    elif [ $AVAILABLE_KV -ge 25 ]; then MAX_CTX=32768   # 32K
    elif [ $AVAILABLE_KV -ge 12 ]; then MAX_CTX=16384   # 16K
    else MAX_CTX=8192; fi   # 8K
 fi

 MAX_BATCHED=$((MAX_CTX * 6))
 echo "[INFO] Context window: $MAX_CTX tokens | Batched tokens: $MAX_BATCHED"

 # GPU-specific recommendations
 echo
 if [ $VRAM_GB -lt $((BASE_VRAM + 10)) ]; then
    echo "⚠️  WARNING: GPU may be insufficient for this model"
    echo "   Required: ~$((BASE_VRAM + 10))GB | Available: ${VRAM_GB}GB"
    [ "$IS_FP8" = false ] && echo "   💡 Try FP8 variant to reduce VRAM by 50%"
    read -rp "Continue anyway? (y/N): " confirm
    [[ ! "$confirm" =~ ^[Yy]$ ]] && exit 1
 elif [ "$IS_FP8" = false ] && [ $VRAM_GB -ge 140 ]; then
    echo "💡 TIP: You have ${VRAM_GB}GB VRAM but using FP16 model"
    echo "   Consider FP8 for 2x speed + larger context window"
 elif [ "$IS_FP8" = true ] && [ $VRAM_GB -ge 140 ]; then
    echo "💡 TIP: Your H200 can handle 256K context with this FP8 model!"
 fi

 # === Step 7: Download Model ===
 MODEL_PATH="$MODEL_FOLDER/$MODEL"
 if [ ! -d "$MODEL_PATH" ]; then
    echo
    echo "[INFO] Downloading $MODEL..."
    python3 << PYEOF
 from huggingface_hub import snapshot_download
 try:
    snapshot_download(
        repo_id="$MODEL",
        local_dir="$MODEL_PATH",
        ignore_patterns=["original/*", "*.md"],
        local_dir_use_symlinks=False
    )
    print("[INFO] Download complete!")
 except Exception as e:
    print(f"[ERROR] Download failed: {e}")
    exit(1)
 PYEOF
    [ $? -ne 0 ] && exit 1
 else
    echo "[INFO] Model already cached"
 fi

 # === Step 8: Cleanup Existing Containers ===
 echo
 echo "[INFO] Preparing container..."
 docker ps -a --filter ancestor=vllm/vllm-openai:v0.9.0 --format "{{.ID}}" | xargs -r docker rm -f 2>/dev/null || true
 sleep 3

 # === Step 9: Build vLLM Command ===
 VLLM_CMD="python3 -m vllm.entrypoints.openai.api_server \
  --model '$MODEL' \
  --port $PORT \
  --tensor-parallel-size 1 \
  --gpu-memory-utilization 0.95 \
  --max-model-len $MAX_CTX \
  --max-num-batched-tokens $MAX_BATCHED \
  --enable-chunked-prefill \
  --kv-cache-dtype auto \
  --enable-prefix-caching \
  --disable-log-requests \
  --swap-space 16 \
  --enable-auto-tool-choice \
  --tool-call-parser hermes"

 # Add FP8 quantization if needed
 [ "$IS_FP8" = true ] && VLLM_CMD="$VLLM_CMD --quantization fp8"

 # === Step 10: Launch Model ===
 echo "[INFO] Starting model server..."
 docker exec -d "$CONTAINER_NAME" bash -c "export CUDA_VISIBLE_DEVICES='0' && $VLLM_CMD > /var/log/vllm.log 2>&1"

 # === Step 11: Wait for API Ready ===
 echo
 echo "⏳ Waiting for model to load (typically 3-8 minutes)..."
 MAX_WAIT=900
 INTERVAL=10
 ELAPSED=0

 while [ $ELAPSED -lt $MAX_WAIT ]; do
    if curl -s -o /dev/null -w "%{http_code}" "http://localhost:$PORT/v1/models" | grep -q "200"; then
        echo
        echo "✅ MODEL READY!"
        echo
        echo "═══════════════════════════════════════════════"
        echo "Model: $MODEL"
        echo "Context: $MAX_CTX tokens | Precision: $([ "$IS_FP8" = true ] && echo "FP8 ⚡" || echo "FP16")"
        echo "Endpoint: http://localhost:$PORT/v1"
        echo "═══════════════════════════════════════════════"
        echo
        echo "Quick test:"
        echo "  curl http://localhost:$PORT/v1/models"
        echo
        echo "Chat example:"
        cat << 'EOF'
  curl http://localhost:$PORT/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
      "model": "$MODEL",
      "messages": [{"role": "user", "content": "Write a Python quicksort"}],
      "max_tokens": 2048
    }'
 EOF
        echo
        echo "View logs:"
        echo "  docker exec $CONTAINER_NAME tail -f /var/log/vllm.log"
        echo "═══════════════════════════════════════════════"
        exit 0
    fi
    
    printf "."
    sleep $INTERVAL
    ELAPSED=$((ELAPSED + INTERVAL))
 done

 echo
 echo "❌ Timeout after $((MAX_WAIT/60)) minutes"
 echo "Check logs: docker exec $CONTAINER_NAME tail -n 100 /var/log/vllm.log"
 exit 1
	#!/bin/bash
	set -e

	# === Configuration ===
	CONTAINER_NAME="vllm"
	PORT=8000
	MODEL_FOLDER="/root/.cache/huggingface/hub"

	# === Step 1: Setup Conda Environment ===
	echo "[INFO] Setting up environment..."
	if ! conda env list \| grep -qE "^inference[[:space:]]"; then
	conda create -n inference python=3.12 -y
	fi
	eval "$(conda shell.bash hook)"
	conda activate inference

	# === Step 2: Install Dependencies ===
	if ! python3 -c "import huggingface_hub" &>/dev/null; then
	pip install -q -U huggingface_hub
	fi
	mkdir -p "$MODEL_FOLDER"

	# === Step 3: HuggingFace Authentication ===
	python3 << 'PYEOF'
	from huggingface_hub import HfApi, login
	import sys, os

	try:
	HfApi().whoami()
	print("[INFO] Already authenticated")
	except:
	token = os.environ.get('HUGGINGFACE_TOKEN') or input("[INPUT] HuggingFace token: ").strip()
	if token:
	login(token=token, add_to_git_credential=True)
	else:
	print("[ERROR] No token provided")
	sys.exit(1)
	PYEOF
	[ $? -ne 0 ] && exit 1

	# === Step 4: Detect GPU ===
	echo
	GPU_INFO=$(nvidia-smi --query-gpu=name --format=csv,noheader,nounits \| head -n1)
	VRAM_GB=$(($(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits \| head -n1) / 1024))
	echo "[INFO] GPU: $GPU_INFO ($VRAM_GB GB VRAM)"

	# === Step 5: Model Selection ===
	echo
	echo "Available Qwen Models:"
	echo " [1] Qwen3-Coder-30B-A3B-Instruct-FP8 (⚡ RECOMMENDED - Fast, efficient)"
	echo " [2] Qwen3-Coder-30B-A3B-Instruct (Full precision)"
	echo " [3] Custom Qwen model"
	echo
	read -rp "Choice (1-3): " choice

	case "$choice" in
	1) MODEL="Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"; IS_FP8=true ;;
	2) MODEL="Qwen/Qwen3-Coder-30B-A3B-Instruct"; IS_FP8=false ;;
	3)
	read -rp "Model name (e.g., Qwen/Model-Name): " MODEL
	[[ "$MODEL" == "FP8" ]] && IS_FP8=true \|\| IS_FP8=false
	;;
	*) echo "[ERROR] Invalid choice"; exit 1 ;;
	esac

	# === Step 6: Auto-calculate Optimal Settings ===
	# Determine model size from name
	if [[ "$MODEL" == "72B" ]]; then MODEL_SIZE=72; BASE_VRAM_FP16=144; BASE_VRAM_FP8=72
	elif [[ "$MODEL" == "70B" ]]; then MODEL_SIZE=70; BASE_VRAM_FP16=140; BASE_VRAM_FP8=70
	elif [[ "$MODEL" == "30B" ]]; then MODEL_SIZE=30; BASE_VRAM_FP16=60; BASE_VRAM_FP8=30
	elif [[ "$MODEL" == "14B" ]]; then MODEL_SIZE=14; BASE_VRAM_FP16=28; BASE_VRAM_FP8=14
	elif [[ "$MODEL" == "8B" ]] \|\| [[ "$MODEL" == "7B" ]]; then MODEL_SIZE=8; BASE_VRAM_FP16=16; BASE_VRAM_FP8=8
	else MODEL_SIZE=30; BASE_VRAM_FP16=60; BASE_VRAM_FP8=30; fi

	# Calculate VRAM allocation
	BASE_VRAM=$( [ "$IS_FP8" = true ] && echo $BASE_VRAM_FP8 \|\| echo $BASE_VRAM_FP16 )
	OVERHEAD=10 # Reserve for system/ops
	AVAILABLE_KV=$((VRAM_GB - BASE_VRAM - OVERHEAD))

	echo
	echo "[INFO] Model: ${MODEL_SIZE}B parameters ($([ "$IS_FP8" = true ] && echo "FP8" \|\| echo "FP16"))"
	echo "[INFO] Base model VRAM: ${BASE_VRAM}GB \| Available for context: ${AVAILABLE_KV}GB"

	# Calculate max context based on available VRAM
	# KV cache memory per 1K tokens: ~0.8GB (FP16) or ~0.4GB (FP8) for 30B model
	if [ "$IS_FP8" = true ]; then
	if [ $AVAILABLE_KV -ge 100 ]; then MAX_CTX=262144 # 256K
	elif [ $AVAILABLE_KV -ge 50 ]; then MAX_CTX=131072 # 128K
	elif [ $AVAILABLE_KV -ge 25 ]; then MAX_CTX=65536 # 64K
	elif [ $AVAILABLE_KV -ge 12 ]; then MAX_CTX=32768 # 32K
	else MAX_CTX=16384; fi # 16K
	else
	if [ $AVAILABLE_KV -ge 100 ]; then MAX_CTX=131072 # 128K
	elif [ $AVAILABLE_KV -ge 50 ]; then MAX_CTX=65536 # 64K
	elif [ $AVAILABLE_KV -ge 25 ]; then MAX_CTX=32768 # 32K
	elif [ $AVAILABLE_KV -ge 12 ]; then MAX_CTX=16384 # 16K
	else MAX_CTX=8192; fi # 8K
	fi

	MAX_BATCHED=$((MAX_CTX * 6))
	echo "[INFO] Context window: $MAX_CTX tokens \| Batched tokens: $MAX_BATCHED"

	# GPU-specific recommendations
	echo
	if [ $VRAM_GB -lt $((BASE_VRAM + 10)) ]; then
	echo "⚠️ WARNING: GPU may be insufficient for this model"
	echo " Required: ~$((BASE_VRAM + 10))GB \| Available: ${VRAM_GB}GB"
	[ "$IS_FP8" = false ] && echo " 💡 Try FP8 variant to reduce VRAM by 50%"
	read -rp "Continue anyway? (y/N): " confirm
	[[ ! "$confirm" =~ ^[Yy]$ ]] && exit 1
	elif [ "$IS_FP8" = false ] && [ $VRAM_GB -ge 140 ]; then
	echo "💡 TIP: You have ${VRAM_GB}GB VRAM but using FP16 model"
	echo " Consider FP8 for 2x speed + larger context window"
	elif [ "$IS_FP8" = true ] && [ $VRAM_GB -ge 140 ]; then
	echo "💡 TIP: Your H200 can handle 256K context with this FP8 model!"
	fi

	# === Step 7: Download Model ===
	MODEL_PATH="$MODEL_FOLDER/$MODEL"
	if [ ! -d "$MODEL_PATH" ]; then
	echo
	echo "[INFO] Downloading $MODEL..."
	python3 << PYEOF
	from huggingface_hub import snapshot_download
	try:
	snapshot_download(
	repo_id="$MODEL",
	local_dir="$MODEL_PATH",
	ignore_patterns=["original/", ".md"],
	local_dir_use_symlinks=False
	)
	print("[INFO] Download complete!")
	except Exception as e:
	print(f"[ERROR] Download failed: {e}")
	exit(1)
	PYEOF
	[ $? -ne 0 ] && exit 1
	else
	echo "[INFO] Model already cached"
	fi

	# === Step 8: Cleanup Existing Containers ===
	echo
	echo "[INFO] Preparing container..."
	docker ps -a --filter ancestor=vllm/vllm-openai:v0.9.0 --format "{{.ID}}" \| xargs -r docker rm -f 2>/dev/null \|\| true
	sleep 3

	# === Step 9: Build vLLM Command ===
	VLLM_CMD="python3 -m vllm.entrypoints.openai.api_server \
	--model '$MODEL' \
	--port $PORT \
	--tensor-parallel-size 1 \
	--gpu-memory-utilization 0.95 \
	--max-model-len $MAX_CTX \
	--max-num-batched-tokens $MAX_BATCHED \
	--enable-chunked-prefill \
	--kv-cache-dtype auto \
	--enable-prefix-caching \
	--disable-log-requests \
	--swap-space 16 \
	--enable-auto-tool-choice \
	--tool-call-parser hermes"

	# Add FP8 quantization if needed
	[ "$IS_FP8" = true ] && VLLM_CMD="$VLLM_CMD --quantization fp8"

	# === Step 10: Launch Model ===
	echo "[INFO] Starting model server..."
	docker exec -d "$CONTAINER_NAME" bash -c "export CUDA_VISIBLE_DEVICES='0' && $VLLM_CMD > /var/log/vllm.log 2>&1"

	# === Step 11: Wait for API Ready ===
	echo
	echo "⏳ Waiting for model to load (typically 3-8 minutes)..."
	MAX_WAIT=900
	INTERVAL=10
	ELAPSED=0

	while [ $ELAPSED -lt $MAX_WAIT ]; do
	if curl -s -o /dev/null -w "%{http_code}" "http://localhost:$PORT/v1/models" \| grep -q "200"; then
	echo
	echo "✅ MODEL READY!"
	echo
	echo "═══════════════════════════════════════════════"
	echo "Model: $MODEL"
	echo "Context: $MAX_CTX tokens \| Precision: $([ "$IS_FP8" = true ] && echo "FP8 ⚡" \|\| echo "FP16")"
	echo "Endpoint: http://localhost:$PORT/v1"
	echo "═══════════════════════════════════════════════"
	echo
	echo "Quick test:"
	echo " curl http://localhost:$PORT/v1/models"
	echo
	echo "Chat example:"
	cat << 'EOF'
	curl http://localhost:$PORT/v1/chat/completions \
	-H "Content-Type: application/json" \
	-d '{
	"model": "$MODEL",
	"messages": [{"role": "user", "content": "Write a Python quicksort"}],
	"max_tokens": 2048
	}'
	EOF
	echo
	echo "View logs:"
	echo " docker exec $CONTAINER_NAME tail -f /var/log/vllm.log"
	echo "═══════════════════════════════════════════════"
	exit 0
	fi

	printf "."
	sleep $INTERVAL
	ELAPSED=$((ELAPSED + INTERVAL))
	done

	echo
	echo "❌ Timeout after $((MAX_WAIT/60)) minutes"
	echo "Check logs: docker exec $CONTAINER_NAME tail -n 100 /var/log/vllm.log"
	exit 1
No results found