Created
December 9, 2025 03:44
-
-
Save alexjyong/0b2ce5c5fbd0320de7cfb4b2532df803 to your computer and use it in GitHub Desktop.
streamlined_qwen.sh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| set -e | |
| # === Configuration === | |
| CONTAINER_NAME="vllm" | |
| PORT=8000 | |
| MODEL_FOLDER="/root/.cache/huggingface/hub" | |
| # === Step 1: Setup Conda Environment === | |
| echo "[INFO] Setting up environment..." | |
| if ! conda env list | grep -qE "^inference[[:space:]]"; then | |
| conda create -n inference python=3.12 -y | |
| fi | |
| eval "$(conda shell.bash hook)" | |
| conda activate inference | |
| # === Step 2: Install Dependencies === | |
| if ! python3 -c "import huggingface_hub" &>/dev/null; then | |
| pip install -q -U huggingface_hub | |
| fi | |
| mkdir -p "$MODEL_FOLDER" | |
| # === Step 3: HuggingFace Authentication === | |
| python3 << 'PYEOF' | |
| from huggingface_hub import HfApi, login | |
| import sys, os | |
| try: | |
| HfApi().whoami() | |
| print("[INFO] Already authenticated") | |
| except: | |
| token = os.environ.get('HUGGINGFACE_TOKEN') or input("[INPUT] HuggingFace token: ").strip() | |
| if token: | |
| login(token=token, add_to_git_credential=True) | |
| else: | |
| print("[ERROR] No token provided") | |
| sys.exit(1) | |
| PYEOF | |
| [ $? -ne 0 ] && exit 1 | |
| # === Step 4: Detect GPU === | |
| echo | |
| GPU_INFO=$(nvidia-smi --query-gpu=name --format=csv,noheader,nounits | head -n1) | |
| VRAM_GB=$(($(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n1) / 1024)) | |
| echo "[INFO] GPU: $GPU_INFO ($VRAM_GB GB VRAM)" | |
| # === Step 5: Model Selection === | |
| echo | |
| echo "Available Qwen Models:" | |
| echo " [1] Qwen3-Coder-30B-A3B-Instruct-FP8 (β‘ RECOMMENDED - Fast, efficient)" | |
| echo " [2] Qwen3-Coder-30B-A3B-Instruct (Full precision)" | |
| echo " [3] Custom Qwen model" | |
| echo | |
| read -rp "Choice (1-3): " choice | |
| case "$choice" in | |
| 1) MODEL="Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"; IS_FP8=true ;; | |
| 2) MODEL="Qwen/Qwen3-Coder-30B-A3B-Instruct"; IS_FP8=false ;; | |
| 3) | |
| read -rp "Model name (e.g., Qwen/Model-Name): " MODEL | |
| [[ "$MODEL" == *"FP8"* ]] && IS_FP8=true || IS_FP8=false | |
| ;; | |
| *) echo "[ERROR] Invalid choice"; exit 1 ;; | |
| esac | |
| # === Step 6: Auto-calculate Optimal Settings === | |
| # Determine model size from name | |
| if [[ "$MODEL" == *"72B"* ]]; then MODEL_SIZE=72; BASE_VRAM_FP16=144; BASE_VRAM_FP8=72 | |
| elif [[ "$MODEL" == *"70B"* ]]; then MODEL_SIZE=70; BASE_VRAM_FP16=140; BASE_VRAM_FP8=70 | |
| elif [[ "$MODEL" == *"30B"* ]]; then MODEL_SIZE=30; BASE_VRAM_FP16=60; BASE_VRAM_FP8=30 | |
| elif [[ "$MODEL" == *"14B"* ]]; then MODEL_SIZE=14; BASE_VRAM_FP16=28; BASE_VRAM_FP8=14 | |
| elif [[ "$MODEL" == *"8B"* ]] || [[ "$MODEL" == *"7B"* ]]; then MODEL_SIZE=8; BASE_VRAM_FP16=16; BASE_VRAM_FP8=8 | |
| else MODEL_SIZE=30; BASE_VRAM_FP16=60; BASE_VRAM_FP8=30; fi | |
| # Calculate VRAM allocation | |
| BASE_VRAM=$( [ "$IS_FP8" = true ] && echo $BASE_VRAM_FP8 || echo $BASE_VRAM_FP16 ) | |
| OVERHEAD=10 # Reserve for system/ops | |
| AVAILABLE_KV=$((VRAM_GB - BASE_VRAM - OVERHEAD)) | |
| echo | |
| echo "[INFO] Model: ${MODEL_SIZE}B parameters ($([ "$IS_FP8" = true ] && echo "FP8" || echo "FP16"))" | |
| echo "[INFO] Base model VRAM: ${BASE_VRAM}GB | Available for context: ${AVAILABLE_KV}GB" | |
| # Calculate max context based on available VRAM | |
| # KV cache memory per 1K tokens: ~0.8GB (FP16) or ~0.4GB (FP8) for 30B model | |
| if [ "$IS_FP8" = true ]; then | |
| if [ $AVAILABLE_KV -ge 100 ]; then MAX_CTX=262144 # 256K | |
| elif [ $AVAILABLE_KV -ge 50 ]; then MAX_CTX=131072 # 128K | |
| elif [ $AVAILABLE_KV -ge 25 ]; then MAX_CTX=65536 # 64K | |
| elif [ $AVAILABLE_KV -ge 12 ]; then MAX_CTX=32768 # 32K | |
| else MAX_CTX=16384; fi # 16K | |
| else | |
| if [ $AVAILABLE_KV -ge 100 ]; then MAX_CTX=131072 # 128K | |
| elif [ $AVAILABLE_KV -ge 50 ]; then MAX_CTX=65536 # 64K | |
| elif [ $AVAILABLE_KV -ge 25 ]; then MAX_CTX=32768 # 32K | |
| elif [ $AVAILABLE_KV -ge 12 ]; then MAX_CTX=16384 # 16K | |
| else MAX_CTX=8192; fi # 8K | |
| fi | |
| MAX_BATCHED=$((MAX_CTX * 6)) | |
| echo "[INFO] Context window: $MAX_CTX tokens | Batched tokens: $MAX_BATCHED" | |
| # GPU-specific recommendations | |
| echo | |
| if [ $VRAM_GB -lt $((BASE_VRAM + 10)) ]; then | |
| echo "β οΈ WARNING: GPU may be insufficient for this model" | |
| echo " Required: ~$((BASE_VRAM + 10))GB | Available: ${VRAM_GB}GB" | |
| [ "$IS_FP8" = false ] && echo " π‘ Try FP8 variant to reduce VRAM by 50%" | |
| read -rp "Continue anyway? (y/N): " confirm | |
| [[ ! "$confirm" =~ ^[Yy]$ ]] && exit 1 | |
| elif [ "$IS_FP8" = false ] && [ $VRAM_GB -ge 140 ]; then | |
| echo "π‘ TIP: You have ${VRAM_GB}GB VRAM but using FP16 model" | |
| echo " Consider FP8 for 2x speed + larger context window" | |
| elif [ "$IS_FP8" = true ] && [ $VRAM_GB -ge 140 ]; then | |
| echo "π‘ TIP: Your H200 can handle 256K context with this FP8 model!" | |
| fi | |
| # === Step 7: Download Model === | |
| MODEL_PATH="$MODEL_FOLDER/$MODEL" | |
| if [ ! -d "$MODEL_PATH" ]; then | |
| echo | |
| echo "[INFO] Downloading $MODEL..." | |
| python3 << PYEOF | |
| from huggingface_hub import snapshot_download | |
| try: | |
| snapshot_download( | |
| repo_id="$MODEL", | |
| local_dir="$MODEL_PATH", | |
| ignore_patterns=["original/*", "*.md"], | |
| local_dir_use_symlinks=False | |
| ) | |
| print("[INFO] Download complete!") | |
| except Exception as e: | |
| print(f"[ERROR] Download failed: {e}") | |
| exit(1) | |
| PYEOF | |
| [ $? -ne 0 ] && exit 1 | |
| else | |
| echo "[INFO] Model already cached" | |
| fi | |
| # === Step 8: Cleanup Existing Containers === | |
| echo | |
| echo "[INFO] Preparing container..." | |
| docker ps -a --filter ancestor=vllm/vllm-openai:v0.9.0 --format "{{.ID}}" | xargs -r docker rm -f 2>/dev/null || true | |
| sleep 3 | |
| # === Step 9: Build vLLM Command === | |
| VLLM_CMD="python3 -m vllm.entrypoints.openai.api_server \ | |
| --model '$MODEL' \ | |
| --port $PORT \ | |
| --tensor-parallel-size 1 \ | |
| --gpu-memory-utilization 0.95 \ | |
| --max-model-len $MAX_CTX \ | |
| --max-num-batched-tokens $MAX_BATCHED \ | |
| --enable-chunked-prefill \ | |
| --kv-cache-dtype auto \ | |
| --enable-prefix-caching \ | |
| --disable-log-requests \ | |
| --swap-space 16 \ | |
| --enable-auto-tool-choice \ | |
| --tool-call-parser hermes" | |
| # Add FP8 quantization if needed | |
| [ "$IS_FP8" = true ] && VLLM_CMD="$VLLM_CMD --quantization fp8" | |
| # === Step 10: Launch Model === | |
| echo "[INFO] Starting model server..." | |
| docker exec -d "$CONTAINER_NAME" bash -c "export CUDA_VISIBLE_DEVICES='0' && $VLLM_CMD > /var/log/vllm.log 2>&1" | |
| # === Step 11: Wait for API Ready === | |
| echo | |
| echo "β³ Waiting for model to load (typically 3-8 minutes)..." | |
| MAX_WAIT=900 | |
| INTERVAL=10 | |
| ELAPSED=0 | |
| while [ $ELAPSED -lt $MAX_WAIT ]; do | |
| if curl -s -o /dev/null -w "%{http_code}" "http://localhost:$PORT/v1/models" | grep -q "200"; then | |
| echo | |
| echo "β MODEL READY!" | |
| echo | |
| echo "βββββββββββββββββββββββββββββββββββββββββββββββ" | |
| echo "Model: $MODEL" | |
| echo "Context: $MAX_CTX tokens | Precision: $([ "$IS_FP8" = true ] && echo "FP8 β‘" || echo "FP16")" | |
| echo "Endpoint: http://localhost:$PORT/v1" | |
| echo "βββββββββββββββββββββββββββββββββββββββββββββββ" | |
| echo | |
| echo "Quick test:" | |
| echo " curl http://localhost:$PORT/v1/models" | |
| echo | |
| echo "Chat example:" | |
| cat << 'EOF' | |
| curl http://localhost:$PORT/v1/chat/completions \ | |
| -H "Content-Type: application/json" \ | |
| -d '{ | |
| "model": "$MODEL", | |
| "messages": [{"role": "user", "content": "Write a Python quicksort"}], | |
| "max_tokens": 2048 | |
| }' | |
| EOF | |
| echo | |
| echo "View logs:" | |
| echo " docker exec $CONTAINER_NAME tail -f /var/log/vllm.log" | |
| echo "βββββββββββββββββββββββββββββββββββββββββββββββ" | |
| exit 0 | |
| fi | |
| printf "." | |
| sleep $INTERVAL | |
| ELAPSED=$((ELAPSED + INTERVAL)) | |
| done | |
| echo | |
| echo "β Timeout after $((MAX_WAIT/60)) minutes" | |
| echo "Check logs: docker exec $CONTAINER_NAME tail -n 100 /var/log/vllm.log" | |
| exit 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment