|
# Adapted from: |
|
# https://huggingface.co/docs/google-cloud/en/examples/vertex-ai-notebooks-deploy-llama-3-1-405b-on-vertex-ai |
|
|
|
# %% |
|
import os |
|
from google.cloud import aiplatform |
|
from huggingface_hub import interpreter_login |
|
from huggingface_hub import get_token |
|
from dotenv import load_dotenv |
|
|
|
|
|
# %% |
|
load_dotenv() |
|
|
|
aiplatform.init( |
|
project=os.getenv("PROJECT_ID"), |
|
location=os.getenv("LOCATION"), |
|
) |
|
interpreter_login() |
|
|
|
# %% |
|
|
|
MAX_INPUT_TOKENS = "3072" |
|
MAX_TOTAL_TOKENS = "4096" |
|
MAX_BATCH_PREFILL_TOKENS = "3072" # <--- FIXED: Must be >= MAX_INPUT_TOKENS |
|
|
|
model = aiplatform.Model.upload( |
|
display_name="eurollm-9b-instruct", |
|
serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310", |
|
serving_container_environment_variables={ |
|
"MODEL_ID": "utter-project/EuroLLM-9B-Instruct", |
|
"HUGGING_FACE_HUB_TOKEN": get_token(), |
|
"HF_HUB_ENABLE_HF_TRANSFER": "1", |
|
|
|
# Hardware Config |
|
"NUM_SHARD": "1", # Run on 1 GPU |
|
"DTYPE": "bfloat16", # Force bfloat16 for L4 GPUs (better performance) |
|
|
|
# Hardware & Precision |
|
"NUM_SHARD": "1", |
|
"DTYPE": "bfloat16", # Crucial for L4 efficiency |
|
|
|
# Context Window Config (Fixed) |
|
"MAX_INPUT_TOKENS": MAX_INPUT_TOKENS, |
|
"MAX_TOTAL_TOKENS": MAX_TOTAL_TOKENS, |
|
"MAX_BATCH_PREFILL_TOKENS": MAX_BATCH_PREFILL_TOKENS, # Matches Input |
|
|
|
# Memory Optimization for L4 (24GB) |
|
# 9B weights take ~18GB. leaving ~6GB for KV Cache. |
|
# If you see OOM errors, lower MAX_TOTAL_TOKENS to 2048. |
|
"GPU_MEMORY_UTILIZATION": "0.95", |
|
}, |
|
serving_container_ports=[8080], |
|
serving_container_predict_route="/predict", |
|
serving_container_health_route="/healthz" |
|
) |
|
model.wait() |
|
|
|
# %% |
|
endpoint = aiplatform.Endpoint.create(display_name="eurollm-9b-endpoint") |
|
|
|
# %% |
|
deployed_model = model.deploy( |
|
endpoint=endpoint, |
|
machine_type="g2-standard-8", # L4 machine, 24G vRAM |
|
accelerator_type="NVIDIA_L4", |
|
accelerator_count=1, |
|
enable_access_logging=True, |
|
) |