Skip to content

Instantly share code, notes, and snippets.

@atemate
Last active February 4, 2026 20:35
Show Gist options
  • Select an option

  • Save atemate/8de4863b902c3128f0cd76dcd99a4deb to your computer and use it in GitHub Desktop.

Select an option

Save atemate/8de4863b902c3128f0cd76dcd99a4deb to your computer and use it in GitHub Desktop.
Scripts - to run Hugging Face models on Vertex AI

Like in tutorial: https://huggingface.co/docs/google-cloud/en/examples/vertex-ai-notebooks-deploy-llama-3-1-405b-on-vertex-ai

uv venv
source .venv/bin/activate
uv pip install -r requirements.txt
source .env
gcloud auth login
gcloud auth application-default login  # For local development
gcloud config set project $PROJECT_ID

# gcloud services enable aiplatform.googleapis.com
# gcloud services enable compute.googleapis.com
# gcloud services enable container.googleapis.com
# gcloud services enable containerregistry.googleapis.com
# gcloud services enable containerfilesystem.googleapis.com
# Adapted from:
# https://huggingface.co/docs/google-cloud/en/examples/vertex-ai-notebooks-deploy-llama-3-1-405b-on-vertex-ai
# %%
import os
from google.cloud import aiplatform
from huggingface_hub import interpreter_login
from huggingface_hub import get_token
from dotenv import load_dotenv
# %%
load_dotenv()
aiplatform.init(
project=os.getenv("PROJECT_ID"),
location=os.getenv("LOCATION"),
)
interpreter_login()
# %%
MAX_INPUT_TOKENS = "3072"
MAX_TOTAL_TOKENS = "4096"
MAX_BATCH_PREFILL_TOKENS = "3072" # <--- FIXED: Must be >= MAX_INPUT_TOKENS
model = aiplatform.Model.upload(
display_name="eurollm-9b-instruct",
serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310",
serving_container_environment_variables={
"MODEL_ID": "utter-project/EuroLLM-9B-Instruct",
"HUGGING_FACE_HUB_TOKEN": get_token(),
"HF_HUB_ENABLE_HF_TRANSFER": "1",
# Hardware Config
"NUM_SHARD": "1", # Run on 1 GPU
"DTYPE": "bfloat16", # Force bfloat16 for L4 GPUs (better performance)
# Hardware & Precision
"NUM_SHARD": "1",
"DTYPE": "bfloat16", # Crucial for L4 efficiency
# Context Window Config (Fixed)
"MAX_INPUT_TOKENS": MAX_INPUT_TOKENS,
"MAX_TOTAL_TOKENS": MAX_TOTAL_TOKENS,
"MAX_BATCH_PREFILL_TOKENS": MAX_BATCH_PREFILL_TOKENS, # Matches Input
# Memory Optimization for L4 (24GB)
# 9B weights take ~18GB. leaving ~6GB for KV Cache.
# If you see OOM errors, lower MAX_TOTAL_TOKENS to 2048.
"GPU_MEMORY_UTILIZATION": "0.95",
},
serving_container_ports=[8080],
serving_container_predict_route="/predict",
serving_container_health_route="/healthz"
)
model.wait()
# %%
endpoint = aiplatform.Endpoint.create(display_name="eurollm-9b-endpoint")
# %%
deployed_model = model.deploy(
endpoint=endpoint,
machine_type="g2-standard-8", # L4 machine, 24G vRAM
accelerator_type="NVIDIA_L4",
accelerator_count=1,
enable_access_logging=True,
)
# Adapted from:
# https://huggingface.co/docs/google-cloud/en/examples/vertex-ai-notebooks-deploy-llama-3-1-405b-on-vertex-ai
# %%
import os
from google.cloud import aiplatform
from huggingface_hub import interpreter_login
from huggingface_hub import get_token
from dotenv import load_dotenv
# %%
load_dotenv()
aiplatform.init(
project=os.getenv("PROJECT_ID"),
location=os.getenv("LOCATION"),
)
interpreter_login()
# %%
model = aiplatform.Model.upload(
display_name="meta-llama--Meta-Llama-3-8B",
serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310",
serving_container_environment_variables={
"MODEL_ID": "meta-llama/Meta-Llama-3-8B",
"HUGGING_FACE_HUB_TOKEN": get_token(),
"HF_HUB_ENABLE_HF_TRANSFER": "1",
"NUM_SHARD": "1",
"MAX_INPUT_TOKENS": "1024", # Adjust for your short context needs
"MAX_TOTAL_TOKENS": "2048", # Adjust for your short context needs
"MAX_BATCH_PREFILL_TOKENS": "2048" # Optimize for latency
},
serving_container_ports=[8080], # TGI listens on 8080 by default
serving_container_predict_route="/predict",
serving_container_health_route="/healthz"
)
model.wait()
# %%
deployed_model = model.deploy(
endpoint=aiplatform.Endpoint.create(display_name="llama-3-8B-endpoint"),
machine_type="g2-standard-4", # L4 machine, 24G memory
accelerator_type="NVIDIA_L4",
accelerator_count=1,
enable_access_logging=True,
)
google-cloud-aiplatform
huggingface_hub
python-dotenv
transformers
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment