atemate · February 4, 2026 20:35
diff --git a/README.md b/README.md
diff --git a/deploy-euorllm-9b.py b/deploy-euorllm-9b.py
 # Adapted from:
 # https://huggingface.co/docs/google-cloud/en/examples/vertex-ai-notebooks-deploy-llama-3-1-405b-on-vertex-ai

 # %%
 import os
 from google.cloud import aiplatform
 from huggingface_hub import interpreter_login
 from huggingface_hub import get_token
 from dotenv import load_dotenv


 # %%
 load_dotenv()

 aiplatform.init(
    project=os.getenv("PROJECT_ID"),
    location=os.getenv("LOCATION"),
 )
 interpreter_login()

 # %%

 MAX_INPUT_TOKENS = "3072"
 MAX_TOTAL_TOKENS = "4096"
 MAX_BATCH_PREFILL_TOKENS = "3072" # <--- FIXED: Must be >= MAX_INPUT_TOKENS

 model = aiplatform.Model.upload(
    display_name="eurollm-9b-instruct",
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310",
    serving_container_environment_variables={
        "MODEL_ID": "utter-project/EuroLLM-9B-Instruct",
        "HUGGING_FACE_HUB_TOKEN": get_token(),
        "HF_HUB_ENABLE_HF_TRANSFER": "1",
        
        # Hardware Config
        "NUM_SHARD": "1",                                 # Run on 1 GPU
        "DTYPE": "bfloat16",                              # Force bfloat16 for L4 GPUs (better performance)
        
        # Hardware & Precision
        "NUM_SHARD": "1",
        "DTYPE": "bfloat16",  # Crucial for L4 efficiency
        
        # Context Window Config (Fixed)
        "MAX_INPUT_TOKENS": MAX_INPUT_TOKENS,
        "MAX_TOTAL_TOKENS": MAX_TOTAL_TOKENS,
        "MAX_BATCH_PREFILL_TOKENS": MAX_BATCH_PREFILL_TOKENS, # Matches Input
        
        # Memory Optimization for L4 (24GB)
        # 9B weights take ~18GB. leaving ~6GB for KV Cache.
        # If you see OOM errors, lower MAX_TOTAL_TOKENS to 2048.
        "GPU_MEMORY_UTILIZATION": "0.95",
    },
    serving_container_ports=[8080],
    serving_container_predict_route="/predict",
    serving_container_health_route="/healthz"
 )
 model.wait()

 # %%
 endpoint = aiplatform.Endpoint.create(display_name="eurollm-9b-endpoint")

 # %%
 deployed_model = model.deploy(
    endpoint=endpoint,
    machine_type="g2-standard-8",  # L4 machine, 24G vRAM
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
    enable_access_logging=True,
 )
diff --git a/deploy-llama3-8b.py b/deploy-llama3-8b.py
 # Adapted from:
 # https://huggingface.co/docs/google-cloud/en/examples/vertex-ai-notebooks-deploy-llama-3-1-405b-on-vertex-ai

 # %%
 import os
 from google.cloud import aiplatform
 from huggingface_hub import interpreter_login
 from huggingface_hub import get_token
 from dotenv import load_dotenv


 # %%
 load_dotenv()

 aiplatform.init(
    project=os.getenv("PROJECT_ID"),
    location=os.getenv("LOCATION"),
 )
 interpreter_login()

 # %%

 model = aiplatform.Model.upload(
    display_name="meta-llama--Meta-Llama-3-8B",
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310",
    serving_container_environment_variables={
        "MODEL_ID": "meta-llama/Meta-Llama-3-8B",
        "HUGGING_FACE_HUB_TOKEN": get_token(),
        "HF_HUB_ENABLE_HF_TRANSFER": "1",
        "NUM_SHARD": "1",
        "MAX_INPUT_TOKENS": "1024",                 # Adjust for your short context needs
        "MAX_TOTAL_TOKENS": "2048",                 # Adjust for your short context needs
        "MAX_BATCH_PREFILL_TOKENS": "2048"          # Optimize for latency
    },
    serving_container_ports=[8080],                 # TGI listens on 8080 by default
    serving_container_predict_route="/predict",
    serving_container_health_route="/healthz"
 )
 model.wait()

 # %%
 deployed_model = model.deploy(
    endpoint=aiplatform.Endpoint.create(display_name="llama-3-8B-endpoint"),
    machine_type="g2-standard-4",  # L4 machine, 24G memory
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
    enable_access_logging=True,
 )
diff --git a/requirements.txt b/requirements.txt
 google-cloud-aiplatform
 huggingface_hub
 python-dotenv
 transformers
	# Adapted from:
	# https://huggingface.co/docs/google-cloud/en/examples/vertex-ai-notebooks-deploy-llama-3-1-405b-on-vertex-ai

	# %%
	import os
	from google.cloud import aiplatform
	from huggingface_hub import interpreter_login
	from huggingface_hub import get_token
	from dotenv import load_dotenv


	# %%
	load_dotenv()

	aiplatform.init(
	project=os.getenv("PROJECT_ID"),
	location=os.getenv("LOCATION"),
	)
	interpreter_login()

	# %%

	MAX_INPUT_TOKENS = "3072"
	MAX_TOTAL_TOKENS = "4096"
	MAX_BATCH_PREFILL_TOKENS = "3072" # <--- FIXED: Must be >= MAX_INPUT_TOKENS

	model = aiplatform.Model.upload(
	display_name="eurollm-9b-instruct",
	serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310",
	serving_container_environment_variables={
	"MODEL_ID": "utter-project/EuroLLM-9B-Instruct",
	"HUGGING_FACE_HUB_TOKEN": get_token(),
	"HF_HUB_ENABLE_HF_TRANSFER": "1",

	# Hardware Config
	"NUM_SHARD": "1", # Run on 1 GPU
	"DTYPE": "bfloat16", # Force bfloat16 for L4 GPUs (better performance)

	# Hardware & Precision
	"NUM_SHARD": "1",
	"DTYPE": "bfloat16", # Crucial for L4 efficiency

	# Context Window Config (Fixed)
	"MAX_INPUT_TOKENS": MAX_INPUT_TOKENS,
	"MAX_TOTAL_TOKENS": MAX_TOTAL_TOKENS,
	"MAX_BATCH_PREFILL_TOKENS": MAX_BATCH_PREFILL_TOKENS, # Matches Input

	# Memory Optimization for L4 (24GB)
	# 9B weights take ~18GB. leaving ~6GB for KV Cache.
	# If you see OOM errors, lower MAX_TOTAL_TOKENS to 2048.
	"GPU_MEMORY_UTILIZATION": "0.95",
	},
	serving_container_ports=[8080],
	serving_container_predict_route="/predict",
	serving_container_health_route="/healthz"
	)
	model.wait()

	# %%
	endpoint = aiplatform.Endpoint.create(display_name="eurollm-9b-endpoint")

	# %%
	deployed_model = model.deploy(
	endpoint=endpoint,
	machine_type="g2-standard-8", # L4 machine, 24G vRAM
	accelerator_type="NVIDIA_L4",
	accelerator_count=1,
	enable_access_logging=True,
	)
	google-cloud-aiplatform
	huggingface_hub
	python-dotenv
	transformers