Skip to content

Instantly share code, notes, and snippets.

@alexgodin
Created February 12, 2026 14:50
Show Gist options
  • Select an option

  • Save alexgodin/e1abfeb8bdfa4d25a2678bd1cb07928a to your computer and use it in GitHub Desktop.

Select an option

Save alexgodin/e1abfeb8bdfa4d25a2678bd1cb07928a to your computer and use it in GitHub Desktop.
# Configuration for Cog ⚙️
# Reference: https://cog.run/yaml
build:
# set to true if your model requires a GPU
gpu: true
# a list of ubuntu apt packages to install
# system_packages:
# - "libgl1-mesa-glx"
# - "libglib2.0-0"
# python version in the form '3.11' or '3.11.4'
python_version: "3.11"
# a list of packages in the format <package-name>==<version>
python_packages:
- "torch"
- "Pillow" # for PIL/Image
- "open_clip_torch" # for open_clip
- "numpy"
- "transformers"
# commands run after the environment is setup
# run:
# - "echo env is ready!"
# - "echo another command if needed"
# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"
image: 'r8.im/alexgodin/fashion-siglip'
# Prediction interface for Cog ⚙️
# https://cog.run/python
import open_clip
import torch
from PIL import Image
from cog import BasePredictor, Input, Path
import numpy as np
from typing import List
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
self.model, _, self.preprocess = open_clip.create_model_and_transforms('hf-hub:Marqo/marqo-fashionSigLIP')
self.tokenizer = open_clip.get_tokenizer('hf-hub:Marqo/marqo-fashionSigLIP')
def predict(
self,
image: Path = Input(description="Input image to generate embeddings for", default=None),
text: str = Input(description="Input text to generate embeddings for", default=None),
) -> List[float]:
"""Generate CLIP embeddings for either the input image or text"""
# Validate inputs
if (image is None and text is None) or (image is not None and text is not None):
raise ValueError("Please provide either an image or text input, but not both or neither")
if text is not None:
# Process text input
text = self.tokenizer(text)
with torch.no_grad(), torch.cuda.amp.autocast():
text_features = self.model.encode_text(text, normalize=True)
return text_features.cpu().numpy().flatten().tolist()
else:
# Process image input
img = Image.open(image)
processed_image = self.preprocess(img).unsqueeze(0)
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = self.model.encode_image(processed_image, normalize=True)
return image_features.cpu().numpy().flatten().tolist()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment