alexgodin · February 12, 2026 14:50
diff --git a/cog.yaml b/cog.yaml
 # Configuration for Cog ⚙️
 # Reference: https://cog.run/yaml

 build:
  # set to true if your model requires a GPU
  gpu: true

  # a list of ubuntu apt packages to install
  # system_packages:
  #   - "libgl1-mesa-glx"
  #   - "libglib2.0-0"

  # python version in the form '3.11' or '3.11.4'
  python_version: "3.11"

  # a list of packages in the format <package-name>==<version>
  python_packages:
    - "torch"
    - "Pillow"  # for PIL/Image
    - "open_clip_torch"  # for open_clip
    - "numpy"
    - "transformers"


  # commands run after the environment is setup
  # run:
  #   - "echo env is ready!"
  #   - "echo another command if needed"

 # predict.py defines how predictions are run on your model
 predict: "predict.py:Predictor"

 image: 'r8.im/alexgodin/fashion-siglip'
diff --git a/predict.py b/predict.py
 # Prediction interface for Cog ⚙️
 # https://cog.run/python

 import open_clip
 import torch
 from PIL import Image
 from cog import BasePredictor, Input, Path
 import numpy as np
 from typing import List

 class Predictor(BasePredictor):
    def setup(self) -> None:
        """Load the model into memory to make running multiple predictions efficient"""
        self.model, _, self.preprocess = open_clip.create_model_and_transforms('hf-hub:Marqo/marqo-fashionSigLIP')
        self.tokenizer = open_clip.get_tokenizer('hf-hub:Marqo/marqo-fashionSigLIP')

    def predict(
        self,
        image: Path = Input(description="Input image to generate embeddings for", default=None),
        text: str = Input(description="Input text to generate embeddings for", default=None),
    ) -> List[float]:
        """Generate CLIP embeddings for either the input image or text"""
        
        # Validate inputs
        if (image is None and text is None) or (image is not None and text is not None):
            raise ValueError("Please provide either an image or text input, but not both or neither")
        
        if text is not None:
            # Process text input
            text = self.tokenizer(text)
            with torch.no_grad(), torch.cuda.amp.autocast():
                text_features = self.model.encode_text(text, normalize=True)
            return text_features.cpu().numpy().flatten().tolist()
        
        else:
            # Process image input
            img = Image.open(image)
            processed_image = self.preprocess(img).unsqueeze(0)

            with torch.no_grad(), torch.cuda.amp.autocast():
                image_features = self.model.encode_image(processed_image, normalize=True)
                
            return image_features.cpu().numpy().flatten().tolist()
	# Configuration for Cog ⚙️
	# Reference: https://cog.run/yaml

	build:
	# set to true if your model requires a GPU
	gpu: true

	# a list of ubuntu apt packages to install
	# system_packages:
	# - "libgl1-mesa-glx"
	# - "libglib2.0-0"

	# python version in the form '3.11' or '3.11.4'
	python_version: "3.11"

	# a list of packages in the format <package-name>==<version>
	python_packages:
	- "torch"
	- "Pillow" # for PIL/Image
	- "open_clip_torch" # for open_clip
	- "numpy"
	- "transformers"


	# commands run after the environment is setup
	# run:
	# - "echo env is ready!"
	# - "echo another command if needed"

	# predict.py defines how predictions are run on your model
	predict: "predict.py:Predictor"

	image: 'r8.im/alexgodin/fashion-siglip'
	# Prediction interface for Cog ⚙️
	# https://cog.run/python

	import open_clip
	import torch
	from PIL import Image
	from cog import BasePredictor, Input, Path
	import numpy as np
	from typing import List

	class Predictor(BasePredictor):
	def setup(self) -> None:
	"""Load the model into memory to make running multiple predictions efficient"""
	self.model, _, self.preprocess = open_clip.create_model_and_transforms('hf-hub:Marqo/marqo-fashionSigLIP')
	self.tokenizer = open_clip.get_tokenizer('hf-hub:Marqo/marqo-fashionSigLIP')

	def predict(
	self,
	image: Path = Input(description="Input image to generate embeddings for", default=None),
	text: str = Input(description="Input text to generate embeddings for", default=None),
	) -> List[float]:
	"""Generate CLIP embeddings for either the input image or text"""

	# Validate inputs
	if (image is None and text is None) or (image is not None and text is not None):
	raise ValueError("Please provide either an image or text input, but not both or neither")

	if text is not None:
	# Process text input
	text = self.tokenizer(text)
	with torch.no_grad(), torch.cuda.amp.autocast():
	text_features = self.model.encode_text(text, normalize=True)
	return text_features.cpu().numpy().flatten().tolist()

	else:
	# Process image input
	img = Image.open(image)
	processed_image = self.preprocess(img).unsqueeze(0)

	with torch.no_grad(), torch.cuda.amp.autocast():
	image_features = self.model.encode_image(processed_image, normalize=True)

	return image_features.cpu().numpy().flatten().tolist()