Aquileo | jinaai/jina-embeddings-v5-omni-small · Why does the model take more than 8 minutes to load?

Why does the model take more than 8 minutes to load?

#2
by tepirale - opened

Why does the model take more than 8 minutes to load?

!pip install --force-reinstall --no-deps transformers==5.5.0 "tokenizers>=0.22.0,<=0.23.0"
# At the BEGINNING of the file, before any import of transformers/sentence_transformers
import os
os.environ["HF_HUB_OFFLINE"] = "1" # After the first download: removes network checks to the Hub
os.environ["TRANSFORMERS_OFFLINE"] = "1"

os.environ["HF_HUB_ETAG_TIMEOUT"] = "2" # Metadata check: 2s max
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "5"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"


JINA_ID        = "jinaai/jina-embeddings-v5-omni-small"
JINA_DEVICE    = "cuda:0"
JINA_MODALITY  = "vision"
  • code: 1
# Placeholder de imagen que exige el doc para la torre de visión
JINA_IMG_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"

class MultimodalEmbedder:
    def __init__(self):
        from transformers import AutoModel, AutoProcessor
        print(f"[load] embebedor multimodal: {JINA_ID} (modality={JINA_MODALITY})")
        self.model = AutoModel.from_pretrained(
            JINA_ID,
            trust_remote_code=True,
            default_task="retrieval",     # antes iba en model_kwargs; aquí va directo
            modality=JINA_MODALITY,       # "vision" -> carga visión + texto, omite audio
            dtype=torch.bfloat16,         # opcional; el doc dice que no es obligatorio
        ).to(JINA_DEVICE).eval() #enable or disable
        self.proc = AutoProcessor.from_pretrained(JINA_ID, trust_remote_code=True)

    @torch .inference_mode()
    def _embed(self, **proc_kwargs):
        inputs = self.proc(return_tensors="pt", **proc_kwargs).to(self.model.device)
        vec = self.model.embed(**inputs)            # ya viene L2-normalizado
        return _l2(vec[0].float().cpu().numpy())    # (dim,) en float32

    def embed_query_text(self, text):
        return self._embed(text="Query: " + (text or ""))

    def embed_doc_text(self, text):
        return self._embed(text="Document: " + (text or ""))

    def embed_doc_image(self, pil_image):
        return self._embed(images=pil_image, text="Document: " + JINA_IMG_PLACEHOLDER)
  • code: 2
class MultimodalEmbedder:
    def __init__(self):
        from sentence_transformers import SentenceTransformer
        print(f"[load] embebedor multimodal: {JINA_ID} (modality={JINA_MODALITY})")
        self.m = SentenceTransformer(
            JINA_ID,
            trust_remote_code=True,
            device=JINA_DEVICE,
            model_kwargs={"default_task": "retrieval", "modality": JINA_MODALITY},
            # max_memory={0: GPU_MTP, "cpu": CPU_OFFLOAD},
        )

    def _enc(self, fn, x):
        return _l2(fn(x))

    def embed_query_text(self, text):
        return self._enc(self.m.encode_query, text)

    def embed_doc_text(self, text):
        return self._enc(self.m.encode_document, text)

    def embed_doc_image(self, pil_image):
        return self._enc(self.m.encode_document, pil_image)

thank you so much

Jina AI org

Hi @tepirale , Is the 8 minutes on the first download or on every load after it's already cached?

every time I load the model (without downloading)

Jina AI org

Ok thanks, seems like an environment/hardware-specific issue.. Are you using Colab or Kaggle? Have you checked if loading is slow for other models too?

Sign up or log in to comment