Why does the model take more than 8 minutes to load?

by tepirale - opened 6 days ago

•

Why does the model take more than 8 minutes to load?

!pip install --force-reinstall --no-deps transformers==5.5.0 "tokenizers>=0.22.0,<=0.23.0"

# At the BEGINNING of the file, before any import of transformers/sentence_transformers
import os
os.environ["HF_HUB_OFFLINE"] = "1" # After the first download: removes network checks to the Hub
os.environ["TRANSFORMERS_OFFLINE"] = "1"

os.environ["HF_HUB_ETAG_TIMEOUT"] = "2" # Metadata check: 2s max
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "5"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"


JINA_ID        = "jinaai/jina-embeddings-v5-omni-small"
JINA_DEVICE    = "cuda:0"
JINA_MODALITY  = "vision"

code: 1

# Placeholder de imagen que exige el doc para la torre de visión
JINA_IMG_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"

class MultimodalEmbedder:
    def __init__(self):
        from transformers import AutoModel, AutoProcessor
        print(f"[load] embebedor multimodal: {JINA_ID} (modality={JINA_MODALITY})")
        self.model = AutoModel.from_pretrained(
            JINA_ID,
            trust_remote_code=True,
            default_task="retrieval",     # antes iba en model_kwargs; aquí va directo
            modality=JINA_MODALITY,       # "vision" -> carga visión + texto, omite audio
            dtype=torch.bfloat16,         # opcional; el doc dice que no es obligatorio
        ).to(JINA_DEVICE).eval() #enable or disable
        self.proc = AutoProcessor.from_pretrained(JINA_ID, trust_remote_code=True)

    @torch .inference_mode()
    def _embed(self, **proc_kwargs):
        inputs = self.proc(return_tensors="pt", **proc_kwargs).to(self.model.device)
        vec = self.model.embed(**inputs)            # ya viene L2-normalizado
        return _l2(vec[0].float().cpu().numpy())    # (dim,) en float32

    def embed_query_text(self, text):
        return self._embed(text="Query: " + (text or ""))

    def embed_doc_text(self, text):
        return self._embed(text="Document: " + (text or ""))

    def embed_doc_image(self, pil_image):
        return self._embed(images=pil_image, text="Document: " + JINA_IMG_PLACEHOLDER)

code: 2

class MultimodalEmbedder:
    def __init__(self):
        from sentence_transformers import SentenceTransformer
        print(f"[load] embebedor multimodal: {JINA_ID} (modality={JINA_MODALITY})")
        self.m = SentenceTransformer(
            JINA_ID,
            trust_remote_code=True,
            device=JINA_DEVICE,
            model_kwargs={"default_task": "retrieval", "modality": JINA_MODALITY},
            # max_memory={0: GPU_MTP, "cpu": CPU_OFFLOAD},
        )

    def _enc(self, fn, x):
        return _l2(fn(x))

    def embed_query_text(self, text):
        return self._enc(self.m.encode_query, text)

    def embed_doc_text(self, text):
        return self._enc(self.m.encode_document, text)

    def embed_doc_image(self, pil_image):
        return self._enc(self.m.encode_document, pil_image)

thank you so much

jupyterjazz

Jina AI org 6 days ago

Hi @tepirale , Is the 8 minutes on the first download or on every load after it's already cached?

tepirale

6 days ago

every time I load the model (without downloading)

jupyterjazz

Jina AI org 2 days ago

Ok thanks, seems like an environment/hardware-specific issue.. Are you using Colab or Kaggle? Have you checked if loading is slow for other models too?

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment