|
|
import torch |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
from transformers import AutoModel |
|
|
from typing import Union, List |
|
|
import io |
|
|
|
|
|
|
|
|
class JinaClipEmbeddingService: |
|
|
""" |
|
|
Jina CLIP v2 Embedding Service với hỗ trợ tiếng Việt |
|
|
Sử dụng AutoModel với trust_remote_code |
|
|
""" |
|
|
|
|
|
def __init__(self, model_path: str = "jinaai/jina-clip-v2"): |
|
|
""" |
|
|
Initialize Jina CLIP v2 model |
|
|
|
|
|
Args: |
|
|
model_path: Path to model hoặc HuggingFace model name |
|
|
""" |
|
|
print(f"Loading Jina CLIP v2 model from {model_path}...") |
|
|
|
|
|
|
|
|
self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True) |
|
|
|
|
|
|
|
|
self.model.eval() |
|
|
|
|
|
|
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
self.model.to(self.device) |
|
|
|
|
|
print(f"✓ Loaded Jina CLIP v2 model on: {self.device}") |
|
|
|
|
|
def encode_text( |
|
|
self, |
|
|
text: Union[str, List[str]], |
|
|
truncate_dim: int = None, |
|
|
normalize: bool = True |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Encode text thành vector embeddings (hỗ trợ tiếng Việt) |
|
|
|
|
|
Args: |
|
|
text: Text hoặc list of texts (tiếng Việt) |
|
|
truncate_dim: Matryoshka dimension (64-1024, None = full 1024) |
|
|
normalize: Có normalize embeddings không |
|
|
|
|
|
Returns: |
|
|
numpy array của embeddings |
|
|
""" |
|
|
if isinstance(text, str): |
|
|
text = [text] |
|
|
|
|
|
|
|
|
|
|
|
embeddings = self.model.encode_text( |
|
|
text, |
|
|
truncate_dim=truncate_dim |
|
|
) |
|
|
|
|
|
|
|
|
if isinstance(embeddings, torch.Tensor): |
|
|
embeddings = embeddings.cpu().detach().numpy() |
|
|
|
|
|
|
|
|
if normalize: |
|
|
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) |
|
|
|
|
|
return embeddings |
|
|
|
|
|
def encode_image( |
|
|
self, |
|
|
image: Union[Image.Image, bytes, List, str], |
|
|
truncate_dim: int = None, |
|
|
normalize: bool = True |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Encode image thành vector embeddings |
|
|
|
|
|
Args: |
|
|
image: PIL Image, bytes, URL string, hoặc list of images |
|
|
truncate_dim: Matryoshka dimension (64-1024, None = full 1024) |
|
|
normalize: Có normalize embeddings không |
|
|
|
|
|
Returns: |
|
|
numpy array của embeddings |
|
|
""" |
|
|
|
|
|
if isinstance(image, bytes): |
|
|
image = Image.open(io.BytesIO(image)).convert('RGB') |
|
|
elif isinstance(image, list): |
|
|
processed_images = [] |
|
|
for img in image: |
|
|
if isinstance(img, bytes): |
|
|
processed_images.append(Image.open(io.BytesIO(img)).convert('RGB')) |
|
|
elif isinstance(img, str): |
|
|
|
|
|
processed_images.append(img) |
|
|
else: |
|
|
processed_images.append(img) |
|
|
image = processed_images |
|
|
elif not isinstance(image, list) and not isinstance(image, str): |
|
|
|
|
|
image = [image] |
|
|
|
|
|
|
|
|
|
|
|
embeddings = self.model.encode_image( |
|
|
image, |
|
|
truncate_dim=truncate_dim |
|
|
) |
|
|
|
|
|
|
|
|
if isinstance(embeddings, torch.Tensor): |
|
|
embeddings = embeddings.cpu().detach().numpy() |
|
|
|
|
|
|
|
|
if normalize: |
|
|
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) |
|
|
|
|
|
return embeddings |
|
|
|
|
|
def encode_multimodal( |
|
|
self, |
|
|
text: Union[str, List[str]] = None, |
|
|
image: Union[Image.Image, bytes, List] = None, |
|
|
truncate_dim: int = None, |
|
|
normalize: bool = True |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Encode cả text và image, trả về embeddings kết hợp |
|
|
|
|
|
Args: |
|
|
text: Text hoặc list of texts (tiếng Việt) |
|
|
image: PIL Image, bytes, hoặc list of images |
|
|
truncate_dim: Matryoshka dimension (64-1024, None = full 1024) |
|
|
normalize: Có normalize embeddings không |
|
|
|
|
|
Returns: |
|
|
numpy array của embeddings |
|
|
""" |
|
|
embeddings = [] |
|
|
|
|
|
if text is not None: |
|
|
text_emb = self.encode_text(text, truncate_dim=truncate_dim, normalize=False) |
|
|
embeddings.append(text_emb) |
|
|
|
|
|
if image is not None: |
|
|
image_emb = self.encode_image(image, truncate_dim=truncate_dim, normalize=False) |
|
|
embeddings.append(image_emb) |
|
|
|
|
|
|
|
|
if len(embeddings) == 2: |
|
|
|
|
|
combined = np.mean(embeddings, axis=0) |
|
|
elif len(embeddings) == 1: |
|
|
combined = embeddings[0] |
|
|
else: |
|
|
raise ValueError("Phải cung cấp ít nhất text hoặc image") |
|
|
|
|
|
|
|
|
if normalize: |
|
|
combined = combined / np.linalg.norm(combined, axis=1, keepdims=True) |
|
|
|
|
|
return combined |
|
|
|
|
|
def get_embedding_dimension(self) -> int: |
|
|
""" |
|
|
Trả về dimension của embeddings (1024 cho Jina CLIP v2) |
|
|
""" |
|
|
return 1024 |
|
|
|