RaviGohelAI's picture
Upload 34 files
bf961d3 verified
"""
Embedding utilities for LifeUnity AI Cognitive Twin System.
Provides text embedding functionality using Sentence-BERT.
"""
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Union
import torch
from app.utils.logger import get_logger
logger = get_logger("Embedder")
class TextEmbedder:
"""Text embedding handler using Sentence-BERT."""
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
"""
Initialize the text embedder.
Args:
model_name: Name of the sentence-transformers model
"""
self.model_name = model_name
self.model = None
self.embedding_dim = None
logger.info(f"Initializing TextEmbedder with model: {model_name}")
def load_model(self):
"""Load the sentence transformer model."""
try:
if self.model is None:
logger.info(f"Loading model: {self.model_name}")
self.model = SentenceTransformer(self.model_name)
# Get embedding dimension
self.embedding_dim = self.model.get_sentence_embedding_dimension()
logger.info(f"Model loaded successfully. Embedding dim: {self.embedding_dim}")
except Exception as e:
logger.error(f"Error loading model: {str(e)}", exc_info=True)
raise
def embed_text(self, text: Union[str, List[str]]) -> np.ndarray:
"""
Generate embeddings for text.
Args:
text: Single text string or list of text strings
Returns:
Numpy array of embeddings
"""
if self.model is None:
self.load_model()
try:
# Handle single string
if isinstance(text, str):
text = [text]
# Generate embeddings
embeddings = self.model.encode(
text,
convert_to_numpy=True,
show_progress_bar=False
)
logger.debug(f"Generated embeddings for {len(text)} texts")
return embeddings
except Exception as e:
logger.error(f"Error generating embeddings: {str(e)}", exc_info=True)
raise
def compute_similarity(self, text1: str, text2: str) -> float:
"""
Compute cosine similarity between two texts.
Args:
text1: First text
text2: Second text
Returns:
Similarity score (0-1)
"""
try:
embeddings = self.embed_text([text1, text2])
# Compute cosine similarity
similarity = np.dot(embeddings[0], embeddings[1]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
return float(similarity)
except Exception as e:
logger.error(f"Error computing similarity: {str(e)}", exc_info=True)
return 0.0
def find_most_similar(
self,
query: str,
candidates: List[str],
top_k: int = 5
) -> List[tuple]:
"""
Find most similar texts to a query.
Args:
query: Query text
candidates: List of candidate texts
top_k: Number of top results to return
Returns:
List of (index, text, similarity_score) tuples
"""
try:
# Embed query and candidates
query_embedding = self.embed_text(query)
candidate_embeddings = self.embed_text(candidates)
# Compute similarities
similarities = []
for idx, candidate_emb in enumerate(candidate_embeddings):
similarity = np.dot(query_embedding[0], candidate_emb) / (
np.linalg.norm(query_embedding[0]) * np.linalg.norm(candidate_emb)
)
similarities.append((idx, candidates[idx], float(similarity)))
# Sort by similarity (descending)
similarities.sort(key=lambda x: x[2], reverse=True)
return similarities[:top_k]
except Exception as e:
logger.error(f"Error finding similar texts: {str(e)}", exc_info=True)
return []
# Global embedder instance
_embedder = None
def get_embedder(model_name: str = 'all-MiniLM-L6-v2') -> TextEmbedder:
"""
Get or create a global embedder instance.
Args:
model_name: Name of the sentence-transformers model
Returns:
TextEmbedder instance
"""
global _embedder
if _embedder is None:
_embedder = TextEmbedder(model_name)
return _embedder