File size: 4,877 Bytes
bf961d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
"""
Embedding utilities for LifeUnity AI Cognitive Twin System.
Provides text embedding functionality using Sentence-BERT.
"""
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Union
import torch
from app.utils.logger import get_logger
logger = get_logger("Embedder")
class TextEmbedder:
"""Text embedding handler using Sentence-BERT."""
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
"""
Initialize the text embedder.
Args:
model_name: Name of the sentence-transformers model
"""
self.model_name = model_name
self.model = None
self.embedding_dim = None
logger.info(f"Initializing TextEmbedder with model: {model_name}")
def load_model(self):
"""Load the sentence transformer model."""
try:
if self.model is None:
logger.info(f"Loading model: {self.model_name}")
self.model = SentenceTransformer(self.model_name)
# Get embedding dimension
self.embedding_dim = self.model.get_sentence_embedding_dimension()
logger.info(f"Model loaded successfully. Embedding dim: {self.embedding_dim}")
except Exception as e:
logger.error(f"Error loading model: {str(e)}", exc_info=True)
raise
def embed_text(self, text: Union[str, List[str]]) -> np.ndarray:
"""
Generate embeddings for text.
Args:
text: Single text string or list of text strings
Returns:
Numpy array of embeddings
"""
if self.model is None:
self.load_model()
try:
# Handle single string
if isinstance(text, str):
text = [text]
# Generate embeddings
embeddings = self.model.encode(
text,
convert_to_numpy=True,
show_progress_bar=False
)
logger.debug(f"Generated embeddings for {len(text)} texts")
return embeddings
except Exception as e:
logger.error(f"Error generating embeddings: {str(e)}", exc_info=True)
raise
def compute_similarity(self, text1: str, text2: str) -> float:
"""
Compute cosine similarity between two texts.
Args:
text1: First text
text2: Second text
Returns:
Similarity score (0-1)
"""
try:
embeddings = self.embed_text([text1, text2])
# Compute cosine similarity
similarity = np.dot(embeddings[0], embeddings[1]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
return float(similarity)
except Exception as e:
logger.error(f"Error computing similarity: {str(e)}", exc_info=True)
return 0.0
def find_most_similar(
self,
query: str,
candidates: List[str],
top_k: int = 5
) -> List[tuple]:
"""
Find most similar texts to a query.
Args:
query: Query text
candidates: List of candidate texts
top_k: Number of top results to return
Returns:
List of (index, text, similarity_score) tuples
"""
try:
# Embed query and candidates
query_embedding = self.embed_text(query)
candidate_embeddings = self.embed_text(candidates)
# Compute similarities
similarities = []
for idx, candidate_emb in enumerate(candidate_embeddings):
similarity = np.dot(query_embedding[0], candidate_emb) / (
np.linalg.norm(query_embedding[0]) * np.linalg.norm(candidate_emb)
)
similarities.append((idx, candidates[idx], float(similarity)))
# Sort by similarity (descending)
similarities.sort(key=lambda x: x[2], reverse=True)
return similarities[:top_k]
except Exception as e:
logger.error(f"Error finding similar texts: {str(e)}", exc_info=True)
return []
# Global embedder instance
_embedder = None
def get_embedder(model_name: str = 'all-MiniLM-L6-v2') -> TextEmbedder:
"""
Get or create a global embedder instance.
Args:
model_name: Name of the sentence-transformers model
Returns:
TextEmbedder instance
"""
global _embedder
if _embedder is None:
_embedder = TextEmbedder(model_name)
return _embedder
|