File size: 4,877 Bytes
bf961d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
Embedding utilities for LifeUnity AI Cognitive Twin System.
Provides text embedding functionality using Sentence-BERT.
"""

from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Union
import torch
from app.utils.logger import get_logger

logger = get_logger("Embedder")


class TextEmbedder:
    """Text embedding handler using Sentence-BERT."""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the text embedder.
        
        Args:
            model_name: Name of the sentence-transformers model
        """
        self.model_name = model_name
        self.model = None
        self.embedding_dim = None
        logger.info(f"Initializing TextEmbedder with model: {model_name}")
    
    def load_model(self):
        """Load the sentence transformer model."""
        try:
            if self.model is None:
                logger.info(f"Loading model: {self.model_name}")
                self.model = SentenceTransformer(self.model_name)
                # Get embedding dimension
                self.embedding_dim = self.model.get_sentence_embedding_dimension()
                logger.info(f"Model loaded successfully. Embedding dim: {self.embedding_dim}")
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}", exc_info=True)
            raise
    
    def embed_text(self, text: Union[str, List[str]]) -> np.ndarray:
        """
        Generate embeddings for text.
        
        Args:
            text: Single text string or list of text strings
            
        Returns:
            Numpy array of embeddings
        """
        if self.model is None:
            self.load_model()
        
        try:
            # Handle single string
            if isinstance(text, str):
                text = [text]
            
            # Generate embeddings
            embeddings = self.model.encode(
                text,
                convert_to_numpy=True,
                show_progress_bar=False
            )
            
            logger.debug(f"Generated embeddings for {len(text)} texts")
            return embeddings
            
        except Exception as e:
            logger.error(f"Error generating embeddings: {str(e)}", exc_info=True)
            raise
    
    def compute_similarity(self, text1: str, text2: str) -> float:
        """
        Compute cosine similarity between two texts.
        
        Args:
            text1: First text
            text2: Second text
            
        Returns:
            Similarity score (0-1)
        """
        try:
            embeddings = self.embed_text([text1, text2])
            
            # Compute cosine similarity
            similarity = np.dot(embeddings[0], embeddings[1]) / (
                np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
            )
            
            return float(similarity)
            
        except Exception as e:
            logger.error(f"Error computing similarity: {str(e)}", exc_info=True)
            return 0.0
    
    def find_most_similar(
        self,
        query: str,
        candidates: List[str],
        top_k: int = 5
    ) -> List[tuple]:
        """
        Find most similar texts to a query.
        
        Args:
            query: Query text
            candidates: List of candidate texts
            top_k: Number of top results to return
            
        Returns:
            List of (index, text, similarity_score) tuples
        """
        try:
            # Embed query and candidates
            query_embedding = self.embed_text(query)
            candidate_embeddings = self.embed_text(candidates)
            
            # Compute similarities
            similarities = []
            for idx, candidate_emb in enumerate(candidate_embeddings):
                similarity = np.dot(query_embedding[0], candidate_emb) / (
                    np.linalg.norm(query_embedding[0]) * np.linalg.norm(candidate_emb)
                )
                similarities.append((idx, candidates[idx], float(similarity)))
            
            # Sort by similarity (descending)
            similarities.sort(key=lambda x: x[2], reverse=True)
            
            return similarities[:top_k]
            
        except Exception as e:
            logger.error(f"Error finding similar texts: {str(e)}", exc_info=True)
            return []


# Global embedder instance
_embedder = None


def get_embedder(model_name: str = 'all-MiniLM-L6-v2') -> TextEmbedder:
    """
    Get or create a global embedder instance.
    
    Args:
        model_name: Name of the sentence-transformers model
        
    Returns:
        TextEmbedder instance
    """
    global _embedder
    if _embedder is None:
        _embedder = TextEmbedder(model_name)
    return _embedder