Spaces:
Sleeping
Sleeping
File size: 6,424 Bytes
dca679b afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 dca679b afecdc5 7793bb6 afecdc5 159faf0 7793bb6 48155ff afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 0a7f9b4 32e4125 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 0a7f9b4 32e4125 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 0a7f9b4 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 0a7f9b4 7793bb6 afecdc5 0a7f9b4 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 0a7f9b4 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 0a7f9b4 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 663a3b7 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 0a7f9b4 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 dca679b afecdc5 7793bb6 afecdc5 dca679b 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
from typing import List
from src.embedding.embedding_service import EmbeddingService
def test_embedding_service_initialization():
"""Test EmbeddingService initialization"""
service = EmbeddingService()
assert service is not None
assert service.model_name == "sentence-transformers/all-MiniLM-L6-v2"
assert service.device == "cpu"
def test_embedding_service_with_custom_config():
"""Test EmbeddingService initialization with custom configuration"""
service = EmbeddingService(model_name="all-MiniLM-L12-v2", device="cpu", batch_size=16)
assert service.model_name == "all-MiniLM-L12-v2"
assert service.device == "cpu"
assert service.batch_size == 16
def test_single_text_embedding():
"""Test embedding generation for a single text"""
service = EmbeddingService()
text = "This is a test document about company policies."
embedding = service.embed_text(text)
# Should return a list of floats (embedding vector)
assert isinstance(embedding, list)
assert len(embedding) == 384 # paraphrase-MiniLM-L3-v2 dimension
assert all(isinstance(x, (float, int)) for x in embedding)
def test_batch_text_embedding():
"""Test embedding generation for multiple texts"""
service = EmbeddingService()
texts = [
"This is the first document about remote work policy.",
"This is the second document about employee benefits.",
"This is the third document about code of conduct.",
]
embeddings = service.embed_texts(texts)
# Should return list of embeddings
assert isinstance(embeddings, list)
assert len(embeddings) == 3
# Each embedding should be correct dimension
for embedding in embeddings:
assert isinstance(embedding, list)
assert len(embedding) == 384
assert all(isinstance(x, (float, int)) for x in embedding)
def test_embedding_consistency():
"""Test that same text produces same embedding"""
service = EmbeddingService()
text = "Consistent embedding test text."
embedding1 = service.embed_text(text)
embedding2 = service.embed_text(text)
# Should be identical (deterministic)
assert embedding1 == embedding2
def test_different_texts_different_embeddings():
"""Test that different texts produce different embeddings"""
service = EmbeddingService()
text1 = "This is about remote work policy."
text2 = "This is about employee benefits and healthcare."
embedding1 = service.embed_text(text1)
embedding2 = service.embed_text(text2)
# Should be different
assert embedding1 != embedding2
# But should have same dimension
assert len(embedding1) == len(embedding2) == 384
def test_empty_text_handling():
"""Test handling of empty or whitespace-only text"""
service = EmbeddingService()
# Empty string
embedding_empty = service.embed_text("")
assert isinstance(embedding_empty, list)
assert len(embedding_empty) == 384
# Whitespace only
embedding_whitespace = service.embed_text(" \n\t ")
assert isinstance(embedding_whitespace, list)
assert len(embedding_whitespace) == 384
def test_very_long_text_handling():
"""Test handling of very long texts"""
service = EmbeddingService()
# Create a very long text (should test tokenization limits)
long_text = "This is a very long document. " * 1000 # ~30,000 characters
embedding = service.embed_text(long_text)
assert isinstance(embedding, list)
assert len(embedding) == 384
def test_batch_size_handling():
"""Test that batch processing works correctly"""
service = EmbeddingService(batch_size=2) # Small batch for testing
texts = [
"Text one about policy",
"Text two about procedures",
"Text three about guidelines",
"Text four about regulations",
"Text five about rules",
]
embeddings = service.embed_texts(texts)
# Should process all texts despite small batch size
assert len(embeddings) == 5
# All embeddings should be valid
for embedding in embeddings:
assert len(embedding) == 384
def test_special_characters_handling():
"""Test handling of special characters and unicode"""
service = EmbeddingService()
texts_with_special_chars = [
"Policy with emojis 😀 and úñicode",
"Text with numbers: 123,456.78 and symbols @#$%",
"Markdown: # Header\n## Subheader\n- List item",
"Mixed: Policy-2024 (v1.2) — updated 12/01/2025",
]
embeddings = service.embed_texts(texts_with_special_chars)
assert len(embeddings) == 4
for embedding in embeddings:
assert len(embedding) == 384
def test_similarity_makes_sense():
"""Test that semantically similar texts have similar embeddings"""
service = EmbeddingService()
# Similar texts
text1 = "Employee remote work policy guidelines"
text2 = "Guidelines for working from home policies"
# Different text
text3 = "Financial expense reimbursement procedures"
embed1 = service.embed_text(text1)
embed2 = service.embed_text(text2)
embed3 = service.embed_text(text3)
# Calculate simple cosine similarity (for validation)
def cosine_similarity(a: List[float], b: List[float]) -> float:
import numpy as np
a_np = np.array(a)
b_np = np.array(b)
return float(np.dot(a_np, b_np) / (np.linalg.norm(a_np) * np.linalg.norm(b_np)))
sim_1_2 = cosine_similarity(embed1, embed2) # Similar texts
sim_1_3 = cosine_similarity(embed1, embed3) # Different texts
# Similar texts should have higher similarity than different texts
assert sim_1_2 > sim_1_3
assert sim_1_2 > 0.5 # Should be reasonably similar
def test_model_loading_performance():
"""Test that model loading doesn't happen repeatedly"""
# This test ensures model is cached after first load
import time
start_time = time.time()
EmbeddingService() # First service
first_load_time = time.time() - start_time
start_time = time.time()
EmbeddingService() # Second service
second_load_time = time.time() - start_time
# Second initialization should be faster (model already cached)
# Note: This might not always be true depending on implementation
# but it's good to test the general behavior
assert second_load_time <= first_load_time * 2 # Allow some variance
|