msse-ai-engineering / tests /test_embedding /test_embedding_service.py
sethmcknight
Refactor test cases for improved readability and consistency
159faf0
from typing import List
from src.embedding.embedding_service import EmbeddingService
def test_embedding_service_initialization():
"""Test EmbeddingService initialization"""
service = EmbeddingService()
assert service is not None
assert service.model_name == "sentence-transformers/all-MiniLM-L6-v2"
assert service.device == "cpu"
def test_embedding_service_with_custom_config():
"""Test EmbeddingService initialization with custom configuration"""
service = EmbeddingService(model_name="all-MiniLM-L12-v2", device="cpu", batch_size=16)
assert service.model_name == "all-MiniLM-L12-v2"
assert service.device == "cpu"
assert service.batch_size == 16
def test_single_text_embedding():
"""Test embedding generation for a single text"""
service = EmbeddingService()
text = "This is a test document about company policies."
embedding = service.embed_text(text)
# Should return a list of floats (embedding vector)
assert isinstance(embedding, list)
assert len(embedding) == 384 # paraphrase-MiniLM-L3-v2 dimension
assert all(isinstance(x, (float, int)) for x in embedding)
def test_batch_text_embedding():
"""Test embedding generation for multiple texts"""
service = EmbeddingService()
texts = [
"This is the first document about remote work policy.",
"This is the second document about employee benefits.",
"This is the third document about code of conduct.",
]
embeddings = service.embed_texts(texts)
# Should return list of embeddings
assert isinstance(embeddings, list)
assert len(embeddings) == 3
# Each embedding should be correct dimension
for embedding in embeddings:
assert isinstance(embedding, list)
assert len(embedding) == 384
assert all(isinstance(x, (float, int)) for x in embedding)
def test_embedding_consistency():
"""Test that same text produces same embedding"""
service = EmbeddingService()
text = "Consistent embedding test text."
embedding1 = service.embed_text(text)
embedding2 = service.embed_text(text)
# Should be identical (deterministic)
assert embedding1 == embedding2
def test_different_texts_different_embeddings():
"""Test that different texts produce different embeddings"""
service = EmbeddingService()
text1 = "This is about remote work policy."
text2 = "This is about employee benefits and healthcare."
embedding1 = service.embed_text(text1)
embedding2 = service.embed_text(text2)
# Should be different
assert embedding1 != embedding2
# But should have same dimension
assert len(embedding1) == len(embedding2) == 384
def test_empty_text_handling():
"""Test handling of empty or whitespace-only text"""
service = EmbeddingService()
# Empty string
embedding_empty = service.embed_text("")
assert isinstance(embedding_empty, list)
assert len(embedding_empty) == 384
# Whitespace only
embedding_whitespace = service.embed_text(" \n\t ")
assert isinstance(embedding_whitespace, list)
assert len(embedding_whitespace) == 384
def test_very_long_text_handling():
"""Test handling of very long texts"""
service = EmbeddingService()
# Create a very long text (should test tokenization limits)
long_text = "This is a very long document. " * 1000 # ~30,000 characters
embedding = service.embed_text(long_text)
assert isinstance(embedding, list)
assert len(embedding) == 384
def test_batch_size_handling():
"""Test that batch processing works correctly"""
service = EmbeddingService(batch_size=2) # Small batch for testing
texts = [
"Text one about policy",
"Text two about procedures",
"Text three about guidelines",
"Text four about regulations",
"Text five about rules",
]
embeddings = service.embed_texts(texts)
# Should process all texts despite small batch size
assert len(embeddings) == 5
# All embeddings should be valid
for embedding in embeddings:
assert len(embedding) == 384
def test_special_characters_handling():
"""Test handling of special characters and unicode"""
service = EmbeddingService()
texts_with_special_chars = [
"Policy with emojis 😀 and úñicode",
"Text with numbers: 123,456.78 and symbols @#$%",
"Markdown: # Header\n## Subheader\n- List item",
"Mixed: Policy-2024 (v1.2) — updated 12/01/2025",
]
embeddings = service.embed_texts(texts_with_special_chars)
assert len(embeddings) == 4
for embedding in embeddings:
assert len(embedding) == 384
def test_similarity_makes_sense():
"""Test that semantically similar texts have similar embeddings"""
service = EmbeddingService()
# Similar texts
text1 = "Employee remote work policy guidelines"
text2 = "Guidelines for working from home policies"
# Different text
text3 = "Financial expense reimbursement procedures"
embed1 = service.embed_text(text1)
embed2 = service.embed_text(text2)
embed3 = service.embed_text(text3)
# Calculate simple cosine similarity (for validation)
def cosine_similarity(a: List[float], b: List[float]) -> float:
import numpy as np
a_np = np.array(a)
b_np = np.array(b)
return float(np.dot(a_np, b_np) / (np.linalg.norm(a_np) * np.linalg.norm(b_np)))
sim_1_2 = cosine_similarity(embed1, embed2) # Similar texts
sim_1_3 = cosine_similarity(embed1, embed3) # Different texts
# Similar texts should have higher similarity than different texts
assert sim_1_2 > sim_1_3
assert sim_1_2 > 0.5 # Should be reasonably similar
def test_model_loading_performance():
"""Test that model loading doesn't happen repeatedly"""
# This test ensures model is cached after first load
import time
start_time = time.time()
EmbeddingService() # First service
first_load_time = time.time() - start_time
start_time = time.time()
EmbeddingService() # Second service
second_load_time = time.time() - start_time
# Second initialization should be faster (model already cached)
# Note: This might not always be true depending on implementation
# but it's good to test the general behavior
assert second_load_time <= first_load_time * 2 # Allow some variance