from typing import List from src.embedding.embedding_service import EmbeddingService def test_embedding_service_initialization(): """Test EmbeddingService initialization""" service = EmbeddingService() assert service is not None assert service.model_name == "sentence-transformers/all-MiniLM-L6-v2" assert service.device == "cpu" def test_embedding_service_with_custom_config(): """Test EmbeddingService initialization with custom configuration""" service = EmbeddingService(model_name="all-MiniLM-L12-v2", device="cpu", batch_size=16) assert service.model_name == "all-MiniLM-L12-v2" assert service.device == "cpu" assert service.batch_size == 16 def test_single_text_embedding(): """Test embedding generation for a single text""" service = EmbeddingService() text = "This is a test document about company policies." embedding = service.embed_text(text) # Should return a list of floats (embedding vector) assert isinstance(embedding, list) assert len(embedding) == 384 # paraphrase-MiniLM-L3-v2 dimension assert all(isinstance(x, (float, int)) for x in embedding) def test_batch_text_embedding(): """Test embedding generation for multiple texts""" service = EmbeddingService() texts = [ "This is the first document about remote work policy.", "This is the second document about employee benefits.", "This is the third document about code of conduct.", ] embeddings = service.embed_texts(texts) # Should return list of embeddings assert isinstance(embeddings, list) assert len(embeddings) == 3 # Each embedding should be correct dimension for embedding in embeddings: assert isinstance(embedding, list) assert len(embedding) == 384 assert all(isinstance(x, (float, int)) for x in embedding) def test_embedding_consistency(): """Test that same text produces same embedding""" service = EmbeddingService() text = "Consistent embedding test text." embedding1 = service.embed_text(text) embedding2 = service.embed_text(text) # Should be identical (deterministic) assert embedding1 == embedding2 def test_different_texts_different_embeddings(): """Test that different texts produce different embeddings""" service = EmbeddingService() text1 = "This is about remote work policy." text2 = "This is about employee benefits and healthcare." embedding1 = service.embed_text(text1) embedding2 = service.embed_text(text2) # Should be different assert embedding1 != embedding2 # But should have same dimension assert len(embedding1) == len(embedding2) == 384 def test_empty_text_handling(): """Test handling of empty or whitespace-only text""" service = EmbeddingService() # Empty string embedding_empty = service.embed_text("") assert isinstance(embedding_empty, list) assert len(embedding_empty) == 384 # Whitespace only embedding_whitespace = service.embed_text(" \n\t ") assert isinstance(embedding_whitespace, list) assert len(embedding_whitespace) == 384 def test_very_long_text_handling(): """Test handling of very long texts""" service = EmbeddingService() # Create a very long text (should test tokenization limits) long_text = "This is a very long document. " * 1000 # ~30,000 characters embedding = service.embed_text(long_text) assert isinstance(embedding, list) assert len(embedding) == 384 def test_batch_size_handling(): """Test that batch processing works correctly""" service = EmbeddingService(batch_size=2) # Small batch for testing texts = [ "Text one about policy", "Text two about procedures", "Text three about guidelines", "Text four about regulations", "Text five about rules", ] embeddings = service.embed_texts(texts) # Should process all texts despite small batch size assert len(embeddings) == 5 # All embeddings should be valid for embedding in embeddings: assert len(embedding) == 384 def test_special_characters_handling(): """Test handling of special characters and unicode""" service = EmbeddingService() texts_with_special_chars = [ "Policy with emojis 😀 and úñicode", "Text with numbers: 123,456.78 and symbols @#$%", "Markdown: # Header\n## Subheader\n- List item", "Mixed: Policy-2024 (v1.2) — updated 12/01/2025", ] embeddings = service.embed_texts(texts_with_special_chars) assert len(embeddings) == 4 for embedding in embeddings: assert len(embedding) == 384 def test_similarity_makes_sense(): """Test that semantically similar texts have similar embeddings""" service = EmbeddingService() # Similar texts text1 = "Employee remote work policy guidelines" text2 = "Guidelines for working from home policies" # Different text text3 = "Financial expense reimbursement procedures" embed1 = service.embed_text(text1) embed2 = service.embed_text(text2) embed3 = service.embed_text(text3) # Calculate simple cosine similarity (for validation) def cosine_similarity(a: List[float], b: List[float]) -> float: import numpy as np a_np = np.array(a) b_np = np.array(b) return float(np.dot(a_np, b_np) / (np.linalg.norm(a_np) * np.linalg.norm(b_np))) sim_1_2 = cosine_similarity(embed1, embed2) # Similar texts sim_1_3 = cosine_similarity(embed1, embed3) # Different texts # Similar texts should have higher similarity than different texts assert sim_1_2 > sim_1_3 assert sim_1_2 > 0.5 # Should be reasonably similar def test_model_loading_performance(): """Test that model loading doesn't happen repeatedly""" # This test ensures model is cached after first load import time start_time = time.time() EmbeddingService() # First service first_load_time = time.time() - start_time start_time = time.time() EmbeddingService() # Second service second_load_time = time.time() - start_time # Second initialization should be faster (model already cached) # Note: This might not always be true depending on implementation # but it's good to test the general behavior assert second_load_time <= first_load_time * 2 # Allow some variance