Spaces:
Sleeping
Sleeping
| from typing import List | |
| from src.embedding.embedding_service import EmbeddingService | |
| def test_embedding_service_initialization(): | |
| """Test EmbeddingService initialization""" | |
| service = EmbeddingService() | |
| assert service is not None | |
| assert service.model_name == "sentence-transformers/all-MiniLM-L6-v2" | |
| assert service.device == "cpu" | |
| def test_embedding_service_with_custom_config(): | |
| """Test EmbeddingService initialization with custom configuration""" | |
| service = EmbeddingService(model_name="all-MiniLM-L12-v2", device="cpu", batch_size=16) | |
| assert service.model_name == "all-MiniLM-L12-v2" | |
| assert service.device == "cpu" | |
| assert service.batch_size == 16 | |
| def test_single_text_embedding(): | |
| """Test embedding generation for a single text""" | |
| service = EmbeddingService() | |
| text = "This is a test document about company policies." | |
| embedding = service.embed_text(text) | |
| # Should return a list of floats (embedding vector) | |
| assert isinstance(embedding, list) | |
| assert len(embedding) == 384 # paraphrase-MiniLM-L3-v2 dimension | |
| assert all(isinstance(x, (float, int)) for x in embedding) | |
| def test_batch_text_embedding(): | |
| """Test embedding generation for multiple texts""" | |
| service = EmbeddingService() | |
| texts = [ | |
| "This is the first document about remote work policy.", | |
| "This is the second document about employee benefits.", | |
| "This is the third document about code of conduct.", | |
| ] | |
| embeddings = service.embed_texts(texts) | |
| # Should return list of embeddings | |
| assert isinstance(embeddings, list) | |
| assert len(embeddings) == 3 | |
| # Each embedding should be correct dimension | |
| for embedding in embeddings: | |
| assert isinstance(embedding, list) | |
| assert len(embedding) == 384 | |
| assert all(isinstance(x, (float, int)) for x in embedding) | |
| def test_embedding_consistency(): | |
| """Test that same text produces same embedding""" | |
| service = EmbeddingService() | |
| text = "Consistent embedding test text." | |
| embedding1 = service.embed_text(text) | |
| embedding2 = service.embed_text(text) | |
| # Should be identical (deterministic) | |
| assert embedding1 == embedding2 | |
| def test_different_texts_different_embeddings(): | |
| """Test that different texts produce different embeddings""" | |
| service = EmbeddingService() | |
| text1 = "This is about remote work policy." | |
| text2 = "This is about employee benefits and healthcare." | |
| embedding1 = service.embed_text(text1) | |
| embedding2 = service.embed_text(text2) | |
| # Should be different | |
| assert embedding1 != embedding2 | |
| # But should have same dimension | |
| assert len(embedding1) == len(embedding2) == 384 | |
| def test_empty_text_handling(): | |
| """Test handling of empty or whitespace-only text""" | |
| service = EmbeddingService() | |
| # Empty string | |
| embedding_empty = service.embed_text("") | |
| assert isinstance(embedding_empty, list) | |
| assert len(embedding_empty) == 384 | |
| # Whitespace only | |
| embedding_whitespace = service.embed_text(" \n\t ") | |
| assert isinstance(embedding_whitespace, list) | |
| assert len(embedding_whitespace) == 384 | |
| def test_very_long_text_handling(): | |
| """Test handling of very long texts""" | |
| service = EmbeddingService() | |
| # Create a very long text (should test tokenization limits) | |
| long_text = "This is a very long document. " * 1000 # ~30,000 characters | |
| embedding = service.embed_text(long_text) | |
| assert isinstance(embedding, list) | |
| assert len(embedding) == 384 | |
| def test_batch_size_handling(): | |
| """Test that batch processing works correctly""" | |
| service = EmbeddingService(batch_size=2) # Small batch for testing | |
| texts = [ | |
| "Text one about policy", | |
| "Text two about procedures", | |
| "Text three about guidelines", | |
| "Text four about regulations", | |
| "Text five about rules", | |
| ] | |
| embeddings = service.embed_texts(texts) | |
| # Should process all texts despite small batch size | |
| assert len(embeddings) == 5 | |
| # All embeddings should be valid | |
| for embedding in embeddings: | |
| assert len(embedding) == 384 | |
| def test_special_characters_handling(): | |
| """Test handling of special characters and unicode""" | |
| service = EmbeddingService() | |
| texts_with_special_chars = [ | |
| "Policy with emojis 😀 and úñicode", | |
| "Text with numbers: 123,456.78 and symbols @#$%", | |
| "Markdown: # Header\n## Subheader\n- List item", | |
| "Mixed: Policy-2024 (v1.2) — updated 12/01/2025", | |
| ] | |
| embeddings = service.embed_texts(texts_with_special_chars) | |
| assert len(embeddings) == 4 | |
| for embedding in embeddings: | |
| assert len(embedding) == 384 | |
| def test_similarity_makes_sense(): | |
| """Test that semantically similar texts have similar embeddings""" | |
| service = EmbeddingService() | |
| # Similar texts | |
| text1 = "Employee remote work policy guidelines" | |
| text2 = "Guidelines for working from home policies" | |
| # Different text | |
| text3 = "Financial expense reimbursement procedures" | |
| embed1 = service.embed_text(text1) | |
| embed2 = service.embed_text(text2) | |
| embed3 = service.embed_text(text3) | |
| # Calculate simple cosine similarity (for validation) | |
| def cosine_similarity(a: List[float], b: List[float]) -> float: | |
| import numpy as np | |
| a_np = np.array(a) | |
| b_np = np.array(b) | |
| return float(np.dot(a_np, b_np) / (np.linalg.norm(a_np) * np.linalg.norm(b_np))) | |
| sim_1_2 = cosine_similarity(embed1, embed2) # Similar texts | |
| sim_1_3 = cosine_similarity(embed1, embed3) # Different texts | |
| # Similar texts should have higher similarity than different texts | |
| assert sim_1_2 > sim_1_3 | |
| assert sim_1_2 > 0.5 # Should be reasonably similar | |
| def test_model_loading_performance(): | |
| """Test that model loading doesn't happen repeatedly""" | |
| # This test ensures model is cached after first load | |
| import time | |
| start_time = time.time() | |
| EmbeddingService() # First service | |
| first_load_time = time.time() - start_time | |
| start_time = time.time() | |
| EmbeddingService() # Second service | |
| second_load_time = time.time() - start_time | |
| # Second initialization should be faster (model already cached) | |
| # Note: This might not always be true depending on implementation | |
| # but it's good to test the general behavior | |
| assert second_load_time <= first_load_time * 2 # Allow some variance | |