Spaces:
Sleeping
Sleeping
| """Integration tests for Phase 2A components.""" | |
| import shutil | |
| import tempfile | |
| from src.embedding.embedding_service import EmbeddingService | |
| from src.vector_store.vector_db import VectorDatabase | |
| class TestPhase2AIntegration: | |
| """Test integration between EmbeddingService and VectorDatabase""" | |
| def setup_method(self): | |
| """Set up test environment with temporary database""" | |
| self.test_dir = tempfile.mkdtemp() | |
| self.embedding_service = EmbeddingService() | |
| self.vector_db = VectorDatabase(persist_path=self.test_dir, collection_name="test_integration") | |
| def teardown_method(self): | |
| """Clean up temporary resources""" | |
| if hasattr(self, "test_dir"): | |
| shutil.rmtree(self.test_dir, ignore_errors=True) | |
| def test_embedding_vector_storage_workflow(self): | |
| """Test complete workflow: text β embedding β storage β search""" | |
| # Sample policy texts | |
| documents = [ | |
| ("Employees must complete security training annually to " "maintain access to company systems."), | |
| ("Remote work policy allows employees to work from home up to " "3 days per week."), | |
| ("All expenses over $500 require manager approval before " "reimbursement."), | |
| ("Code review is mandatory for all pull requests before " "merging to main branch."), | |
| ] | |
| # Generate embeddings | |
| embeddings = self.embedding_service.embed_texts(documents) | |
| # Verify embeddings were generated | |
| assert len(embeddings) == len(documents) | |
| assert all(len(emb) == self.embedding_service.get_embedding_dimension() for emb in embeddings) | |
| # Store embeddings with metadata (using existing collection) | |
| doc_ids = [f"doc_{i}" for i in range(len(documents))] | |
| metadatas = [{"type": "policy", "doc_id": doc_id} for doc_id in doc_ids] | |
| success = self.vector_db.add_embeddings( | |
| embeddings=embeddings, | |
| chunk_ids=doc_ids, | |
| documents=documents, | |
| metadatas=metadatas, | |
| ) | |
| assert success is True | |
| # Test search functionality | |
| query = "remote work from home policy" | |
| query_embedding = self.embedding_service.embed_text(query) | |
| results = self.vector_db.search(query_embedding=query_embedding, top_k=2) | |
| # Verify search results (should return list of dictionaries) | |
| assert isinstance(results, list) | |
| assert len(results) <= 2 # Should return at most 2 results | |
| if results: # If we have results | |
| assert all(isinstance(result, dict) for result in results) | |
| # Check that at least one result contains remote work related content | |
| documents_found = [result.get("document", "") for result in results] | |
| remote_work_found = any( | |
| "remote work" in doc.lower() or "work from home" in doc.lower() for doc in documents_found | |
| ) | |
| assert remote_work_found | |
| def test_basic_embedding_dimension_consistency(self): | |
| """Test that embeddings have consistent dimensions""" | |
| # Test different text lengths | |
| texts = [ | |
| "Short text.", | |
| ("This is a medium length text with several words to test " "embedding consistency."), | |
| ( | |
| "This is a much longer text that contains multiple sentences " | |
| "and various types of content to ensure that the embedding " | |
| "service can handle longer inputs without issues and still " | |
| "produce consistent dimensional output vectors." | |
| ), | |
| ] | |
| # Generate embeddings | |
| embeddings = self.embedding_service.embed_texts(texts) | |
| # All embeddings should have the same dimension | |
| dimensions = [len(emb) for emb in embeddings] | |
| assert all(dim == dimensions[0] for dim in dimensions) | |
| # Dimension should match the service's reported dimension | |
| assert dimensions[0] == self.embedding_service.get_embedding_dimension() | |
| def test_empty_collection_handling(self): | |
| """Test behavior with empty collection""" | |
| # Search in empty collection | |
| query_embedding = self.embedding_service.embed_text("test query") | |
| results = self.vector_db.search(query_embedding=query_embedding, top_k=5) | |
| # Should handle empty collection gracefully | |
| assert isinstance(results, list) | |
| assert len(results) == 0 | |