"""Integration tests for Phase 2A components.""" import shutil import tempfile from src.embedding.embedding_service import EmbeddingService from src.vector_store.vector_db import VectorDatabase class TestPhase2AIntegration: """Test integration between EmbeddingService and VectorDatabase""" def setup_method(self): """Set up test environment with temporary database""" self.test_dir = tempfile.mkdtemp() self.embedding_service = EmbeddingService() self.vector_db = VectorDatabase(persist_path=self.test_dir, collection_name="test_integration") def teardown_method(self): """Clean up temporary resources""" if hasattr(self, "test_dir"): shutil.rmtree(self.test_dir, ignore_errors=True) def test_embedding_vector_storage_workflow(self): """Test complete workflow: text → embedding → storage → search""" # Sample policy texts documents = [ ("Employees must complete security training annually to " "maintain access to company systems."), ("Remote work policy allows employees to work from home up to " "3 days per week."), ("All expenses over $500 require manager approval before " "reimbursement."), ("Code review is mandatory for all pull requests before " "merging to main branch."), ] # Generate embeddings embeddings = self.embedding_service.embed_texts(documents) # Verify embeddings were generated assert len(embeddings) == len(documents) assert all(len(emb) == self.embedding_service.get_embedding_dimension() for emb in embeddings) # Store embeddings with metadata (using existing collection) doc_ids = [f"doc_{i}" for i in range(len(documents))] metadatas = [{"type": "policy", "doc_id": doc_id} for doc_id in doc_ids] success = self.vector_db.add_embeddings( embeddings=embeddings, chunk_ids=doc_ids, documents=documents, metadatas=metadatas, ) assert success is True # Test search functionality query = "remote work from home policy" query_embedding = self.embedding_service.embed_text(query) results = self.vector_db.search(query_embedding=query_embedding, top_k=2) # Verify search results (should return list of dictionaries) assert isinstance(results, list) assert len(results) <= 2 # Should return at most 2 results if results: # If we have results assert all(isinstance(result, dict) for result in results) # Check that at least one result contains remote work related content documents_found = [result.get("document", "") for result in results] remote_work_found = any( "remote work" in doc.lower() or "work from home" in doc.lower() for doc in documents_found ) assert remote_work_found def test_basic_embedding_dimension_consistency(self): """Test that embeddings have consistent dimensions""" # Test different text lengths texts = [ "Short text.", ("This is a medium length text with several words to test " "embedding consistency."), ( "This is a much longer text that contains multiple sentences " "and various types of content to ensure that the embedding " "service can handle longer inputs without issues and still " "produce consistent dimensional output vectors." ), ] # Generate embeddings embeddings = self.embedding_service.embed_texts(texts) # All embeddings should have the same dimension dimensions = [len(emb) for emb in embeddings] assert all(dim == dimensions[0] for dim in dimensions) # Dimension should match the service's reported dimension assert dimensions[0] == self.embedding_service.get_embedding_dimension() def test_empty_collection_handling(self): """Test behavior with empty collection""" # Search in empty collection query_embedding = self.embedding_service.embed_text("test query") results = self.vector_db.search(query_embedding=query_embedding, top_k=5) # Should handle empty collection gracefully assert isinstance(results, list) assert len(results) == 0