msse-ai-engineering / tests /test_phase2a_integration.py
sethmcknight
Refactor test cases for improved readability and consistency
159faf0
"""Integration tests for Phase 2A components."""
import shutil
import tempfile
from src.embedding.embedding_service import EmbeddingService
from src.vector_store.vector_db import VectorDatabase
class TestPhase2AIntegration:
"""Test integration between EmbeddingService and VectorDatabase"""
def setup_method(self):
"""Set up test environment with temporary database"""
self.test_dir = tempfile.mkdtemp()
self.embedding_service = EmbeddingService()
self.vector_db = VectorDatabase(persist_path=self.test_dir, collection_name="test_integration")
def teardown_method(self):
"""Clean up temporary resources"""
if hasattr(self, "test_dir"):
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_embedding_vector_storage_workflow(self):
"""Test complete workflow: text β†’ embedding β†’ storage β†’ search"""
# Sample policy texts
documents = [
("Employees must complete security training annually to " "maintain access to company systems."),
("Remote work policy allows employees to work from home up to " "3 days per week."),
("All expenses over $500 require manager approval before " "reimbursement."),
("Code review is mandatory for all pull requests before " "merging to main branch."),
]
# Generate embeddings
embeddings = self.embedding_service.embed_texts(documents)
# Verify embeddings were generated
assert len(embeddings) == len(documents)
assert all(len(emb) == self.embedding_service.get_embedding_dimension() for emb in embeddings)
# Store embeddings with metadata (using existing collection)
doc_ids = [f"doc_{i}" for i in range(len(documents))]
metadatas = [{"type": "policy", "doc_id": doc_id} for doc_id in doc_ids]
success = self.vector_db.add_embeddings(
embeddings=embeddings,
chunk_ids=doc_ids,
documents=documents,
metadatas=metadatas,
)
assert success is True
# Test search functionality
query = "remote work from home policy"
query_embedding = self.embedding_service.embed_text(query)
results = self.vector_db.search(query_embedding=query_embedding, top_k=2)
# Verify search results (should return list of dictionaries)
assert isinstance(results, list)
assert len(results) <= 2 # Should return at most 2 results
if results: # If we have results
assert all(isinstance(result, dict) for result in results)
# Check that at least one result contains remote work related content
documents_found = [result.get("document", "") for result in results]
remote_work_found = any(
"remote work" in doc.lower() or "work from home" in doc.lower() for doc in documents_found
)
assert remote_work_found
def test_basic_embedding_dimension_consistency(self):
"""Test that embeddings have consistent dimensions"""
# Test different text lengths
texts = [
"Short text.",
("This is a medium length text with several words to test " "embedding consistency."),
(
"This is a much longer text that contains multiple sentences "
"and various types of content to ensure that the embedding "
"service can handle longer inputs without issues and still "
"produce consistent dimensional output vectors."
),
]
# Generate embeddings
embeddings = self.embedding_service.embed_texts(texts)
# All embeddings should have the same dimension
dimensions = [len(emb) for emb in embeddings]
assert all(dim == dimensions[0] for dim in dimensions)
# Dimension should match the service's reported dimension
assert dimensions[0] == self.embedding_service.get_embedding_dimension()
def test_empty_collection_handling(self):
"""Test behavior with empty collection"""
# Search in empty collection
query_embedding = self.embedding_service.embed_text("test query")
results = self.vector_db.search(query_embedding=query_embedding, top_k=5)
# Should handle empty collection gracefully
assert isinstance(results, list)
assert len(results) == 0