Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

msse-ai-engineering / tests /test_phase2a_integration.py

sethmcknight

Refactor test cases for improved readability and consistency

159faf0 about 2 months ago

4.45 kB

	"""Integration tests for Phase 2A components."""

	import shutil
	import tempfile

	from src.embedding.embedding_service import EmbeddingService
	from src.vector_store.vector_db import VectorDatabase


	class TestPhase2AIntegration:
	"""Test integration between EmbeddingService and VectorDatabase"""

	def setup_method(self):
	"""Set up test environment with temporary database"""
	self.test_dir = tempfile.mkdtemp()
	self.embedding_service = EmbeddingService()
	self.vector_db = VectorDatabase(persist_path=self.test_dir, collection_name="test_integration")

	def teardown_method(self):
	"""Clean up temporary resources"""
	if hasattr(self, "test_dir"):
	shutil.rmtree(self.test_dir, ignore_errors=True)

	def test_embedding_vector_storage_workflow(self):
	"""Test complete workflow: text → embedding → storage → search"""

	# Sample policy texts
	documents = [
	("Employees must complete security training annually to " "maintain access to company systems."),
	("Remote work policy allows employees to work from home up to " "3 days per week."),
	("All expenses over $500 require manager approval before " "reimbursement."),
	("Code review is mandatory for all pull requests before " "merging to main branch."),
	]

	# Generate embeddings
	embeddings = self.embedding_service.embed_texts(documents)

	# Verify embeddings were generated
	assert len(embeddings) == len(documents)
	assert all(len(emb) == self.embedding_service.get_embedding_dimension() for emb in embeddings)

	# Store embeddings with metadata (using existing collection)
	doc_ids = [f"doc_{i}" for i in range(len(documents))]
	metadatas = [{"type": "policy", "doc_id": doc_id} for doc_id in doc_ids]

	success = self.vector_db.add_embeddings(
	embeddings=embeddings,
	chunk_ids=doc_ids,
	documents=documents,
	metadatas=metadatas,
	)

	assert success is True

	# Test search functionality
	query = "remote work from home policy"
	query_embedding = self.embedding_service.embed_text(query)

	results = self.vector_db.search(query_embedding=query_embedding, top_k=2)

	# Verify search results (should return list of dictionaries)
	assert isinstance(results, list)
	assert len(results) <= 2 # Should return at most 2 results

	if results: # If we have results
	assert all(isinstance(result, dict) for result in results)
	# Check that at least one result contains remote work related content
	documents_found = [result.get("document", "") for result in results]
	remote_work_found = any(
	"remote work" in doc.lower() or "work from home" in doc.lower() for doc in documents_found
	)
	assert remote_work_found

	def test_basic_embedding_dimension_consistency(self):
	"""Test that embeddings have consistent dimensions"""

	# Test different text lengths
	texts = [
	"Short text.",
	("This is a medium length text with several words to test " "embedding consistency."),
	(
	"This is a much longer text that contains multiple sentences "
	"and various types of content to ensure that the embedding "
	"service can handle longer inputs without issues and still "
	"produce consistent dimensional output vectors."
	),
	]

	# Generate embeddings
	embeddings = self.embedding_service.embed_texts(texts)

	# All embeddings should have the same dimension
	dimensions = [len(emb) for emb in embeddings]
	assert all(dim == dimensions[0] for dim in dimensions)

	# Dimension should match the service's reported dimension
	assert dimensions[0] == self.embedding_service.get_embedding_dimension()

	def test_empty_collection_handling(self):
	"""Test behavior with empty collection"""

	# Search in empty collection
	query_embedding = self.embedding_service.embed_text("test query")

	results = self.vector_db.search(query_embedding=query_embedding, top_k=5)

	# Should handle empty collection gracefully
	assert isinstance(results, list)
	assert len(results) == 0