Spaces:
Sleeping
Sleeping
File size: 4,448 Bytes
afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 159faf0 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 159faf0 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 159faf0 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 159faf0 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 159faf0 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 afecdc5 7793bb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
"""Integration tests for Phase 2A components."""
import shutil
import tempfile
from src.embedding.embedding_service import EmbeddingService
from src.vector_store.vector_db import VectorDatabase
class TestPhase2AIntegration:
"""Test integration between EmbeddingService and VectorDatabase"""
def setup_method(self):
"""Set up test environment with temporary database"""
self.test_dir = tempfile.mkdtemp()
self.embedding_service = EmbeddingService()
self.vector_db = VectorDatabase(persist_path=self.test_dir, collection_name="test_integration")
def teardown_method(self):
"""Clean up temporary resources"""
if hasattr(self, "test_dir"):
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_embedding_vector_storage_workflow(self):
"""Test complete workflow: text → embedding → storage → search"""
# Sample policy texts
documents = [
("Employees must complete security training annually to " "maintain access to company systems."),
("Remote work policy allows employees to work from home up to " "3 days per week."),
("All expenses over $500 require manager approval before " "reimbursement."),
("Code review is mandatory for all pull requests before " "merging to main branch."),
]
# Generate embeddings
embeddings = self.embedding_service.embed_texts(documents)
# Verify embeddings were generated
assert len(embeddings) == len(documents)
assert all(len(emb) == self.embedding_service.get_embedding_dimension() for emb in embeddings)
# Store embeddings with metadata (using existing collection)
doc_ids = [f"doc_{i}" for i in range(len(documents))]
metadatas = [{"type": "policy", "doc_id": doc_id} for doc_id in doc_ids]
success = self.vector_db.add_embeddings(
embeddings=embeddings,
chunk_ids=doc_ids,
documents=documents,
metadatas=metadatas,
)
assert success is True
# Test search functionality
query = "remote work from home policy"
query_embedding = self.embedding_service.embed_text(query)
results = self.vector_db.search(query_embedding=query_embedding, top_k=2)
# Verify search results (should return list of dictionaries)
assert isinstance(results, list)
assert len(results) <= 2 # Should return at most 2 results
if results: # If we have results
assert all(isinstance(result, dict) for result in results)
# Check that at least one result contains remote work related content
documents_found = [result.get("document", "") for result in results]
remote_work_found = any(
"remote work" in doc.lower() or "work from home" in doc.lower() for doc in documents_found
)
assert remote_work_found
def test_basic_embedding_dimension_consistency(self):
"""Test that embeddings have consistent dimensions"""
# Test different text lengths
texts = [
"Short text.",
("This is a medium length text with several words to test " "embedding consistency."),
(
"This is a much longer text that contains multiple sentences "
"and various types of content to ensure that the embedding "
"service can handle longer inputs without issues and still "
"produce consistent dimensional output vectors."
),
]
# Generate embeddings
embeddings = self.embedding_service.embed_texts(texts)
# All embeddings should have the same dimension
dimensions = [len(emb) for emb in embeddings]
assert all(dim == dimensions[0] for dim in dimensions)
# Dimension should match the service's reported dimension
assert dimensions[0] == self.embedding_service.get_embedding_dimension()
def test_empty_collection_handling(self):
"""Test behavior with empty collection"""
# Search in empty collection
query_embedding = self.embedding_service.embed_text("test query")
results = self.vector_db.search(query_embedding=query_embedding, top_k=5)
# Should handle empty collection gracefully
assert isinstance(results, list)
assert len(results) == 0
|