File size: 4,448 Bytes
afecdc5
 
 
7793bb6
afecdc5
 
 
 
 
 
 
7793bb6
afecdc5
 
 
 
159faf0
7793bb6
afecdc5
 
7793bb6
afecdc5
7793bb6
afecdc5
 
7793bb6
afecdc5
 
159faf0
 
 
 
afecdc5
7793bb6
afecdc5
 
7793bb6
afecdc5
 
159faf0
7793bb6
afecdc5
 
 
7793bb6
afecdc5
 
 
 
7793bb6
afecdc5
7793bb6
afecdc5
7793bb6
afecdc5
 
 
7793bb6
 
 
afecdc5
 
 
7793bb6
afecdc5
 
 
7793bb6
 
159faf0
7793bb6
afecdc5
7793bb6
afecdc5
 
7793bb6
afecdc5
 
 
159faf0
7793bb6
 
 
 
 
 
afecdc5
7793bb6
afecdc5
 
7793bb6
afecdc5
 
 
7793bb6
afecdc5
 
7793bb6
afecdc5
 
7793bb6
afecdc5
 
7793bb6
 
 
afecdc5
 
7793bb6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""Integration tests for Phase 2A components."""

import shutil
import tempfile

from src.embedding.embedding_service import EmbeddingService
from src.vector_store.vector_db import VectorDatabase


class TestPhase2AIntegration:
    """Test integration between EmbeddingService and VectorDatabase"""

    def setup_method(self):
        """Set up test environment with temporary database"""
        self.test_dir = tempfile.mkdtemp()
        self.embedding_service = EmbeddingService()
        self.vector_db = VectorDatabase(persist_path=self.test_dir, collection_name="test_integration")

    def teardown_method(self):
        """Clean up temporary resources"""
        if hasattr(self, "test_dir"):
            shutil.rmtree(self.test_dir, ignore_errors=True)

    def test_embedding_vector_storage_workflow(self):
        """Test complete workflow: text → embedding → storage → search"""

        # Sample policy texts
        documents = [
            ("Employees must complete security training annually to " "maintain access to company systems."),
            ("Remote work policy allows employees to work from home up to " "3 days per week."),
            ("All expenses over $500 require manager approval before " "reimbursement."),
            ("Code review is mandatory for all pull requests before " "merging to main branch."),
        ]

        # Generate embeddings
        embeddings = self.embedding_service.embed_texts(documents)

        # Verify embeddings were generated
        assert len(embeddings) == len(documents)
        assert all(len(emb) == self.embedding_service.get_embedding_dimension() for emb in embeddings)

        # Store embeddings with metadata (using existing collection)
        doc_ids = [f"doc_{i}" for i in range(len(documents))]
        metadatas = [{"type": "policy", "doc_id": doc_id} for doc_id in doc_ids]

        success = self.vector_db.add_embeddings(
            embeddings=embeddings,
            chunk_ids=doc_ids,
            documents=documents,
            metadatas=metadatas,
        )

        assert success is True

        # Test search functionality
        query = "remote work from home policy"
        query_embedding = self.embedding_service.embed_text(query)

        results = self.vector_db.search(query_embedding=query_embedding, top_k=2)

        # Verify search results (should return list of dictionaries)
        assert isinstance(results, list)
        assert len(results) <= 2  # Should return at most 2 results

        if results:  # If we have results
            assert all(isinstance(result, dict) for result in results)
            # Check that at least one result contains remote work related content
            documents_found = [result.get("document", "") for result in results]
            remote_work_found = any(
                "remote work" in doc.lower() or "work from home" in doc.lower() for doc in documents_found
            )
            assert remote_work_found

    def test_basic_embedding_dimension_consistency(self):
        """Test that embeddings have consistent dimensions"""

        # Test different text lengths
        texts = [
            "Short text.",
            ("This is a medium length text with several words to test " "embedding consistency."),
            (
                "This is a much longer text that contains multiple sentences "
                "and various types of content to ensure that the embedding "
                "service can handle longer inputs without issues and still "
                "produce consistent dimensional output vectors."
            ),
        ]

        # Generate embeddings
        embeddings = self.embedding_service.embed_texts(texts)

        # All embeddings should have the same dimension
        dimensions = [len(emb) for emb in embeddings]
        assert all(dim == dimensions[0] for dim in dimensions)

        # Dimension should match the service's reported dimension
        assert dimensions[0] == self.embedding_service.get_embedding_dimension()

    def test_empty_collection_handling(self):
        """Test behavior with empty collection"""

        # Search in empty collection
        query_embedding = self.embedding_service.embed_text("test query")

        results = self.vector_db.search(query_embedding=query_embedding, top_k=5)

        # Should handle empty collection gracefully
        assert isinstance(results, list)
        assert len(results) == 0