Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

Tobias Pasquale commited on Oct 18

Commit

7793bb6

1 Parent(s): 7effb84

style: Fix code formatting and linting issues for CI/CD compliance

- Fix black code formatting across all Python files
- Fix isort import ordering
- Remove unused imports (pytest, pathlib, numpy, Union)
- Fix line length issues (split long lines)
- Remove unused variables in tests
- Fix whitespace and end-of-file issues
- Address flake8 linting requirements

All changes maintain functionality while ensuring CI/CD pipeline passes.

Files changed (22) hide show

CHANGELOG.md +19 -1
app.py +16 -18
src/__init__.py +1 -1
src/config.py +3 -3
src/embedding/__init__.py +1 -1
src/embedding/embedding_service.py +55 -48
src/ingestion/__init__.py +1 -1
src/ingestion/document_chunker.py +43 -36
src/ingestion/document_parser.py +19 -21
src/ingestion/ingestion_pipeline.py +26 -20
src/vector_store/__init__.py +1 -1
src/vector_store/vector_db.py +57 -49
tests/test_app.py +2 -1
tests/test_embedding/__init__.py +1 -1
tests/test_embedding/test_embedding_service.py +57 -46
tests/test_ingestion/__init__.py +1 -1
tests/test_ingestion/test_document_chunker.py +66 -58
tests/test_ingestion/test_document_parser.py +39 -30
tests/test_ingestion/test_ingestion_pipeline.py +70 -58
tests/test_integration.py +66 -47
tests/test_vector_store/__init__.py +1 -1
tests/test_vector_store/test_vector_db.py +51 -41

CHANGELOG.md CHANGED Viewed

@@ -205,6 +205,24 @@ Each entry includes:
   - **Foundation Complete**: ChromaDB + HuggingFace embeddings fully integrated and tested
   - **Phase 2A Status**: ✅ COMPLETED SUCCESSFULLY - Ready for Phase 2B Enhanced Ingestion Pipeline
 ---
 ## Next Planned Actions
@@ -248,4 +266,4 @@ Each entry includes:
 ---
-*This changelog is automatically updated after each development action to maintain complete project transparency and audit trail.*

   - **Foundation Complete**: ChromaDB + HuggingFace embeddings fully integrated and tested
   - **Phase 2A Status**: ✅ COMPLETED SUCCESSFULLY - Ready for Phase 2B Enhanced Ingestion Pipeline
+#### Entry #012 - 2025-10-17 17:30
+- **Action Type**: DEPLOY + COLLABORATE
+- **Component**: Project Documentation & Team Collaboration
+- **Description**: Moved development changelog to root directory and committed to git for better team collaboration and visibility
+- **Files Changed**:
+  - Moved: `planning/development-changelog.md` → `CHANGELOG.md` (root directory)
+  - Modified: `README.md` (added Development Progress section)
+  - Committed: All Phase 2A changes to `feat/embedding-vector-storage` branch
+- **Tests**: N/A (documentation/collaboration improvement)
+- **CI/CD**: Branch pushed to GitHub with comprehensive commit history
+- **Notes**:
+  - **Team Collaboration**: CHANGELOG.md now visible in repository for partner collaboration
+  - **Comprehensive Commit**: All Phase 2A changes committed with detailed descriptions
+  - **Documentation Enhancement**: README updated to reference changelog for development tracking
+  - **Branch Status**: `feat/embedding-vector-storage` ready for pull request and code review
+  - **Visibility Improvement**: Development progress now trackable by all team members
+  - **Next Steps**: Ready for partner review and Phase 2B planning collaboration
 ---
 ## Next Planned Actions
 ---
+*This changelog is automatically updated after each development action to maintain complete project transparency and audit trail.*

app.py CHANGED Viewed

@@ -19,32 +19,30 @@ def health():
     return jsonify({"status": "ok"}), 200
-@app.route('/ingest', methods=['POST'])
 def ingest():
     """Endpoint to trigger document ingestion"""
     try:
         from src.ingestion.ingestion_pipeline import IngestionPipeline
-        from src.config import CORPUS_DIRECTORY, DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, RANDOM_SEED
         pipeline = IngestionPipeline(
-            chunk_size=DEFAULT_CHUNK_SIZE,
-            overlap=DEFAULT_OVERLAP,
-            seed=RANDOM_SEED
         )
         chunks = pipeline.process_directory(CORPUS_DIRECTORY)
-        return jsonify({
-            "status": "success",
-            "chunks_processed": len(chunks),
-            "message": f"Successfully processed {len(chunks)} chunks"
-        })
     except Exception as e:
-        return jsonify({
-            "status": "error",
-            "message": str(e)
-        }), 500
 if __name__ == "__main__":

     return jsonify({"status": "ok"}), 200
+@app.route("/ingest", methods=["POST"])
 def ingest():
     """Endpoint to trigger document ingestion"""
     try:
+        from src.config import (CORPUS_DIRECTORY, DEFAULT_CHUNK_SIZE,
+                                DEFAULT_OVERLAP, RANDOM_SEED)
         from src.ingestion.ingestion_pipeline import IngestionPipeline
         pipeline = IngestionPipeline(
+            chunk_size=DEFAULT_CHUNK_SIZE, overlap=DEFAULT_OVERLAP, seed=RANDOM_SEED
         )
         chunks = pipeline.process_directory(CORPUS_DIRECTORY)
+        return jsonify(
+            {
+                "status": "success",
+                "chunks_processed": len(chunks),
+                "message": f"Successfully processed {len(chunks)} chunks",
+            }
+        )
     except Exception as e:
+        return jsonify({"status": "error", "message": str(e)}), 500
 if __name__ == "__main__":

src/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # Empty file to make src a package


1	+ # Empty file to make src a package

src/config.py CHANGED Viewed

@@ -6,10 +6,10 @@ DEFAULT_OVERLAP = 200
 RANDOM_SEED = 42
 # Supported file formats
-SUPPORTED_FORMATS = {'.txt', '.md', '.markdown'}
 # Corpus directory
-CORPUS_DIRECTORY = 'synthetic_policies'
 # Vector Database Settings
 VECTOR_DB_PERSIST_PATH = "data/chroma_db"
@@ -25,4 +25,4 @@ EMBEDDING_DEVICE = "cpu"  # Use CPU for free tier compatibility
 # Search Settings
 DEFAULT_TOP_K = 5
 MAX_TOP_K = 20
-MIN_SIMILARITY_THRESHOLD = 0.3

 RANDOM_SEED = 42
 # Supported file formats
+SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}
 # Corpus directory
+CORPUS_DIRECTORY = "synthetic_policies"
 # Vector Database Settings
 VECTOR_DB_PERSIST_PATH = "data/chroma_db"
 # Search Settings
 DEFAULT_TOP_K = 5
 MAX_TOP_K = 20
+MIN_SIMILARITY_THRESHOLD = 0.3

src/embedding/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # Embedding service package for HuggingFace model integration


1	+ # Embedding service package for HuggingFace model integration

src/embedding/embedding_service.py CHANGED Viewed

@@ -1,22 +1,24 @@
-from sentence_transformers import SentenceTransformer
-from typing import List, Union
 import logging
 import numpy as np
 class EmbeddingService:
     """HuggingFace sentence-transformers wrapper for generating embeddings"""
     _model_cache = {}  # Class-level cache for model instances
     def __init__(
-        self,
         model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
         device: str = "cpu",
-        batch_size: int = 32
     ):
         """
         Initialize the embedding service
         Args:
             model_name: HuggingFace model name
             device: Device to run the model on ('cpu' or 'cuda')
@@ -25,64 +27,69 @@ class EmbeddingService:
         self.model_name = model_name
         self.device = device
         self.batch_size = batch_size
         # Load model (with caching)
         self.model = self._load_model()
-        logging.info(f"Initialized EmbeddingService with model '{model_name}' on device '{device}'")
     def _load_model(self) -> SentenceTransformer:
         """Load the sentence transformer model with caching"""
         cache_key = f"{self.model_name}_{self.device}"
         if cache_key not in self._model_cache:
-            logging.info(f"Loading model '{self.model_name}' on device '{self.device}'...")
             model = SentenceTransformer(self.model_name, device=self.device)
             self._model_cache[cache_key] = model
-            logging.info(f"Model loaded successfully")
         else:
             logging.info(f"Using cached model '{self.model_name}'")
         return self._model_cache[cache_key]
     def embed_text(self, text: str) -> List[float]:
         """
         Generate embedding for a single text
         Args:
             text: Text to embed
         Returns:
             List of float values representing the embedding
         """
         if not text.strip():
             # Handle empty text - still generate embedding
             text = " "  # Single space to avoid completely empty input
         try:
             # Generate embedding
             embedding = self.model.encode(text, convert_to_numpy=True)
             # Convert to Python list of floats
             return embedding.tolist()
         except Exception as e:
             logging.error(f"Failed to generate embedding for text: {e}")
             raise e
     def embed_texts(self, texts: List[str]) -> List[List[float]]:
         """
         Generate embeddings for multiple texts
         Args:
             texts: List of texts to embed
         Returns:
             List of embeddings (each embedding is a list of floats)
         """
         if not texts:
             return []
         try:
             # Preprocess empty texts
             processed_texts = []
@@ -91,48 +98,48 @@ class EmbeddingService:
                     processed_texts.append(" ")  # Single space for empty texts
                 else:
                     processed_texts.append(text)
             # Generate embeddings in batches
             all_embeddings = []
             for i in range(0, len(processed_texts), self.batch_size):
-                batch_texts = processed_texts[i:i + self.batch_size]
                 # Generate embeddings for this batch
                 batch_embeddings = self.model.encode(
-                    batch_texts,
                     convert_to_numpy=True,
-                    show_progress_bar=False  # Disable progress bar for cleaner output
                 )
                 # Convert to list of lists
                 for embedding in batch_embeddings:
                     all_embeddings.append(embedding.tolist())
             logging.info(f"Generated embeddings for {len(texts)} texts")
             return all_embeddings
         except Exception as e:
             logging.error(f"Failed to generate embeddings for texts: {e}")
             raise e
     def get_embedding_dimension(self) -> int:
         """Get the dimension of embeddings produced by this model"""
         return self.model.get_sentence_embedding_dimension()
     def encode_batch(self, texts: List[str]) -> np.ndarray:
         """
         Generate embeddings and return as numpy array (for efficiency)
         Args:
             texts: List of texts to embed
         Returns:
             NumPy array of embeddings
         """
         if not texts:
             return np.array([])
         # Preprocess empty texts
         processed_texts = []
         for text in texts:
@@ -140,33 +147,33 @@ class EmbeddingService:
                 processed_texts.append(" ")
             else:
                 processed_texts.append(text)
         return self.model.encode(processed_texts, convert_to_numpy=True)
     def similarity(self, text1: str, text2: str) -> float:
         """
         Calculate cosine similarity between two texts
         Args:
             text1: First text
             text2: Second text
         Returns:
             Cosine similarity score (0-1)
         """
         try:
             embeddings = self.embed_texts([text1, text2])
             # Calculate cosine similarity
             embed1 = np.array(embeddings[0])
             embed2 = np.array(embeddings[1])
             similarity = np.dot(embed1, embed2) / (
                 np.linalg.norm(embed1) * np.linalg.norm(embed2)
             )
             return float(similarity)
         except Exception as e:
             logging.error(f"Failed to calculate similarity: {e}")
-            return 0.0

 import logging
+from typing import List
 import numpy as np
+from sentence_transformers import SentenceTransformer
 class EmbeddingService:
     """HuggingFace sentence-transformers wrapper for generating embeddings"""
     _model_cache = {}  # Class-level cache for model instances
     def __init__(
+        self,
         model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
         device: str = "cpu",
+        batch_size: int = 32,
     ):
         """
         Initialize the embedding service
         Args:
             model_name: HuggingFace model name
             device: Device to run the model on ('cpu' or 'cuda')
         self.model_name = model_name
         self.device = device
         self.batch_size = batch_size
         # Load model (with caching)
         self.model = self._load_model()
+        logging.info(
+            f"Initialized EmbeddingService with model "
+            f"'{model_name}' on device '{device}'"
+        )
     def _load_model(self) -> SentenceTransformer:
         """Load the sentence transformer model with caching"""
         cache_key = f"{self.model_name}_{self.device}"
         if cache_key not in self._model_cache:
+            logging.info(
+                f"Loading model '{self.model_name}' on device '{self.device}'..."
+            )
             model = SentenceTransformer(self.model_name, device=self.device)
             self._model_cache[cache_key] = model
+            logging.info("Model loaded successfully")
         else:
             logging.info(f"Using cached model '{self.model_name}'")
         return self._model_cache[cache_key]
     def embed_text(self, text: str) -> List[float]:
         """
         Generate embedding for a single text
         Args:
             text: Text to embed
         Returns:
             List of float values representing the embedding
         """
         if not text.strip():
             # Handle empty text - still generate embedding
             text = " "  # Single space to avoid completely empty input
         try:
             # Generate embedding
             embedding = self.model.encode(text, convert_to_numpy=True)
             # Convert to Python list of floats
             return embedding.tolist()
         except Exception as e:
             logging.error(f"Failed to generate embedding for text: {e}")
             raise e
     def embed_texts(self, texts: List[str]) -> List[List[float]]:
         """
         Generate embeddings for multiple texts
         Args:
             texts: List of texts to embed
         Returns:
             List of embeddings (each embedding is a list of floats)
         """
         if not texts:
             return []
         try:
             # Preprocess empty texts
             processed_texts = []
                     processed_texts.append(" ")  # Single space for empty texts
                 else:
                     processed_texts.append(text)
             # Generate embeddings in batches
             all_embeddings = []
             for i in range(0, len(processed_texts), self.batch_size):
+                batch_texts = processed_texts[i : i + self.batch_size]
                 # Generate embeddings for this batch
                 batch_embeddings = self.model.encode(
+                    batch_texts,
                     convert_to_numpy=True,
+                    show_progress_bar=False,  # Disable progress bar for cleaner output
                 )
                 # Convert to list of lists
                 for embedding in batch_embeddings:
                     all_embeddings.append(embedding.tolist())
             logging.info(f"Generated embeddings for {len(texts)} texts")
             return all_embeddings
         except Exception as e:
             logging.error(f"Failed to generate embeddings for texts: {e}")
             raise e
     def get_embedding_dimension(self) -> int:
         """Get the dimension of embeddings produced by this model"""
         return self.model.get_sentence_embedding_dimension()
     def encode_batch(self, texts: List[str]) -> np.ndarray:
         """
         Generate embeddings and return as numpy array (for efficiency)
         Args:
             texts: List of texts to embed
         Returns:
             NumPy array of embeddings
         """
         if not texts:
             return np.array([])
         # Preprocess empty texts
         processed_texts = []
         for text in texts:
                 processed_texts.append(" ")
             else:
                 processed_texts.append(text)
         return self.model.encode(processed_texts, convert_to_numpy=True)
     def similarity(self, text1: str, text2: str) -> float:
         """
         Calculate cosine similarity between two texts
         Args:
             text1: First text
             text2: Second text
         Returns:
             Cosine similarity score (0-1)
         """
         try:
             embeddings = self.embed_texts([text1, text2])
             # Calculate cosine similarity
             embed1 = np.array(embeddings[0])
             embed2 = np.array(embeddings[1])
             similarity = np.dot(embed1, embed2) / (
                 np.linalg.norm(embed1) * np.linalg.norm(embed2)
             )
             return float(similarity)
         except Exception as e:
             logging.error(f"Failed to calculate similarity: {e}")
+            return 0.0

src/ingestion/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # Empty file to make ingestion a package


1	+ # Empty file to make ingestion a package

src/ingestion/document_chunker.py CHANGED Viewed

@@ -1,14 +1,17 @@
 import hashlib
 import random
-from typing import List, Dict, Any, Optional
 class DocumentChunker:
     """Document chunker with overlap and reproducible behavior"""
-    def __init__(self, chunk_size: int = 1000, overlap: int = 200, seed: Optional[int] = None):
         """
         Initialize the document chunker
         Args:
             chunk_size: Maximum characters per chunk
             overlap: Number of overlapping characters between chunks
@@ -17,80 +20,84 @@ class DocumentChunker:
         self.chunk_size = chunk_size
         self.overlap = overlap
         self.seed = seed
         if seed is not None:
             random.seed(seed)
     def chunk_text(self, text: str) -> List[Dict[str, Any]]:
         """
         Chunk text into overlapping segments
         Args:
             text: Input text to chunk
         Returns:
             List of chunk dictionaries with content and basic metadata
         """
         if not text.strip():
             return []
         chunks = []
         start = 0
         chunk_index = 0
         while start < len(text):
             end = start + self.chunk_size
             chunk_content = text[start:end]
             # Create chunk with metadata
             chunk = {
-                'content': chunk_content,
-                'metadata': {
-                    'chunk_index': chunk_index,
-                    'start_pos': start,
-                    'end_pos': min(end, len(text)),
-                    'chunk_id': self._generate_chunk_id(chunk_content, chunk_index)
-                }
             }
             chunks.append(chunk)
             # Move start position with overlap consideration
             start = end - self.overlap
             chunk_index += 1
             # Break if we've processed all text
             if end >= len(text):
                 break
         return chunks
-    def chunk_document(self, text: str, doc_metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Chunk a document while preserving document metadata
         Args:
             text: Document text content
             doc_metadata: Document metadata to preserve
         Returns:
             List of chunks with combined metadata
         """
         chunks = self.chunk_text(text)
         # Enhance each chunk with document metadata
         for chunk in chunks:
-            chunk['metadata'].update(doc_metadata)
             # Create unique chunk ID combining document and chunk info
-            chunk['metadata']['chunk_id'] = self._generate_chunk_id(
-                chunk['content'],
-                chunk['metadata']['chunk_index'],
-                doc_metadata.get('filename', 'unknown')
             )
         return chunks
-    def _generate_chunk_id(self, content: str, chunk_index: int, filename: str = "") -> str:
         """Generate a deterministic chunk ID"""
         id_string = f"{filename}_{chunk_index}_{content[:50]}"
-        return hashlib.md5(id_string.encode()).hexdigest()[:12]

 import hashlib
 import random
+from typing import Any, Dict, List, Optional
 class DocumentChunker:
     """Document chunker with overlap and reproducible behavior"""
+    def __init__(
+        self, chunk_size: int = 1000, overlap: int = 200, seed: Optional[int] = None
+    ):
         """
         Initialize the document chunker
         Args:
             chunk_size: Maximum characters per chunk
             overlap: Number of overlapping characters between chunks
         self.chunk_size = chunk_size
         self.overlap = overlap
         self.seed = seed
         if seed is not None:
             random.seed(seed)
     def chunk_text(self, text: str) -> List[Dict[str, Any]]:
         """
         Chunk text into overlapping segments
         Args:
             text: Input text to chunk
         Returns:
             List of chunk dictionaries with content and basic metadata
         """
         if not text.strip():
             return []
         chunks = []
         start = 0
         chunk_index = 0
         while start < len(text):
             end = start + self.chunk_size
             chunk_content = text[start:end]
             # Create chunk with metadata
             chunk = {
+                "content": chunk_content,
+                "metadata": {
+                    "chunk_index": chunk_index,
+                    "start_pos": start,
+                    "end_pos": min(end, len(text)),
+                    "chunk_id": self._generate_chunk_id(chunk_content, chunk_index),
+                },
             }
             chunks.append(chunk)
             # Move start position with overlap consideration
             start = end - self.overlap
             chunk_index += 1
             # Break if we've processed all text
             if end >= len(text):
                 break
         return chunks
+    def chunk_document(
+        self, text: str, doc_metadata: Dict[str, Any]
+    ) -> List[Dict[str, Any]]:
         """
         Chunk a document while preserving document metadata
         Args:
             text: Document text content
             doc_metadata: Document metadata to preserve
         Returns:
             List of chunks with combined metadata
         """
         chunks = self.chunk_text(text)
         # Enhance each chunk with document metadata
         for chunk in chunks:
+            chunk["metadata"].update(doc_metadata)
             # Create unique chunk ID combining document and chunk info
+            chunk["metadata"]["chunk_id"] = self._generate_chunk_id(
+                chunk["content"],
+                chunk["metadata"]["chunk_index"],
+                doc_metadata.get("filename", "unknown"),
             )
         return chunks
+    def _generate_chunk_id(
+        self, content: str, chunk_index: int, filename: str = ""
+    ) -> str:
         """Generate a deterministic chunk ID"""
         id_string = f"{filename}_{chunk_index}_{content[:50]}"
+        return hashlib.md5(id_string.encode()).hexdigest()[:12]

src/ingestion/document_parser.py CHANGED Viewed

@@ -1,46 +1,44 @@
 import os
 from pathlib import Path
-from typing import Dict, Any
 class DocumentParser:
     """Parser for different document formats in the policy corpus"""
-    SUPPORTED_FORMATS = {'.txt', '.md', '.markdown'}
     def parse_document(self, file_path: str) -> Dict[str, Any]:
         """
         Parse a document and return content with metadata
         Args:
             file_path: Path to the document file
         Returns:
             Dict containing 'content' and 'metadata'
         Raises:
             FileNotFoundError: If file doesn't exist
             ValueError: If file format is unsupported
         """
         path = Path(file_path)
         # Check file format first (before existence check)
         if path.suffix.lower() not in self.SUPPORTED_FORMATS:
             raise ValueError(f"Unsupported file format: {path.suffix}")
         if not path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
-        with open(file_path, 'r', encoding='utf-8') as f:
             content = f.read()
         metadata = {
-            'filename': path.name,
-            'file_type': path.suffix.lstrip('.').lower(),
-            'file_size': os.path.getsize(file_path),
-            'file_path': str(path.absolute())
         }
-        return {
-            'content': content,
-            'metadata': metadata
-        }

 import os
 from pathlib import Path
+from typing import Any, Dict
 class DocumentParser:
     """Parser for different document formats in the policy corpus"""
+    SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}
     def parse_document(self, file_path: str) -> Dict[str, Any]:
         """
         Parse a document and return content with metadata
         Args:
             file_path: Path to the document file
         Returns:
             Dict containing 'content' and 'metadata'
         Raises:
             FileNotFoundError: If file doesn't exist
             ValueError: If file format is unsupported
         """
         path = Path(file_path)
         # Check file format first (before existence check)
         if path.suffix.lower() not in self.SUPPORTED_FORMATS:
             raise ValueError(f"Unsupported file format: {path.suffix}")
         if not path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
+        with open(file_path, "r", encoding="utf-8") as f:
             content = f.read()
         metadata = {
+            "filename": path.name,
+            "file_type": path.suffix.lstrip(".").lower(),
+            "file_size": os.path.getsize(file_path),
+            "file_path": str(path.absolute()),
         }
+        return {"content": content, "metadata": metadata}

src/ingestion/ingestion_pipeline.py CHANGED Viewed

@@ -1,69 +1,75 @@
 from pathlib import Path
-from typing import List, Dict, Any
-from .document_parser import DocumentParser
 from .document_chunker import DocumentChunker
 class IngestionPipeline:
     """Complete ingestion pipeline for processing document corpus"""
     def __init__(self, chunk_size: int = 1000, overlap: int = 200, seed: int = 42):
         """
         Initialize the ingestion pipeline
         Args:
             chunk_size: Size of text chunks
             overlap: Overlap between chunks
             seed: Random seed for reproducibility
         """
         self.parser = DocumentParser()
-        self.chunker = DocumentChunker(chunk_size=chunk_size, overlap=overlap, seed=seed)
         self.seed = seed
     def process_directory(self, directory_path: str) -> List[Dict[str, Any]]:
         """
         Process all supported documents in a directory
         Args:
             directory_path: Path to directory containing documents
         Returns:
             List of processed chunks with metadata
         """
         directory = Path(directory_path)
         if not directory.exists():
             raise FileNotFoundError(f"Directory not found: {directory_path}")
         all_chunks = []
         # Process each supported file
         for file_path in directory.iterdir():
-            if file_path.is_file() and file_path.suffix.lower() in self.parser.SUPPORTED_FORMATS:
                 try:
                     chunks = self.process_file(str(file_path))
                     all_chunks.extend(chunks)
                 except Exception as e:
                     print(f"Warning: Failed to process {file_path}: {e}")
                     continue
         return all_chunks
     def process_file(self, file_path: str) -> List[Dict[str, Any]]:
         """
         Process a single file through the complete pipeline
         Args:
             file_path: Path to the file to process
         Returns:
             List of chunks from the file
         """
         # Parse document
         parsed_doc = self.parser.parse_document(file_path)
         # Chunk the document
         chunks = self.chunker.chunk_document(
-            parsed_doc['content'],
-            parsed_doc['metadata']
         )
-        return chunks

 from pathlib import Path
+from typing import Any, Dict, List
 from .document_chunker import DocumentChunker
+from .document_parser import DocumentParser
 class IngestionPipeline:
     """Complete ingestion pipeline for processing document corpus"""
     def __init__(self, chunk_size: int = 1000, overlap: int = 200, seed: int = 42):
         """
         Initialize the ingestion pipeline
         Args:
             chunk_size: Size of text chunks
             overlap: Overlap between chunks
             seed: Random seed for reproducibility
         """
         self.parser = DocumentParser()
+        self.chunker = DocumentChunker(
+            chunk_size=chunk_size, overlap=overlap, seed=seed
+        )
         self.seed = seed
     def process_directory(self, directory_path: str) -> List[Dict[str, Any]]:
         """
         Process all supported documents in a directory
         Args:
             directory_path: Path to directory containing documents
         Returns:
             List of processed chunks with metadata
         """
         directory = Path(directory_path)
         if not directory.exists():
             raise FileNotFoundError(f"Directory not found: {directory_path}")
         all_chunks = []
         # Process each supported file
         for file_path in directory.iterdir():
+            if (
+                file_path.is_file()
+                and file_path.suffix.lower() in self.parser.SUPPORTED_FORMATS
+            ):
                 try:
                     chunks = self.process_file(str(file_path))
                     all_chunks.extend(chunks)
                 except Exception as e:
                     print(f"Warning: Failed to process {file_path}: {e}")
                     continue
         return all_chunks
     def process_file(self, file_path: str) -> List[Dict[str, Any]]:
         """
         Process a single file through the complete pipeline
         Args:
             file_path: Path to the file to process
         Returns:
             List of chunks from the file
         """
         # Parse document
         parsed_doc = self.parser.parse_document(file_path)
         # Chunk the document
         chunks = self.chunker.chunk_document(
+            parsed_doc["content"], parsed_doc["metadata"]
         )
+        return chunks

src/vector_store/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # Vector store package for ChromaDB integration


1	+ # Vector store package for ChromaDB integration

src/vector_store/vector_db.py CHANGED Viewed

@@ -1,92 +1,100 @@
-import chromadb
-from typing import List, Dict, Any, Optional
-from pathlib import Path
 import logging
 class VectorDatabase:
     """ChromaDB integration for vector storage and similarity search"""
     def __init__(self, persist_path: str, collection_name: str):
         """
         Initialize the vector database
         Args:
             persist_path: Path to persist the database
             collection_name: Name of the collection to use
         """
         self.persist_path = persist_path
         self.collection_name = collection_name
         # Ensure persist directory exists
         Path(persist_path).mkdir(parents=True, exist_ok=True)
         # Initialize ChromaDB client with persistence
         self.client = chromadb.PersistentClient(path=persist_path)
         # Get or create collection
         try:
             self.collection = self.client.get_collection(name=collection_name)
         except ValueError:
             # Collection doesn't exist, create it
             self.collection = self.client.create_collection(name=collection_name)
-        logging.info(f"Initialized VectorDatabase with collection '{collection_name}' at '{persist_path}'")
     def get_collection(self):
         """Get the ChromaDB collection"""
         return self.collection
     def add_embeddings(
-        self,
-        embeddings: List[List[float]],
-        chunk_ids: List[str],
-        documents: List[str],
-        metadatas: List[Dict[str, Any]]
     ) -> bool:
         """
         Add embeddings to the vector database
         Args:
             embeddings: List of embedding vectors
             chunk_ids: List of unique chunk IDs
             documents: List of document contents
             metadatas: List of metadata dictionaries
         Returns:
             True if successful, False otherwise
         """
         try:
             # Validate input lengths match
-            if not (len(embeddings) == len(chunk_ids) == len(documents) == len(metadatas)):
                 raise ValueError("All input lists must have the same length")
             # Add to ChromaDB collection
             self.collection.add(
                 embeddings=embeddings,
                 documents=documents,
                 metadatas=metadatas,
-                ids=chunk_ids
             )
-            logging.info(f"Added {len(embeddings)} embeddings to collection '{self.collection_name}'")
             return True
         except Exception as e:
             logging.error(f"Failed to add embeddings: {e}")
             raise e
     def search(
-        self,
-        query_embedding: List[float],
-        top_k: int = 5
     ) -> List[Dict[str, Any]]:
         """
         Search for similar embeddings
         Args:
             query_embedding: Query vector to search for
             top_k: Number of results to return
         Returns:
             List of search results with metadata
         """
@@ -94,33 +102,33 @@ class VectorDatabase:
             # Handle empty collection
             if self.get_count() == 0:
                 return []
             # Perform similarity search
             results = self.collection.query(
                 query_embeddings=[query_embedding],
-                n_results=min(top_k, self.get_count())
             )
             # Format results
             formatted_results = []
-            if results['ids'] and len(results['ids'][0]) > 0:
-                for i in range(len(results['ids'][0])):
                     result = {
-                        'id': results['ids'][0][i],
-                        'document': results['documents'][0][i],
-                        'metadata': results['metadatas'][0][i],
-                        'distance': results['distances'][0][i]
                     }
                     formatted_results.append(result)
             logging.info(f"Search returned {len(formatted_results)} results")
             return formatted_results
         except Exception as e:
             logging.error(f"Search failed: {e}")
             return []
     def get_count(self) -> int:
         """Get the number of embeddings in the collection"""
         try:
@@ -128,7 +136,7 @@ class VectorDatabase:
         except Exception as e:
             logging.error(f"Failed to get count: {e}")
             return 0
     def delete_collection(self) -> bool:
         """Delete the collection"""
         try:
@@ -138,7 +146,7 @@ class VectorDatabase:
         except Exception as e:
             logging.error(f"Failed to delete collection: {e}")
             return False
     def reset_collection(self) -> bool:
         """Reset the collection (delete and recreate)"""
         try:
@@ -148,12 +156,12 @@ class VectorDatabase:
             except ValueError:
                 # Collection doesn't exist, that's fine
                 pass
             # Create new collection
             self.collection = self.client.create_collection(name=self.collection_name)
             logging.info(f"Reset collection '{self.collection_name}'")
             return True
         except Exception as e:
             logging.error(f"Failed to reset collection: {e}")
-            return False

 import logging
+from pathlib import Path
+from typing import Any, Dict, List
+import chromadb
 class VectorDatabase:
     """ChromaDB integration for vector storage and similarity search"""
     def __init__(self, persist_path: str, collection_name: str):
         """
         Initialize the vector database
         Args:
             persist_path: Path to persist the database
             collection_name: Name of the collection to use
         """
         self.persist_path = persist_path
         self.collection_name = collection_name
         # Ensure persist directory exists
         Path(persist_path).mkdir(parents=True, exist_ok=True)
         # Initialize ChromaDB client with persistence
         self.client = chromadb.PersistentClient(path=persist_path)
         # Get or create collection
         try:
             self.collection = self.client.get_collection(name=collection_name)
         except ValueError:
             # Collection doesn't exist, create it
             self.collection = self.client.create_collection(name=collection_name)
+        logging.info(
+            f"Initialized VectorDatabase with collection "
+            f"'{collection_name}' at '{persist_path}'"
+        )
     def get_collection(self):
         """Get the ChromaDB collection"""
         return self.collection
     def add_embeddings(
+        self,
+        embeddings: List[List[float]],
+        chunk_ids: List[str],
+        documents: List[str],
+        metadatas: List[Dict[str, Any]],
     ) -> bool:
         """
         Add embeddings to the vector database
         Args:
             embeddings: List of embedding vectors
             chunk_ids: List of unique chunk IDs
             documents: List of document contents
             metadatas: List of metadata dictionaries
         Returns:
             True if successful, False otherwise
         """
         try:
             # Validate input lengths match
+            if not (
+                len(embeddings) == len(chunk_ids) == len(documents) == len(metadatas)
+            ):
                 raise ValueError("All input lists must have the same length")
             # Add to ChromaDB collection
             self.collection.add(
                 embeddings=embeddings,
                 documents=documents,
                 metadatas=metadatas,
+                ids=chunk_ids,
+            )
+            logging.info(
+                f"Added {len(embeddings)} embeddings to collection "
+                f"'{self.collection_name}'"
             )
             return True
         except Exception as e:
             logging.error(f"Failed to add embeddings: {e}")
             raise e
     def search(
+        self, query_embedding: List[float], top_k: int = 5
     ) -> List[Dict[str, Any]]:
         """
         Search for similar embeddings
         Args:
             query_embedding: Query vector to search for
             top_k: Number of results to return
         Returns:
             List of search results with metadata
         """
             # Handle empty collection
             if self.get_count() == 0:
                 return []
             # Perform similarity search
             results = self.collection.query(
                 query_embeddings=[query_embedding],
+                n_results=min(top_k, self.get_count()),
             )
             # Format results
             formatted_results = []
+            if results["ids"] and len(results["ids"][0]) > 0:
+                for i in range(len(results["ids"][0])):
                     result = {
+                        "id": results["ids"][0][i],
+                        "document": results["documents"][0][i],
+                        "metadata": results["metadatas"][0][i],
+                        "distance": results["distances"][0][i],
                     }
                     formatted_results.append(result)
             logging.info(f"Search returned {len(formatted_results)} results")
             return formatted_results
         except Exception as e:
             logging.error(f"Search failed: {e}")
             return []
     def get_count(self) -> int:
         """Get the number of embeddings in the collection"""
         try:
         except Exception as e:
             logging.error(f"Failed to get count: {e}")
             return 0
     def delete_collection(self) -> bool:
         """Delete the collection"""
         try:
         except Exception as e:
             logging.error(f"Failed to delete collection: {e}")
             return False
     def reset_collection(self) -> bool:
         """Reset the collection (delete and recreate)"""
         try:
             except ValueError:
                 # Collection doesn't exist, that's fine
                 pass
             # Create new collection
             self.collection = self.client.create_collection(name=self.collection_name)
             logging.info(f"Reset collection '{self.collection_name}'")
             return True
         except Exception as e:
             logging.error(f"Failed to reset collection: {e}")
+            return False

tests/test_app.py CHANGED Viewed

@@ -33,7 +33,8 @@ def test_index_endpoint(client):
 def test_ingest_endpoint_exists():
     """Test that the ingest endpoint is available"""
     from app import app
     client = app.test_client()
-    response = client.post('/ingest')
     # Should not be 404 (not found)
     assert response.status_code != 404

 def test_ingest_endpoint_exists():
     """Test that the ingest endpoint is available"""
     from app import app
     client = app.test_client()
+    response = client.post("/ingest")
     # Should not be 404 (not found)
     assert response.status_code != 404

tests/test_embedding/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # Test package for embedding service components


1	+ # Test package for embedding service components

tests/test_embedding/test_embedding_service.py CHANGED Viewed

@@ -1,196 +1,207 @@
-import pytest
 import numpy as np
 from src.embedding.embedding_service import EmbeddingService
 def test_embedding_service_initialization():
     """Test EmbeddingService initialization"""
     # Test will fail initially - we'll implement EmbeddingService to make it pass
     service = EmbeddingService()
     assert service is not None
     assert service.model_name == "sentence-transformers/all-MiniLM-L6-v2"
     assert service.device == "cpu"
 def test_embedding_service_with_custom_config():
     """Test EmbeddingService initialization with custom configuration"""
     service = EmbeddingService(
-        model_name="sentence-transformers/all-MiniLM-L6-v2",
-        device="cpu",
-        batch_size=16
     )
     assert service.model_name == "sentence-transformers/all-MiniLM-L6-v2"
     assert service.device == "cpu"
     assert service.batch_size == 16
 def test_single_text_embedding():
     """Test embedding generation for a single text"""
     service = EmbeddingService()
     text = "This is a test document about company policies."
     embedding = service.embed_text(text)
     # Should return a list of floats (embedding vector)
     assert isinstance(embedding, list)
     assert len(embedding) == 384  # all-MiniLM-L6-v2 dimension
     assert all(isinstance(x, (float, np.float32, np.float64)) for x in embedding)
 def test_batch_text_embedding():
     """Test embedding generation for multiple texts"""
     service = EmbeddingService()
     texts = [
         "This is the first document about remote work policy.",
         "This is the second document about employee benefits.",
-        "This is the third document about code of conduct."
     ]
     embeddings = service.embed_texts(texts)
     # Should return list of embeddings
     assert isinstance(embeddings, list)
     assert len(embeddings) == 3
     # Each embedding should be correct dimension
     for embedding in embeddings:
         assert isinstance(embedding, list)
         assert len(embedding) == 384
         assert all(isinstance(x, (float, np.float32, np.float64)) for x in embedding)
 def test_embedding_consistency():
     """Test that same text produces same embedding"""
     service = EmbeddingService()
     text = "Consistent embedding test text."
     embedding1 = service.embed_text(text)
     embedding2 = service.embed_text(text)
     # Should be identical (deterministic)
     assert embedding1 == embedding2
 def test_different_texts_different_embeddings():
     """Test that different texts produce different embeddings"""
     service = EmbeddingService()
     text1 = "This is about remote work policy."
     text2 = "This is about employee benefits and healthcare."
     embedding1 = service.embed_text(text1)
     embedding2 = service.embed_text(text2)
     # Should be different
     assert embedding1 != embedding2
     # But should have same dimension
     assert len(embedding1) == len(embedding2) == 384
 def test_empty_text_handling():
     """Test handling of empty or whitespace-only text"""
     service = EmbeddingService()
     # Empty string
     embedding_empty = service.embed_text("")
     assert isinstance(embedding_empty, list)
     assert len(embedding_empty) == 384
     # Whitespace only
     embedding_whitespace = service.embed_text("   \n\t  ")
     assert isinstance(embedding_whitespace, list)
     assert len(embedding_whitespace) == 384
 def test_very_long_text_handling():
     """Test handling of very long texts"""
     service = EmbeddingService()
     # Create a very long text (should test tokenization limits)
     long_text = "This is a very long document. " * 1000  # ~30,000 characters
     embedding = service.embed_text(long_text)
     assert isinstance(embedding, list)
     assert len(embedding) == 384
 def test_batch_size_handling():
     """Test that batch processing works correctly"""
     service = EmbeddingService(batch_size=2)  # Small batch for testing
     texts = [
         "Text one about policy",
-        "Text two about procedures",
         "Text three about guidelines",
         "Text four about regulations",
-        "Text five about rules"
     ]
     embeddings = service.embed_texts(texts)
     # Should process all texts despite small batch size
     assert len(embeddings) == 5
     # All embeddings should be valid
     for embedding in embeddings:
         assert len(embedding) == 384
 def test_special_characters_handling():
     """Test handling of special characters and unicode"""
     service = EmbeddingService()
     texts_with_special_chars = [
         "Policy with émojis 😀 and úñicode",
         "Text with numbers: 123,456.78 and symbols @#$%",
         "Markdown: # Header\n## Subheader\n- List item",
-        "Mixed: Policy-2024 (v1.2) — updated 12/01/2025"
     ]
     embeddings = service.embed_texts(texts_with_special_chars)
     assert len(embeddings) == 4
     for embedding in embeddings:
         assert len(embedding) == 384
 def test_similarity_makes_sense():
     """Test that semantically similar texts have similar embeddings"""
     service = EmbeddingService()
     # Similar texts
     text1 = "Employee remote work policy guidelines"
     text2 = "Guidelines for working from home policies"
     # Different text
     text3 = "Financial expense reimbursement procedures"
     embed1 = service.embed_text(text1)
     embed2 = service.embed_text(text2)
     embed3 = service.embed_text(text3)
     # Calculate simple cosine similarity (for validation)
     def cosine_similarity(a, b):
         import numpy as np
         a_np = np.array(a)
         b_np = np.array(b)
         return np.dot(a_np, b_np) / (np.linalg.norm(a_np) * np.linalg.norm(b_np))
     sim_1_2 = cosine_similarity(embed1, embed2)  # Similar texts
     sim_1_3 = cosine_similarity(embed1, embed3)  # Different texts
     # Similar texts should have higher similarity than different texts
     assert sim_1_2 > sim_1_3
     assert sim_1_2 > 0.5  # Should be reasonably similar
 def test_model_loading_performance():
     """Test that model loading doesn't happen repeatedly"""
     # This test ensures model is cached after first load
     import time
     start_time = time.time()
-    service1 = EmbeddingService()
     first_load_time = time.time() - start_time
     start_time = time.time()
-    service2 = EmbeddingService()
     second_load_time = time.time() - start_time
     # Second initialization should be faster (model already cached)
     # Note: This might not always be true depending on implementation
     # but it's good to test the general behavior
-    assert second_load_time <= first_load_time * 2  # Allow some variance

 import numpy as np
 from src.embedding.embedding_service import EmbeddingService
 def test_embedding_service_initialization():
     """Test EmbeddingService initialization"""
     # Test will fail initially - we'll implement EmbeddingService to make it pass
     service = EmbeddingService()
     assert service is not None
     assert service.model_name == "sentence-transformers/all-MiniLM-L6-v2"
     assert service.device == "cpu"
 def test_embedding_service_with_custom_config():
     """Test EmbeddingService initialization with custom configuration"""
     service = EmbeddingService(
+        model_name="sentence-transformers/all-MiniLM-L6-v2", device="cpu", batch_size=16
     )
     assert service.model_name == "sentence-transformers/all-MiniLM-L6-v2"
     assert service.device == "cpu"
     assert service.batch_size == 16
 def test_single_text_embedding():
     """Test embedding generation for a single text"""
     service = EmbeddingService()
     text = "This is a test document about company policies."
     embedding = service.embed_text(text)
     # Should return a list of floats (embedding vector)
     assert isinstance(embedding, list)
     assert len(embedding) == 384  # all-MiniLM-L6-v2 dimension
     assert all(isinstance(x, (float, np.float32, np.float64)) for x in embedding)
 def test_batch_text_embedding():
     """Test embedding generation for multiple texts"""
     service = EmbeddingService()
     texts = [
         "This is the first document about remote work policy.",
         "This is the second document about employee benefits.",
+        "This is the third document about code of conduct.",
     ]
     embeddings = service.embed_texts(texts)
     # Should return list of embeddings
     assert isinstance(embeddings, list)
     assert len(embeddings) == 3
     # Each embedding should be correct dimension
     for embedding in embeddings:
         assert isinstance(embedding, list)
         assert len(embedding) == 384
         assert all(isinstance(x, (float, np.float32, np.float64)) for x in embedding)
 def test_embedding_consistency():
     """Test that same text produces same embedding"""
     service = EmbeddingService()
     text = "Consistent embedding test text."
     embedding1 = service.embed_text(text)
     embedding2 = service.embed_text(text)
     # Should be identical (deterministic)
     assert embedding1 == embedding2
 def test_different_texts_different_embeddings():
     """Test that different texts produce different embeddings"""
     service = EmbeddingService()
     text1 = "This is about remote work policy."
     text2 = "This is about employee benefits and healthcare."
     embedding1 = service.embed_text(text1)
     embedding2 = service.embed_text(text2)
     # Should be different
     assert embedding1 != embedding2
     # But should have same dimension
     assert len(embedding1) == len(embedding2) == 384
 def test_empty_text_handling():
     """Test handling of empty or whitespace-only text"""
     service = EmbeddingService()
     # Empty string
     embedding_empty = service.embed_text("")
     assert isinstance(embedding_empty, list)
     assert len(embedding_empty) == 384
     # Whitespace only
     embedding_whitespace = service.embed_text("   \n\t  ")
     assert isinstance(embedding_whitespace, list)
     assert len(embedding_whitespace) == 384
 def test_very_long_text_handling():
     """Test handling of very long texts"""
     service = EmbeddingService()
     # Create a very long text (should test tokenization limits)
     long_text = "This is a very long document. " * 1000  # ~30,000 characters
     embedding = service.embed_text(long_text)
     assert isinstance(embedding, list)
     assert len(embedding) == 384
 def test_batch_size_handling():
     """Test that batch processing works correctly"""
     service = EmbeddingService(batch_size=2)  # Small batch for testing
     texts = [
         "Text one about policy",
+        "Text two about procedures",
         "Text three about guidelines",
         "Text four about regulations",
+        "Text five about rules",
     ]
     embeddings = service.embed_texts(texts)
     # Should process all texts despite small batch size
     assert len(embeddings) == 5
     # All embeddings should be valid
     for embedding in embeddings:
         assert len(embedding) == 384
 def test_special_characters_handling():
     """Test handling of special characters and unicode"""
     service = EmbeddingService()
     texts_with_special_chars = [
         "Policy with émojis 😀 and úñicode",
         "Text with numbers: 123,456.78 and symbols @#$%",
         "Markdown: # Header\n## Subheader\n- List item",
+        "Mixed: Policy-2024 (v1.2) — updated 12/01/2025",
     ]
     embeddings = service.embed_texts(texts_with_special_chars)
     assert len(embeddings) == 4
     for embedding in embeddings:
         assert len(embedding) == 384
 def test_similarity_makes_sense():
     """Test that semantically similar texts have similar embeddings"""
     service = EmbeddingService()
     # Similar texts
     text1 = "Employee remote work policy guidelines"
     text2 = "Guidelines for working from home policies"
     # Different text
     text3 = "Financial expense reimbursement procedures"
     embed1 = service.embed_text(text1)
     embed2 = service.embed_text(text2)
     embed3 = service.embed_text(text3)
     # Calculate simple cosine similarity (for validation)
     def cosine_similarity(a, b):
         import numpy as np
         a_np = np.array(a)
         b_np = np.array(b)
         return np.dot(a_np, b_np) / (np.linalg.norm(a_np) * np.linalg.norm(b_np))
     sim_1_2 = cosine_similarity(embed1, embed2)  # Similar texts
     sim_1_3 = cosine_similarity(embed1, embed3)  # Different texts
     # Similar texts should have higher similarity than different texts
     assert sim_1_2 > sim_1_3
     assert sim_1_2 > 0.5  # Should be reasonably similar
 def test_model_loading_performance():
     """Test that model loading doesn't happen repeatedly"""
     # This test ensures model is cached after first load
     import time
     start_time = time.time()
+    EmbeddingService()  # First service
     first_load_time = time.time() - start_time
     start_time = time.time()
+    EmbeddingService()  # Second service
     second_load_time = time.time() - start_time
     # Second initialization should be faster (model already cached)
     # Note: This might not always be true depending on implementation
     # but it's good to test the general behavior
+    assert second_load_time <= first_load_time * 2  # Allow some variance

tests/test_ingestion/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # Test package for ingestion components


1	+ # Test package for ingestion components

tests/test_ingestion/test_document_chunker.py CHANGED Viewed

@@ -1,73 +1,74 @@
-import pytest
 from src.ingestion.document_chunker import DocumentChunker
 def test_chunk_by_characters():
     """Test basic character-based chunking"""
     chunker = DocumentChunker(chunk_size=50, overlap=10)
     text = "This is a test document. " * 10  # 250 characters
     chunks = chunker.chunk_text(text)
     assert len(chunks) > 1  # Should create multiple chunks
-    assert all(len(chunk['content']) <= 50 for chunk in chunks)
     # Test overlap
     if len(chunks) > 1:
         # Check that there's overlap between consecutive chunks
-        assert chunks[0]['content'][-10:] in chunks[1]['content'][:20]
 def test_chunk_with_metadata():
     """Test that chunks preserve document metadata"""
     chunker = DocumentChunker(chunk_size=100, overlap=20)
-    doc_metadata = {
-        'filename': 'test.txt',
-        'file_type': 'txt',
-        'source_id': 'doc_001'
-    }
     text = "Content that will be chunked. " * 20
     chunks = chunker.chunk_document(text, doc_metadata)
     for chunk in chunks:
-        assert chunk['metadata']['filename'] == 'test.txt'
-        assert chunk['metadata']['file_type'] == 'txt'
-        assert 'chunk_id' in chunk['metadata']
-        assert 'chunk_index' in chunk['metadata']
 def test_reproducible_chunking():
     """Test that chunking is deterministic with fixed seed"""
     chunker1 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
     chunker2 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
     text = "This text will be chunked reproducibly. " * 30
     chunks1 = chunker1.chunk_text(text)
     chunks2 = chunker2.chunk_text(text)
     assert len(chunks1) == len(chunks2)
     for c1, c2 in zip(chunks1, chunks2):
-        assert c1['content'] == c2['content']
 def test_empty_text_chunking():
     """Test handling of empty or very short text"""
     chunker = DocumentChunker(chunk_size=100, overlap=20)
     # Empty text
     chunks = chunker.chunk_text("")
     assert len(chunks) == 0
     # Very short text
     chunks = chunker.chunk_text("Short")
     assert len(chunks) == 1
-    assert chunks[0]['content'] == "Short"
 def test_chunk_real_policy_content():
     """Test chunking actual policy document content"""
     chunker = DocumentChunker(chunk_size=500, overlap=100, seed=42)
     # Use content that resembles our policy documents
-    policy_content = """# HR-POL-001: Employee Handbook
 **Effective Date:** 2025-01-01
 **Revision:** 1.1
@@ -83,54 +84,61 @@ Welcome to Innovate Inc.! We are thrilled to have you as part of our team. Our s
 ### 2.1. Code of Conduct
-All employees must adhere to our code of conduct which emphasizes integrity, respect, and professionalism in all interactions.""" * 3
     doc_metadata = {
-        'filename': 'employee_handbook.md',
-        'file_type': 'md',
-        'file_path': '/path/to/employee_handbook.md'
     }
     chunks = chunker.chunk_document(policy_content, doc_metadata)
     # Verify chunking worked
     assert len(chunks) > 1
     # Verify all chunks have proper metadata
     for i, chunk in enumerate(chunks):
-        assert chunk['metadata']['filename'] == 'employee_handbook.md'
-        assert chunk['metadata']['file_type'] == 'md'
-        assert chunk['metadata']['chunk_index'] == i
-        assert 'chunk_id' in chunk['metadata']
-        assert len(chunk['content']) <= 500
     # Verify overlap exists between consecutive chunks
     if len(chunks) > 1:
-        assert chunks[0]['content'][-100:] in chunks[1]['content'][:200]
 def test_chunk_metadata_inheritance():
     """Test that document metadata is properly inherited by chunks"""
     chunker = DocumentChunker(chunk_size=100, overlap=20)
     doc_metadata = {
-        'filename': 'test_policy.md',
-        'file_type': 'md',
-        'file_size': 1500,
-        'file_path': '/absolute/path/to/test_policy.md'
     }
     text = "Policy content goes here. " * 20
     chunks = chunker.chunk_document(text, doc_metadata)
     for chunk in chunks:
         # Original metadata should be preserved
-        assert chunk['metadata']['filename'] == 'test_policy.md'
-        assert chunk['metadata']['file_type'] == 'md'
-        assert chunk['metadata']['file_size'] == 1500
-        assert chunk['metadata']['file_path'] == '/absolute/path/to/test_policy.md'
         # New chunk-specific metadata should be added
-        assert 'chunk_index' in chunk['metadata']
-        assert 'chunk_id' in chunk['metadata']
-        assert 'start_pos' in chunk['metadata']
-        assert 'end_pos' in chunk['metadata']

 from src.ingestion.document_chunker import DocumentChunker
 def test_chunk_by_characters():
     """Test basic character-based chunking"""
     chunker = DocumentChunker(chunk_size=50, overlap=10)
     text = "This is a test document. " * 10  # 250 characters
     chunks = chunker.chunk_text(text)
     assert len(chunks) > 1  # Should create multiple chunks
+    assert all(len(chunk["content"]) <= 50 for chunk in chunks)
     # Test overlap
     if len(chunks) > 1:
         # Check that there's overlap between consecutive chunks
+        assert chunks[0]["content"][-10:] in chunks[1]["content"][:20]
 def test_chunk_with_metadata():
     """Test that chunks preserve document metadata"""
     chunker = DocumentChunker(chunk_size=100, overlap=20)
+    doc_metadata = {"filename": "test.txt", "file_type": "txt", "source_id": "doc_001"}
     text = "Content that will be chunked. " * 20
     chunks = chunker.chunk_document(text, doc_metadata)
     for chunk in chunks:
+        assert chunk["metadata"]["filename"] == "test.txt"
+        assert chunk["metadata"]["file_type"] == "txt"
+        assert "chunk_id" in chunk["metadata"]
+        assert "chunk_index" in chunk["metadata"]
 def test_reproducible_chunking():
     """Test that chunking is deterministic with fixed seed"""
     chunker1 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
     chunker2 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
     text = "This text will be chunked reproducibly. " * 30
     chunks1 = chunker1.chunk_text(text)
     chunks2 = chunker2.chunk_text(text)
     assert len(chunks1) == len(chunks2)
     for c1, c2 in zip(chunks1, chunks2):
+        assert c1["content"] == c2["content"]
 def test_empty_text_chunking():
     """Test handling of empty or very short text"""
     chunker = DocumentChunker(chunk_size=100, overlap=20)
     # Empty text
     chunks = chunker.chunk_text("")
     assert len(chunks) == 0
     # Very short text
     chunks = chunker.chunk_text("Short")
     assert len(chunks) == 1
+    assert chunks[0]["content"] == "Short"
 def test_chunk_real_policy_content():
     """Test chunking actual policy document content"""
     chunker = DocumentChunker(chunk_size=500, overlap=100, seed=42)
     # Use content that resembles our policy documents
+    policy_content = (
+        """# HR-POL-001: Employee Handbook
 **Effective Date:** 2025-01-01
 **Revision:** 1.1
 ### 2.1. Code of Conduct
+All employees must adhere to our code of conduct which emphasizes integrity, respect, and professionalism in all interactions."""
+        * 3
+    )
     doc_metadata = {
+        "filename": "employee_handbook.md",
+        "file_type": "md",
+        "file_path": "/path/to/employee_handbook.md",
     }
     chunks = chunker.chunk_document(policy_content, doc_metadata)
     # Verify chunking worked
     assert len(chunks) > 1
     # Verify all chunks have proper metadata
     for i, chunk in enumerate(chunks):
+        assert chunk["metadata"]["filename"] == "employee_handbook.md"
+        assert chunk["metadata"]["file_type"] == "md"
+        assert chunk["metadata"]["chunk_index"] == i
+        assert "chunk_id" in chunk["metadata"]
+        assert len(chunk["content"]) <= 500
     # Verify overlap exists between consecutive chunks
     if len(chunks) > 1:
+        overlap_check = (
+            chunks[0]["content"][-100:] in chunks[1]["content"][:200]
+        )
+        assert overlap_check
 def test_chunk_metadata_inheritance():
     """Test that document metadata is properly inherited by chunks"""
     chunker = DocumentChunker(chunk_size=100, overlap=20)
     doc_metadata = {
+        "filename": "test_policy.md",
+        "file_type": "md",
+        "file_size": 1500,
+        "file_path": "/absolute/path/to/test_policy.md",
     }
     text = "Policy content goes here. " * 20
     chunks = chunker.chunk_document(text, doc_metadata)
     for chunk in chunks:
         # Original metadata should be preserved
+        assert chunk["metadata"]["filename"] == "test_policy.md"
+        assert chunk["metadata"]["file_type"] == "md"
+        assert chunk["metadata"]["file_size"] == 1500
+        expected_path = "/absolute/path/to/test_policy.md"
+        assert chunk["metadata"]["file_path"] == expected_path
         # New chunk-specific metadata should be added
+        assert "chunk_index" in chunk["metadata"]
+        assert "chunk_id" in chunk["metadata"]
+        assert "start_pos" in chunk["metadata"]
+        assert "end_pos" in chunk["metadata"]

tests/test_ingestion/test_document_parser.py CHANGED Viewed

@@ -1,85 +1,94 @@
-import pytest
-import tempfile
 import os
 from pathlib import Path
 def test_parse_txt_file():
     """Test parsing a simple text file"""
     # Test will fail initially - we'll implement parser to make it pass
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
         f.write("This is a test policy document.\nIt has multiple lines.")
         temp_path = f.name
     try:
         result = parser.parse_document(temp_path)
-        assert result['content'] == "This is a test policy document.\nIt has multiple lines."
-        assert result['metadata']['filename'] == Path(temp_path).name
-        assert result['metadata']['file_type'] == 'txt'
     finally:
         os.unlink(temp_path)
 def test_parse_markdown_file():
     """Test parsing a markdown file"""
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
     markdown_content = """# Policy Title
 ## Section 1
 This is section content.
 ### Subsection
 More content here."""
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
         f.write(markdown_content)
         temp_path = f.name
     try:
         result = parser.parse_document(temp_path)
-        assert "Policy Title" in result['content']
-        assert "Section 1" in result['content']
-        assert result['metadata']['file_type'] == 'md'
     finally:
         os.unlink(temp_path)
 def test_parse_unsupported_format():
     """Test handling of unsupported file formats"""
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
     with pytest.raises(ValueError, match="Unsupported file format"):
         parser.parse_document("test.xyz")
 def test_parse_nonexistent_file():
     """Test handling of non-existent files"""
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
     with pytest.raises(FileNotFoundError):
         parser.parse_document("nonexistent.txt")
 def test_parse_real_policy_document():
     """Test parsing an actual policy document from our corpus"""
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
     # Use a real policy document from our corpus
     policy_path = "synthetic_policies/employee_handbook.md"
     result = parser.parse_document(policy_path)
     # Verify content structure
-    assert "employee_handbook.md" in result['metadata']['filename']
-    assert result['metadata']['file_type'] == 'md'
-    assert "Employee Handbook" in result['content']
-    assert "HR-POL-001" in result['content']
-    assert len(result['content']) > 100  # Should have substantial content
     # Verify metadata completeness
-    assert 'file_size' in result['metadata']
-    assert 'file_path' in result['metadata']
-    assert result['metadata']['file_size'] > 0

 import os
+import tempfile
 from pathlib import Path
+import pytest
 def test_parse_txt_file():
     """Test parsing a simple text file"""
     # Test will fail initially - we'll implement parser to make it pass
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
         f.write("This is a test policy document.\nIt has multiple lines.")
         temp_path = f.name
     try:
         result = parser.parse_document(temp_path)
+        assert (
+            result["content"]
+            == "This is a test policy document.\nIt has multiple lines."
+        )
+        assert result["metadata"]["filename"] == Path(temp_path).name
+        assert result["metadata"]["file_type"] == "txt"
     finally:
         os.unlink(temp_path)
 def test_parse_markdown_file():
     """Test parsing a markdown file"""
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
     markdown_content = """# Policy Title
 ## Section 1
 This is section content.
 ### Subsection
 More content here."""
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
         f.write(markdown_content)
         temp_path = f.name
     try:
         result = parser.parse_document(temp_path)
+        assert "Policy Title" in result["content"]
+        assert "Section 1" in result["content"]
+        assert result["metadata"]["file_type"] == "md"
     finally:
         os.unlink(temp_path)
 def test_parse_unsupported_format():
     """Test handling of unsupported file formats"""
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
     with pytest.raises(ValueError, match="Unsupported file format"):
         parser.parse_document("test.xyz")
 def test_parse_nonexistent_file():
     """Test handling of non-existent files"""
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
     with pytest.raises(FileNotFoundError):
         parser.parse_document("nonexistent.txt")
 def test_parse_real_policy_document():
     """Test parsing an actual policy document from our corpus"""
     from src.ingestion.document_parser import DocumentParser
     parser = DocumentParser()
     # Use a real policy document from our corpus
     policy_path = "synthetic_policies/employee_handbook.md"
     result = parser.parse_document(policy_path)
     # Verify content structure
+    assert "employee_handbook.md" in result["metadata"]["filename"]
+    assert result["metadata"]["file_type"] == "md"
+    assert "Employee Handbook" in result["content"]
+    assert "HR-POL-001" in result["content"]
+    assert len(result["content"]) > 100  # Should have substantial content
     # Verify metadata completeness
+    assert "file_size" in result["metadata"]
+    assert "file_path" in result["metadata"]
+    assert result["metadata"]["file_size"] > 0

tests/test_ingestion/test_ingestion_pipeline.py CHANGED Viewed

@@ -1,9 +1,12 @@
-import pytest
-import tempfile
 import os
 from pathlib import Path
 from src.ingestion.ingestion_pipeline import IngestionPipeline
 def test_full_ingestion_pipeline():
     """Test the complete ingestion pipeline end-to-end"""
     # Create temporary test documents
@@ -11,68 +14,73 @@ def test_full_ingestion_pipeline():
         # Create test files
         txt_file = Path(temp_dir) / "policy1.txt"
         md_file = Path(temp_dir) / "policy2.md"
-        txt_file.write_text("This is a text policy document with important information.")
         md_file.write_text("# Markdown Policy\n\nThis is markdown content.")
         # Initialize pipeline
         pipeline = IngestionPipeline(chunk_size=50, overlap=10, seed=42)
         # Process documents
         results = pipeline.process_directory(temp_dir)
         assert len(results) >= 2  # At least one result per file
         # Verify structure
         for result in results:
-            assert 'content' in result
-            assert 'metadata' in result
-            assert 'chunk_id' in result['metadata']
-            assert 'filename' in result['metadata']
 def test_pipeline_reproducibility():
     """Test that pipeline produces consistent results"""
     with tempfile.TemporaryDirectory() as temp_dir:
         test_file = Path(temp_dir) / "test.txt"
         test_file.write_text("Test content for reproducibility. " * 20)
         pipeline1 = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
         pipeline2 = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
         results1 = pipeline1.process_directory(temp_dir)
         results2 = pipeline2.process_directory(temp_dir)
         assert len(results1) == len(results2)
         for r1, r2 in zip(results1, results2):
-            assert r1['content'] == r2['content']
-            assert r1['metadata']['chunk_id'] == r2['metadata']['chunk_id']
 def test_pipeline_with_real_corpus():
     """Test pipeline with actual policy documents"""
     pipeline = IngestionPipeline(chunk_size=1000, overlap=200, seed=42)
     # Process just one real document to verify it works
     corpus_dir = "synthetic_policies"
     # Check if corpus directory exists
     if not Path(corpus_dir).exists():
         pytest.skip("Corpus directory not found - test requires synthetic_policies/")
     results = pipeline.process_directory(corpus_dir)
     # Should process all 22 documents
     assert len(results) > 20  # Should have many chunks from 22 documents
     # Verify all results have proper structure
     for result in results:
-        assert 'content' in result
-        assert 'metadata' in result
-        assert 'chunk_id' in result['metadata']
-        assert 'filename' in result['metadata']
-        assert 'file_type' in result['metadata']
-        assert result['metadata']['file_type'] == 'md'
-        assert 'chunk_index' in result['metadata']
 def test_pipeline_error_handling():
     """Test pipeline handles errors gracefully"""
@@ -80,87 +88,91 @@ def test_pipeline_error_handling():
         # Create valid and invalid files
         valid_file = Path(temp_dir) / "valid.md"
         invalid_file = Path(temp_dir) / "invalid.xyz"
         valid_file.write_text("# Valid Policy\n\nThis is valid content.")
         invalid_file.write_text("This file has unsupported format.")
         pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
         # Should process valid file and skip invalid one
         results = pipeline.process_directory(temp_dir)
         # Should only get results from valid file
         assert len(results) >= 1
         # All results should be from valid file
         for result in results:
-            assert result['metadata']['filename'] == 'valid.md'
 def test_pipeline_single_file():
     """Test processing a single file"""
     pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
         f.write("# Test Policy\n\n" + "Content section. " * 20)
         temp_path = f.name
     try:
         results = pipeline.process_file(temp_path)
         # Should get multiple chunks due to length
         assert len(results) > 1
         # All chunks should have same filename
         filename = Path(temp_path).name
         for result in results:
-            assert result['metadata']['filename'] == filename
-            assert result['metadata']['file_type'] == 'md'
-            assert 'chunk_index' in result['metadata']
     finally:
         os.unlink(temp_path)
 def test_pipeline_empty_directory():
     """Test pipeline with empty directory"""
     with tempfile.TemporaryDirectory() as temp_dir:
         pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
         results = pipeline.process_directory(temp_dir)
         # Should return empty list for empty directory
         assert len(results) == 0
 def test_pipeline_nonexistent_directory():
     """Test pipeline with non-existent directory"""
     pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
     with pytest.raises(FileNotFoundError):
         pipeline.process_directory("/nonexistent/directory")
 def test_pipeline_configuration():
     """Test pipeline configuration options"""
     # Test different configurations
     pipeline_small = IngestionPipeline(chunk_size=50, overlap=10, seed=42)
     pipeline_large = IngestionPipeline(chunk_size=200, overlap=50, seed=42)
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
         content = "Policy content goes here. " * 30  # 780 characters
         f.write(content)
         temp_path = f.name
     try:
         results_small = pipeline_small.process_file(temp_path)
         results_large = pipeline_large.process_file(temp_path)
         # Small chunks should create more chunks
         assert len(results_small) > len(results_large)
         # All chunks should respect size limits
         for result in results_small:
-            assert len(result['content']) <= 50
         for result in results_large:
-            assert len(result['content']) <= 200
     finally:
-        os.unlink(temp_path)

 import os
+import tempfile
 from pathlib import Path
+import pytest
 from src.ingestion.ingestion_pipeline import IngestionPipeline
 def test_full_ingestion_pipeline():
     """Test the complete ingestion pipeline end-to-end"""
     # Create temporary test documents
         # Create test files
         txt_file = Path(temp_dir) / "policy1.txt"
         md_file = Path(temp_dir) / "policy2.md"
+        txt_file.write_text(
+            "This is a text policy document with important information."
+        )
         md_file.write_text("# Markdown Policy\n\nThis is markdown content.")
         # Initialize pipeline
         pipeline = IngestionPipeline(chunk_size=50, overlap=10, seed=42)
         # Process documents
         results = pipeline.process_directory(temp_dir)
         assert len(results) >= 2  # At least one result per file
         # Verify structure
         for result in results:
+            assert "content" in result
+            assert "metadata" in result
+            assert "chunk_id" in result["metadata"]
+            assert "filename" in result["metadata"]
 def test_pipeline_reproducibility():
     """Test that pipeline produces consistent results"""
     with tempfile.TemporaryDirectory() as temp_dir:
         test_file = Path(temp_dir) / "test.txt"
         test_file.write_text("Test content for reproducibility. " * 20)
         pipeline1 = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
         pipeline2 = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
         results1 = pipeline1.process_directory(temp_dir)
         results2 = pipeline2.process_directory(temp_dir)
         assert len(results1) == len(results2)
         for r1, r2 in zip(results1, results2):
+            assert r1["content"] == r2["content"]
+            assert r1["metadata"]["chunk_id"] == r2["metadata"]["chunk_id"]
 def test_pipeline_with_real_corpus():
     """Test pipeline with actual policy documents"""
     pipeline = IngestionPipeline(chunk_size=1000, overlap=200, seed=42)
     # Process just one real document to verify it works
     corpus_dir = "synthetic_policies"
     # Check if corpus directory exists
     if not Path(corpus_dir).exists():
         pytest.skip("Corpus directory not found - test requires synthetic_policies/")
     results = pipeline.process_directory(corpus_dir)
     # Should process all 22 documents
     assert len(results) > 20  # Should have many chunks from 22 documents
     # Verify all results have proper structure
     for result in results:
+        assert "content" in result
+        assert "metadata" in result
+        assert "chunk_id" in result["metadata"]
+        assert "filename" in result["metadata"]
+        assert "file_type" in result["metadata"]
+        assert result["metadata"]["file_type"] == "md"
+        assert "chunk_index" in result["metadata"]
 def test_pipeline_error_handling():
     """Test pipeline handles errors gracefully"""
         # Create valid and invalid files
         valid_file = Path(temp_dir) / "valid.md"
         invalid_file = Path(temp_dir) / "invalid.xyz"
         valid_file.write_text("# Valid Policy\n\nThis is valid content.")
         invalid_file.write_text("This file has unsupported format.")
         pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
         # Should process valid file and skip invalid one
         results = pipeline.process_directory(temp_dir)
         # Should only get results from valid file
         assert len(results) >= 1
         # All results should be from valid file
         for result in results:
+            assert result["metadata"]["filename"] == "valid.md"
 def test_pipeline_single_file():
     """Test processing a single file"""
     pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
         f.write("# Test Policy\n\n" + "Content section. " * 20)
         temp_path = f.name
     try:
         results = pipeline.process_file(temp_path)
         # Should get multiple chunks due to length
         assert len(results) > 1
         # All chunks should have same filename
         filename = Path(temp_path).name
         for result in results:
+            assert result["metadata"]["filename"] == filename
+            assert result["metadata"]["file_type"] == "md"
+            assert "chunk_index" in result["metadata"]
     finally:
         os.unlink(temp_path)
 def test_pipeline_empty_directory():
     """Test pipeline with empty directory"""
     with tempfile.TemporaryDirectory() as temp_dir:
         pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
         results = pipeline.process_directory(temp_dir)
         # Should return empty list for empty directory
         assert len(results) == 0
 def test_pipeline_nonexistent_directory():
     """Test pipeline with non-existent directory"""
     pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
     with pytest.raises(FileNotFoundError):
         pipeline.process_directory("/nonexistent/directory")
 def test_pipeline_configuration():
     """Test pipeline configuration options"""
     # Test different configurations
     pipeline_small = IngestionPipeline(chunk_size=50, overlap=10, seed=42)
     pipeline_large = IngestionPipeline(chunk_size=200, overlap=50, seed=42)
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
         content = "Policy content goes here. " * 30  # 780 characters
         f.write(content)
         temp_path = f.name
     try:
         results_small = pipeline_small.process_file(temp_path)
         results_large = pipeline_large.process_file(temp_path)
         # Small chunks should create more chunks
         assert len(results_small) > len(results_large)
         # All chunks should respect size limits
         for result in results_small:
+            assert len(result["content"]) <= 50
         for result in results_large:
+            assert len(result["content"]) <= 200
     finally:
+        os.unlink(temp_path)

tests/test_integration.py CHANGED Viewed

@@ -1,9 +1,7 @@
 """Integration tests for Phase 2A components."""
-import pytest
-import tempfile
 import shutil
-from pathlib import Path
 from src.embedding.embedding_service import EmbeddingService
 from src.vector_store.vector_db import VectorDatabase
@@ -11,101 +9,122 @@ from src.vector_store.vector_db import VectorDatabase
 class TestPhase2AIntegration:
     """Test integration between EmbeddingService and VectorDatabase"""
     def setup_method(self):
         """Set up test environment with temporary database"""
         self.test_dir = tempfile.mkdtemp()
         self.embedding_service = EmbeddingService()
-        self.vector_db = VectorDatabase(persist_path=self.test_dir, collection_name="test_integration")
     def teardown_method(self):
         """Clean up temporary resources"""
-        if hasattr(self, 'test_dir'):
             shutil.rmtree(self.test_dir, ignore_errors=True)
     def test_embedding_vector_storage_workflow(self):
         """Test complete workflow: text → embedding → storage → search"""
         # Sample policy texts
         documents = [
-            "Employees must complete security training annually to maintain access to company systems.",
-            "Remote work policy allows employees to work from home up to 3 days per week.",
-            "All expenses over $500 require manager approval before reimbursement.",
-            "Code review is mandatory for all pull requests before merging to main branch."
         ]
         # Generate embeddings
         embeddings = self.embedding_service.embed_texts(documents)
         # Verify embeddings were generated
         assert len(embeddings) == len(documents)
-        assert all(len(emb) == self.embedding_service.get_embedding_dimension() for emb in embeddings)
         # Store embeddings with metadata (using existing collection)
         doc_ids = [f"doc_{i}" for i in range(len(documents))]
         metadatas = [{"type": "policy", "doc_id": doc_id} for doc_id in doc_ids]
         success = self.vector_db.add_embeddings(
             embeddings=embeddings,
             chunk_ids=doc_ids,
             documents=documents,
-            metadatas=metadatas
         )
         assert success is True
         # Test search functionality
         query = "remote work from home policy"
         query_embedding = self.embedding_service.embed_text(query)
-        results = self.vector_db.search(
-            query_embedding=query_embedding,
-            top_k=2
-        )
         # Verify search results (should return list of dictionaries)
         assert isinstance(results, list)
         assert len(results) <= 2  # Should return at most 2 results
         if results:  # If we have results
             assert all(isinstance(result, dict) for result in results)
             # Check that at least one result contains remote work related content
-            documents_found = [result.get('document', '') for result in results]
-            remote_work_found = any("remote work" in doc.lower() or "work from home" in doc.lower()
-                                  for doc in documents_found)
             assert remote_work_found
     def test_basic_embedding_dimension_consistency(self):
         """Test that embeddings have consistent dimensions"""
         # Test different text lengths
         texts = [
             "Short text.",
-            "This is a medium length text with several words to test embedding consistency.",
-            "This is a much longer text that contains multiple sentences and various types of content to ensure that the embedding service can handle longer inputs without issues and still produce consistent dimensional output vectors."
         ]
         # Generate embeddings
         embeddings = self.embedding_service.embed_texts(texts)
         # All embeddings should have the same dimension
         dimensions = [len(emb) for emb in embeddings]
         assert all(dim == dimensions[0] for dim in dimensions)
         # Dimension should match the service's reported dimension
         assert dimensions[0] == self.embedding_service.get_embedding_dimension()
     def test_empty_collection_handling(self):
         """Test behavior with empty collection"""
         # Search in empty collection
         query_embedding = self.embedding_service.embed_text("test query")
-        results = self.vector_db.search(
-            query_embedding=query_embedding,
-            top_k=5
-        )
         # Should handle empty collection gracefully
         assert isinstance(results, list)
-        assert len(results) == 0

 """Integration tests for Phase 2A components."""
 import shutil
+import tempfile
 from src.embedding.embedding_service import EmbeddingService
 from src.vector_store.vector_db import VectorDatabase
 class TestPhase2AIntegration:
     """Test integration between EmbeddingService and VectorDatabase"""
     def setup_method(self):
         """Set up test environment with temporary database"""
         self.test_dir = tempfile.mkdtemp()
         self.embedding_service = EmbeddingService()
+        self.vector_db = VectorDatabase(
+            persist_path=self.test_dir, collection_name="test_integration"
+        )
     def teardown_method(self):
         """Clean up temporary resources"""
+        if hasattr(self, "test_dir"):
             shutil.rmtree(self.test_dir, ignore_errors=True)
     def test_embedding_vector_storage_workflow(self):
         """Test complete workflow: text → embedding → storage → search"""
         # Sample policy texts
         documents = [
+            (
+                "Employees must complete security training annually to "
+                "maintain access to company systems."
+            ),
+            (
+                "Remote work policy allows employees to work from home up to "
+                "3 days per week."
+            ),
+            (
+                "All expenses over $500 require manager approval before "
+                "reimbursement."
+            ),
+            (
+                "Code review is mandatory for all pull requests before "
+                "merging to main branch."
+            ),
         ]
         # Generate embeddings
         embeddings = self.embedding_service.embed_texts(documents)
         # Verify embeddings were generated
         assert len(embeddings) == len(documents)
+        assert all(
+            len(emb) == self.embedding_service.get_embedding_dimension()
+            for emb in embeddings
+        )
         # Store embeddings with metadata (using existing collection)
         doc_ids = [f"doc_{i}" for i in range(len(documents))]
         metadatas = [{"type": "policy", "doc_id": doc_id} for doc_id in doc_ids]
         success = self.vector_db.add_embeddings(
             embeddings=embeddings,
             chunk_ids=doc_ids,
             documents=documents,
+            metadatas=metadatas,
         )
         assert success is True
         # Test search functionality
         query = "remote work from home policy"
         query_embedding = self.embedding_service.embed_text(query)
+        results = self.vector_db.search(query_embedding=query_embedding, top_k=2)
         # Verify search results (should return list of dictionaries)
         assert isinstance(results, list)
         assert len(results) <= 2  # Should return at most 2 results
         if results:  # If we have results
             assert all(isinstance(result, dict) for result in results)
             # Check that at least one result contains remote work related content
+            documents_found = [result.get("document", "") for result in results]
+            remote_work_found = any(
+                "remote work" in doc.lower() or "work from home" in doc.lower()
+                for doc in documents_found
+            )
             assert remote_work_found
     def test_basic_embedding_dimension_consistency(self):
         """Test that embeddings have consistent dimensions"""
         # Test different text lengths
         texts = [
             "Short text.",
+            (
+                "This is a medium length text with several words to test "
+                "embedding consistency."
+            ),
+            (
+                "This is a much longer text that contains multiple sentences "
+                "and various types of content to ensure that the embedding "
+                "service can handle longer inputs without issues and still "
+                "produce consistent dimensional output vectors."
+            ),
         ]
         # Generate embeddings
         embeddings = self.embedding_service.embed_texts(texts)
         # All embeddings should have the same dimension
         dimensions = [len(emb) for emb in embeddings]
         assert all(dim == dimensions[0] for dim in dimensions)
         # Dimension should match the service's reported dimension
         assert dimensions[0] == self.embedding_service.get_embedding_dimension()
     def test_empty_collection_handling(self):
         """Test behavior with empty collection"""
         # Search in empty collection
         query_embedding = self.embedding_service.embed_text("test query")
+        results = self.vector_db.search(query_embedding=query_embedding, top_k=5)
         # Should handle empty collection gracefully
         assert isinstance(results, list)
+        assert len(results) == 0

tests/test_vector_store/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # Test package for vector store components


1	+ # Test package for vector store components

tests/test_vector_store/test_vector_db.py CHANGED Viewed

@@ -1,187 +1,197 @@
-import pytest
 import tempfile
-import shutil
-from pathlib import Path
-import numpy as np
 from src.vector_store.vector_db import VectorDatabase
 def test_vector_database_initialization():
     """Test VectorDatabase initialization and connection"""
     with tempfile.TemporaryDirectory() as temp_dir:
         # Test will fail initially - we'll implement VectorDatabase to make it pass
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_collection")
         # Should create connection successfully
         assert db is not None
         assert db.collection_name == "test_collection"
         assert db.persist_path == temp_dir
 def test_create_collection():
     """Test creating a new collection"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         # Collection should be created
         collection = db.get_collection()
         assert collection is not None
         assert collection.name == "test_docs"
 def test_add_embeddings():
     """Test adding embeddings to the database"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         # Sample data
         embeddings = [
             [0.1, 0.2, 0.3, 0.4],  # 4-dimensional for testing
             [0.5, 0.6, 0.7, 0.8],
-            [0.9, 1.0, 1.1, 1.2]
         ]
         chunk_ids = ["chunk_1", "chunk_2", "chunk_3"]
         documents = [
             "This is the first document chunk.",
             "This is the second document chunk.",
-            "This is the third document chunk."
         ]
         metadatas = [
             {"filename": "doc1.md", "chunk_index": 0},
             {"filename": "doc1.md", "chunk_index": 1},
-            {"filename": "doc2.md", "chunk_index": 0}
         ]
         # Add embeddings
         result = db.add_embeddings(
             embeddings=embeddings,
             chunk_ids=chunk_ids,
             documents=documents,
-            metadatas=metadatas
         )
         # Should return success
         assert result is True
         # Verify count
         count = db.get_count()
         assert count == 3
 def test_search_embeddings():
     """Test searching for similar embeddings"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         # Add some test data first
         embeddings = [
             [1.0, 0.0, 0.0, 0.0],  # Distinct embeddings for testing
             [0.0, 1.0, 0.0, 0.0],
             [0.0, 0.0, 1.0, 0.0],
-            [0.0, 0.0, 0.0, 1.0]
         ]
         chunk_ids = ["chunk_1", "chunk_2", "chunk_3", "chunk_4"]
         documents = ["Doc 1", "Doc 2", "Doc 3", "Doc 4"]
         metadatas = [{"index": i} for i in range(4)]
         db.add_embeddings(embeddings, chunk_ids, documents, metadatas)
         # Search for similar to first embedding
         query_embedding = [1.0, 0.0, 0.0, 0.0]
         results = db.search(query_embedding, top_k=2)
         # Should return results
         assert len(results) <= 2
         assert len(results) > 0
         # First result should be the exact match
         assert results[0]["id"] == "chunk_1"
         assert "distance" in results[0]
         assert "document" in results[0]
         assert "metadata" in results[0]
 def test_delete_collection():
     """Test deleting a collection"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         # Add some data
         embeddings = [[0.1, 0.2, 0.3, 0.4]]
         chunk_ids = ["chunk_1"]
         documents = ["Test doc"]
         metadatas = [{"test": True}]
         db.add_embeddings(embeddings, chunk_ids, documents, metadatas)
         assert db.get_count() == 1
         # Delete collection
         db.delete_collection()
         # Should be empty after recreation
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         assert db.get_count() == 0
 def test_persistence():
     """Test that data persists across database instances"""
     with tempfile.TemporaryDirectory() as temp_dir:
         # Create first instance and add data
         db1 = VectorDatabase(persist_path=temp_dir, collection_name="persistent_test")
         embeddings = [[0.1, 0.2, 0.3, 0.4]]
         chunk_ids = ["persistent_chunk"]
         documents = ["Persistent document"]
         metadatas = [{"persistent": True}]
         db1.add_embeddings(embeddings, chunk_ids, documents, metadatas)
         assert db1.get_count() == 1
         # Create second instance with same path
         db2 = VectorDatabase(persist_path=temp_dir, collection_name="persistent_test")
         # Should have the same data
         assert db2.get_count() == 1
         # Should be able to search and find the data
         results = db2.search([0.1, 0.2, 0.3, 0.4], top_k=1)
         assert len(results) == 1
         assert results[0]["id"] == "persistent_chunk"
 def test_error_handling():
     """Test error handling for various edge cases"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="error_test")
         # Test empty search
         results = db.search([0.1, 0.2, 0.3, 0.4], top_k=5)
         assert results == []
         # Test adding mismatched data
         with pytest.raises((ValueError, Exception)):
             db.add_embeddings(
                 embeddings=[[0.1, 0.2]],  # 2D
                 chunk_ids=["chunk_1", "chunk_2"],  # 2 IDs but 1 embedding
                 documents=["Doc 1"],  # 1 document
-                metadatas=[{"test": True}]  # 1 metadata
             )
 def test_batch_operations():
     """Test batch operations for performance"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="batch_test")
         # Create larger batch for testing
         batch_size = 50
-        embeddings = [[float(i), float(i+1), float(i+2), float(i+3)] for i in range(batch_size)]
         chunk_ids = [f"chunk_{i}" for i in range(batch_size)]
         documents = [f"Document {i} content" for i in range(batch_size)]
         metadatas = [{"batch_index": i, "test_batch": True} for i in range(batch_size)]
         # Should handle batch operations
         result = db.add_embeddings(embeddings, chunk_ids, documents, metadatas)
         assert result is True
         assert db.get_count() == batch_size
         # Should handle batch search
         query_embedding = [0.0, 1.0, 2.0, 3.0]
         results = db.search(query_embedding, top_k=10)
-        assert len(results) == 10  # Should return requested number

 import tempfile
+import pytest
 from src.vector_store.vector_db import VectorDatabase
 def test_vector_database_initialization():
     """Test VectorDatabase initialization and connection"""
     with tempfile.TemporaryDirectory() as temp_dir:
         # Test will fail initially - we'll implement VectorDatabase to make it pass
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_collection")
         # Should create connection successfully
         assert db is not None
         assert db.collection_name == "test_collection"
         assert db.persist_path == temp_dir
 def test_create_collection():
     """Test creating a new collection"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         # Collection should be created
         collection = db.get_collection()
         assert collection is not None
         assert collection.name == "test_docs"
 def test_add_embeddings():
     """Test adding embeddings to the database"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         # Sample data
         embeddings = [
             [0.1, 0.2, 0.3, 0.4],  # 4-dimensional for testing
             [0.5, 0.6, 0.7, 0.8],
+            [0.9, 1.0, 1.1, 1.2],
         ]
         chunk_ids = ["chunk_1", "chunk_2", "chunk_3"]
         documents = [
             "This is the first document chunk.",
             "This is the second document chunk.",
+            "This is the third document chunk.",
         ]
         metadatas = [
             {"filename": "doc1.md", "chunk_index": 0},
             {"filename": "doc1.md", "chunk_index": 1},
+            {"filename": "doc2.md", "chunk_index": 0},
         ]
         # Add embeddings
         result = db.add_embeddings(
             embeddings=embeddings,
             chunk_ids=chunk_ids,
             documents=documents,
+            metadatas=metadatas,
         )
         # Should return success
         assert result is True
         # Verify count
         count = db.get_count()
         assert count == 3
 def test_search_embeddings():
     """Test searching for similar embeddings"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         # Add some test data first
         embeddings = [
             [1.0, 0.0, 0.0, 0.0],  # Distinct embeddings for testing
             [0.0, 1.0, 0.0, 0.0],
             [0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0],
         ]
         chunk_ids = ["chunk_1", "chunk_2", "chunk_3", "chunk_4"]
         documents = ["Doc 1", "Doc 2", "Doc 3", "Doc 4"]
         metadatas = [{"index": i} for i in range(4)]
         db.add_embeddings(embeddings, chunk_ids, documents, metadatas)
         # Search for similar to first embedding
         query_embedding = [1.0, 0.0, 0.0, 0.0]
         results = db.search(query_embedding, top_k=2)
         # Should return results
         assert len(results) <= 2
         assert len(results) > 0
         # First result should be the exact match
         assert results[0]["id"] == "chunk_1"
         assert "distance" in results[0]
         assert "document" in results[0]
         assert "metadata" in results[0]
 def test_delete_collection():
     """Test deleting a collection"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         # Add some data
         embeddings = [[0.1, 0.2, 0.3, 0.4]]
         chunk_ids = ["chunk_1"]
         documents = ["Test doc"]
         metadatas = [{"test": True}]
         db.add_embeddings(embeddings, chunk_ids, documents, metadatas)
         assert db.get_count() == 1
         # Delete collection
         db.delete_collection()
         # Should be empty after recreation
         db = VectorDatabase(persist_path=temp_dir, collection_name="test_docs")
         assert db.get_count() == 0
 def test_persistence():
     """Test that data persists across database instances"""
     with tempfile.TemporaryDirectory() as temp_dir:
         # Create first instance and add data
         db1 = VectorDatabase(persist_path=temp_dir, collection_name="persistent_test")
         embeddings = [[0.1, 0.2, 0.3, 0.4]]
         chunk_ids = ["persistent_chunk"]
         documents = ["Persistent document"]
         metadatas = [{"persistent": True}]
         db1.add_embeddings(embeddings, chunk_ids, documents, metadatas)
         assert db1.get_count() == 1
         # Create second instance with same path
         db2 = VectorDatabase(persist_path=temp_dir, collection_name="persistent_test")
         # Should have the same data
         assert db2.get_count() == 1
         # Should be able to search and find the data
         results = db2.search([0.1, 0.2, 0.3, 0.4], top_k=1)
         assert len(results) == 1
         assert results[0]["id"] == "persistent_chunk"
 def test_error_handling():
     """Test error handling for various edge cases"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="error_test")
         # Test empty search
         results = db.search([0.1, 0.2, 0.3, 0.4], top_k=5)
         assert results == []
         # Test adding mismatched data
         with pytest.raises((ValueError, Exception)):
             db.add_embeddings(
                 embeddings=[[0.1, 0.2]],  # 2D
                 chunk_ids=["chunk_1", "chunk_2"],  # 2 IDs but 1 embedding
                 documents=["Doc 1"],  # 1 document
+                metadatas=[{"test": True}],  # 1 metadata
             )
 def test_batch_operations():
     """Test batch operations for performance"""
     with tempfile.TemporaryDirectory() as temp_dir:
         db = VectorDatabase(persist_path=temp_dir, collection_name="batch_test")
         # Create larger batch for testing
         batch_size = 50
+        embeddings = [
+            [float(i), float(i + 1), float(i + 2), float(i + 3)]
+            for i in range(batch_size)
+        ]
         chunk_ids = [f"chunk_{i}" for i in range(batch_size)]
         documents = [f"Document {i} content" for i in range(batch_size)]
         metadatas = [{"batch_index": i, "test_batch": True} for i in range(batch_size)]
         # Should handle batch operations
         result = db.add_embeddings(embeddings, chunk_ids, documents, metadatas)
         assert result is True
         assert db.get_count() == batch_size
         # Should handle batch search
         query_embedding = [0.0, 1.0, 2.0, 3.0]
         results = db.search(query_embedding, top_k=10)
+        assert len(results) == 10  # Should return requested number