#!/usr/bin/env python3 """ Script to rebuild the vector database with embeddings locally. Run this when you update the synthetic_policies documents. """ import logging import sys from pathlib import Path # Add src to path so we can import modules sys.path.insert(0, str(Path(__file__).parent / "src")) def main(): """Build embeddings for the corpus.""" logging.basicConfig(level=logging.INFO) print("๐Ÿ”„ Building embeddings database...") # Import after setting up path from src.config import ( COLLECTION_NAME, CORPUS_DIRECTORY, DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, EMBEDDING_DIMENSION, EMBEDDING_MODEL_NAME, RANDOM_SEED, VECTOR_DB_PERSIST_PATH, ) from src.ingestion.ingestion_pipeline import IngestionPipeline from src.vector_store.vector_db import VectorDatabase print(f"๐Ÿ“ Processing corpus: {CORPUS_DIRECTORY}") print(f"๐Ÿค– Using model: {EMBEDDING_MODEL_NAME}") print(f"๐Ÿ“Š Target dimension: {EMBEDDING_DIMENSION}") # Clear existing database import shutil if Path(VECTOR_DB_PERSIST_PATH).exists(): print(f"๐Ÿ—‘๏ธ Clearing existing database: {VECTOR_DB_PERSIST_PATH}") shutil.rmtree(VECTOR_DB_PERSIST_PATH) # Run ingestion pipeline ingestion_pipeline = IngestionPipeline( chunk_size=DEFAULT_CHUNK_SIZE, overlap=DEFAULT_OVERLAP, seed=RANDOM_SEED, store_embeddings=True, ) result = ingestion_pipeline.process_directory_with_embeddings(CORPUS_DIRECTORY) chunks_processed = result["chunks_processed"] embeddings_stored = result["embeddings_stored"] if chunks_processed == 0: print("โŒ Ingestion failed or processed 0 chunks") return 1 # Verify database vector_db = VectorDatabase(VECTOR_DB_PERSIST_PATH, COLLECTION_NAME) count = vector_db.get_count() dimension = vector_db.get_embedding_dimension() print(f"โœ… Successfully processed {chunks_processed} chunks") print(f"๐Ÿ”— Embeddings stored: {embeddings_stored}") print(f"๐Ÿ“Š Database contains {count} embeddings") print(f"๐Ÿ”ข Embedding dimension: {dimension}") if dimension != EMBEDDING_DIMENSION: print(f"โš ๏ธ Warning: Expected dimension {EMBEDDING_DIMENSION}, got {dimension}") return 1 print("๐ŸŽ‰ Embeddings database ready for deployment!") print("๐Ÿ’ก Don't forget to commit the data/ directory to git") # Clean up memory after build import gc gc.collect() print("๐Ÿงน Memory cleanup completed") return 0 if __name__ == "__main__": sys.exit(main())