Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Script to rebuild the vector database with embeddings locally. | |
| Run this when you update the synthetic_policies documents. | |
| """ | |
| import logging | |
| import sys | |
| from pathlib import Path | |
| # Add src to path so we can import modules | |
| sys.path.insert(0, str(Path(__file__).parent / "src")) | |
| def main(): | |
| """Build embeddings for the corpus.""" | |
| logging.basicConfig(level=logging.INFO) | |
| print("π Building embeddings database...") | |
| # Import after setting up path | |
| from src.config import ( | |
| COLLECTION_NAME, | |
| CORPUS_DIRECTORY, | |
| DEFAULT_CHUNK_SIZE, | |
| DEFAULT_OVERLAP, | |
| EMBEDDING_DIMENSION, | |
| EMBEDDING_MODEL_NAME, | |
| RANDOM_SEED, | |
| VECTOR_DB_PERSIST_PATH, | |
| ) | |
| from src.ingestion.ingestion_pipeline import IngestionPipeline | |
| from src.vector_store.vector_db import VectorDatabase | |
| print(f"π Processing corpus: {CORPUS_DIRECTORY}") | |
| print(f"π€ Using model: {EMBEDDING_MODEL_NAME}") | |
| print(f"π Target dimension: {EMBEDDING_DIMENSION}") | |
| # Clear existing database | |
| import shutil | |
| if Path(VECTOR_DB_PERSIST_PATH).exists(): | |
| print(f"ποΈ Clearing existing database: {VECTOR_DB_PERSIST_PATH}") | |
| shutil.rmtree(VECTOR_DB_PERSIST_PATH) | |
| # Run ingestion pipeline | |
| ingestion_pipeline = IngestionPipeline( | |
| chunk_size=DEFAULT_CHUNK_SIZE, | |
| overlap=DEFAULT_OVERLAP, | |
| seed=RANDOM_SEED, | |
| store_embeddings=True, | |
| ) | |
| result = ingestion_pipeline.process_directory_with_embeddings(CORPUS_DIRECTORY) | |
| chunks_processed = result["chunks_processed"] | |
| embeddings_stored = result["embeddings_stored"] | |
| if chunks_processed == 0: | |
| print("β Ingestion failed or processed 0 chunks") | |
| return 1 | |
| # Verify database | |
| vector_db = VectorDatabase(VECTOR_DB_PERSIST_PATH, COLLECTION_NAME) | |
| count = vector_db.get_count() | |
| dimension = vector_db.get_embedding_dimension() | |
| print(f"β Successfully processed {chunks_processed} chunks") | |
| print(f"π Embeddings stored: {embeddings_stored}") | |
| print(f"π Database contains {count} embeddings") | |
| print(f"π’ Embedding dimension: {dimension}") | |
| if dimension != EMBEDDING_DIMENSION: | |
| print(f"β οΈ Warning: Expected dimension {EMBEDDING_DIMENSION}, got {dimension}") | |
| return 1 | |
| print("π Embeddings database ready for deployment!") | |
| print("π‘ Don't forget to commit the data/ directory to git") | |
| # Clean up memory after build | |
| import gc | |
| gc.collect() | |
| print("π§Ή Memory cleanup completed") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |