msse-ai-engineering / build_embeddings.py
Seth McKnight
Comprehensive memory optimizations and embedding service updates (#74)
32e4125
#!/usr/bin/env python3
"""
Script to rebuild the vector database with embeddings locally.
Run this when you update the synthetic_policies documents.
"""
import logging
import sys
from pathlib import Path
# Add src to path so we can import modules
sys.path.insert(0, str(Path(__file__).parent / "src"))
def main():
"""Build embeddings for the corpus."""
logging.basicConfig(level=logging.INFO)
print("πŸ”„ Building embeddings database...")
# Import after setting up path
from src.config import (
COLLECTION_NAME,
CORPUS_DIRECTORY,
DEFAULT_CHUNK_SIZE,
DEFAULT_OVERLAP,
EMBEDDING_DIMENSION,
EMBEDDING_MODEL_NAME,
RANDOM_SEED,
VECTOR_DB_PERSIST_PATH,
)
from src.ingestion.ingestion_pipeline import IngestionPipeline
from src.vector_store.vector_db import VectorDatabase
print(f"πŸ“ Processing corpus: {CORPUS_DIRECTORY}")
print(f"πŸ€– Using model: {EMBEDDING_MODEL_NAME}")
print(f"πŸ“Š Target dimension: {EMBEDDING_DIMENSION}")
# Clear existing database
import shutil
if Path(VECTOR_DB_PERSIST_PATH).exists():
print(f"πŸ—‘οΈ Clearing existing database: {VECTOR_DB_PERSIST_PATH}")
shutil.rmtree(VECTOR_DB_PERSIST_PATH)
# Run ingestion pipeline
ingestion_pipeline = IngestionPipeline(
chunk_size=DEFAULT_CHUNK_SIZE,
overlap=DEFAULT_OVERLAP,
seed=RANDOM_SEED,
store_embeddings=True,
)
result = ingestion_pipeline.process_directory_with_embeddings(CORPUS_DIRECTORY)
chunks_processed = result["chunks_processed"]
embeddings_stored = result["embeddings_stored"]
if chunks_processed == 0:
print("❌ Ingestion failed or processed 0 chunks")
return 1
# Verify database
vector_db = VectorDatabase(VECTOR_DB_PERSIST_PATH, COLLECTION_NAME)
count = vector_db.get_count()
dimension = vector_db.get_embedding_dimension()
print(f"βœ… Successfully processed {chunks_processed} chunks")
print(f"πŸ”— Embeddings stored: {embeddings_stored}")
print(f"πŸ“Š Database contains {count} embeddings")
print(f"πŸ”’ Embedding dimension: {dimension}")
if dimension != EMBEDDING_DIMENSION:
print(f"⚠️ Warning: Expected dimension {EMBEDDING_DIMENSION}, got {dimension}")
return 1
print("πŸŽ‰ Embeddings database ready for deployment!")
print("πŸ’‘ Don't forget to commit the data/ directory to git")
# Clean up memory after build
import gc
gc.collect()
print("🧹 Memory cleanup completed")
return 0
if __name__ == "__main__":
sys.exit(main())