msse-ai-engineering / init_memory_optimized.py
Seth McKnight
Comprehensive memory optimizations and embedding service updates (#76)
f75da29
#!/usr/bin/env python3
"""
Memory optimization and database initialization script for Render deployment.
"""
import logging
import os
import sys
from src.utils.memory_utils import clean_memory, log_memory_usage
# Add src to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
def initialize_vector_store():
"""Initialize vector store with memory management."""
from src.config import (
COLLECTION_NAME,
CORPUS_DIRECTORY,
DEFAULT_CHUNK_SIZE,
DEFAULT_OVERLAP,
EMBEDDING_DIMENSION,
RANDOM_SEED,
VECTOR_DB_PERSIST_PATH,
)
from src.ingestion.ingestion_pipeline import IngestionPipeline
from src.vector_store.vector_db import VectorDatabase
log_memory_usage("Vector store initialization start")
try:
# Initialize vector database to check its state
vector_db = VectorDatabase(VECTOR_DB_PERSIST_PATH, COLLECTION_NAME)
# Check if embeddings exist and have correct dimension
if not vector_db.has_valid_embeddings(EMBEDDING_DIMENSION):
logging.info("Vector store needs initialization - running ingestion")
# Clean memory before starting ingestion
clean_memory("Before ingestion")
# Run ingestion pipeline to rebuild embeddings
ingestion_pipeline = IngestionPipeline(
chunk_size=DEFAULT_CHUNK_SIZE,
overlap=DEFAULT_OVERLAP,
seed=RANDOM_SEED,
store_embeddings=True,
)
# Process the corpus directory
results = ingestion_pipeline.process_directory(CORPUS_DIRECTORY)
if not results or len(results) == 0:
logging.error("Ingestion failed or processed 0 chunks")
return False
else:
logging.info(f"Ingestion completed: {len(results)} chunks processed")
clean_memory("After ingestion")
else:
logging.info(
f"Vector store is valid with {vector_db.get_count()} embeddings "
f"of dimension {vector_db.get_embedding_dimension()}"
)
log_memory_usage("Vector store initialization complete")
return True
except Exception as e:
logging.error(f"Vector store initialization failed: {e}")
return False
def main():
"""Main initialization function."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
log_memory_usage("Script start")
# Clean memory at start
clean_memory("Script startup")
# Initialize vector store
success = initialize_vector_store()
if success:
logging.info("Memory optimization and initialization completed successfully")
log_memory_usage("Script end")
return 0
else:
logging.error("Initialization failed")
return 1
if __name__ == "__main__":
sys.exit(main())