#!/usr/bin/env python3 """ Memory optimization and database initialization script for Render deployment. """ import logging import os import sys from src.utils.memory_utils import clean_memory, log_memory_usage # Add src to path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src")) def initialize_vector_store(): """Initialize vector store with memory management.""" from src.config import ( COLLECTION_NAME, CORPUS_DIRECTORY, DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, EMBEDDING_DIMENSION, RANDOM_SEED, VECTOR_DB_PERSIST_PATH, ) from src.ingestion.ingestion_pipeline import IngestionPipeline from src.vector_store.vector_db import VectorDatabase log_memory_usage("Vector store initialization start") try: # Initialize vector database to check its state vector_db = VectorDatabase(VECTOR_DB_PERSIST_PATH, COLLECTION_NAME) # Check if embeddings exist and have correct dimension if not vector_db.has_valid_embeddings(EMBEDDING_DIMENSION): logging.info("Vector store needs initialization - running ingestion") # Clean memory before starting ingestion clean_memory("Before ingestion") # Run ingestion pipeline to rebuild embeddings ingestion_pipeline = IngestionPipeline( chunk_size=DEFAULT_CHUNK_SIZE, overlap=DEFAULT_OVERLAP, seed=RANDOM_SEED, store_embeddings=True, ) # Process the corpus directory results = ingestion_pipeline.process_directory(CORPUS_DIRECTORY) if not results or len(results) == 0: logging.error("Ingestion failed or processed 0 chunks") return False else: logging.info(f"Ingestion completed: {len(results)} chunks processed") clean_memory("After ingestion") else: logging.info( f"Vector store is valid with {vector_db.get_count()} embeddings " f"of dimension {vector_db.get_embedding_dimension()}" ) log_memory_usage("Vector store initialization complete") return True except Exception as e: logging.error(f"Vector store initialization failed: {e}") return False def main(): """Main initialization function.""" logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) log_memory_usage("Script start") # Clean memory at start clean_memory("Script startup") # Initialize vector store success = initialize_vector_store() if success: logging.info("Memory optimization and initialization completed successfully") log_memory_usage("Script end") return 0 else: logging.error("Initialization failed") return 1 if __name__ == "__main__": sys.exit(main())