File size: 3,008 Bytes
f75da29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
"""
Memory optimization and database initialization script for Render deployment.
"""

import logging
import os
import sys

from src.utils.memory_utils import clean_memory, log_memory_usage

# Add src to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))


def initialize_vector_store():
    """Initialize vector store with memory management."""
    from src.config import (
        COLLECTION_NAME,
        CORPUS_DIRECTORY,
        DEFAULT_CHUNK_SIZE,
        DEFAULT_OVERLAP,
        EMBEDDING_DIMENSION,
        RANDOM_SEED,
        VECTOR_DB_PERSIST_PATH,
    )
    from src.ingestion.ingestion_pipeline import IngestionPipeline
    from src.vector_store.vector_db import VectorDatabase

    log_memory_usage("Vector store initialization start")

    try:
        # Initialize vector database to check its state
        vector_db = VectorDatabase(VECTOR_DB_PERSIST_PATH, COLLECTION_NAME)

        # Check if embeddings exist and have correct dimension
        if not vector_db.has_valid_embeddings(EMBEDDING_DIMENSION):
            logging.info("Vector store needs initialization - running ingestion")

            # Clean memory before starting ingestion
            clean_memory("Before ingestion")

            # Run ingestion pipeline to rebuild embeddings
            ingestion_pipeline = IngestionPipeline(
                chunk_size=DEFAULT_CHUNK_SIZE,
                overlap=DEFAULT_OVERLAP,
                seed=RANDOM_SEED,
                store_embeddings=True,
            )

            # Process the corpus directory
            results = ingestion_pipeline.process_directory(CORPUS_DIRECTORY)

            if not results or len(results) == 0:
                logging.error("Ingestion failed or processed 0 chunks")
                return False
            else:
                logging.info(f"Ingestion completed: {len(results)} chunks processed")
                clean_memory("After ingestion")
        else:
            logging.info(
                f"Vector store is valid with {vector_db.get_count()} embeddings "
                f"of dimension {vector_db.get_embedding_dimension()}"
            )

        log_memory_usage("Vector store initialization complete")
        return True

    except Exception as e:
        logging.error(f"Vector store initialization failed: {e}")
        return False


def main():
    """Main initialization function."""
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    log_memory_usage("Script start")

    # Clean memory at start
    clean_memory("Script startup")

    # Initialize vector store
    success = initialize_vector_store()

    if success:
        logging.info("Memory optimization and initialization completed successfully")
        log_memory_usage("Script end")
        return 0
    else:
        logging.error("Initialization failed")
        return 1


if __name__ == "__main__":
    sys.exit(main())