File size: 1,843 Bytes
ffa0f3d
 
dca679b
 
ffa0f3d
 
 
 
 
 
7793bb6
ffa0f3d
 
7793bb6
afecdc5
 
15f6c83
dca679b
 
afecdc5
0a7f9b4
afecdc5
 
dca679b
f75da29
 
 
f60c17f
f75da29
 
dca679b
 
 
 
48155ff
dca679b
48155ff
afecdc5
f60c17f
afecdc5
dca679b
 
 
 
 
159faf0
 
dca679b
afecdc5
 
 
7793bb6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""Configuration settings for the ingestion pipeline"""

import os

# Default ingestion settings
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_OVERLAP = 200
RANDOM_SEED = 42

# Supported file formats
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}

# Corpus directory
CORPUS_DIRECTORY = "synthetic_policies"

# Vector Database Settings
VECTOR_STORAGE_TYPE = os.getenv("VECTOR_STORAGE_TYPE", "chroma")  # "chroma" or "postgres"
VECTOR_DB_PERSIST_PATH = "data/chroma_db"  # Used for ChromaDB
DATABASE_URL = os.getenv("DATABASE_URL")  # Used for PostgreSQL
COLLECTION_NAME = "policy_documents"
EMBEDDING_DIMENSION = 384  # paraphrase-MiniLM-L3-v2 (smaller, memory-efficient)
SIMILARITY_METRIC = "cosine"

# ChromaDB Configuration for Memory Optimization (when using ChromaDB)
CHROMA_SETTINGS = {
    "anonymized_telemetry": False,
    "allow_reset": False,
    "is_persistent": True,
}

# PostgreSQL Configuration (when using PostgreSQL)
POSTGRES_TABLE_NAME = "document_embeddings"
POSTGRES_MAX_CONNECTIONS = 10

# Embedding Model Settings
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Ultra-lightweight
EMBEDDING_BATCH_SIZE = 1  # Absolute minimum for extreme memory constraints
EMBEDDING_DEVICE = "cpu"  # Use CPU for free tier compatibility
EMBEDDING_USE_QUANTIZED = os.getenv("EMBEDDING_USE_QUANTIZED", "true").lower() == "true"

# Document Processing Settings (for memory optimization)
MAX_DOCUMENT_LENGTH = 1000  # Truncate documents to reduce memory usage
MAX_DOCUMENTS_IN_MEMORY = 100  # Process documents in small batches

# Memory Management Settings
ENABLE_MEMORY_MONITORING = os.getenv("ENABLE_MEMORY_MONITORING", "true").lower() == "true"
MEMORY_LIMIT_MB = int(os.getenv("MEMORY_LIMIT_MB", "400"))  # Conservative limit for 512MB instances

# Search Settings
DEFAULT_TOP_K = 5
MAX_TOP_K = 20
MIN_SIMILARITY_THRESHOLD = 0.3