Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

File size: 1,843 Bytes

ffa0f3d
 
dca679b
 
ffa0f3d
 
 
 
 
 
7793bb6
ffa0f3d
 
7793bb6
afecdc5
 
15f6c83
dca679b
 
afecdc5
0a7f9b4
afecdc5
 
dca679b
f75da29
 
 
f60c17f
f75da29
 
dca679b
 
 
 
48155ff
dca679b
48155ff
afecdc5
f60c17f
afecdc5
dca679b
 
 
 
 
159faf0
 
dca679b
afecdc5
 
 
7793bb6

"""Configuration settings for the ingestion pipeline"""

import os

# Default ingestion settings
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_OVERLAP = 200
RANDOM_SEED = 42

# Supported file formats
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}

# Corpus directory
CORPUS_DIRECTORY = "synthetic_policies"

# Vector Database Settings
VECTOR_STORAGE_TYPE = os.getenv("VECTOR_STORAGE_TYPE", "chroma")  # "chroma" or "postgres"
VECTOR_DB_PERSIST_PATH = "data/chroma_db"  # Used for ChromaDB
DATABASE_URL = os.getenv("DATABASE_URL")  # Used for PostgreSQL
COLLECTION_NAME = "policy_documents"
EMBEDDING_DIMENSION = 384  # paraphrase-MiniLM-L3-v2 (smaller, memory-efficient)
SIMILARITY_METRIC = "cosine"

# ChromaDB Configuration for Memory Optimization (when using ChromaDB)
CHROMA_SETTINGS = {
    "anonymized_telemetry": False,
    "allow_reset": False,
    "is_persistent": True,
}

# PostgreSQL Configuration (when using PostgreSQL)
POSTGRES_TABLE_NAME = "document_embeddings"
POSTGRES_MAX_CONNECTIONS = 10

# Embedding Model Settings
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Ultra-lightweight
EMBEDDING_BATCH_SIZE = 1  # Absolute minimum for extreme memory constraints
EMBEDDING_DEVICE = "cpu"  # Use CPU for free tier compatibility
EMBEDDING_USE_QUANTIZED = os.getenv("EMBEDDING_USE_QUANTIZED", "true").lower() == "true"

# Document Processing Settings (for memory optimization)
MAX_DOCUMENT_LENGTH = 1000  # Truncate documents to reduce memory usage
MAX_DOCUMENTS_IN_MEMORY = 100  # Process documents in small batches

# Memory Management Settings
ENABLE_MEMORY_MONITORING = os.getenv("ENABLE_MEMORY_MONITORING", "true").lower() == "true"
MEMORY_LIMIT_MB = int(os.getenv("MEMORY_LIMIT_MB", "400"))  # Conservative limit for 512MB instances

# Search Settings
DEFAULT_TOP_K = 5
MAX_TOP_K = 20
MIN_SIMILARITY_THRESHOLD = 0.3