Spaces:
Sleeping
Sleeping
| """Configuration settings for the ingestion pipeline""" | |
| import os | |
| # Default ingestion settings | |
| DEFAULT_CHUNK_SIZE = 1000 | |
| DEFAULT_OVERLAP = 200 | |
| RANDOM_SEED = 42 | |
| # Supported file formats | |
| SUPPORTED_FORMATS = {".txt", ".md", ".markdown"} | |
| # Corpus directory | |
| CORPUS_DIRECTORY = "synthetic_policies" | |
| # Vector Database Settings | |
| VECTOR_STORAGE_TYPE = os.getenv("VECTOR_STORAGE_TYPE", "chroma") # "chroma" or "postgres" | |
| VECTOR_DB_PERSIST_PATH = "data/chroma_db" # Used for ChromaDB | |
| DATABASE_URL = os.getenv("DATABASE_URL") # Used for PostgreSQL | |
| COLLECTION_NAME = "policy_documents" | |
| EMBEDDING_DIMENSION = 384 # paraphrase-MiniLM-L3-v2 (smaller, memory-efficient) | |
| SIMILARITY_METRIC = "cosine" | |
| # ChromaDB Configuration for Memory Optimization (when using ChromaDB) | |
| CHROMA_SETTINGS = { | |
| "anonymized_telemetry": False, | |
| "allow_reset": False, | |
| "is_persistent": True, | |
| } | |
| # PostgreSQL Configuration (when using PostgreSQL) | |
| POSTGRES_TABLE_NAME = "document_embeddings" | |
| POSTGRES_MAX_CONNECTIONS = 10 | |
| # Embedding Model Settings | |
| EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Ultra-lightweight | |
| EMBEDDING_BATCH_SIZE = 1 # Absolute minimum for extreme memory constraints | |
| EMBEDDING_DEVICE = "cpu" # Use CPU for free tier compatibility | |
| EMBEDDING_USE_QUANTIZED = os.getenv("EMBEDDING_USE_QUANTIZED", "true").lower() == "true" | |
| # Document Processing Settings (for memory optimization) | |
| MAX_DOCUMENT_LENGTH = 1000 # Truncate documents to reduce memory usage | |
| MAX_DOCUMENTS_IN_MEMORY = 100 # Process documents in small batches | |
| # Memory Management Settings | |
| ENABLE_MEMORY_MONITORING = os.getenv("ENABLE_MEMORY_MONITORING", "true").lower() == "true" | |
| MEMORY_LIMIT_MB = int(os.getenv("MEMORY_LIMIT_MB", "400")) # Conservative limit for 512MB instances | |
| # Search Settings | |
| DEFAULT_TOP_K = 5 | |
| MAX_TOP_K = 20 | |
| MIN_SIMILARITY_THRESHOLD = 0.3 | |