Spaces:
Sleeping
Sleeping
File size: 1,843 Bytes
ffa0f3d dca679b ffa0f3d 7793bb6 ffa0f3d 7793bb6 afecdc5 15f6c83 dca679b afecdc5 0a7f9b4 afecdc5 dca679b f75da29 f60c17f f75da29 dca679b 48155ff dca679b 48155ff afecdc5 f60c17f afecdc5 dca679b 159faf0 dca679b afecdc5 7793bb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
"""Configuration settings for the ingestion pipeline"""
import os
# Default ingestion settings
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_OVERLAP = 200
RANDOM_SEED = 42
# Supported file formats
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}
# Corpus directory
CORPUS_DIRECTORY = "synthetic_policies"
# Vector Database Settings
VECTOR_STORAGE_TYPE = os.getenv("VECTOR_STORAGE_TYPE", "chroma") # "chroma" or "postgres"
VECTOR_DB_PERSIST_PATH = "data/chroma_db" # Used for ChromaDB
DATABASE_URL = os.getenv("DATABASE_URL") # Used for PostgreSQL
COLLECTION_NAME = "policy_documents"
EMBEDDING_DIMENSION = 384 # paraphrase-MiniLM-L3-v2 (smaller, memory-efficient)
SIMILARITY_METRIC = "cosine"
# ChromaDB Configuration for Memory Optimization (when using ChromaDB)
CHROMA_SETTINGS = {
"anonymized_telemetry": False,
"allow_reset": False,
"is_persistent": True,
}
# PostgreSQL Configuration (when using PostgreSQL)
POSTGRES_TABLE_NAME = "document_embeddings"
POSTGRES_MAX_CONNECTIONS = 10
# Embedding Model Settings
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Ultra-lightweight
EMBEDDING_BATCH_SIZE = 1 # Absolute minimum for extreme memory constraints
EMBEDDING_DEVICE = "cpu" # Use CPU for free tier compatibility
EMBEDDING_USE_QUANTIZED = os.getenv("EMBEDDING_USE_QUANTIZED", "true").lower() == "true"
# Document Processing Settings (for memory optimization)
MAX_DOCUMENT_LENGTH = 1000 # Truncate documents to reduce memory usage
MAX_DOCUMENTS_IN_MEMORY = 100 # Process documents in small batches
# Memory Management Settings
ENABLE_MEMORY_MONITORING = os.getenv("ENABLE_MEMORY_MONITORING", "true").lower() == "true"
MEMORY_LIMIT_MB = int(os.getenv("MEMORY_LIMIT_MB", "400")) # Conservative limit for 512MB instances
# Search Settings
DEFAULT_TOP_K = 5
MAX_TOP_K = 20
MIN_SIMILARITY_THRESHOLD = 0.3
|