Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

sethmcknight

fix(chroma): enable persistence and quantized embeddings by default

f60c17f about 2 months ago

1.84 kB

	"""Configuration settings for the ingestion pipeline"""

	import os

	# Default ingestion settings
	DEFAULT_CHUNK_SIZE = 1000
	DEFAULT_OVERLAP = 200
	RANDOM_SEED = 42

	# Supported file formats
	SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}

	# Corpus directory
	CORPUS_DIRECTORY = "synthetic_policies"

	# Vector Database Settings
	VECTOR_STORAGE_TYPE = os.getenv("VECTOR_STORAGE_TYPE", "chroma") # "chroma" or "postgres"
	VECTOR_DB_PERSIST_PATH = "data/chroma_db" # Used for ChromaDB
	DATABASE_URL = os.getenv("DATABASE_URL") # Used for PostgreSQL
	COLLECTION_NAME = "policy_documents"
	EMBEDDING_DIMENSION = 384 # paraphrase-MiniLM-L3-v2 (smaller, memory-efficient)
	SIMILARITY_METRIC = "cosine"

	# ChromaDB Configuration for Memory Optimization (when using ChromaDB)
	CHROMA_SETTINGS = {
	"anonymized_telemetry": False,
	"allow_reset": False,
	"is_persistent": True,
	}

	# PostgreSQL Configuration (when using PostgreSQL)
	POSTGRES_TABLE_NAME = "document_embeddings"
	POSTGRES_MAX_CONNECTIONS = 10

	# Embedding Model Settings
	EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Ultra-lightweight
	EMBEDDING_BATCH_SIZE = 1 # Absolute minimum for extreme memory constraints
	EMBEDDING_DEVICE = "cpu" # Use CPU for free tier compatibility
	EMBEDDING_USE_QUANTIZED = os.getenv("EMBEDDING_USE_QUANTIZED", "true").lower() == "true"

	# Document Processing Settings (for memory optimization)
	MAX_DOCUMENT_LENGTH = 1000 # Truncate documents to reduce memory usage
	MAX_DOCUMENTS_IN_MEMORY = 100 # Process documents in small batches

	# Memory Management Settings
	ENABLE_MEMORY_MONITORING = os.getenv("ENABLE_MEMORY_MONITORING", "true").lower() == "true"
	MEMORY_LIMIT_MB = int(os.getenv("MEMORY_LIMIT_MB", "400")) # Conservative limit for 512MB instances

	# Search Settings
	DEFAULT_TOP_K = 5
	MAX_TOP_K = 20
	MIN_SIMILARITY_THRESHOLD = 0.3