Spaces:

lapnt3
/

my-gradio-app

Runtime error

my-gradio-app / data_mining /mining_mentalchat.py

Nguyen Trong Lap

Recreate history without binary blobs

eeb0f9c about 2 months ago

5.47 kB

	"""
	MentalChat16K Dataset - Download & Process
	Downloads and processes mental health counseling conversations into ChromaDB
	Dataset: ShenLab/MentalChat16K (16K conversations, 33 topics)
	"""

	from datasets import load_dataset
	import pandas as pd
	import chromadb
	from sentence_transformers import SentenceTransformer
	import os

	def download_mentalchat():
	"""Download MentalChat16K dataset from HuggingFace"""

	print("📥 Downloading MentalChat16K dataset...")
	print(" Source: ShenLab/MentalChat16K")
	print(" Coverage: 33 mental health topics")

	try:
	# Load dataset from HuggingFace
	dataset = load_dataset("ShenLab/MentalChat16K")

	# Create output directory
	os.makedirs("data_mining/datasets", exist_ok=True)

	# Convert to pandas DataFrame
	df = dataset['train'].to_pandas()

	# Save to CSV
	output_path = "data_mining/datasets/mentalchat16k.csv"
	df.to_csv(output_path, index=False)

	# Check file size
	file_size = os.path.getsize(output_path) / (1024 * 1024) # MB

	print(f"✅ Downloaded: {output_path}")
	print(f"📊 Records: {len(df)}")
	print(f"📊 File size: {file_size:.2f} MB")

	return True

	except Exception as e:
	print(f"❌ Download failed: {e}")
	return False

	def process_mentalchat():
	"""Process MentalChat16K dataset and build ChromaDB"""

	print("\n🔨 Processing MentalChat16K dataset...")

	# Load dataset
	csv_path = "data_mining/datasets/mentalchat16k.csv"
	if not os.path.exists(csv_path):
	print(f"❌ Dataset not found: {csv_path}")
	return False

	df = pd.read_csv(csv_path)
	print(f"📊 Loaded {len(df)} records")

	# Initialize embedder
	print("🤖 Loading embedding model...")
	embedder = SentenceTransformer('keepitreal/vietnamese-sbert')

	# Initialize ChromaDB
	print("💾 Initializing ChromaDB...")
	os.makedirs("data_mining/output", exist_ok=True)
	client = chromadb.PersistentClient(path="data_mining/output/mental_health_chroma")

	# Create collection
	collection = client.get_or_create_collection(
	name="mental_health",
	metadata={"hnsw:space": "cosine"}
	)

	# Process conversations
	print("📝 Processing conversations...")

	# Determine column names and combine if needed
	if 'instruction' in df.columns and 'output' in df.columns:
	# New format: instruction + input + output
	print(" Detected instruction-based format")
	df['text'] = df.apply(lambda row:
	f"User: {row['instruction']}\n{row.get('input', '')}\n\nAssistant: {row['output']}",
	axis=1
	)
	text_column = 'text'
	else:
	# Try to find existing text column
	text_column = None
	for col in ['conversation', 'text', 'Context', 'Question', 'Response']:
	if col in df.columns:
	text_column = col
	break

	if not text_column:
	print(f"❌ Could not find text column. Available: {df.columns.tolist()}")
	return False

	print(f" Using column: '{text_column}'")

	processed = 0
	batch_size = 100

	for i in range(0, len(df), batch_size):
	batch = df.iloc[i:i+batch_size]

	ids = []
	embeddings = []
	documents = []
	metadatas = []

	for idx, row in batch.iterrows():
	text = str(row[text_column])

	if len(text) < 10:
	continue

	embedding = embedder.encode(text)

	ids.append(f"mental_{processed:05d}")
	embeddings.append(embedding.tolist())
	documents.append(text)
	metadatas.append({
	'domain': 'mental_health',
	'agent': 'MentalHealthAgent',
	'source': 'MentalChat16K',
	'index': processed
	})

	processed += 1

	if ids:
	collection.add(
	ids=ids,
	embeddings=embeddings,
	documents=documents,
	metadatas=metadatas
	)

	if (i + batch_size) % 1000 == 0:
	print(f" Processed {min(i + batch_size, len(df))}/{len(df)} records...")

	print(f"✅ Processed {processed} conversations")
	print(f"💾 Database saved to: data_mining/output/mental_health_chroma/")

	# Get database size
	db_path = "data_mining/output/mental_health_chroma"
	total_size = 0
	for dirpath, dirnames, filenames in os.walk(db_path):
	for filename in filenames:
	filepath = os.path.join(dirpath, filename)
	total_size += os.path.getsize(filepath)

	print(f"📊 Database size: {total_size / (1024 * 1024):.2f} MB")

	return True

	def main():
	"""Main function - download and process"""
	print("=" * 60)
	print("MentalChat16K Dataset - Download & Process")
	print("=" * 60)

	if not download_mentalchat():
	return False

	if not process_mentalchat():
	return False

	print("\n" + "=" * 60)
	print("✅ MentalChat16K dataset ready!")
	print("=" * 60)
	return True

	if __name__ == "__main__":
	success = main()
	exit(0 if success else 1)