my-gradio-app / data_mining /mining_mentalchat.py
Nguyen Trong Lap
Recreate history without binary blobs
eeb0f9c
"""
MentalChat16K Dataset - Download & Process
Downloads and processes mental health counseling conversations into ChromaDB
Dataset: ShenLab/MentalChat16K (16K conversations, 33 topics)
"""
from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os
def download_mentalchat():
"""Download MentalChat16K dataset from HuggingFace"""
print("πŸ“₯ Downloading MentalChat16K dataset...")
print(" Source: ShenLab/MentalChat16K")
print(" Coverage: 33 mental health topics")
try:
# Load dataset from HuggingFace
dataset = load_dataset("ShenLab/MentalChat16K")
# Create output directory
os.makedirs("data_mining/datasets", exist_ok=True)
# Convert to pandas DataFrame
df = dataset['train'].to_pandas()
# Save to CSV
output_path = "data_mining/datasets/mentalchat16k.csv"
df.to_csv(output_path, index=False)
# Check file size
file_size = os.path.getsize(output_path) / (1024 * 1024) # MB
print(f"βœ… Downloaded: {output_path}")
print(f"πŸ“Š Records: {len(df)}")
print(f"πŸ“Š File size: {file_size:.2f} MB")
return True
except Exception as e:
print(f"❌ Download failed: {e}")
return False
def process_mentalchat():
"""Process MentalChat16K dataset and build ChromaDB"""
print("\nπŸ”¨ Processing MentalChat16K dataset...")
# Load dataset
csv_path = "data_mining/datasets/mentalchat16k.csv"
if not os.path.exists(csv_path):
print(f"❌ Dataset not found: {csv_path}")
return False
df = pd.read_csv(csv_path)
print(f"πŸ“Š Loaded {len(df)} records")
# Initialize embedder
print("πŸ€– Loading embedding model...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
# Initialize ChromaDB
print("πŸ’Ύ Initializing ChromaDB...")
os.makedirs("data_mining/output", exist_ok=True)
client = chromadb.PersistentClient(path="data_mining/output/mental_health_chroma")
# Create collection
collection = client.get_or_create_collection(
name="mental_health",
metadata={"hnsw:space": "cosine"}
)
# Process conversations
print("πŸ“ Processing conversations...")
# Determine column names and combine if needed
if 'instruction' in df.columns and 'output' in df.columns:
# New format: instruction + input + output
print(" Detected instruction-based format")
df['text'] = df.apply(lambda row:
f"User: {row['instruction']}\n{row.get('input', '')}\n\nAssistant: {row['output']}",
axis=1
)
text_column = 'text'
else:
# Try to find existing text column
text_column = None
for col in ['conversation', 'text', 'Context', 'Question', 'Response']:
if col in df.columns:
text_column = col
break
if not text_column:
print(f"❌ Could not find text column. Available: {df.columns.tolist()}")
return False
print(f" Using column: '{text_column}'")
processed = 0
batch_size = 100
for i in range(0, len(df), batch_size):
batch = df.iloc[i:i+batch_size]
ids = []
embeddings = []
documents = []
metadatas = []
for idx, row in batch.iterrows():
text = str(row[text_column])
if len(text) < 10:
continue
embedding = embedder.encode(text)
ids.append(f"mental_{processed:05d}")
embeddings.append(embedding.tolist())
documents.append(text)
metadatas.append({
'domain': 'mental_health',
'agent': 'MentalHealthAgent',
'source': 'MentalChat16K',
'index': processed
})
processed += 1
if ids:
collection.add(
ids=ids,
embeddings=embeddings,
documents=documents,
metadatas=metadatas
)
if (i + batch_size) % 1000 == 0:
print(f" Processed {min(i + batch_size, len(df))}/{len(df)} records...")
print(f"βœ… Processed {processed} conversations")
print(f"πŸ’Ύ Database saved to: data_mining/output/mental_health_chroma/")
# Get database size
db_path = "data_mining/output/mental_health_chroma"
total_size = 0
for dirpath, dirnames, filenames in os.walk(db_path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
total_size += os.path.getsize(filepath)
print(f"πŸ“Š Database size: {total_size / (1024 * 1024):.2f} MB")
return True
def main():
"""Main function - download and process"""
print("=" * 60)
print("MentalChat16K Dataset - Download & Process")
print("=" * 60)
if not download_mentalchat():
return False
if not process_mentalchat():
return False
print("\n" + "=" * 60)
print("βœ… MentalChat16K dataset ready!")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
exit(0 if success else 1)