Spaces:

lapnt3
/

my-gradio-app

Runtime error

File size: 5,472 Bytes

eeb0f9c

"""
MentalChat16K Dataset - Download & Process
Downloads and processes mental health counseling conversations into ChromaDB
Dataset: ShenLab/MentalChat16K (16K conversations, 33 topics)
"""

from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os

def download_mentalchat():
    """Download MentalChat16K dataset from HuggingFace"""
    
    print("📥 Downloading MentalChat16K dataset...")
    print("   Source: ShenLab/MentalChat16K")
    print("   Coverage: 33 mental health topics")
    
    try:
        # Load dataset from HuggingFace
        dataset = load_dataset("ShenLab/MentalChat16K")
        
        # Create output directory
        os.makedirs("data_mining/datasets", exist_ok=True)
        
        # Convert to pandas DataFrame
        df = dataset['train'].to_pandas()
        
        # Save to CSV
        output_path = "data_mining/datasets/mentalchat16k.csv"
        df.to_csv(output_path, index=False)
        
        # Check file size
        file_size = os.path.getsize(output_path) / (1024 * 1024)  # MB
        
        print(f"✅ Downloaded: {output_path}")
        print(f"📊 Records: {len(df)}")
        print(f"📊 File size: {file_size:.2f} MB")
        
        return True
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return False

def process_mentalchat():
    """Process MentalChat16K dataset and build ChromaDB"""
    
    print("\n🔨 Processing MentalChat16K dataset...")
    
    # Load dataset
    csv_path = "data_mining/datasets/mentalchat16k.csv"
    if not os.path.exists(csv_path):
        print(f"❌ Dataset not found: {csv_path}")
        return False
    
    df = pd.read_csv(csv_path)
    print(f"📊 Loaded {len(df)} records")
    
    # Initialize embedder
    print("🤖 Loading embedding model...")
    embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
    
    # Initialize ChromaDB
    print("💾 Initializing ChromaDB...")
    os.makedirs("data_mining/output", exist_ok=True)
    client = chromadb.PersistentClient(path="data_mining/output/mental_health_chroma")
    
    # Create collection
    collection = client.get_or_create_collection(
        name="mental_health",
        metadata={"hnsw:space": "cosine"}
    )
    
    # Process conversations
    print("📝 Processing conversations...")
    
    # Determine column names and combine if needed
    if 'instruction' in df.columns and 'output' in df.columns:
        # New format: instruction + input + output
        print("   Detected instruction-based format")
        df['text'] = df.apply(lambda row: 
            f"User: {row['instruction']}\n{row.get('input', '')}\n\nAssistant: {row['output']}", 
            axis=1
        )
        text_column = 'text'
    else:
        # Try to find existing text column
        text_column = None
        for col in ['conversation', 'text', 'Context', 'Question', 'Response']:
            if col in df.columns:
                text_column = col
                break
        
        if not text_column:
            print(f"❌ Could not find text column. Available: {df.columns.tolist()}")
            return False
    
    print(f"   Using column: '{text_column}'")
    
    processed = 0
    batch_size = 100
    
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        
        ids = []
        embeddings = []
        documents = []
        metadatas = []
        
        for idx, row in batch.iterrows():
            text = str(row[text_column])
            
            if len(text) < 10:
                continue
            
            embedding = embedder.encode(text)
            
            ids.append(f"mental_{processed:05d}")
            embeddings.append(embedding.tolist())
            documents.append(text)
            metadatas.append({
                'domain': 'mental_health',
                'agent': 'MentalHealthAgent',
                'source': 'MentalChat16K',
                'index': processed
            })
            
            processed += 1
        
        if ids:
            collection.add(
                ids=ids,
                embeddings=embeddings,
                documents=documents,
                metadatas=metadatas
            )
        
        if (i + batch_size) % 1000 == 0:
            print(f"  Processed {min(i + batch_size, len(df))}/{len(df)} records...")
    
    print(f"✅ Processed {processed} conversations")
    print(f"💾 Database saved to: data_mining/output/mental_health_chroma/")
    
    # Get database size
    db_path = "data_mining/output/mental_health_chroma"
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(db_path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    
    print(f"📊 Database size: {total_size / (1024 * 1024):.2f} MB")
    
    return True

def main():
    """Main function - download and process"""
    print("=" * 60)
    print("MentalChat16K Dataset - Download & Process")
    print("=" * 60)
    
    if not download_mentalchat():
        return False
    
    if not process_mentalchat():
        return False
    
    print("\n" + "=" * 60)
    print("✅ MentalChat16K dataset ready!")
    print("=" * 60)
    return True

if __name__ == "__main__":
    success = main()
    exit(0 if success else 1)