Spaces:
Runtime error
Runtime error
| """ | |
| MentalChat16K Dataset - Download & Process | |
| Downloads and processes mental health counseling conversations into ChromaDB | |
| Dataset: ShenLab/MentalChat16K (16K conversations, 33 topics) | |
| """ | |
| from datasets import load_dataset | |
| import pandas as pd | |
| import chromadb | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| def download_mentalchat(): | |
| """Download MentalChat16K dataset from HuggingFace""" | |
| print("π₯ Downloading MentalChat16K dataset...") | |
| print(" Source: ShenLab/MentalChat16K") | |
| print(" Coverage: 33 mental health topics") | |
| try: | |
| # Load dataset from HuggingFace | |
| dataset = load_dataset("ShenLab/MentalChat16K") | |
| # Create output directory | |
| os.makedirs("data_mining/datasets", exist_ok=True) | |
| # Convert to pandas DataFrame | |
| df = dataset['train'].to_pandas() | |
| # Save to CSV | |
| output_path = "data_mining/datasets/mentalchat16k.csv" | |
| df.to_csv(output_path, index=False) | |
| # Check file size | |
| file_size = os.path.getsize(output_path) / (1024 * 1024) # MB | |
| print(f"β Downloaded: {output_path}") | |
| print(f"π Records: {len(df)}") | |
| print(f"π File size: {file_size:.2f} MB") | |
| return True | |
| except Exception as e: | |
| print(f"β Download failed: {e}") | |
| return False | |
| def process_mentalchat(): | |
| """Process MentalChat16K dataset and build ChromaDB""" | |
| print("\nπ¨ Processing MentalChat16K dataset...") | |
| # Load dataset | |
| csv_path = "data_mining/datasets/mentalchat16k.csv" | |
| if not os.path.exists(csv_path): | |
| print(f"β Dataset not found: {csv_path}") | |
| return False | |
| df = pd.read_csv(csv_path) | |
| print(f"π Loaded {len(df)} records") | |
| # Initialize embedder | |
| print("π€ Loading embedding model...") | |
| embedder = SentenceTransformer('keepitreal/vietnamese-sbert') | |
| # Initialize ChromaDB | |
| print("πΎ Initializing ChromaDB...") | |
| os.makedirs("data_mining/output", exist_ok=True) | |
| client = chromadb.PersistentClient(path="data_mining/output/mental_health_chroma") | |
| # Create collection | |
| collection = client.get_or_create_collection( | |
| name="mental_health", | |
| metadata={"hnsw:space": "cosine"} | |
| ) | |
| # Process conversations | |
| print("π Processing conversations...") | |
| # Determine column names and combine if needed | |
| if 'instruction' in df.columns and 'output' in df.columns: | |
| # New format: instruction + input + output | |
| print(" Detected instruction-based format") | |
| df['text'] = df.apply(lambda row: | |
| f"User: {row['instruction']}\n{row.get('input', '')}\n\nAssistant: {row['output']}", | |
| axis=1 | |
| ) | |
| text_column = 'text' | |
| else: | |
| # Try to find existing text column | |
| text_column = None | |
| for col in ['conversation', 'text', 'Context', 'Question', 'Response']: | |
| if col in df.columns: | |
| text_column = col | |
| break | |
| if not text_column: | |
| print(f"β Could not find text column. Available: {df.columns.tolist()}") | |
| return False | |
| print(f" Using column: '{text_column}'") | |
| processed = 0 | |
| batch_size = 100 | |
| for i in range(0, len(df), batch_size): | |
| batch = df.iloc[i:i+batch_size] | |
| ids = [] | |
| embeddings = [] | |
| documents = [] | |
| metadatas = [] | |
| for idx, row in batch.iterrows(): | |
| text = str(row[text_column]) | |
| if len(text) < 10: | |
| continue | |
| embedding = embedder.encode(text) | |
| ids.append(f"mental_{processed:05d}") | |
| embeddings.append(embedding.tolist()) | |
| documents.append(text) | |
| metadatas.append({ | |
| 'domain': 'mental_health', | |
| 'agent': 'MentalHealthAgent', | |
| 'source': 'MentalChat16K', | |
| 'index': processed | |
| }) | |
| processed += 1 | |
| if ids: | |
| collection.add( | |
| ids=ids, | |
| embeddings=embeddings, | |
| documents=documents, | |
| metadatas=metadatas | |
| ) | |
| if (i + batch_size) % 1000 == 0: | |
| print(f" Processed {min(i + batch_size, len(df))}/{len(df)} records...") | |
| print(f"β Processed {processed} conversations") | |
| print(f"πΎ Database saved to: data_mining/output/mental_health_chroma/") | |
| # Get database size | |
| db_path = "data_mining/output/mental_health_chroma" | |
| total_size = 0 | |
| for dirpath, dirnames, filenames in os.walk(db_path): | |
| for filename in filenames: | |
| filepath = os.path.join(dirpath, filename) | |
| total_size += os.path.getsize(filepath) | |
| print(f"π Database size: {total_size / (1024 * 1024):.2f} MB") | |
| return True | |
| def main(): | |
| """Main function - download and process""" | |
| print("=" * 60) | |
| print("MentalChat16K Dataset - Download & Process") | |
| print("=" * 60) | |
| if not download_mentalchat(): | |
| return False | |
| if not process_mentalchat(): | |
| return False | |
| print("\n" + "=" * 60) | |
| print("β MentalChat16K dataset ready!") | |
| print("=" * 60) | |
| return True | |
| if __name__ == "__main__": | |
| success = main() | |
| exit(0 if success else 1) | |