Spaces:
Runtime error
Runtime error
File size: 5,472 Bytes
eeb0f9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
"""
MentalChat16K Dataset - Download & Process
Downloads and processes mental health counseling conversations into ChromaDB
Dataset: ShenLab/MentalChat16K (16K conversations, 33 topics)
"""
from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os
def download_mentalchat():
"""Download MentalChat16K dataset from HuggingFace"""
print("π₯ Downloading MentalChat16K dataset...")
print(" Source: ShenLab/MentalChat16K")
print(" Coverage: 33 mental health topics")
try:
# Load dataset from HuggingFace
dataset = load_dataset("ShenLab/MentalChat16K")
# Create output directory
os.makedirs("data_mining/datasets", exist_ok=True)
# Convert to pandas DataFrame
df = dataset['train'].to_pandas()
# Save to CSV
output_path = "data_mining/datasets/mentalchat16k.csv"
df.to_csv(output_path, index=False)
# Check file size
file_size = os.path.getsize(output_path) / (1024 * 1024) # MB
print(f"β
Downloaded: {output_path}")
print(f"π Records: {len(df)}")
print(f"π File size: {file_size:.2f} MB")
return True
except Exception as e:
print(f"β Download failed: {e}")
return False
def process_mentalchat():
"""Process MentalChat16K dataset and build ChromaDB"""
print("\nπ¨ Processing MentalChat16K dataset...")
# Load dataset
csv_path = "data_mining/datasets/mentalchat16k.csv"
if not os.path.exists(csv_path):
print(f"β Dataset not found: {csv_path}")
return False
df = pd.read_csv(csv_path)
print(f"π Loaded {len(df)} records")
# Initialize embedder
print("π€ Loading embedding model...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
# Initialize ChromaDB
print("πΎ Initializing ChromaDB...")
os.makedirs("data_mining/output", exist_ok=True)
client = chromadb.PersistentClient(path="data_mining/output/mental_health_chroma")
# Create collection
collection = client.get_or_create_collection(
name="mental_health",
metadata={"hnsw:space": "cosine"}
)
# Process conversations
print("π Processing conversations...")
# Determine column names and combine if needed
if 'instruction' in df.columns and 'output' in df.columns:
# New format: instruction + input + output
print(" Detected instruction-based format")
df['text'] = df.apply(lambda row:
f"User: {row['instruction']}\n{row.get('input', '')}\n\nAssistant: {row['output']}",
axis=1
)
text_column = 'text'
else:
# Try to find existing text column
text_column = None
for col in ['conversation', 'text', 'Context', 'Question', 'Response']:
if col in df.columns:
text_column = col
break
if not text_column:
print(f"β Could not find text column. Available: {df.columns.tolist()}")
return False
print(f" Using column: '{text_column}'")
processed = 0
batch_size = 100
for i in range(0, len(df), batch_size):
batch = df.iloc[i:i+batch_size]
ids = []
embeddings = []
documents = []
metadatas = []
for idx, row in batch.iterrows():
text = str(row[text_column])
if len(text) < 10:
continue
embedding = embedder.encode(text)
ids.append(f"mental_{processed:05d}")
embeddings.append(embedding.tolist())
documents.append(text)
metadatas.append({
'domain': 'mental_health',
'agent': 'MentalHealthAgent',
'source': 'MentalChat16K',
'index': processed
})
processed += 1
if ids:
collection.add(
ids=ids,
embeddings=embeddings,
documents=documents,
metadatas=metadatas
)
if (i + batch_size) % 1000 == 0:
print(f" Processed {min(i + batch_size, len(df))}/{len(df)} records...")
print(f"β
Processed {processed} conversations")
print(f"πΎ Database saved to: data_mining/output/mental_health_chroma/")
# Get database size
db_path = "data_mining/output/mental_health_chroma"
total_size = 0
for dirpath, dirnames, filenames in os.walk(db_path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
total_size += os.path.getsize(filepath)
print(f"π Database size: {total_size / (1024 * 1024):.2f} MB")
return True
def main():
"""Main function - download and process"""
print("=" * 60)
print("MentalChat16K Dataset - Download & Process")
print("=" * 60)
if not download_mentalchat():
return False
if not process_mentalchat():
return False
print("\n" + "=" * 60)
print("β
MentalChat16K dataset ready!")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
exit(0 if success else 1)
|