File size: 5,472 Bytes
eeb0f9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
MentalChat16K Dataset - Download & Process
Downloads and processes mental health counseling conversations into ChromaDB
Dataset: ShenLab/MentalChat16K (16K conversations, 33 topics)
"""

from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os

def download_mentalchat():
    """Download MentalChat16K dataset from HuggingFace"""
    
    print("πŸ“₯ Downloading MentalChat16K dataset...")
    print("   Source: ShenLab/MentalChat16K")
    print("   Coverage: 33 mental health topics")
    
    try:
        # Load dataset from HuggingFace
        dataset = load_dataset("ShenLab/MentalChat16K")
        
        # Create output directory
        os.makedirs("data_mining/datasets", exist_ok=True)
        
        # Convert to pandas DataFrame
        df = dataset['train'].to_pandas()
        
        # Save to CSV
        output_path = "data_mining/datasets/mentalchat16k.csv"
        df.to_csv(output_path, index=False)
        
        # Check file size
        file_size = os.path.getsize(output_path) / (1024 * 1024)  # MB
        
        print(f"βœ… Downloaded: {output_path}")
        print(f"πŸ“Š Records: {len(df)}")
        print(f"πŸ“Š File size: {file_size:.2f} MB")
        
        return True
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return False

def process_mentalchat():
    """Process MentalChat16K dataset and build ChromaDB"""
    
    print("\nπŸ”¨ Processing MentalChat16K dataset...")
    
    # Load dataset
    csv_path = "data_mining/datasets/mentalchat16k.csv"
    if not os.path.exists(csv_path):
        print(f"❌ Dataset not found: {csv_path}")
        return False
    
    df = pd.read_csv(csv_path)
    print(f"πŸ“Š Loaded {len(df)} records")
    
    # Initialize embedder
    print("πŸ€– Loading embedding model...")
    embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
    
    # Initialize ChromaDB
    print("πŸ’Ύ Initializing ChromaDB...")
    os.makedirs("data_mining/output", exist_ok=True)
    client = chromadb.PersistentClient(path="data_mining/output/mental_health_chroma")
    
    # Create collection
    collection = client.get_or_create_collection(
        name="mental_health",
        metadata={"hnsw:space": "cosine"}
    )
    
    # Process conversations
    print("πŸ“ Processing conversations...")
    
    # Determine column names and combine if needed
    if 'instruction' in df.columns and 'output' in df.columns:
        # New format: instruction + input + output
        print("   Detected instruction-based format")
        df['text'] = df.apply(lambda row: 
            f"User: {row['instruction']}\n{row.get('input', '')}\n\nAssistant: {row['output']}", 
            axis=1
        )
        text_column = 'text'
    else:
        # Try to find existing text column
        text_column = None
        for col in ['conversation', 'text', 'Context', 'Question', 'Response']:
            if col in df.columns:
                text_column = col
                break
        
        if not text_column:
            print(f"❌ Could not find text column. Available: {df.columns.tolist()}")
            return False
    
    print(f"   Using column: '{text_column}'")
    
    processed = 0
    batch_size = 100
    
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        
        ids = []
        embeddings = []
        documents = []
        metadatas = []
        
        for idx, row in batch.iterrows():
            text = str(row[text_column])
            
            if len(text) < 10:
                continue
            
            embedding = embedder.encode(text)
            
            ids.append(f"mental_{processed:05d}")
            embeddings.append(embedding.tolist())
            documents.append(text)
            metadatas.append({
                'domain': 'mental_health',
                'agent': 'MentalHealthAgent',
                'source': 'MentalChat16K',
                'index': processed
            })
            
            processed += 1
        
        if ids:
            collection.add(
                ids=ids,
                embeddings=embeddings,
                documents=documents,
                metadatas=metadatas
            )
        
        if (i + batch_size) % 1000 == 0:
            print(f"  Processed {min(i + batch_size, len(df))}/{len(df)} records...")
    
    print(f"βœ… Processed {processed} conversations")
    print(f"πŸ’Ύ Database saved to: data_mining/output/mental_health_chroma/")
    
    # Get database size
    db_path = "data_mining/output/mental_health_chroma"
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(db_path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    
    print(f"πŸ“Š Database size: {total_size / (1024 * 1024):.2f} MB")
    
    return True

def main():
    """Main function - download and process"""
    print("=" * 60)
    print("MentalChat16K Dataset - Download & Process")
    print("=" * 60)
    
    if not download_mentalchat():
        return False
    
    if not process_mentalchat():
        return False
    
    print("\n" + "=" * 60)
    print("βœ… MentalChat16K dataset ready!")
    print("=" * 60)
    return True

if __name__ == "__main__":
    success = main()
    exit(0 if success else 1)