File size: 12,882 Bytes
eeb0f9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/bin/bash
# Setup RAG system - One command to rule them all
# Usage: bash scripts/setup_rag.sh

set -e  # Exit on error

# Colors
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

echo -e "${BLUE}"
echo "╔════════════════════════════════════════════════════════════╗"
echo "β•‘       πŸ₯ HeoCare RAG System Setup (HuggingFace)          β•‘"
echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•"
echo -e "${NC}"

# 0. Cleanup old files and databases
echo -e "${BLUE}🧹 Cleaning up old files and databases...${NC}"

# Remove old PDF/MD files from data_mining (if any)
if find data_mining -name "*.pdf" -o -name "*.md" ! -name "README.md" 2>/dev/null | grep -q .; then
    echo -e "${YELLOW}   Removing old PDF/MD files...${NC}"
    find data_mining -name "*.pdf" -type f -delete 2>/dev/null || true
    find data_mining -name "*.md" -type f ! -name "README.md" -delete 2>/dev/null || true
    echo -e "${GREEN}   βœ… Old documents removed${NC}"
fi

# Clear temporary datasets and output folders
if [ -d "data_mining/datasets" ] || [ -d "data_mining/output" ]; then
    echo -e "${YELLOW}   Clearing temporary folders...${NC}"
    rm -rf data_mining/datasets 2>/dev/null || true
    rm -rf data_mining/output 2>/dev/null || true
    echo -e "${GREEN}   βœ… Temporary folders cleared${NC}"
fi

# Clear old vector stores (will be regenerated)
if [ -d "rag/vector_store" ]; then
    echo -e "${YELLOW}   Clearing old vector stores...${NC}"
    rm -rf rag/vector_store/* 2>/dev/null || true
    echo -e "${GREEN}   βœ… Old vector stores cleared${NC}"
fi

# Clear Python cache
if [ -d "__pycache__" ] || find . -type d -name "__pycache__" 2>/dev/null | grep -q .; then
    echo -e "${YELLOW}   Clearing Python cache...${NC}"
    find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
    find . -type f -name "*.pyc" -delete 2>/dev/null || true
    echo -e "${GREEN}   βœ… Python cache cleared${NC}"
fi

echo -e "${GREEN}βœ… Cleanup complete!${NC}"

# 1. Check Python
echo -e "${BLUE}🐍 Checking Python...${NC}"
if ! command -v python3 &> /dev/null; then
    echo -e "${RED}❌ Python3 not found!${NC}"
    echo "Please install Python 3.8 or higher"
    exit 1
fi
PYTHON_VERSION=$(python3 --version)
echo -e "${GREEN}βœ… ${PYTHON_VERSION}${NC}"

# 2. Check pip
echo -e "\n${BLUE}πŸ“¦ Checking pip...${NC}"
if ! command -v pip3 &> /dev/null && ! command -v pip &> /dev/null; then
    echo -e "${RED}❌ pip not found!${NC}"
    exit 1
fi
echo -e "${GREEN}βœ… pip found${NC}"

# 3. Install dependencies
echo -e "\n${BLUE}πŸ“¦ Installing dependencies...${NC}"
echo -e "${YELLOW}This may take a few minutes...${NC}"

# Check if requirements.txt exists
if [ -f "requirements.txt" ]; then
    pip3 install -q -r requirements.txt || pip install -q -r requirements.txt
    echo -e "${GREEN}βœ… Dependencies installed from requirements.txt${NC}"
else
    echo -e "${YELLOW}⚠️  requirements.txt not found, installing core packages...${NC}"
    pip3 install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests || \
    pip install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests
    echo -e "${GREEN}βœ… Core dependencies installed${NC}"
fi

# 4. Create directories
echo -e "\n${BLUE}πŸ“ Creating directories...${NC}"
mkdir -p rag/vector_store
mkdir -p data_mining/{datasets,output}
mkdir -p chroma_db
echo -e "${GREEN}βœ… Directories created${NC}"

# 5. Setup ViMedical Vietnamese Disease Dataset
echo -e "\n${BLUE}πŸ₯ Setting up ViMedical Vietnamese Disease Dataset...${NC}"
echo -e "${YELLOW}This will download and process 603 Vietnamese diseases...${NC}"

# Check if already exists
if [ -d "rag/vector_store/medical_diseases" ]; then
    echo -e "${YELLOW}⚠️  ViMedical database already exists, skipping...${NC}"
else
    # Create temp directory
    mkdir -p data_mining/datasets
    mkdir -p data_mining/output
    
    # Run ViMedical setup
    python3 data_mining/mining_vimedical.py || python data_mining/mining_vimedical.py
    
    if [ $? -eq 0 ]; then
        # Move to RAG directory
        mkdir -p rag/vector_store
        mv data_mining/output/medical_chroma rag/vector_store/medical_diseases
        echo -e "${GREEN}βœ… ViMedical dataset ready (603 diseases)${NC}"
    else
        echo -e "${YELLOW}⚠️  ViMedical setup failed, continuing...${NC}"
    fi
    
    # Cleanup
    rm -rf data_mining/datasets
    rm -rf data_mining/output
fi

# 6. Setup MentalChat16K Mental Health Dataset
echo -e "\n${BLUE}🧠 Setting up MentalChat16K Mental Health Dataset...${NC}"
echo -e "${YELLOW}This will download and process 16K mental health conversations...${NC}"

# Check if already exists
if [ -d "rag/vector_store/mental_health" ]; then
    echo -e "${YELLOW}⚠️  Mental Health database already exists, skipping...${NC}"
else
    # Create temp directory
    mkdir -p data_mining/datasets
    mkdir -p data_mining/output
    
    # Run MentalChat setup
    python3 data_mining/mining_mentalchat.py || python data_mining/mining_mentalchat.py
    
    if [ $? -eq 0 ]; then
        # Move to RAG directory
        mkdir -p rag/vector_store
        mv data_mining/output/mental_health_chroma rag/vector_store/mental_health
        echo -e "${GREEN}βœ… Mental Health dataset ready (16K conversations)${NC}"
    else
        echo -e "${YELLOW}⚠️  Mental Health setup failed, continuing...${NC}"
    fi
    
    # Cleanup
    rm -rf data_mining/datasets
    rm -rf data_mining/output
fi

# 7. Setup Nutrition Dataset (Dietary Profiles)
echo -e "\n${BLUE}πŸ₯— Setting up Nutrition Dataset (Dietary Profiles)...${NC}"
echo -e "${YELLOW}This will download 50 dietary profiles...${NC}"

if [ -d "rag/vector_store/nutrition" ]; then
    echo -e "${YELLOW}⚠️  Nutrition database already exists, skipping...${NC}"
else
    mkdir -p data_mining/datasets data_mining/output
    python3 data_mining/mining_nutrition.py || python data_mining/mining_nutrition.py
    if [ $? -eq 0 ]; then
        mkdir -p rag/vector_store
        mv data_mining/output/nutrition_chroma rag/vector_store/nutrition
        echo -e "${GREEN}βœ… Nutrition profiles ready (50 profiles)${NC}"
    else
        echo -e "${YELLOW}⚠️  Nutrition setup failed, continuing...${NC}"
    fi
    rm -rf data_mining/datasets data_mining/output
fi

# 7b. Setup Vietnamese Food Nutrition Database
echo -e "\n${BLUE}🍜 Setting up Vietnamese Food Nutrition Database...${NC}"
echo -e "${YELLOW}This will create 73 Vietnamese foods with nutrition facts...${NC}"

if [ -d "rag/vector_store/vietnamese_nutrition" ]; then
    echo -e "${YELLOW}⚠️  Vietnamese nutrition database already exists, skipping...${NC}"
else
    mkdir -p data_mining/datasets data_mining/output
    python3 data_mining/mining_vietnamese_nutrition.py || python data_mining/mining_vietnamese_nutrition.py
    if [ $? -eq 0 ]; then
        mkdir -p rag/vector_store
        mv data_mining/output/vietnamese_nutrition_chroma rag/vector_store/vietnamese_nutrition
        echo -e "${GREEN}βœ… Vietnamese food nutrition ready (73 foods)${NC}"
    else
        echo -e "${YELLOW}⚠️  Vietnamese nutrition setup failed, continuing...${NC}"
    fi
    rm -rf data_mining/datasets data_mining/output
fi

# 8. Setup Fitness Dataset
echo -e "\n${BLUE}πŸ’ͺ Setting up Fitness Dataset...${NC}"
echo -e "${YELLOW}This will download and process gym exercises...${NC}"

if [ -d "rag/vector_store/fitness" ]; then
    echo -e "${YELLOW}⚠️  Fitness database already exists, skipping...${NC}"
else
    mkdir -p data_mining/datasets data_mining/output
    python3 data_mining/mining_fitness.py || python data_mining/mining_fitness.py
    if [ $? -eq 0 ]; then
        mkdir -p rag/vector_store
        mv data_mining/output/fitness_chroma rag/vector_store/fitness
        echo -e "${GREEN}βœ… Fitness dataset ready${NC}"
    else
        echo -e "${YELLOW}⚠️  Fitness setup failed, continuing...${NC}"
    fi
    rm -rf data_mining/datasets data_mining/output
fi

# 9. Setup COVID-19 Dataset (DEPRECATED - Skipped)
echo -e "\n${BLUE}🦠 COVID-19 Dataset...${NC}"
echo -e "${YELLOW}⏭️  Skipping (dataset deprecated, already have Medical Q&A)${NC}"

# 10. Setup Vietnamese Medical Q&A Dataset
echo -e "\n${BLUE}πŸ’¬ Setting up Vietnamese Medical Q&A Dataset...${NC}"
echo -e "${YELLOW}This will download and process 9.3K medical Q&A pairs from HuggingFace...${NC}"

if [ -d "rag/vector_store/symptom_qa" ] && [ -d "rag/vector_store/general_health_qa" ]; then
    echo -e "${YELLOW}⚠️  Medical Q&A databases already exist, skipping...${NC}"
else
    mkdir -p data_mining/datasets data_mining/output
    python3 data_mining/mining_medical_qa.py || python data_mining/mining_medical_qa.py
    if [ $? -eq 0 ]; then
        mkdir -p rag/vector_store
        mv data_mining/output/symptom_qa_chroma rag/vector_store/symptom_qa
        mv data_mining/output/general_health_qa_chroma rag/vector_store/general_health_qa
        echo -e "${GREEN}βœ… Medical Q&A datasets ready (Symptom + General Health)${NC}"
    else
        echo -e "${YELLOW}⚠️  Medical Q&A setup failed, continuing...${NC}"
    fi
    rm -rf data_mining/datasets data_mining/output
fi

# 11. Verify RAG
echo -e "\n${BLUE}βœ… Verifying RAG system...${NC}"
python3 scripts/check_rag_status.py 2>/dev/null || python scripts/check_rag_status.py 2>/dev/null || echo "⚠️  Verification skipped"

# 12. Generate Training Data (DISABLED - Not needed without fine-tuning)
# echo -e "\n${BLUE}πŸ€– Generating synthetic training data...${NC}"
# echo -e "${YELLOW}This will create ~200 conversations for fine-tuning...${NC}"
# 
# if [ -d "fine_tuning/training_data" ] && [ "$(ls -A fine_tuning/training_data 2>/dev/null)" ]; then
#     echo -e "${YELLOW}⚠️  Training data already exists, skipping generation...${NC}"
# else
#     python3 scripts/generate_training_data.py || python scripts/generate_training_data.py
#     if [ $? -eq 0 ]; then
#         echo -e "${GREEN}βœ… Training data generated!${NC}"
#     else
#         echo -e "${YELLOW}⚠️  Training data generation failed, continuing...${NC}"
#     fi
# fi

# 13. Fine-tune Models (DISABLED - Custom API doesn't support fine-tuning)
# Fine-tuning requires OpenAI official API, which costs money and is not necessary
# The app works well with base model + RAG without fine-tuning
# 
# echo -e "\n${BLUE}πŸŽ“ Fine-tuning agents...${NC}"
# echo -e "${YELLOW}This will fine-tune all agents with synthetic data (takes 30-60 min, costs ~\$2)${NC}"
# echo -e "${YELLOW}Do you want to fine-tune now? (y/N)${NC}"
# read -t 10 -n 1 -r FINETUNE_CHOICE || FINETUNE_CHOICE="n"
# echo
# 
# if [[ $FINETUNE_CHOICE =~ ^[Yy]$ ]]; then
#     echo -e "${BLUE}πŸš€ Starting fine-tuning...${NC}"
#     python3 scripts/auto_finetune.py || python scripts/auto_finetune.py
#     if [ $? -eq 0 ]; then
#         echo -e "${GREEN}βœ… Fine-tuning complete!${NC}"
#     else
#         echo -e "${YELLOW}⚠️  Fine-tuning failed, check errors above${NC}"
#     fi
# else
#     echo -e "${YELLOW}⏭️  Skipping fine-tuning (you can run it later with: python scripts/auto_finetune.py)${NC}"
# fi

echo -e "\n${YELLOW}ℹ️  Training data generation and fine-tuning are disabled${NC}"
echo -e "${YELLOW}   Reason: Custom API doesn't support fine-tuning (404 error)${NC}"
echo -e "${YELLOW}   App works well with base model + RAG without fine-tuning${NC}"

# Done
echo -e "\n${GREEN}"
echo "╔════════════════════════════════════════════════════════════╗"
echo "β•‘          πŸŽ‰ Setup Complete!                               β•‘"
echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•"
echo -e "${NC}"

echo -e "${BLUE}πŸ“Š What was set up:${NC}"
echo "  βœ… RAG databases (6 specialized databases, ~160 MB)"
echo "     - ViMedical Diseases (603 diseases)"
echo "     - Mental Health (16K conversations)"
echo "     - Nutrition Plans"
echo "     - Vietnamese Food (73 items)"
echo "     - Fitness Exercises (1.66K)"
echo "     - Medical Q&A (9.3K pairs)"
echo ""

echo -e "${BLUE}πŸš€ Next steps:${NC}"
echo "  1. python app.py"
echo "  2. Open http://localhost:7860 in your browser"
echo ""

echo -e "${BLUE}πŸ’‘ Tips:${NC}"
echo "  - Check RAG status: python scripts/check_rag_status.py"
echo "  - App works with base model + RAG (no fine-tuning needed)"
echo ""