Spaces:
Runtime error
Runtime error
File size: 12,882 Bytes
eeb0f9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
#!/bin/bash
# Setup RAG system - One command to rule them all
# Usage: bash scripts/setup_rag.sh
set -e # Exit on error
# Colors
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
echo -e "${BLUE}"
echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
echo "β π₯ HeoCare RAG System Setup (HuggingFace) β"
echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
echo -e "${NC}"
# 0. Cleanup old files and databases
echo -e "${BLUE}π§Ή Cleaning up old files and databases...${NC}"
# Remove old PDF/MD files from data_mining (if any)
if find data_mining -name "*.pdf" -o -name "*.md" ! -name "README.md" 2>/dev/null | grep -q .; then
echo -e "${YELLOW} Removing old PDF/MD files...${NC}"
find data_mining -name "*.pdf" -type f -delete 2>/dev/null || true
find data_mining -name "*.md" -type f ! -name "README.md" -delete 2>/dev/null || true
echo -e "${GREEN} β
Old documents removed${NC}"
fi
# Clear temporary datasets and output folders
if [ -d "data_mining/datasets" ] || [ -d "data_mining/output" ]; then
echo -e "${YELLOW} Clearing temporary folders...${NC}"
rm -rf data_mining/datasets 2>/dev/null || true
rm -rf data_mining/output 2>/dev/null || true
echo -e "${GREEN} β
Temporary folders cleared${NC}"
fi
# Clear old vector stores (will be regenerated)
if [ -d "rag/vector_store" ]; then
echo -e "${YELLOW} Clearing old vector stores...${NC}"
rm -rf rag/vector_store/* 2>/dev/null || true
echo -e "${GREEN} β
Old vector stores cleared${NC}"
fi
# Clear Python cache
if [ -d "__pycache__" ] || find . -type d -name "__pycache__" 2>/dev/null | grep -q .; then
echo -e "${YELLOW} Clearing Python cache...${NC}"
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
find . -type f -name "*.pyc" -delete 2>/dev/null || true
echo -e "${GREEN} β
Python cache cleared${NC}"
fi
echo -e "${GREEN}β
Cleanup complete!${NC}"
# 1. Check Python
echo -e "${BLUE}π Checking Python...${NC}"
if ! command -v python3 &> /dev/null; then
echo -e "${RED}β Python3 not found!${NC}"
echo "Please install Python 3.8 or higher"
exit 1
fi
PYTHON_VERSION=$(python3 --version)
echo -e "${GREEN}β
${PYTHON_VERSION}${NC}"
# 2. Check pip
echo -e "\n${BLUE}π¦ Checking pip...${NC}"
if ! command -v pip3 &> /dev/null && ! command -v pip &> /dev/null; then
echo -e "${RED}β pip not found!${NC}"
exit 1
fi
echo -e "${GREEN}β
pip found${NC}"
# 3. Install dependencies
echo -e "\n${BLUE}π¦ Installing dependencies...${NC}"
echo -e "${YELLOW}This may take a few minutes...${NC}"
# Check if requirements.txt exists
if [ -f "requirements.txt" ]; then
pip3 install -q -r requirements.txt || pip install -q -r requirements.txt
echo -e "${GREEN}β
Dependencies installed from requirements.txt${NC}"
else
echo -e "${YELLOW}β οΈ requirements.txt not found, installing core packages...${NC}"
pip3 install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests || \
pip install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests
echo -e "${GREEN}β
Core dependencies installed${NC}"
fi
# 4. Create directories
echo -e "\n${BLUE}π Creating directories...${NC}"
mkdir -p rag/vector_store
mkdir -p data_mining/{datasets,output}
mkdir -p chroma_db
echo -e "${GREEN}β
Directories created${NC}"
# 5. Setup ViMedical Vietnamese Disease Dataset
echo -e "\n${BLUE}π₯ Setting up ViMedical Vietnamese Disease Dataset...${NC}"
echo -e "${YELLOW}This will download and process 603 Vietnamese diseases...${NC}"
# Check if already exists
if [ -d "rag/vector_store/medical_diseases" ]; then
echo -e "${YELLOW}β οΈ ViMedical database already exists, skipping...${NC}"
else
# Create temp directory
mkdir -p data_mining/datasets
mkdir -p data_mining/output
# Run ViMedical setup
python3 data_mining/mining_vimedical.py || python data_mining/mining_vimedical.py
if [ $? -eq 0 ]; then
# Move to RAG directory
mkdir -p rag/vector_store
mv data_mining/output/medical_chroma rag/vector_store/medical_diseases
echo -e "${GREEN}β
ViMedical dataset ready (603 diseases)${NC}"
else
echo -e "${YELLOW}β οΈ ViMedical setup failed, continuing...${NC}"
fi
# Cleanup
rm -rf data_mining/datasets
rm -rf data_mining/output
fi
# 6. Setup MentalChat16K Mental Health Dataset
echo -e "\n${BLUE}π§ Setting up MentalChat16K Mental Health Dataset...${NC}"
echo -e "${YELLOW}This will download and process 16K mental health conversations...${NC}"
# Check if already exists
if [ -d "rag/vector_store/mental_health" ]; then
echo -e "${YELLOW}β οΈ Mental Health database already exists, skipping...${NC}"
else
# Create temp directory
mkdir -p data_mining/datasets
mkdir -p data_mining/output
# Run MentalChat setup
python3 data_mining/mining_mentalchat.py || python data_mining/mining_mentalchat.py
if [ $? -eq 0 ]; then
# Move to RAG directory
mkdir -p rag/vector_store
mv data_mining/output/mental_health_chroma rag/vector_store/mental_health
echo -e "${GREEN}β
Mental Health dataset ready (16K conversations)${NC}"
else
echo -e "${YELLOW}β οΈ Mental Health setup failed, continuing...${NC}"
fi
# Cleanup
rm -rf data_mining/datasets
rm -rf data_mining/output
fi
# 7. Setup Nutrition Dataset (Dietary Profiles)
echo -e "\n${BLUE}π₯ Setting up Nutrition Dataset (Dietary Profiles)...${NC}"
echo -e "${YELLOW}This will download 50 dietary profiles...${NC}"
if [ -d "rag/vector_store/nutrition" ]; then
echo -e "${YELLOW}β οΈ Nutrition database already exists, skipping...${NC}"
else
mkdir -p data_mining/datasets data_mining/output
python3 data_mining/mining_nutrition.py || python data_mining/mining_nutrition.py
if [ $? -eq 0 ]; then
mkdir -p rag/vector_store
mv data_mining/output/nutrition_chroma rag/vector_store/nutrition
echo -e "${GREEN}β
Nutrition profiles ready (50 profiles)${NC}"
else
echo -e "${YELLOW}β οΈ Nutrition setup failed, continuing...${NC}"
fi
rm -rf data_mining/datasets data_mining/output
fi
# 7b. Setup Vietnamese Food Nutrition Database
echo -e "\n${BLUE}π Setting up Vietnamese Food Nutrition Database...${NC}"
echo -e "${YELLOW}This will create 73 Vietnamese foods with nutrition facts...${NC}"
if [ -d "rag/vector_store/vietnamese_nutrition" ]; then
echo -e "${YELLOW}β οΈ Vietnamese nutrition database already exists, skipping...${NC}"
else
mkdir -p data_mining/datasets data_mining/output
python3 data_mining/mining_vietnamese_nutrition.py || python data_mining/mining_vietnamese_nutrition.py
if [ $? -eq 0 ]; then
mkdir -p rag/vector_store
mv data_mining/output/vietnamese_nutrition_chroma rag/vector_store/vietnamese_nutrition
echo -e "${GREEN}β
Vietnamese food nutrition ready (73 foods)${NC}"
else
echo -e "${YELLOW}β οΈ Vietnamese nutrition setup failed, continuing...${NC}"
fi
rm -rf data_mining/datasets data_mining/output
fi
# 8. Setup Fitness Dataset
echo -e "\n${BLUE}πͺ Setting up Fitness Dataset...${NC}"
echo -e "${YELLOW}This will download and process gym exercises...${NC}"
if [ -d "rag/vector_store/fitness" ]; then
echo -e "${YELLOW}β οΈ Fitness database already exists, skipping...${NC}"
else
mkdir -p data_mining/datasets data_mining/output
python3 data_mining/mining_fitness.py || python data_mining/mining_fitness.py
if [ $? -eq 0 ]; then
mkdir -p rag/vector_store
mv data_mining/output/fitness_chroma rag/vector_store/fitness
echo -e "${GREEN}β
Fitness dataset ready${NC}"
else
echo -e "${YELLOW}β οΈ Fitness setup failed, continuing...${NC}"
fi
rm -rf data_mining/datasets data_mining/output
fi
# 9. Setup COVID-19 Dataset (DEPRECATED - Skipped)
echo -e "\n${BLUE}π¦ COVID-19 Dataset...${NC}"
echo -e "${YELLOW}βοΈ Skipping (dataset deprecated, already have Medical Q&A)${NC}"
# 10. Setup Vietnamese Medical Q&A Dataset
echo -e "\n${BLUE}π¬ Setting up Vietnamese Medical Q&A Dataset...${NC}"
echo -e "${YELLOW}This will download and process 9.3K medical Q&A pairs from HuggingFace...${NC}"
if [ -d "rag/vector_store/symptom_qa" ] && [ -d "rag/vector_store/general_health_qa" ]; then
echo -e "${YELLOW}β οΈ Medical Q&A databases already exist, skipping...${NC}"
else
mkdir -p data_mining/datasets data_mining/output
python3 data_mining/mining_medical_qa.py || python data_mining/mining_medical_qa.py
if [ $? -eq 0 ]; then
mkdir -p rag/vector_store
mv data_mining/output/symptom_qa_chroma rag/vector_store/symptom_qa
mv data_mining/output/general_health_qa_chroma rag/vector_store/general_health_qa
echo -e "${GREEN}β
Medical Q&A datasets ready (Symptom + General Health)${NC}"
else
echo -e "${YELLOW}β οΈ Medical Q&A setup failed, continuing...${NC}"
fi
rm -rf data_mining/datasets data_mining/output
fi
# 11. Verify RAG
echo -e "\n${BLUE}β
Verifying RAG system...${NC}"
python3 scripts/check_rag_status.py 2>/dev/null || python scripts/check_rag_status.py 2>/dev/null || echo "β οΈ Verification skipped"
# 12. Generate Training Data (DISABLED - Not needed without fine-tuning)
# echo -e "\n${BLUE}π€ Generating synthetic training data...${NC}"
# echo -e "${YELLOW}This will create ~200 conversations for fine-tuning...${NC}"
#
# if [ -d "fine_tuning/training_data" ] && [ "$(ls -A fine_tuning/training_data 2>/dev/null)" ]; then
# echo -e "${YELLOW}β οΈ Training data already exists, skipping generation...${NC}"
# else
# python3 scripts/generate_training_data.py || python scripts/generate_training_data.py
# if [ $? -eq 0 ]; then
# echo -e "${GREEN}β
Training data generated!${NC}"
# else
# echo -e "${YELLOW}β οΈ Training data generation failed, continuing...${NC}"
# fi
# fi
# 13. Fine-tune Models (DISABLED - Custom API doesn't support fine-tuning)
# Fine-tuning requires OpenAI official API, which costs money and is not necessary
# The app works well with base model + RAG without fine-tuning
#
# echo -e "\n${BLUE}π Fine-tuning agents...${NC}"
# echo -e "${YELLOW}This will fine-tune all agents with synthetic data (takes 30-60 min, costs ~\$2)${NC}"
# echo -e "${YELLOW}Do you want to fine-tune now? (y/N)${NC}"
# read -t 10 -n 1 -r FINETUNE_CHOICE || FINETUNE_CHOICE="n"
# echo
#
# if [[ $FINETUNE_CHOICE =~ ^[Yy]$ ]]; then
# echo -e "${BLUE}π Starting fine-tuning...${NC}"
# python3 scripts/auto_finetune.py || python scripts/auto_finetune.py
# if [ $? -eq 0 ]; then
# echo -e "${GREEN}β
Fine-tuning complete!${NC}"
# else
# echo -e "${YELLOW}β οΈ Fine-tuning failed, check errors above${NC}"
# fi
# else
# echo -e "${YELLOW}βοΈ Skipping fine-tuning (you can run it later with: python scripts/auto_finetune.py)${NC}"
# fi
echo -e "\n${YELLOW}βΉοΈ Training data generation and fine-tuning are disabled${NC}"
echo -e "${YELLOW} Reason: Custom API doesn't support fine-tuning (404 error)${NC}"
echo -e "${YELLOW} App works well with base model + RAG without fine-tuning${NC}"
# Done
echo -e "\n${GREEN}"
echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
echo "β π Setup Complete! β"
echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
echo -e "${NC}"
echo -e "${BLUE}π What was set up:${NC}"
echo " β
RAG databases (6 specialized databases, ~160 MB)"
echo " - ViMedical Diseases (603 diseases)"
echo " - Mental Health (16K conversations)"
echo " - Nutrition Plans"
echo " - Vietnamese Food (73 items)"
echo " - Fitness Exercises (1.66K)"
echo " - Medical Q&A (9.3K pairs)"
echo ""
echo -e "${BLUE}π Next steps:${NC}"
echo " 1. python app.py"
echo " 2. Open http://localhost:7860 in your browser"
echo ""
echo -e "${BLUE}π‘ Tips:${NC}"
echo " - Check RAG status: python scripts/check_rag_status.py"
echo " - App works with base model + RAG (no fine-tuning needed)"
echo ""
|