Spaces:

lapnt3
/

my-gradio-app

Runtime error

File size: 5,336 Bytes

eeb0f9c

"""
ViMedical Disease Dataset - Download & Process
Downloads and processes Vietnamese medical disease dataset into ChromaDB
Dataset: PB3002/ViMedical_Disease (603 diseases, 12K+ examples)
"""

import requests
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os
import re

def download_vimedical():
    """Download ViMedical dataset from HuggingFace"""
    
    print("📥 Downloading ViMedical Disease dataset...")
    
    # HuggingFace dataset URL
    url = "https://huggingface.co/datasets/PB3002/ViMedical_Disease/resolve/main/ViMedical_Disease.csv"
    
    # Create datasets directory
    os.makedirs("data_mining/datasets", exist_ok=True)
    output_path = "data_mining/datasets/vimedical_disease.csv"
    
    try:
        # Download
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        
        # Save
        with open(output_path, 'wb') as f:
            f.write(response.content)
        
        # Check file size
        file_size = os.path.getsize(output_path) / (1024 * 1024)  # MB
        
        print(f"✅ Downloaded: {output_path}")
        print(f"📊 File size: {file_size:.2f} MB")
        
        return True
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return False

def extract_symptoms(question):
    """Extract symptom description from question"""
    # Remove common prefixes
    prefixes = [
        'Tôi đang có triệu chứng như ',
        'Tôi thường xuyên ',
        'Tôi cảm thấy ',
        'Tôi bị ',
        'Tôi hay ',
        'Tôi có '
    ]
    
    symptom = question
    for prefix in prefixes:
        if symptom.startswith(prefix):
            symptom = symptom[len(prefix):]
            break
    
    # Remove question suffix
    suffixes = [
        '. Tôi bị bệnh gì?',
        '. Tôi có thể bị gì?',
        '. Đó là bệnh gì?'
    ]
    for suffix in suffixes:
        if symptom.endswith(suffix):
            symptom = symptom[:-len(suffix)]
            break
    
    return symptom.strip()

def process_vimedical():
    """Process ViMedical dataset and build ChromaDB"""
    
    print("\n🔨 Processing ViMedical dataset...")
    
    # Load dataset
    csv_path = "data_mining/datasets/vimedical_disease.csv"
    if not os.path.exists(csv_path):
        print(f"❌ Dataset not found: {csv_path}")
        return False
    
    df = pd.read_csv(csv_path)
    print(f"📊 Loaded {len(df)} records")
    print(f"📊 Unique diseases: {df['Disease'].nunique()}")
    
    # Initialize embedder
    print("🤖 Loading embedding model...")
    embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
    
    # Initialize ChromaDB
    print("💾 Initializing ChromaDB...")
    os.makedirs("data_mining/output", exist_ok=True)
    client = chromadb.PersistentClient(path="data_mining/output/medical_chroma")
    
    # Create collection
    collection = client.get_or_create_collection(
        name="medical_diseases",
        metadata={"hnsw:space": "cosine"}
    )
    
    # Group by disease
    print("📝 Processing diseases...")
    disease_groups = df.groupby('Disease')
    
    processed = 0
    for disease_name, group in disease_groups:
        # Extract symptoms from all questions
        symptoms = []
        for question in group['Question']:
            symptom = extract_symptoms(question)
            if symptom:
                symptoms.append(symptom)
        
        # Create document text
        doc_text = f"Bệnh: {disease_name}\n\nTriệu chứng:\n"
        doc_text += "\n".join(f"- {s}" for s in symptoms[:10])  # Limit to 10 examples
        
        # Generate embedding
        embedding = embedder.encode(doc_text)
        
        # Add to ChromaDB
        collection.add(
            ids=[f"disease_{processed:04d}"],
            embeddings=[embedding.tolist()],
            documents=[doc_text],
            metadatas=[{
                'disease_name': disease_name,
                'num_examples': len(symptoms),
                'source': 'ViMedical_Disease'
            }]
        )
        
        processed += 1
        if processed % 50 == 0:
            print(f"  Processed {processed}/{len(disease_groups)} diseases...")
    
    print(f"✅ Processed {processed} diseases")
    print(f"💾 Database saved to: data_mining/output/medical_chroma/")
    
    # Get database size
    db_path = "data_mining/output/medical_chroma"
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(db_path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    
    print(f"📊 Database size: {total_size / (1024 * 1024):.2f} MB")
    
    return True

def main():
    """Main function - download and process"""
    print("=" * 60)
    print("ViMedical Disease Dataset - Download & Process")
    print("=" * 60)
    
    # Step 1: Download
    if not download_vimedical():
        return False
    
    # Step 2: Process
    if not process_vimedical():
        return False
    
    print("\n" + "=" * 60)
    print("✅ ViMedical dataset ready!")
    print("=" * 60)
    return True

if __name__ == "__main__":
    success = main()
    exit(0 if success else 1)