Spaces:
Runtime error
Runtime error
File size: 5,336 Bytes
eeb0f9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
"""
ViMedical Disease Dataset - Download & Process
Downloads and processes Vietnamese medical disease dataset into ChromaDB
Dataset: PB3002/ViMedical_Disease (603 diseases, 12K+ examples)
"""
import requests
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os
import re
def download_vimedical():
"""Download ViMedical dataset from HuggingFace"""
print("📥 Downloading ViMedical Disease dataset...")
# HuggingFace dataset URL
url = "https://huggingface.co/datasets/PB3002/ViMedical_Disease/resolve/main/ViMedical_Disease.csv"
# Create datasets directory
os.makedirs("data_mining/datasets", exist_ok=True)
output_path = "data_mining/datasets/vimedical_disease.csv"
try:
# Download
response = requests.get(url, timeout=60)
response.raise_for_status()
# Save
with open(output_path, 'wb') as f:
f.write(response.content)
# Check file size
file_size = os.path.getsize(output_path) / (1024 * 1024) # MB
print(f"✅ Downloaded: {output_path}")
print(f"📊 File size: {file_size:.2f} MB")
return True
except Exception as e:
print(f"❌ Download failed: {e}")
return False
def extract_symptoms(question):
"""Extract symptom description from question"""
# Remove common prefixes
prefixes = [
'Tôi đang có triệu chứng như ',
'Tôi thường xuyên ',
'Tôi cảm thấy ',
'Tôi bị ',
'Tôi hay ',
'Tôi có '
]
symptom = question
for prefix in prefixes:
if symptom.startswith(prefix):
symptom = symptom[len(prefix):]
break
# Remove question suffix
suffixes = [
'. Tôi bị bệnh gì?',
'. Tôi có thể bị gì?',
'. Đó là bệnh gì?'
]
for suffix in suffixes:
if symptom.endswith(suffix):
symptom = symptom[:-len(suffix)]
break
return symptom.strip()
def process_vimedical():
"""Process ViMedical dataset and build ChromaDB"""
print("\n🔨 Processing ViMedical dataset...")
# Load dataset
csv_path = "data_mining/datasets/vimedical_disease.csv"
if not os.path.exists(csv_path):
print(f"❌ Dataset not found: {csv_path}")
return False
df = pd.read_csv(csv_path)
print(f"📊 Loaded {len(df)} records")
print(f"📊 Unique diseases: {df['Disease'].nunique()}")
# Initialize embedder
print("🤖 Loading embedding model...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
# Initialize ChromaDB
print("💾 Initializing ChromaDB...")
os.makedirs("data_mining/output", exist_ok=True)
client = chromadb.PersistentClient(path="data_mining/output/medical_chroma")
# Create collection
collection = client.get_or_create_collection(
name="medical_diseases",
metadata={"hnsw:space": "cosine"}
)
# Group by disease
print("📝 Processing diseases...")
disease_groups = df.groupby('Disease')
processed = 0
for disease_name, group in disease_groups:
# Extract symptoms from all questions
symptoms = []
for question in group['Question']:
symptom = extract_symptoms(question)
if symptom:
symptoms.append(symptom)
# Create document text
doc_text = f"Bệnh: {disease_name}\n\nTriệu chứng:\n"
doc_text += "\n".join(f"- {s}" for s in symptoms[:10]) # Limit to 10 examples
# Generate embedding
embedding = embedder.encode(doc_text)
# Add to ChromaDB
collection.add(
ids=[f"disease_{processed:04d}"],
embeddings=[embedding.tolist()],
documents=[doc_text],
metadatas=[{
'disease_name': disease_name,
'num_examples': len(symptoms),
'source': 'ViMedical_Disease'
}]
)
processed += 1
if processed % 50 == 0:
print(f" Processed {processed}/{len(disease_groups)} diseases...")
print(f"✅ Processed {processed} diseases")
print(f"💾 Database saved to: data_mining/output/medical_chroma/")
# Get database size
db_path = "data_mining/output/medical_chroma"
total_size = 0
for dirpath, dirnames, filenames in os.walk(db_path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
total_size += os.path.getsize(filepath)
print(f"📊 Database size: {total_size / (1024 * 1024):.2f} MB")
return True
def main():
"""Main function - download and process"""
print("=" * 60)
print("ViMedical Disease Dataset - Download & Process")
print("=" * 60)
# Step 1: Download
if not download_vimedical():
return False
# Step 2: Process
if not process_vimedical():
return False
print("\n" + "=" * 60)
print("✅ ViMedical dataset ready!")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
exit(0 if success else 1)
|