my-gradio-app / data_mining /mining_medical_qa.py
Nguyen Trong Lap
Recreate history without binary blobs
eeb0f9c
#!/usr/bin/env python3
"""
Mining Script: Vietnamese Medical Q&A Dataset
Downloads and processes hungnm/vietnamese-medical-qa from HuggingFace
Splits into 2 collections: symptom_qa and general_health_qa
"""
import sys
import pandas as pd
from pathlib import Path
def download_medical_qa():
"""Download Vietnamese Medical Q&A dataset from HuggingFace"""
try:
from datasets import load_dataset
print("πŸ“₯ Downloading Vietnamese Medical Q&A from HuggingFace...")
print(" Source: hungnm/vietnamese-medical-qa")
print(" Size: ~9,335 Q&A pairs")
# Download dataset
dataset = load_dataset("hungnm/vietnamese-medical-qa")
df = dataset['train'].to_pandas()
print(f"βœ… Downloaded: {len(df)} Q&A pairs")
# Save to CSV
output_dir = Path("data_mining/datasets")
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "vietnamese_medical_qa.csv"
df.to_csv(output_path, index=False, encoding='utf-8')
print(f"πŸ’Ύ Saved to: {output_path}")
return df
except ImportError:
print("❌ Error: 'datasets' library not installed")
print(" Install with: pip install datasets")
return None
except Exception as e:
print(f"❌ Error downloading dataset: {e}")
return None
def is_symptom_question(question):
"""
Classify if question is about SPECIFIC SYMPTOMS
Returns:
bool: True if symptom question, False if general health question
"""
if not question or not isinstance(question, str):
return False
question_lower = question.lower()
# Symptom keywords (high priority - user describing active symptoms)
symptom_keywords = [
# Pain
'bα»‹ Δ‘au', 'Δ‘au', 'nhα»©c', 'tα»©c', 'Δ‘au nhα»©c',
# Infection/Fever
'bα»‹ sα»‘t', 'sα»‘t', 'viΓͺm', 'nhiα»…m trΓΉng', 'mα»§', 'sΖ°ng',
# Digestive
'buα»“n nΓ΄n', 'nΓ΄n', 'tiΓͺu chαΊ£y', 'tΓ‘o bΓ³n', 'Δ‘αΊ§y hΖ‘i',
'ợ hΖ‘i', 'ợ chua', 'khΓ³ tiΓͺu',
# Respiratory
'ho', 'khΓ³ thở', 'nghαΊΉt mΕ©i', 'chαΊ£y nΖ°α»›c mΕ©i',
'Δ‘au họng', 'khΓ n giọng',
# Neurological
'chΓ³ng mαΊ·t', 'hoa mαΊ―t', 'mαΊ₯t thΔƒng bαΊ±ng', 'Δ‘au Δ‘αΊ§u',
# Skin
'ngα»©a', 'phΓ‘t ban', 'nα»•i mαΊ©n', 'đỏ',
# General symptoms
'mệt mỏi', 'yαΊΏu', 'khΓ΄ng khỏe', 'bα»‹ α»‘m', 'khΓ³ chα»‹u'
]
# General health keywords (prevention, knowledge, advice)
general_keywords = [
# Prevention
'lΓ m sao để khΓ΄ng', 'phΓ²ng ngα»«a', 'trΓ‘nh', 'cΓ‘ch phΓ²ng',
'lΓ m thαΊΏ nΓ o để', 'cΓ‘ch nΓ o để',
# Knowledge questions
'lΓ  gΓ¬', 'cΓ³ phαΊ£i', 'cΓ³ nΓͺn', 'nΓͺn khΓ΄ng',
'tαΊ‘i sao', 'nguyΓͺn nhΓ’n', 'cΓ³ thể',
# Advice/Recommendations
'nΓͺn lΓ m gΓ¬', 'nΓͺn Δƒn gΓ¬', 'cΓ³ tα»‘t khΓ΄ng',
'cΓ³ được khΓ΄ng', 'cΓ³ nΓͺn', 'khuyΓͺn'
]
# Count keyword matches
symptom_score = sum(1 for kw in symptom_keywords if kw in question_lower)
general_score = sum(1 for kw in general_keywords if kw in question_lower)
# Decision logic
if symptom_score > general_score:
return True # Symptom question
elif general_score > symptom_score:
return False # General health question
else:
# Tie-breaker: Check for "bα»‹" (indicates having a condition)
return 'bα»‹' in question_lower
def process_medical_qa():
"""Process and split into 2 ChromaDB collections"""
try:
from sentence_transformers import SentenceTransformer
import chromadb
print("\nπŸ”„ Processing Vietnamese Medical Q&A...")
# Load CSV
csv_path = Path("data_mining/datasets/vietnamese_medical_qa.csv")
if not csv_path.exists():
print(f"❌ Error: {csv_path} not found")
return False
df = pd.read_csv(csv_path, encoding='utf-8')
print(f"πŸ“Š Loaded: {len(df)} Q&A pairs")
# Initialize embedding model
print("πŸ€– Loading embedding model: keepitreal/vietnamese-sbert...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
# Initialize ChromaDB
output_dir = Path("data_mining/output")
output_dir.mkdir(parents=True, exist_ok=True)
# Split data
symptom_data = []
general_data = []
print("πŸ” Classifying questions...")
for idx, row in df.iterrows():
question = str(row['question'])
answer = str(row['answer'])
# Combine Q&A
text = f"CÒu hỏi: {question}\n\nTrả lời: {answer}"
# Classify
if is_symptom_question(question):
symptom_data.append({
'id': f'symptom_qa_{idx}',
'text': text,
'question': question,
'answer': answer,
'type': 'symptom'
})
else:
general_data.append({
'id': f'general_qa_{idx}',
'text': text,
'question': question,
'answer': answer,
'type': 'general'
})
print(f"βœ… Classification complete:")
print(f" - Symptom Q&A: {len(symptom_data)} ({len(symptom_data)/len(df)*100:.1f}%)")
print(f" - General Health Q&A: {len(general_data)} ({len(general_data)/len(df)*100:.1f}%)")
# Create ChromaDB collections
# 1. Symptom Q&A Collection
print("\nπŸ“¦ Creating Symptom Q&A ChromaDB...")
symptom_client = chromadb.PersistentClient(path=str(output_dir / "symptom_qa_chroma"))
symptom_collection = symptom_client.get_or_create_collection(
name="symptom_qa",
metadata={"description": "Vietnamese Medical Q&A - Symptom Questions"}
)
# Batch insert symptom data
batch_size = 100
for i in range(0, len(symptom_data), batch_size):
batch = symptom_data[i:i+batch_size]
ids = [item['id'] for item in batch]
texts = [item['text'] for item in batch]
metadatas = [{
'type': item['type'],
'domain': 'symptom',
'agent': 'SymptomAgent',
'source': 'vietnamese-medical-qa'
} for item in batch]
# Generate embeddings
embeddings = embedder.encode(texts, show_progress_bar=False)
symptom_collection.add(
ids=ids,
embeddings=embeddings.tolist(),
documents=texts,
metadatas=metadatas
)
if (i + batch_size) % 500 == 0:
print(f" Processed {min(i+batch_size, len(symptom_data))}/{len(symptom_data)} symptom Q&A...")
print(f"βœ… Symptom Q&A ChromaDB created: {len(symptom_data)} records")
# 2. General Health Q&A Collection
print("\nπŸ“¦ Creating General Health Q&A ChromaDB...")
general_client = chromadb.PersistentClient(path=str(output_dir / "general_health_qa_chroma"))
general_collection = general_client.get_or_create_collection(
name="general_health_qa",
metadata={"description": "Vietnamese Medical Q&A - General Health Questions"}
)
# Batch insert general data
for i in range(0, len(general_data), batch_size):
batch = general_data[i:i+batch_size]
ids = [item['id'] for item in batch]
texts = [item['text'] for item in batch]
metadatas = [{
'type': item['type'],
'domain': 'general_health',
'agent': 'GeneralHealthAgent',
'source': 'vietnamese-medical-qa'
} for item in batch]
# Generate embeddings
embeddings = embedder.encode(texts, show_progress_bar=False)
general_collection.add(
ids=ids,
embeddings=embeddings.tolist(),
documents=texts,
metadatas=metadatas
)
if (i + batch_size) % 500 == 0:
print(f" Processed {min(i+batch_size, len(general_data))}/{len(general_data)} general Q&A...")
print(f"βœ… General Health Q&A ChromaDB created: {len(general_data)} records")
print("\nβœ… Processing complete!")
print(f" Output: {output_dir}")
print(f" - symptom_qa_chroma/ ({len(symptom_data)} records)")
print(f" - general_health_qa_chroma/ ({len(general_data)} records)")
return True
except ImportError as e:
print(f"❌ Error: Missing library - {e}")
print(" Install with: pip install sentence-transformers chromadb")
return False
except Exception as e:
print(f"❌ Error processing dataset: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Main execution"""
print("=" * 60)
print("Vietnamese Medical Q&A Dataset Mining")
print("Source: hungnm/vietnamese-medical-qa (HuggingFace)")
print("=" * 60)
# Step 1: Download
df = download_medical_qa()
if df is None:
print("\n❌ Download failed!")
return False
# Step 2: Process
success = process_medical_qa()
if not success:
print("\n❌ Processing failed!")
return False
print("\n" + "=" * 60)
print("βœ… SUCCESS! Vietnamese Medical Q&A ready for RAG system")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)