Spaces:

lapnt3
/

my-gradio-app

Runtime error

my-gradio-app / data_mining /mining_medical_qa.py

Nguyen Trong Lap

Recreate history without binary blobs

eeb0f9c about 2 months ago

10.2 kB

	#!/usr/bin/env python3
	"""
	Mining Script: Vietnamese Medical Q&A Dataset
	Downloads and processes hungnm/vietnamese-medical-qa from HuggingFace
	Splits into 2 collections: symptom_qa and general_health_qa
	"""

	import sys
	import pandas as pd
	from pathlib import Path

	def download_medical_qa():
	"""Download Vietnamese Medical Q&A dataset from HuggingFace"""
	try:
	from datasets import load_dataset

	print("📥 Downloading Vietnamese Medical Q&A from HuggingFace...")
	print(" Source: hungnm/vietnamese-medical-qa")
	print(" Size: ~9,335 Q&A pairs")

	# Download dataset
	dataset = load_dataset("hungnm/vietnamese-medical-qa")
	df = dataset['train'].to_pandas()

	print(f"✅ Downloaded: {len(df)} Q&A pairs")

	# Save to CSV
	output_dir = Path("data_mining/datasets")
	output_dir.mkdir(parents=True, exist_ok=True)

	output_path = output_dir / "vietnamese_medical_qa.csv"
	df.to_csv(output_path, index=False, encoding='utf-8')

	print(f"💾 Saved to: {output_path}")
	return df

	except ImportError:
	print("❌ Error: 'datasets' library not installed")
	print(" Install with: pip install datasets")
	return None
	except Exception as e:
	print(f"❌ Error downloading dataset: {e}")
	return None


	def is_symptom_question(question):
	"""
	Classify if question is about SPECIFIC SYMPTOMS

	Returns:
	bool: True if symptom question, False if general health question
	"""
	if not question or not isinstance(question, str):
	return False

	question_lower = question.lower()

	# Symptom keywords (high priority - user describing active symptoms)
	symptom_keywords = [
	# Pain
	'bị đau', 'đau', 'nhức', 'tức', 'đau nhức',

	# Infection/Fever
	'bị sốt', 'sốt', 'viêm', 'nhiễm trùng', 'mủ', 'sưng',

	# Digestive
	'buồn nôn', 'nôn', 'tiêu chảy', 'táo bón', 'đầy hơi',
	'ợ hơi', 'ợ chua', 'khó tiêu',

	# Respiratory
	'ho', 'khó thở', 'nghẹt mũi', 'chảy nước mũi',
	'đau họng', 'khàn giọng',

	# Neurological
	'chóng mặt', 'hoa mắt', 'mất thăng bằng', 'đau đầu',

	# Skin
	'ngứa', 'phát ban', 'nổi mẩn', 'đỏ',

	# General symptoms
	'mệt mỏi', 'yếu', 'không khỏe', 'bị ốm', 'khó chịu'
	]

	# General health keywords (prevention, knowledge, advice)
	general_keywords = [
	# Prevention
	'làm sao để không', 'phòng ngừa', 'tránh', 'cách phòng',
	'làm thế nào để', 'cách nào để',

	# Knowledge questions
	'là gì', 'có phải', 'có nên', 'nên không',
	'tại sao', 'nguyên nhân', 'có thể',

	# Advice/Recommendations
	'nên làm gì', 'nên ăn gì', 'có tốt không',
	'có được không', 'có nên', 'khuyên'
	]

	# Count keyword matches
	symptom_score = sum(1 for kw in symptom_keywords if kw in question_lower)
	general_score = sum(1 for kw in general_keywords if kw in question_lower)

	# Decision logic
	if symptom_score > general_score:
	return True # Symptom question
	elif general_score > symptom_score:
	return False # General health question
	else:
	# Tie-breaker: Check for "bị" (indicates having a condition)
	return 'bị' in question_lower


	def process_medical_qa():
	"""Process and split into 2 ChromaDB collections"""
	try:
	from sentence_transformers import SentenceTransformer
	import chromadb

	print("\n🔄 Processing Vietnamese Medical Q&A...")

	# Load CSV
	csv_path = Path("data_mining/datasets/vietnamese_medical_qa.csv")
	if not csv_path.exists():
	print(f"❌ Error: {csv_path} not found")
	return False

	df = pd.read_csv(csv_path, encoding='utf-8')
	print(f"📊 Loaded: {len(df)} Q&A pairs")

	# Initialize embedding model
	print("🤖 Loading embedding model: keepitreal/vietnamese-sbert...")
	embedder = SentenceTransformer('keepitreal/vietnamese-sbert')

	# Initialize ChromaDB
	output_dir = Path("data_mining/output")
	output_dir.mkdir(parents=True, exist_ok=True)

	# Split data
	symptom_data = []
	general_data = []

	print("🔍 Classifying questions...")
	for idx, row in df.iterrows():
	question = str(row['question'])
	answer = str(row['answer'])

	# Combine Q&A
	text = f"Câu hỏi: {question}\n\nTrả lời: {answer}"

	# Classify
	if is_symptom_question(question):
	symptom_data.append({
	'id': f'symptom_qa_{idx}',
	'text': text,
	'question': question,
	'answer': answer,
	'type': 'symptom'
	})
	else:
	general_data.append({
	'id': f'general_qa_{idx}',
	'text': text,
	'question': question,
	'answer': answer,
	'type': 'general'
	})

	print(f"✅ Classification complete:")
	print(f" - Symptom Q&A: {len(symptom_data)} ({len(symptom_data)/len(df)*100:.1f}%)")
	print(f" - General Health Q&A: {len(general_data)} ({len(general_data)/len(df)*100:.1f}%)")

	# Create ChromaDB collections
	# 1. Symptom Q&A Collection
	print("\n📦 Creating Symptom Q&A ChromaDB...")
	symptom_client = chromadb.PersistentClient(path=str(output_dir / "symptom_qa_chroma"))
	symptom_collection = symptom_client.get_or_create_collection(
	name="symptom_qa",
	metadata={"description": "Vietnamese Medical Q&A - Symptom Questions"}
	)

	# Batch insert symptom data
	batch_size = 100
	for i in range(0, len(symptom_data), batch_size):
	batch = symptom_data[i:i+batch_size]

	ids = [item['id'] for item in batch]
	texts = [item['text'] for item in batch]
	metadatas = [{
	'type': item['type'],
	'domain': 'symptom',
	'agent': 'SymptomAgent',
	'source': 'vietnamese-medical-qa'
	} for item in batch]

	# Generate embeddings
	embeddings = embedder.encode(texts, show_progress_bar=False)

	symptom_collection.add(
	ids=ids,
	embeddings=embeddings.tolist(),
	documents=texts,
	metadatas=metadatas
	)

	if (i + batch_size) % 500 == 0:
	print(f" Processed {min(i+batch_size, len(symptom_data))}/{len(symptom_data)} symptom Q&A...")

	print(f"✅ Symptom Q&A ChromaDB created: {len(symptom_data)} records")

	# 2. General Health Q&A Collection
	print("\n📦 Creating General Health Q&A ChromaDB...")
	general_client = chromadb.PersistentClient(path=str(output_dir / "general_health_qa_chroma"))
	general_collection = general_client.get_or_create_collection(
	name="general_health_qa",
	metadata={"description": "Vietnamese Medical Q&A - General Health Questions"}
	)

	# Batch insert general data
	for i in range(0, len(general_data), batch_size):
	batch = general_data[i:i+batch_size]

	ids = [item['id'] for item in batch]
	texts = [item['text'] for item in batch]
	metadatas = [{
	'type': item['type'],
	'domain': 'general_health',
	'agent': 'GeneralHealthAgent',
	'source': 'vietnamese-medical-qa'
	} for item in batch]

	# Generate embeddings
	embeddings = embedder.encode(texts, show_progress_bar=False)

	general_collection.add(
	ids=ids,
	embeddings=embeddings.tolist(),
	documents=texts,
	metadatas=metadatas
	)

	if (i + batch_size) % 500 == 0:
	print(f" Processed {min(i+batch_size, len(general_data))}/{len(general_data)} general Q&A...")

	print(f"✅ General Health Q&A ChromaDB created: {len(general_data)} records")

	print("\n✅ Processing complete!")
	print(f" Output: {output_dir}")
	print(f" - symptom_qa_chroma/ ({len(symptom_data)} records)")
	print(f" - general_health_qa_chroma/ ({len(general_data)} records)")

	return True

	except ImportError as e:
	print(f"❌ Error: Missing library - {e}")
	print(" Install with: pip install sentence-transformers chromadb")
	return False
	except Exception as e:
	print(f"❌ Error processing dataset: {e}")
	import traceback
	traceback.print_exc()
	return False


	def main():
	"""Main execution"""
	print("=" * 60)
	print("Vietnamese Medical Q&A Dataset Mining")
	print("Source: hungnm/vietnamese-medical-qa (HuggingFace)")
	print("=" * 60)

	# Step 1: Download
	df = download_medical_qa()
	if df is None:
	print("\n❌ Download failed!")
	return False

	# Step 2: Process
	success = process_medical_qa()
	if not success:
	print("\n❌ Processing failed!")
	return False

	print("\n" + "=" * 60)
	print("✅ SUCCESS! Vietnamese Medical Q&A ready for RAG system")
	print("=" * 60)
	return True


	if __name__ == "__main__":
	success = main()
	sys.exit(0 if success else 1)