Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Mining Script: Vietnamese Medical Q&A Dataset | |
| Downloads and processes hungnm/vietnamese-medical-qa from HuggingFace | |
| Splits into 2 collections: symptom_qa and general_health_qa | |
| """ | |
| import sys | |
| import pandas as pd | |
| from pathlib import Path | |
| def download_medical_qa(): | |
| """Download Vietnamese Medical Q&A dataset from HuggingFace""" | |
| try: | |
| from datasets import load_dataset | |
| print("π₯ Downloading Vietnamese Medical Q&A from HuggingFace...") | |
| print(" Source: hungnm/vietnamese-medical-qa") | |
| print(" Size: ~9,335 Q&A pairs") | |
| # Download dataset | |
| dataset = load_dataset("hungnm/vietnamese-medical-qa") | |
| df = dataset['train'].to_pandas() | |
| print(f"β Downloaded: {len(df)} Q&A pairs") | |
| # Save to CSV | |
| output_dir = Path("data_mining/datasets") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| output_path = output_dir / "vietnamese_medical_qa.csv" | |
| df.to_csv(output_path, index=False, encoding='utf-8') | |
| print(f"πΎ Saved to: {output_path}") | |
| return df | |
| except ImportError: | |
| print("β Error: 'datasets' library not installed") | |
| print(" Install with: pip install datasets") | |
| return None | |
| except Exception as e: | |
| print(f"β Error downloading dataset: {e}") | |
| return None | |
| def is_symptom_question(question): | |
| """ | |
| Classify if question is about SPECIFIC SYMPTOMS | |
| Returns: | |
| bool: True if symptom question, False if general health question | |
| """ | |
| if not question or not isinstance(question, str): | |
| return False | |
| question_lower = question.lower() | |
| # Symptom keywords (high priority - user describing active symptoms) | |
| symptom_keywords = [ | |
| # Pain | |
| 'bα» Δau', 'Δau', 'nhα»©c', 'tα»©c', 'Δau nhα»©c', | |
| # Infection/Fever | |
| 'bα» sα»t', 'sα»t', 'viΓͺm', 'nhiα» m trΓΉng', 'mα»§', 'sΖ°ng', | |
| # Digestive | |
| 'buα»n nΓ΄n', 'nΓ΄n', 'tiΓͺu chαΊ£y', 'tΓ‘o bΓ³n', 'ΔαΊ§y hΖ‘i', | |
| 'ợ hΖ‘i', 'ợ chua', 'khΓ³ tiΓͺu', | |
| # Respiratory | |
| 'ho', 'khΓ³ thα»', 'nghαΊΉt mΕ©i', 'chαΊ£y nΖ°α»c mΕ©i', | |
| 'Δau hα»ng', 'khΓ n giα»ng', | |
| # Neurological | |
| 'chΓ³ng mαΊ·t', 'hoa mαΊ―t', 'mαΊ₯t thΔng bαΊ±ng', 'Δau ΔαΊ§u', | |
| # Skin | |
| 'ngα»©a', 'phΓ‘t ban', 'nα»i mαΊ©n', 'Δα»', | |
| # General symptoms | |
| 'mα»t mα»i', 'yαΊΏu', 'khΓ΄ng khα»e', 'bα» α»m', 'khΓ³ chα»u' | |
| ] | |
| # General health keywords (prevention, knowledge, advice) | |
| general_keywords = [ | |
| # Prevention | |
| 'lΓ m sao Δα» khΓ΄ng', 'phΓ²ng ngα»«a', 'trΓ‘nh', 'cΓ‘ch phΓ²ng', | |
| 'lΓ m thαΊΏ nΓ o Δα»', 'cΓ‘ch nΓ o Δα»', | |
| # Knowledge questions | |
| 'lΓ gΓ¬', 'cΓ³ phαΊ£i', 'cΓ³ nΓͺn', 'nΓͺn khΓ΄ng', | |
| 'tαΊ‘i sao', 'nguyΓͺn nhΓ’n', 'cΓ³ thα»', | |
| # Advice/Recommendations | |
| 'nΓͺn lΓ m gΓ¬', 'nΓͺn Δn gΓ¬', 'cΓ³ tα»t khΓ΄ng', | |
| 'cΓ³ Δược khΓ΄ng', 'cΓ³ nΓͺn', 'khuyΓͺn' | |
| ] | |
| # Count keyword matches | |
| symptom_score = sum(1 for kw in symptom_keywords if kw in question_lower) | |
| general_score = sum(1 for kw in general_keywords if kw in question_lower) | |
| # Decision logic | |
| if symptom_score > general_score: | |
| return True # Symptom question | |
| elif general_score > symptom_score: | |
| return False # General health question | |
| else: | |
| # Tie-breaker: Check for "bα»" (indicates having a condition) | |
| return 'bα»' in question_lower | |
| def process_medical_qa(): | |
| """Process and split into 2 ChromaDB collections""" | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| print("\nπ Processing Vietnamese Medical Q&A...") | |
| # Load CSV | |
| csv_path = Path("data_mining/datasets/vietnamese_medical_qa.csv") | |
| if not csv_path.exists(): | |
| print(f"β Error: {csv_path} not found") | |
| return False | |
| df = pd.read_csv(csv_path, encoding='utf-8') | |
| print(f"π Loaded: {len(df)} Q&A pairs") | |
| # Initialize embedding model | |
| print("π€ Loading embedding model: keepitreal/vietnamese-sbert...") | |
| embedder = SentenceTransformer('keepitreal/vietnamese-sbert') | |
| # Initialize ChromaDB | |
| output_dir = Path("data_mining/output") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Split data | |
| symptom_data = [] | |
| general_data = [] | |
| print("π Classifying questions...") | |
| for idx, row in df.iterrows(): | |
| question = str(row['question']) | |
| answer = str(row['answer']) | |
| # Combine Q&A | |
| text = f"CΓ’u hα»i: {question}\n\nTrαΊ£ lα»i: {answer}" | |
| # Classify | |
| if is_symptom_question(question): | |
| symptom_data.append({ | |
| 'id': f'symptom_qa_{idx}', | |
| 'text': text, | |
| 'question': question, | |
| 'answer': answer, | |
| 'type': 'symptom' | |
| }) | |
| else: | |
| general_data.append({ | |
| 'id': f'general_qa_{idx}', | |
| 'text': text, | |
| 'question': question, | |
| 'answer': answer, | |
| 'type': 'general' | |
| }) | |
| print(f"β Classification complete:") | |
| print(f" - Symptom Q&A: {len(symptom_data)} ({len(symptom_data)/len(df)*100:.1f}%)") | |
| print(f" - General Health Q&A: {len(general_data)} ({len(general_data)/len(df)*100:.1f}%)") | |
| # Create ChromaDB collections | |
| # 1. Symptom Q&A Collection | |
| print("\nπ¦ Creating Symptom Q&A ChromaDB...") | |
| symptom_client = chromadb.PersistentClient(path=str(output_dir / "symptom_qa_chroma")) | |
| symptom_collection = symptom_client.get_or_create_collection( | |
| name="symptom_qa", | |
| metadata={"description": "Vietnamese Medical Q&A - Symptom Questions"} | |
| ) | |
| # Batch insert symptom data | |
| batch_size = 100 | |
| for i in range(0, len(symptom_data), batch_size): | |
| batch = symptom_data[i:i+batch_size] | |
| ids = [item['id'] for item in batch] | |
| texts = [item['text'] for item in batch] | |
| metadatas = [{ | |
| 'type': item['type'], | |
| 'domain': 'symptom', | |
| 'agent': 'SymptomAgent', | |
| 'source': 'vietnamese-medical-qa' | |
| } for item in batch] | |
| # Generate embeddings | |
| embeddings = embedder.encode(texts, show_progress_bar=False) | |
| symptom_collection.add( | |
| ids=ids, | |
| embeddings=embeddings.tolist(), | |
| documents=texts, | |
| metadatas=metadatas | |
| ) | |
| if (i + batch_size) % 500 == 0: | |
| print(f" Processed {min(i+batch_size, len(symptom_data))}/{len(symptom_data)} symptom Q&A...") | |
| print(f"β Symptom Q&A ChromaDB created: {len(symptom_data)} records") | |
| # 2. General Health Q&A Collection | |
| print("\nπ¦ Creating General Health Q&A ChromaDB...") | |
| general_client = chromadb.PersistentClient(path=str(output_dir / "general_health_qa_chroma")) | |
| general_collection = general_client.get_or_create_collection( | |
| name="general_health_qa", | |
| metadata={"description": "Vietnamese Medical Q&A - General Health Questions"} | |
| ) | |
| # Batch insert general data | |
| for i in range(0, len(general_data), batch_size): | |
| batch = general_data[i:i+batch_size] | |
| ids = [item['id'] for item in batch] | |
| texts = [item['text'] for item in batch] | |
| metadatas = [{ | |
| 'type': item['type'], | |
| 'domain': 'general_health', | |
| 'agent': 'GeneralHealthAgent', | |
| 'source': 'vietnamese-medical-qa' | |
| } for item in batch] | |
| # Generate embeddings | |
| embeddings = embedder.encode(texts, show_progress_bar=False) | |
| general_collection.add( | |
| ids=ids, | |
| embeddings=embeddings.tolist(), | |
| documents=texts, | |
| metadatas=metadatas | |
| ) | |
| if (i + batch_size) % 500 == 0: | |
| print(f" Processed {min(i+batch_size, len(general_data))}/{len(general_data)} general Q&A...") | |
| print(f"β General Health Q&A ChromaDB created: {len(general_data)} records") | |
| print("\nβ Processing complete!") | |
| print(f" Output: {output_dir}") | |
| print(f" - symptom_qa_chroma/ ({len(symptom_data)} records)") | |
| print(f" - general_health_qa_chroma/ ({len(general_data)} records)") | |
| return True | |
| except ImportError as e: | |
| print(f"β Error: Missing library - {e}") | |
| print(" Install with: pip install sentence-transformers chromadb") | |
| return False | |
| except Exception as e: | |
| print(f"β Error processing dataset: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def main(): | |
| """Main execution""" | |
| print("=" * 60) | |
| print("Vietnamese Medical Q&A Dataset Mining") | |
| print("Source: hungnm/vietnamese-medical-qa (HuggingFace)") | |
| print("=" * 60) | |
| # Step 1: Download | |
| df = download_medical_qa() | |
| if df is None: | |
| print("\nβ Download failed!") | |
| return False | |
| # Step 2: Process | |
| success = process_medical_qa() | |
| if not success: | |
| print("\nβ Processing failed!") | |
| return False | |
| print("\n" + "=" * 60) | |
| print("β SUCCESS! Vietnamese Medical Q&A ready for RAG system") | |
| print("=" * 60) | |
| return True | |
| if __name__ == "__main__": | |
| success = main() | |
| sys.exit(0 if success else 1) | |