""" Nutrition Dataset - Download & Process Downloads and processes dietary recommendation data into ChromaDB Dataset: issai/LLM_for_Dietary_Recommendation_System (50 patient profiles) """ from datasets import load_dataset import pandas as pd import chromadb from sentence_transformers import SentenceTransformer import os def download_nutrition(): """Download Dietary Recommendation dataset from HuggingFace""" print("šŸ“„ Downloading Dietary Recommendation dataset...") print(" Source: issai/LLM_for_Dietary_Recommendation_System") try: dataset = load_dataset("issai/LLM_for_Dietary_Recommendation_System") os.makedirs("data_mining/datasets", exist_ok=True) df = dataset['train'].to_pandas() output_path = "data_mining/datasets/nutrition_diet.csv" df.to_csv(output_path, index=False) file_size = os.path.getsize(output_path) / (1024 * 1024) print(f"āœ… Downloaded: {output_path}") print(f"šŸ“Š Records: {len(df)}") print(f"šŸ“Š File size: {file_size:.2f} MB") return True except Exception as e: print(f"āŒ Download failed: {e}") return False def process_nutrition(): """Process Nutrition dataset and build ChromaDB""" print("\nšŸ”Ø Processing Nutrition dataset...") csv_path = "data_mining/datasets/nutrition_diet.csv" if not os.path.exists(csv_path): print(f"āŒ Dataset not found: {csv_path}") return False df = pd.read_csv(csv_path) print(f"šŸ“Š Loaded {len(df)} records") print("šŸ¤– Loading embedding model...") embedder = SentenceTransformer('keepitreal/vietnamese-sbert') print("šŸ’¾ Initializing ChromaDB...") os.makedirs("data_mining/output", exist_ok=True) client = chromadb.PersistentClient(path="data_mining/output/nutrition_chroma") collection = client.get_or_create_collection( name="nutrition", metadata={"hnsw:space": "cosine"} ) print("šŸ“ Processing nutrition data...") text_columns = [] for col in ['profile', 'recommendation', 'diet_plan', 'text', 'content']: if col in df.columns: text_columns.append(col) if not text_columns: text_columns = df.columns.tolist() print(f" Using columns: {text_columns}") processed = 0 for idx, row in df.iterrows(): text_parts = [] for col in text_columns: value = str(row[col]) if value and value != 'nan' and len(value) > 5: text_parts.append(f"{col}: {value}") text = "\n".join(text_parts) if len(text) < 20: continue embedding = embedder.encode(text) collection.add( ids=[f"nutrition_{processed:05d}"], embeddings=[embedding.tolist()], documents=[text], metadatas=[{ 'domain': 'nutrition', 'agent': 'NutritionAgent', 'source': 'LLM_Dietary_Recommendation', 'index': processed }] ) processed += 1 if (processed % 10) == 0: print(f" Processed {processed}/{len(df)} records...") print(f"āœ… Processed {processed} nutrition records") print(f"šŸ’¾ Database saved to: data_mining/output/nutrition_chroma/") db_path = "data_mining/output/nutrition_chroma" total_size = 0 for dirpath, dirnames, filenames in os.walk(db_path): for filename in filenames: filepath = os.path.join(dirpath, filename) total_size += os.path.getsize(filepath) print(f"šŸ“Š Database size: {total_size / (1024 * 1024):.2f} MB") return True def main(): """Main function - download and process""" print("=" * 60) print("Nutrition Dataset - Download & Process") print("=" * 60) if not download_nutrition(): return False if not process_nutrition(): return False print("\n" + "=" * 60) print("āœ… Nutrition dataset ready!") print("=" * 60) return True if __name__ == "__main__": success = main() exit(0 if success else 1)