Spaces:
Runtime error
Runtime error
| """ | |
| Fitness Dataset - Download & Process | |
| Downloads and processes gym exercise data into ChromaDB | |
| Dataset: onurSakar/GYM-Exercise (1.66K exercises) | |
| """ | |
| from datasets import load_dataset | |
| import pandas as pd | |
| import chromadb | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| def download_fitness(): | |
| """Download GYM Exercise dataset from HuggingFace""" | |
| print("π₯ Downloading GYM Exercise dataset...") | |
| print(" Source: onurSakar/GYM-Exercise") | |
| try: | |
| dataset = load_dataset("onurSakar/GYM-Exercise") | |
| os.makedirs("data_mining/datasets", exist_ok=True) | |
| df = dataset['train'].to_pandas() | |
| output_path = "data_mining/datasets/gym_exercise.csv" | |
| df.to_csv(output_path, index=False) | |
| file_size = os.path.getsize(output_path) / (1024 * 1024) | |
| print(f"β Downloaded: {output_path}") | |
| print(f"π Records: {len(df)}") | |
| print(f"π File size: {file_size:.2f} MB") | |
| return True | |
| except Exception as e: | |
| print(f"β Download failed: {e}") | |
| return False | |
| def process_fitness(): | |
| """Process Fitness dataset and build ChromaDB""" | |
| print("\nπ¨ Processing Fitness dataset...") | |
| csv_path = "data_mining/datasets/gym_exercise.csv" | |
| if not os.path.exists(csv_path): | |
| print(f"β Dataset not found: {csv_path}") | |
| return False | |
| df = pd.read_csv(csv_path) | |
| print(f"π Loaded {len(df)} records") | |
| print("π€ Loading embedding model...") | |
| embedder = SentenceTransformer('keepitreal/vietnamese-sbert') | |
| print("πΎ Initializing ChromaDB...") | |
| os.makedirs("data_mining/output", exist_ok=True) | |
| client = chromadb.PersistentClient(path="data_mining/output/fitness_chroma") | |
| collection = client.get_or_create_collection( | |
| name="fitness", | |
| metadata={"hnsw:space": "cosine"} | |
| ) | |
| print("π Processing fitness data...") | |
| processed = 0 | |
| for idx, row in df.iterrows(): | |
| text_parts = [] | |
| for col in df.columns: | |
| value = str(row[col]) | |
| if value and value != 'nan' and len(value) > 2: | |
| text_parts.append(f"{col}: {value}") | |
| text = "\n".join(text_parts) | |
| if len(text) < 10: | |
| continue | |
| embedding = embedder.encode(text) | |
| collection.add( | |
| ids=[f"fitness_{processed:05d}"], | |
| embeddings=[embedding.tolist()], | |
| documents=[text], | |
| metadatas=[{ | |
| 'domain': 'fitness', | |
| 'agent': 'FitnessAgent', | |
| 'source': 'GYM_Exercise', | |
| 'index': processed | |
| }] | |
| ) | |
| processed += 1 | |
| if (processed % 100) == 0: | |
| print(f" Processed {processed}/{len(df)} records...") | |
| print(f"β Processed {processed} fitness records") | |
| print(f"πΎ Database saved to: data_mining/output/fitness_chroma/") | |
| db_path = "data_mining/output/fitness_chroma" | |
| total_size = 0 | |
| for dirpath, dirnames, filenames in os.walk(db_path): | |
| for filename in filenames: | |
| filepath = os.path.join(dirpath, filename) | |
| total_size += os.path.getsize(filepath) | |
| print(f"π Database size: {total_size / (1024 * 1024):.2f} MB") | |
| return True | |
| def main(): | |
| """Main function - download and process""" | |
| print("=" * 60) | |
| print("Fitness Dataset - Download & Process") | |
| print("=" * 60) | |
| if not download_fitness(): | |
| return False | |
| if not process_fitness(): | |
| return False | |
| print("\n" + "=" * 60) | |
| print("β Fitness dataset ready!") | |
| print("=" * 60) | |
| return True | |
| if __name__ == "__main__": | |
| success = main() | |
| exit(0 if success else 1) | |