my-gradio-app / data_mining /mining_fitness.py
Nguyen Trong Lap
Recreate history without binary blobs
eeb0f9c
"""
Fitness Dataset - Download & Process
Downloads and processes gym exercise data into ChromaDB
Dataset: onurSakar/GYM-Exercise (1.66K exercises)
"""
from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os
def download_fitness():
"""Download GYM Exercise dataset from HuggingFace"""
print("πŸ“₯ Downloading GYM Exercise dataset...")
print(" Source: onurSakar/GYM-Exercise")
try:
dataset = load_dataset("onurSakar/GYM-Exercise")
os.makedirs("data_mining/datasets", exist_ok=True)
df = dataset['train'].to_pandas()
output_path = "data_mining/datasets/gym_exercise.csv"
df.to_csv(output_path, index=False)
file_size = os.path.getsize(output_path) / (1024 * 1024)
print(f"βœ… Downloaded: {output_path}")
print(f"πŸ“Š Records: {len(df)}")
print(f"πŸ“Š File size: {file_size:.2f} MB")
return True
except Exception as e:
print(f"❌ Download failed: {e}")
return False
def process_fitness():
"""Process Fitness dataset and build ChromaDB"""
print("\nπŸ”¨ Processing Fitness dataset...")
csv_path = "data_mining/datasets/gym_exercise.csv"
if not os.path.exists(csv_path):
print(f"❌ Dataset not found: {csv_path}")
return False
df = pd.read_csv(csv_path)
print(f"πŸ“Š Loaded {len(df)} records")
print("πŸ€– Loading embedding model...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
print("πŸ’Ύ Initializing ChromaDB...")
os.makedirs("data_mining/output", exist_ok=True)
client = chromadb.PersistentClient(path="data_mining/output/fitness_chroma")
collection = client.get_or_create_collection(
name="fitness",
metadata={"hnsw:space": "cosine"}
)
print("πŸ“ Processing fitness data...")
processed = 0
for idx, row in df.iterrows():
text_parts = []
for col in df.columns:
value = str(row[col])
if value and value != 'nan' and len(value) > 2:
text_parts.append(f"{col}: {value}")
text = "\n".join(text_parts)
if len(text) < 10:
continue
embedding = embedder.encode(text)
collection.add(
ids=[f"fitness_{processed:05d}"],
embeddings=[embedding.tolist()],
documents=[text],
metadatas=[{
'domain': 'fitness',
'agent': 'FitnessAgent',
'source': 'GYM_Exercise',
'index': processed
}]
)
processed += 1
if (processed % 100) == 0:
print(f" Processed {processed}/{len(df)} records...")
print(f"βœ… Processed {processed} fitness records")
print(f"πŸ’Ύ Database saved to: data_mining/output/fitness_chroma/")
db_path = "data_mining/output/fitness_chroma"
total_size = 0
for dirpath, dirnames, filenames in os.walk(db_path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
total_size += os.path.getsize(filepath)
print(f"πŸ“Š Database size: {total_size / (1024 * 1024):.2f} MB")
return True
def main():
"""Main function - download and process"""
print("=" * 60)
print("Fitness Dataset - Download & Process")
print("=" * 60)
if not download_fitness():
return False
if not process_fitness():
return False
print("\n" + "=" * 60)
print("βœ… Fitness dataset ready!")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
exit(0 if success else 1)