Spaces:

astroknotsheep
/

gemmaft

Sleeping

App Files Files Community

gemmaft / utils /training.py

astroknotsheep

Upload 15 files

b619545 verified 9 months ago

raw

history blame contribute delete

11.5 kB

	import os
	import json
	import time
	import pandas as pd
	import streamlit as st
	from huggingface_hub import HfApi, create_repo, upload_file
	import tempfile

	def create_model_repo(repo_name, private=True):
	"""
	Create a new model repository on Hugging Face Hub

	Args:
	repo_name (str): Name of the repository
	private (bool): Whether the repository should be private

	Returns:
	tuple: (success (bool), message (str))
	"""
	try:
	token = st.session_state.get("hf_token")
	if not token:
	return False, "No Hugging Face token found"

	username = st.session_state.get("hf_username", "user")
	full_repo_name = f"{username}/{repo_name}"

	api = HfApi(token=token)
	repo_url = api.create_repo(
	repo_id=full_repo_name,
	private=private,
	exist_ok=True
	)

	return True, repo_url
	except Exception as e:
	return False, str(e)

	def upload_training_config(config, repo_name):
	"""
	Upload a training configuration file to Hugging Face Hub

	Args:
	config (dict): Training configuration
	repo_name (str): Repository to upload to

	Returns:
	tuple: (success (bool), message (str))
	"""
	try:
	token = st.session_state.get("hf_token")
	if not token:
	return False, "No Hugging Face token found"

	username = st.session_state.get("hf_username", "user")
	repo_id = f"{username}/{repo_name}"

	with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
	with open(tmp.name, 'w') as f:
	json.dump(config, f, indent=2)

	# Upload file to repository
	upload_file(
	path_or_fileobj=tmp.name,
	path_in_repo="training_config.json",
	repo_id=repo_id,
	token=token
	)

	# Clean up temporary file
	tmp_name = tmp.name

	os.unlink(tmp_name)
	return True, f"Training config uploaded to {repo_id}"
	except Exception as e:
	return False, str(e)

	def setup_training_script(repo_name, config):
	"""
	Generate and upload a training script to the repository

	Args:
	repo_name (str): Repository name
	config (dict): Training configuration

	Returns:
	tuple: (success (bool), message (str))
	"""
	# Create a training script using transformers Trainer with 4-bit quantization for CPU
	script = """
	from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
	from datasets import load_dataset
	from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
	import json
	import os
	import torch
	from huggingface_hub import login
	import bitsandbytes as bnb

	# Load configuration
	with open("training_config.json", "r") as f:
	config = json.load(f)

	# Login to Hugging Face
	login(token=os.environ.get("HF_TOKEN"))

	# Load dataset
	print("Loading dataset:", config["dataset_name"])
	dataset = load_dataset(config["dataset_name"])

	# Prepare train/validation split if not already split
	if "train" in dataset and "validation" not in dataset:
	dataset = dataset["train"].train_test_split(test_size=0.1)
	elif "train" not in dataset:
	# If dataset has no train split but has text column, use that
	if "text" in dataset:
	dataset = dataset.train_test_split(test_size=0.1)
	else:
	# Try to find what splits are available
	print("Available splits:", list(dataset.keys()))
	# Default to using the first split and splitting it
	first_split = list(dataset.keys())[0]
	dataset = dataset[first_split].train_test_split(test_size=0.1)

	print("Dataset splits:", list(dataset.keys()))

	# Print dataset sample
	print("Dataset sample:", dataset["train"][0])

	# Load tokenizer
	print("Loading tokenizer for model:", config["model_name_or_path"])
	tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"])
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model with 4-bit quantization for CPU efficiency
	print("Loading model with quantization...")
	model = AutoModelForCausalLM.from_pretrained(
	config["model_name_or_path"],
	load_in_4bit=True, # Enable 4-bit quantization
	device_map="auto",
	quantization_config=bnb.nn.modules.Linear4bit.compute_quant_config(),
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	use_cache=False, # Required for gradient checkpointing
	)

	# Enable gradient checkpointing for memory efficiency
	model.gradient_checkpointing_enable()

	# Print memory usage before PEFT
	print(f"Model loaded. Memory usage: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")

	# Prepare model for training with LoRA
	print("Setting up LoRA with rank:", config["peft_config"]["r"])
	peft_config = LoraConfig(
	r=config["peft_config"]["r"],
	lora_alpha=config["peft_config"]["lora_alpha"],
	lora_dropout=config["peft_config"]["lora_dropout"],
	bias=config["peft_config"]["bias"],
	task_type=config["peft_config"]["task_type"],
	target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
	)

	# Prepare model - use 8-bit Adam for memory efficiency
	print("Preparing model for training...")
	model = prepare_model_for_kbit_training(model)
	model = get_peft_model(model, peft_config)
	model.print_trainable_parameters()

	# Setup training arguments with CPU optimizations
	print("Setting up training arguments...")
	training_args = TrainingArguments(
	output_dir=config["output_dir"],
	num_train_epochs=config["num_train_epochs"],
	per_device_train_batch_size=config["per_device_train_batch_size"],
	per_device_eval_batch_size=max(1, config["per_device_train_batch_size"] // 2),
	learning_rate=config["learning_rate"],
	weight_decay=config["weight_decay"],
	save_strategy=config["save_strategy"],
	evaluation_strategy=config["evaluation_strategy"],
	fp16=config["fp16"] and torch.cuda.is_available(),
	optim=config["optim"],
	logging_steps=config["logging_steps"],
	gradient_accumulation_steps=config["gradient_accumulation_steps"],
	max_steps=config["max_steps"] if config["max_steps"] > 0 else None,
	warmup_steps=config["warmup_steps"],
	max_grad_norm=config["max_grad_norm"],
	push_to_hub=True,
	hub_token=os.environ.get("HF_TOKEN"),
	dataloader_num_workers=0, # Lower CPU usage for smaller machines
	use_cpu=not torch.cuda.is_available(), # Force CPU if no GPU
	lr_scheduler_type="cosine", # Better LR scheduling for small datasets
	report_to=["tensorboard"], # Enable tensorboard logging
	)

	# Define data collator
	print("Setting up data collator...")
	data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

	# Tokenize dataset
	print("Tokenizing dataset...")
	def tokenize_function(examples):
	# Use a smaller max length on CPU to save memory
	max_length = 256 if not torch.cuda.is_available() else 512
	return tokenizer(
	examples["text"],
	padding="max_length",
	truncation=True,
	max_length=max_length
	)

	# Show progress while tokenizing
	print("Mapping tokenization function...")
	tokenized_dataset = dataset.map(
	tokenize_function,
	batched=True,
	batch_size=8, # Smaller batch size for CPU
	remove_columns=dataset["train"].column_names, # Remove original columns after tokenizing
	desc="Tokenizing dataset",
	)

	# Initialize trainer
	print("Initializing trainer...")
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset["train"],
	eval_dataset=tokenized_dataset["validation"],
	data_collator=data_collator,
	)

	# Start training
	print("Starting training...")
	try:
	trainer.train()
	# Save model and tokenizer
	print("Saving model...")
	trainer.save_model()
	print("Training completed successfully!")
	except Exception as e:
	print(f"Error during training: {str(e)}")
	# Save checkpoint even if error occurred
	try:
	trainer.save_model("./checkpoint-error")
	print("Saved checkpoint before error")
	except:
	print("Could not save checkpoint")
	"""

	try:
	token = st.session_state.get("hf_token")
	if not token:
	return False, "No Hugging Face token found"

	username = st.session_state.get("hf_username", "user")
	repo_id = f"{username}/{repo_name}"

	with tempfile.NamedTemporaryFile(delete=False, suffix='.py') as tmp:
	with open(tmp.name, 'w') as f:
	f.write(script)

	# Upload file to repository
	upload_file(
	path_or_fileobj=tmp.name,
	path_in_repo="train.py",
	repo_id=repo_id,
	token=token
	)

	# Clean up temporary file
	tmp_name = tmp.name

	# Also create and upload a CPU-optimized requirements file
	requirements = """
	transformers>=4.35.0
	peft>=0.7.0
	bitsandbytes>=0.40.0
	datasets>=2.10.0
	torch>=2.0.0
	tensorboard>=2.13.0
	accelerate>=0.20.0
	huggingface_hub>=0.15.0
	scipy>=1.10.0
	"""

	with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as tmp:
	with open(tmp.name, 'w') as f:
	f.write(requirements)

	# Upload file to repository
	upload_file(
	path_or_fileobj=tmp.name,
	path_in_repo="requirements.txt",
	repo_id=repo_id,
	token=token
	)

	# Clean up temporary file
	tmp_name = tmp.name

	os.unlink(tmp_name)

	return True, f"Training script and requirements uploaded to {repo_id}"
	except Exception as e:
	return False, str(e)

	def simulate_training_progress():
	"""
	Simulate training progress for demonstration purposes
	"""
	if "training_progress" not in st.session_state:
	st.session_state.training_progress = {
	"started": time.time(),
	"current_epoch": 0,
	"total_epochs": 3,
	"loss": 2.5,
	"learning_rate": 2e-5,
	"progress": 0.0,
	"status": "running"
	}

	# Update progress based on elapsed time (simulated)
	elapsed = time.time() - st.session_state.training_progress["started"]
	epoch_duration = 60 # Simulate each epoch taking 60 seconds

	# Calculate current progress
	total_duration = epoch_duration * st.session_state.training_progress["total_epochs"]
	progress = min(elapsed / total_duration, 1.0)

	# Calculate current epoch
	current_epoch = min(
	int(progress * st.session_state.training_progress["total_epochs"]),
	st.session_state.training_progress["total_epochs"]
	)

	# Simulate decreasing loss
	loss = max(2.5 - (progress * 2.0), 0.5)

	# Update session state
	st.session_state.training_progress.update({
	"progress": progress,
	"current_epoch": current_epoch,
	"loss": loss,
	"status": "completed" if progress >= 1.0 else "running"
	})

	return st.session_state.training_progress