import os import json import time import pandas as pd import streamlit as st from huggingface_hub import HfApi, create_repo, upload_file import tempfile def create_model_repo(repo_name, private=True): """ Create a new model repository on Hugging Face Hub Args: repo_name (str): Name of the repository private (bool): Whether the repository should be private Returns: tuple: (success (bool), message (str)) """ try: token = st.session_state.get("hf_token") if not token: return False, "No Hugging Face token found" username = st.session_state.get("hf_username", "user") full_repo_name = f"{username}/{repo_name}" api = HfApi(token=token) repo_url = api.create_repo( repo_id=full_repo_name, private=private, exist_ok=True ) return True, repo_url except Exception as e: return False, str(e) def upload_training_config(config, repo_name): """ Upload a training configuration file to Hugging Face Hub Args: config (dict): Training configuration repo_name (str): Repository to upload to Returns: tuple: (success (bool), message (str)) """ try: token = st.session_state.get("hf_token") if not token: return False, "No Hugging Face token found" username = st.session_state.get("hf_username", "user") repo_id = f"{username}/{repo_name}" with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp: with open(tmp.name, 'w') as f: json.dump(config, f, indent=2) # Upload file to repository upload_file( path_or_fileobj=tmp.name, path_in_repo="training_config.json", repo_id=repo_id, token=token ) # Clean up temporary file tmp_name = tmp.name os.unlink(tmp_name) return True, f"Training config uploaded to {repo_id}" except Exception as e: return False, str(e) def setup_training_script(repo_name, config): """ Generate and upload a training script to the repository Args: repo_name (str): Repository name config (dict): Training configuration Returns: tuple: (success (bool), message (str)) """ # Create a training script using transformers Trainer with 4-bit quantization for CPU script = """ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling from datasets import load_dataset from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel import json import os import torch from huggingface_hub import login import bitsandbytes as bnb # Load configuration with open("training_config.json", "r") as f: config = json.load(f) # Login to Hugging Face login(token=os.environ.get("HF_TOKEN")) # Load dataset print("Loading dataset:", config["dataset_name"]) dataset = load_dataset(config["dataset_name"]) # Prepare train/validation split if not already split if "train" in dataset and "validation" not in dataset: dataset = dataset["train"].train_test_split(test_size=0.1) elif "train" not in dataset: # If dataset has no train split but has text column, use that if "text" in dataset: dataset = dataset.train_test_split(test_size=0.1) else: # Try to find what splits are available print("Available splits:", list(dataset.keys())) # Default to using the first split and splitting it first_split = list(dataset.keys())[0] dataset = dataset[first_split].train_test_split(test_size=0.1) print("Dataset splits:", list(dataset.keys())) # Print dataset sample print("Dataset sample:", dataset["train"][0]) # Load tokenizer print("Loading tokenizer for model:", config["model_name_or_path"]) tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"]) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Load model with 4-bit quantization for CPU efficiency print("Loading model with quantization...") model = AutoModelForCausalLM.from_pretrained( config["model_name_or_path"], load_in_4bit=True, # Enable 4-bit quantization device_map="auto", quantization_config=bnb.nn.modules.Linear4bit.compute_quant_config(), torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, use_cache=False, # Required for gradient checkpointing ) # Enable gradient checkpointing for memory efficiency model.gradient_checkpointing_enable() # Print memory usage before PEFT print(f"Model loaded. Memory usage: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") # Prepare model for training with LoRA print("Setting up LoRA with rank:", config["peft_config"]["r"]) peft_config = LoraConfig( r=config["peft_config"]["r"], lora_alpha=config["peft_config"]["lora_alpha"], lora_dropout=config["peft_config"]["lora_dropout"], bias=config["peft_config"]["bias"], task_type=config["peft_config"]["task_type"], target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], ) # Prepare model - use 8-bit Adam for memory efficiency print("Preparing model for training...") model = prepare_model_for_kbit_training(model) model = get_peft_model(model, peft_config) model.print_trainable_parameters() # Setup training arguments with CPU optimizations print("Setting up training arguments...") training_args = TrainingArguments( output_dir=config["output_dir"], num_train_epochs=config["num_train_epochs"], per_device_train_batch_size=config["per_device_train_batch_size"], per_device_eval_batch_size=max(1, config["per_device_train_batch_size"] // 2), learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], save_strategy=config["save_strategy"], evaluation_strategy=config["evaluation_strategy"], fp16=config["fp16"] and torch.cuda.is_available(), optim=config["optim"], logging_steps=config["logging_steps"], gradient_accumulation_steps=config["gradient_accumulation_steps"], max_steps=config["max_steps"] if config["max_steps"] > 0 else None, warmup_steps=config["warmup_steps"], max_grad_norm=config["max_grad_norm"], push_to_hub=True, hub_token=os.environ.get("HF_TOKEN"), dataloader_num_workers=0, # Lower CPU usage for smaller machines use_cpu=not torch.cuda.is_available(), # Force CPU if no GPU lr_scheduler_type="cosine", # Better LR scheduling for small datasets report_to=["tensorboard"], # Enable tensorboard logging ) # Define data collator print("Setting up data collator...") data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Tokenize dataset print("Tokenizing dataset...") def tokenize_function(examples): # Use a smaller max length on CPU to save memory max_length = 256 if not torch.cuda.is_available() else 512 return tokenizer( examples["text"], padding="max_length", truncation=True, max_length=max_length ) # Show progress while tokenizing print("Mapping tokenization function...") tokenized_dataset = dataset.map( tokenize_function, batched=True, batch_size=8, # Smaller batch size for CPU remove_columns=dataset["train"].column_names, # Remove original columns after tokenizing desc="Tokenizing dataset", ) # Initialize trainer print("Initializing trainer...") trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["validation"], data_collator=data_collator, ) # Start training print("Starting training...") try: trainer.train() # Save model and tokenizer print("Saving model...") trainer.save_model() print("Training completed successfully!") except Exception as e: print(f"Error during training: {str(e)}") # Save checkpoint even if error occurred try: trainer.save_model("./checkpoint-error") print("Saved checkpoint before error") except: print("Could not save checkpoint") """ try: token = st.session_state.get("hf_token") if not token: return False, "No Hugging Face token found" username = st.session_state.get("hf_username", "user") repo_id = f"{username}/{repo_name}" with tempfile.NamedTemporaryFile(delete=False, suffix='.py') as tmp: with open(tmp.name, 'w') as f: f.write(script) # Upload file to repository upload_file( path_or_fileobj=tmp.name, path_in_repo="train.py", repo_id=repo_id, token=token ) # Clean up temporary file tmp_name = tmp.name # Also create and upload a CPU-optimized requirements file requirements = """ transformers>=4.35.0 peft>=0.7.0 bitsandbytes>=0.40.0 datasets>=2.10.0 torch>=2.0.0 tensorboard>=2.13.0 accelerate>=0.20.0 huggingface_hub>=0.15.0 scipy>=1.10.0 """ with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as tmp: with open(tmp.name, 'w') as f: f.write(requirements) # Upload file to repository upload_file( path_or_fileobj=tmp.name, path_in_repo="requirements.txt", repo_id=repo_id, token=token ) # Clean up temporary file tmp_name = tmp.name os.unlink(tmp_name) return True, f"Training script and requirements uploaded to {repo_id}" except Exception as e: return False, str(e) def simulate_training_progress(): """ Simulate training progress for demonstration purposes """ if "training_progress" not in st.session_state: st.session_state.training_progress = { "started": time.time(), "current_epoch": 0, "total_epochs": 3, "loss": 2.5, "learning_rate": 2e-5, "progress": 0.0, "status": "running" } # Update progress based on elapsed time (simulated) elapsed = time.time() - st.session_state.training_progress["started"] epoch_duration = 60 # Simulate each epoch taking 60 seconds # Calculate current progress total_duration = epoch_duration * st.session_state.training_progress["total_epochs"] progress = min(elapsed / total_duration, 1.0) # Calculate current epoch current_epoch = min( int(progress * st.session_state.training_progress["total_epochs"]), st.session_state.training_progress["total_epochs"] ) # Simulate decreasing loss loss = max(2.5 - (progress * 2.0), 0.5) # Update session state st.session_state.training_progress.update({ "progress": progress, "current_epoch": current_epoch, "loss": loss, "status": "completed" if progress >= 1.0 else "running" }) return st.session_state.training_progress