Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import time | |
| import pandas as pd | |
| import streamlit as st | |
| from huggingface_hub import HfApi, create_repo, upload_file | |
| import tempfile | |
| def create_model_repo(repo_name, private=True): | |
| """ | |
| Create a new model repository on Hugging Face Hub | |
| Args: | |
| repo_name (str): Name of the repository | |
| private (bool): Whether the repository should be private | |
| Returns: | |
| tuple: (success (bool), message (str)) | |
| """ | |
| try: | |
| token = st.session_state.get("hf_token") | |
| if not token: | |
| return False, "No Hugging Face token found" | |
| username = st.session_state.get("hf_username", "user") | |
| full_repo_name = f"{username}/{repo_name}" | |
| api = HfApi(token=token) | |
| repo_url = api.create_repo( | |
| repo_id=full_repo_name, | |
| private=private, | |
| exist_ok=True | |
| ) | |
| return True, repo_url | |
| except Exception as e: | |
| return False, str(e) | |
| def upload_training_config(config, repo_name): | |
| """ | |
| Upload a training configuration file to Hugging Face Hub | |
| Args: | |
| config (dict): Training configuration | |
| repo_name (str): Repository to upload to | |
| Returns: | |
| tuple: (success (bool), message (str)) | |
| """ | |
| try: | |
| token = st.session_state.get("hf_token") | |
| if not token: | |
| return False, "No Hugging Face token found" | |
| username = st.session_state.get("hf_username", "user") | |
| repo_id = f"{username}/{repo_name}" | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp: | |
| with open(tmp.name, 'w') as f: | |
| json.dump(config, f, indent=2) | |
| # Upload file to repository | |
| upload_file( | |
| path_or_fileobj=tmp.name, | |
| path_in_repo="training_config.json", | |
| repo_id=repo_id, | |
| token=token | |
| ) | |
| # Clean up temporary file | |
| tmp_name = tmp.name | |
| os.unlink(tmp_name) | |
| return True, f"Training config uploaded to {repo_id}" | |
| except Exception as e: | |
| return False, str(e) | |
| def setup_training_script(repo_name, config): | |
| """ | |
| Generate and upload a training script to the repository | |
| Args: | |
| repo_name (str): Repository name | |
| config (dict): Training configuration | |
| Returns: | |
| tuple: (success (bool), message (str)) | |
| """ | |
| # Create a training script using transformers Trainer with 4-bit quantization for CPU | |
| script = """ | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling | |
| from datasets import load_dataset | |
| from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel | |
| import json | |
| import os | |
| import torch | |
| from huggingface_hub import login | |
| import bitsandbytes as bnb | |
| # Load configuration | |
| with open("training_config.json", "r") as f: | |
| config = json.load(f) | |
| # Login to Hugging Face | |
| login(token=os.environ.get("HF_TOKEN")) | |
| # Load dataset | |
| print("Loading dataset:", config["dataset_name"]) | |
| dataset = load_dataset(config["dataset_name"]) | |
| # Prepare train/validation split if not already split | |
| if "train" in dataset and "validation" not in dataset: | |
| dataset = dataset["train"].train_test_split(test_size=0.1) | |
| elif "train" not in dataset: | |
| # If dataset has no train split but has text column, use that | |
| if "text" in dataset: | |
| dataset = dataset.train_test_split(test_size=0.1) | |
| else: | |
| # Try to find what splits are available | |
| print("Available splits:", list(dataset.keys())) | |
| # Default to using the first split and splitting it | |
| first_split = list(dataset.keys())[0] | |
| dataset = dataset[first_split].train_test_split(test_size=0.1) | |
| print("Dataset splits:", list(dataset.keys())) | |
| # Print dataset sample | |
| print("Dataset sample:", dataset["train"][0]) | |
| # Load tokenizer | |
| print("Loading tokenizer for model:", config["model_name_or_path"]) | |
| tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"]) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load model with 4-bit quantization for CPU efficiency | |
| print("Loading model with quantization...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| config["model_name_or_path"], | |
| load_in_4bit=True, # Enable 4-bit quantization | |
| device_map="auto", | |
| quantization_config=bnb.nn.modules.Linear4bit.compute_quant_config(), | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| use_cache=False, # Required for gradient checkpointing | |
| ) | |
| # Enable gradient checkpointing for memory efficiency | |
| model.gradient_checkpointing_enable() | |
| # Print memory usage before PEFT | |
| print(f"Model loaded. Memory usage: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") | |
| # Prepare model for training with LoRA | |
| print("Setting up LoRA with rank:", config["peft_config"]["r"]) | |
| peft_config = LoraConfig( | |
| r=config["peft_config"]["r"], | |
| lora_alpha=config["peft_config"]["lora_alpha"], | |
| lora_dropout=config["peft_config"]["lora_dropout"], | |
| bias=config["peft_config"]["bias"], | |
| task_type=config["peft_config"]["task_type"], | |
| target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], | |
| ) | |
| # Prepare model - use 8-bit Adam for memory efficiency | |
| print("Preparing model for training...") | |
| model = prepare_model_for_kbit_training(model) | |
| model = get_peft_model(model, peft_config) | |
| model.print_trainable_parameters() | |
| # Setup training arguments with CPU optimizations | |
| print("Setting up training arguments...") | |
| training_args = TrainingArguments( | |
| output_dir=config["output_dir"], | |
| num_train_epochs=config["num_train_epochs"], | |
| per_device_train_batch_size=config["per_device_train_batch_size"], | |
| per_device_eval_batch_size=max(1, config["per_device_train_batch_size"] // 2), | |
| learning_rate=config["learning_rate"], | |
| weight_decay=config["weight_decay"], | |
| save_strategy=config["save_strategy"], | |
| evaluation_strategy=config["evaluation_strategy"], | |
| fp16=config["fp16"] and torch.cuda.is_available(), | |
| optim=config["optim"], | |
| logging_steps=config["logging_steps"], | |
| gradient_accumulation_steps=config["gradient_accumulation_steps"], | |
| max_steps=config["max_steps"] if config["max_steps"] > 0 else None, | |
| warmup_steps=config["warmup_steps"], | |
| max_grad_norm=config["max_grad_norm"], | |
| push_to_hub=True, | |
| hub_token=os.environ.get("HF_TOKEN"), | |
| dataloader_num_workers=0, # Lower CPU usage for smaller machines | |
| use_cpu=not torch.cuda.is_available(), # Force CPU if no GPU | |
| lr_scheduler_type="cosine", # Better LR scheduling for small datasets | |
| report_to=["tensorboard"], # Enable tensorboard logging | |
| ) | |
| # Define data collator | |
| print("Setting up data collator...") | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
| # Tokenize dataset | |
| print("Tokenizing dataset...") | |
| def tokenize_function(examples): | |
| # Use a smaller max length on CPU to save memory | |
| max_length = 256 if not torch.cuda.is_available() else 512 | |
| return tokenizer( | |
| examples["text"], | |
| padding="max_length", | |
| truncation=True, | |
| max_length=max_length | |
| ) | |
| # Show progress while tokenizing | |
| print("Mapping tokenization function...") | |
| tokenized_dataset = dataset.map( | |
| tokenize_function, | |
| batched=True, | |
| batch_size=8, # Smaller batch size for CPU | |
| remove_columns=dataset["train"].column_names, # Remove original columns after tokenizing | |
| desc="Tokenizing dataset", | |
| ) | |
| # Initialize trainer | |
| print("Initializing trainer...") | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset["train"], | |
| eval_dataset=tokenized_dataset["validation"], | |
| data_collator=data_collator, | |
| ) | |
| # Start training | |
| print("Starting training...") | |
| try: | |
| trainer.train() | |
| # Save model and tokenizer | |
| print("Saving model...") | |
| trainer.save_model() | |
| print("Training completed successfully!") | |
| except Exception as e: | |
| print(f"Error during training: {str(e)}") | |
| # Save checkpoint even if error occurred | |
| try: | |
| trainer.save_model("./checkpoint-error") | |
| print("Saved checkpoint before error") | |
| except: | |
| print("Could not save checkpoint") | |
| """ | |
| try: | |
| token = st.session_state.get("hf_token") | |
| if not token: | |
| return False, "No Hugging Face token found" | |
| username = st.session_state.get("hf_username", "user") | |
| repo_id = f"{username}/{repo_name}" | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.py') as tmp: | |
| with open(tmp.name, 'w') as f: | |
| f.write(script) | |
| # Upload file to repository | |
| upload_file( | |
| path_or_fileobj=tmp.name, | |
| path_in_repo="train.py", | |
| repo_id=repo_id, | |
| token=token | |
| ) | |
| # Clean up temporary file | |
| tmp_name = tmp.name | |
| # Also create and upload a CPU-optimized requirements file | |
| requirements = """ | |
| transformers>=4.35.0 | |
| peft>=0.7.0 | |
| bitsandbytes>=0.40.0 | |
| datasets>=2.10.0 | |
| torch>=2.0.0 | |
| tensorboard>=2.13.0 | |
| accelerate>=0.20.0 | |
| huggingface_hub>=0.15.0 | |
| scipy>=1.10.0 | |
| """ | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as tmp: | |
| with open(tmp.name, 'w') as f: | |
| f.write(requirements) | |
| # Upload file to repository | |
| upload_file( | |
| path_or_fileobj=tmp.name, | |
| path_in_repo="requirements.txt", | |
| repo_id=repo_id, | |
| token=token | |
| ) | |
| # Clean up temporary file | |
| tmp_name = tmp.name | |
| os.unlink(tmp_name) | |
| return True, f"Training script and requirements uploaded to {repo_id}" | |
| except Exception as e: | |
| return False, str(e) | |
| def simulate_training_progress(): | |
| """ | |
| Simulate training progress for demonstration purposes | |
| """ | |
| if "training_progress" not in st.session_state: | |
| st.session_state.training_progress = { | |
| "started": time.time(), | |
| "current_epoch": 0, | |
| "total_epochs": 3, | |
| "loss": 2.5, | |
| "learning_rate": 2e-5, | |
| "progress": 0.0, | |
| "status": "running" | |
| } | |
| # Update progress based on elapsed time (simulated) | |
| elapsed = time.time() - st.session_state.training_progress["started"] | |
| epoch_duration = 60 # Simulate each epoch taking 60 seconds | |
| # Calculate current progress | |
| total_duration = epoch_duration * st.session_state.training_progress["total_epochs"] | |
| progress = min(elapsed / total_duration, 1.0) | |
| # Calculate current epoch | |
| current_epoch = min( | |
| int(progress * st.session_state.training_progress["total_epochs"]), | |
| st.session_state.training_progress["total_epochs"] | |
| ) | |
| # Simulate decreasing loss | |
| loss = max(2.5 - (progress * 2.0), 0.5) | |
| # Update session state | |
| st.session_state.training_progress.update({ | |
| "progress": progress, | |
| "current_epoch": current_epoch, | |
| "loss": loss, | |
| "status": "completed" if progress >= 1.0 else "running" | |
| }) | |
| return st.session_state.training_progress |