import os import json import tempfile import pandas as pd import streamlit as st from huggingface_hub import HfApi, upload_file, create_repo from transformers import AutoTokenizer def create_dataset_repo(repo_name, private=True): """ Create a new dataset repository on Hugging Face Hub Args: repo_name (str): Name of the repository private (bool): Whether the repository should be private Returns: str: URL of the created repository """ try: token = st.session_state.get("hf_token") if not token: return False, "No Hugging Face token found" username = st.session_state.get("hf_username", "user") full_repo_name = f"{username}/{repo_name}" api = HfApi(token=token) repo_url = api.create_repo( repo_id=full_repo_name, repo_type="dataset", private=private, exist_ok=True ) return True, repo_url except Exception as e: return False, str(e) def upload_dataset_to_hub(file_data, file_name, repo_name): """ Upload a dataset file to Hugging Face Hub Args: file_data (bytes/DataFrame): File content as bytes or a DataFrame file_name (str): Name to save the file as repo_name (str): Repository to upload to Returns: tuple: (success (bool), message (str)) """ try: token = st.session_state.get("hf_token") if not token: return False, "No Hugging Face token found" username = st.session_state.get("hf_username", "user") repo_id = f"{username}/{repo_name}" with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1]) as tmp: # If it's a DataFrame, save as JSONL if isinstance(file_data, pd.DataFrame): file_data.to_json(tmp.name, orient="records", lines=True) else: # Otherwise, assume it's bytes tmp.write(file_data) # Upload file to repository upload_file( path_or_fileobj=tmp.name, path_in_repo=file_name, repo_id=repo_id, token=token, repo_type="dataset" ) # Clean up temporary file tmp_name = tmp.name os.unlink(tmp_name) return True, f"File uploaded to {repo_id}" except Exception as e: return False, str(e) def prepare_training_config(model_name, hyperparams, dataset_repo, output_repo): """ Prepare a training configuration for Gemma fine-tuning Args: model_name (str): Model identifier hyperparams (dict): Training hyperparameters dataset_repo (str): Dataset repository name output_repo (str): Output repository name Returns: dict: Training configuration """ username = st.session_state.get("hf_username", "user") config = { "model_name_or_path": model_name, "dataset_name": f"{username}/{dataset_repo}", "output_dir": f"{username}/{output_repo}", "num_train_epochs": hyperparams.get("epochs", 3), "per_device_train_batch_size": hyperparams.get("batch_size", 8), "learning_rate": hyperparams.get("learning_rate", 2e-5), "weight_decay": hyperparams.get("weight_decay", 0.01), "save_strategy": "epoch", "evaluation_strategy": "epoch", "fp16": hyperparams.get("fp16", False), "peft_config": { "r": hyperparams.get("lora_rank", 8), "lora_alpha": hyperparams.get("lora_alpha", 32), "lora_dropout": hyperparams.get("lora_dropout", 0.05), "bias": "none", "task_type": "CAUSAL_LM" }, "optim": "adamw_torch", "logging_steps": 50, "gradient_accumulation_steps": hyperparams.get("gradient_accumulation", 1), "max_steps": hyperparams.get("max_steps", -1), "warmup_steps": hyperparams.get("warmup_steps", 0), "max_grad_norm": hyperparams.get("max_grad_norm", 1.0), } return config def preprocess_dataset(df, prompt_column, response_column, model_name="google/gemma-2b"): """ Preprocess a dataset for Gemma fine-tuning Args: df (DataFrame): Dataset prompt_column (str): Column containing prompts/instructions response_column (str): Column containing responses model_name (str): Model identifier for tokenizer Returns: DataFrame: Processed dataset """ # Check if columns exist if prompt_column not in df.columns or response_column not in df.columns: raise ValueError(f"Columns {prompt_column} and/or {response_column} not found in dataset") # Simple format for instruction tuning df["text"] = df.apply( lambda row: f"user\n{row[prompt_column]}\nmodel\n{row[response_column]}", axis=1 ) # Return the processed dataset return df[["text"]]