File size: 11,470 Bytes
b619545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import os
import json
import time
import pandas as pd
import streamlit as st
from huggingface_hub import HfApi, create_repo, upload_file
import tempfile

def create_model_repo(repo_name, private=True):
    """
    Create a new model repository on Hugging Face Hub
    
    Args:
        repo_name (str): Name of the repository
        private (bool): Whether the repository should be private
        
    Returns:
        tuple: (success (bool), message (str))
    """
    try:
        token = st.session_state.get("hf_token")
        if not token:
            return False, "No Hugging Face token found"
        
        username = st.session_state.get("hf_username", "user")
        full_repo_name = f"{username}/{repo_name}"
        
        api = HfApi(token=token)
        repo_url = api.create_repo(
            repo_id=full_repo_name,
            private=private,
            exist_ok=True
        )
        
        return True, repo_url
    except Exception as e:
        return False, str(e)

def upload_training_config(config, repo_name):
    """
    Upload a training configuration file to Hugging Face Hub
    
    Args:
        config (dict): Training configuration
        repo_name (str): Repository to upload to
        
    Returns:
        tuple: (success (bool), message (str))
    """
    try:
        token = st.session_state.get("hf_token")
        if not token:
            return False, "No Hugging Face token found"
            
        username = st.session_state.get("hf_username", "user")
        repo_id = f"{username}/{repo_name}"
        
        with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
            with open(tmp.name, 'w') as f:
                json.dump(config, f, indent=2)
                
            # Upload file to repository
            upload_file(
                path_or_fileobj=tmp.name,
                path_in_repo="training_config.json",
                repo_id=repo_id,
                token=token
            )
            
            # Clean up temporary file
            tmp_name = tmp.name
        
        os.unlink(tmp_name)
        return True, f"Training config uploaded to {repo_id}"
    except Exception as e:
        return False, str(e)

def setup_training_script(repo_name, config):
    """
    Generate and upload a training script to the repository
    
    Args:
        repo_name (str): Repository name
        config (dict): Training configuration
        
    Returns:
        tuple: (success (bool), message (str))
    """
    # Create a training script using transformers Trainer with 4-bit quantization for CPU
    script = """
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
import json
import os
import torch
from huggingface_hub import login
import bitsandbytes as bnb

# Load configuration
with open("training_config.json", "r") as f:
    config = json.load(f)

# Login to Hugging Face
login(token=os.environ.get("HF_TOKEN"))

# Load dataset
print("Loading dataset:", config["dataset_name"])
dataset = load_dataset(config["dataset_name"])

# Prepare train/validation split if not already split
if "train" in dataset and "validation" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.1)
elif "train" not in dataset:
    # If dataset has no train split but has text column, use that
    if "text" in dataset:
        dataset = dataset.train_test_split(test_size=0.1)
    else:
        # Try to find what splits are available
        print("Available splits:", list(dataset.keys()))
        # Default to using the first split and splitting it
        first_split = list(dataset.keys())[0]
        dataset = dataset[first_split].train_test_split(test_size=0.1)

print("Dataset splits:", list(dataset.keys()))

# Print dataset sample
print("Dataset sample:", dataset["train"][0])

# Load tokenizer
print("Loading tokenizer for model:", config["model_name_or_path"])
tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"])
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with 4-bit quantization for CPU efficiency
print("Loading model with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    config["model_name_or_path"],
    load_in_4bit=True,  # Enable 4-bit quantization
    device_map="auto",
    quantization_config=bnb.nn.modules.Linear4bit.compute_quant_config(),
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    use_cache=False,  # Required for gradient checkpointing
)

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

# Print memory usage before PEFT
print(f"Model loaded. Memory usage: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")

# Prepare model for training with LoRA
print("Setting up LoRA with rank:", config["peft_config"]["r"])
peft_config = LoraConfig(
    r=config["peft_config"]["r"],
    lora_alpha=config["peft_config"]["lora_alpha"],
    lora_dropout=config["peft_config"]["lora_dropout"],
    bias=config["peft_config"]["bias"],
    task_type=config["peft_config"]["task_type"],
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

# Prepare model - use 8-bit Adam for memory efficiency
print("Preparing model for training...")
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Setup training arguments with CPU optimizations
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["num_train_epochs"],
    per_device_train_batch_size=config["per_device_train_batch_size"],
    per_device_eval_batch_size=max(1, config["per_device_train_batch_size"] // 2),
    learning_rate=config["learning_rate"],
    weight_decay=config["weight_decay"],
    save_strategy=config["save_strategy"],
    evaluation_strategy=config["evaluation_strategy"],
    fp16=config["fp16"] and torch.cuda.is_available(),
    optim=config["optim"],
    logging_steps=config["logging_steps"],
    gradient_accumulation_steps=config["gradient_accumulation_steps"],
    max_steps=config["max_steps"] if config["max_steps"] > 0 else None,
    warmup_steps=config["warmup_steps"],
    max_grad_norm=config["max_grad_norm"],
    push_to_hub=True,
    hub_token=os.environ.get("HF_TOKEN"),
    dataloader_num_workers=0,  # Lower CPU usage for smaller machines
    use_cpu=not torch.cuda.is_available(),  # Force CPU if no GPU
    lr_scheduler_type="cosine",  # Better LR scheduling for small datasets
    report_to=["tensorboard"],  # Enable tensorboard logging
)

# Define data collator
print("Setting up data collator...")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Tokenize dataset
print("Tokenizing dataset...")
def tokenize_function(examples):
    # Use a smaller max length on CPU to save memory
    max_length = 256 if not torch.cuda.is_available() else 512
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=max_length
    )

# Show progress while tokenizing
print("Mapping tokenization function...")
tokenized_dataset = dataset.map(
    tokenize_function, 
    batched=True,
    batch_size=8,  # Smaller batch size for CPU
    remove_columns=dataset["train"].column_names,  # Remove original columns after tokenizing
    desc="Tokenizing dataset",
)

# Initialize trainer
print("Initializing trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

# Start training
print("Starting training...")
try:
    trainer.train()
    # Save model and tokenizer
    print("Saving model...")
    trainer.save_model()
    print("Training completed successfully!")
except Exception as e:
    print(f"Error during training: {str(e)}")
    # Save checkpoint even if error occurred
    try:
        trainer.save_model("./checkpoint-error")
        print("Saved checkpoint before error")
    except:
        print("Could not save checkpoint")
"""
    
    try:
        token = st.session_state.get("hf_token")
        if not token:
            return False, "No Hugging Face token found"
            
        username = st.session_state.get("hf_username", "user")
        repo_id = f"{username}/{repo_name}"
        
        with tempfile.NamedTemporaryFile(delete=False, suffix='.py') as tmp:
            with open(tmp.name, 'w') as f:
                f.write(script)
                
            # Upload file to repository
            upload_file(
                path_or_fileobj=tmp.name,
                path_in_repo="train.py",
                repo_id=repo_id,
                token=token
            )
            
            # Clean up temporary file
            tmp_name = tmp.name
        
        # Also create and upload a CPU-optimized requirements file
        requirements = """
transformers>=4.35.0
peft>=0.7.0
bitsandbytes>=0.40.0
datasets>=2.10.0
torch>=2.0.0
tensorboard>=2.13.0
accelerate>=0.20.0
huggingface_hub>=0.15.0
scipy>=1.10.0
"""
        
        with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as tmp:
            with open(tmp.name, 'w') as f:
                f.write(requirements)
                
            # Upload file to repository
            upload_file(
                path_or_fileobj=tmp.name,
                path_in_repo="requirements.txt",
                repo_id=repo_id,
                token=token
            )
            
            # Clean up temporary file
            tmp_name = tmp.name
        
        os.unlink(tmp_name)
        
        return True, f"Training script and requirements uploaded to {repo_id}"
    except Exception as e:
        return False, str(e)

def simulate_training_progress():
    """
    Simulate training progress for demonstration purposes
    """
    if "training_progress" not in st.session_state:
        st.session_state.training_progress = {
            "started": time.time(),
            "current_epoch": 0,
            "total_epochs": 3,
            "loss": 2.5,
            "learning_rate": 2e-5,
            "progress": 0.0,
            "status": "running"
        }
    
    # Update progress based on elapsed time (simulated)
    elapsed = time.time() - st.session_state.training_progress["started"]
    epoch_duration = 60  # Simulate each epoch taking 60 seconds
    
    # Calculate current progress
    total_duration = epoch_duration * st.session_state.training_progress["total_epochs"]
    progress = min(elapsed / total_duration, 1.0)
    
    # Calculate current epoch
    current_epoch = min(
        int(progress * st.session_state.training_progress["total_epochs"]),
        st.session_state.training_progress["total_epochs"]
    )
    
    # Simulate decreasing loss
    loss = max(2.5 - (progress * 2.0), 0.5)
    
    # Update session state
    st.session_state.training_progress.update({
        "progress": progress,
        "current_epoch": current_epoch,
        "loss": loss,
        "status": "completed" if progress >= 1.0 else "running"
    })
    
    return st.session_state.training_progress