Upload 4 files
Browse files
app.py
CHANGED
|
@@ -133,15 +133,8 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
|
|
| 133 |
# Preprocess dataset
|
| 134 |
progress(0.3, desc="Preprocessing dataset...")
|
| 135 |
|
| 136 |
-
def tokenize_function(examples):
|
| 137 |
-
# Process conversations
|
| 138 |
-
processed = preprocess_conversations(examples, tokenizer)
|
| 139 |
-
# Add labels (copy of input_ids for language modeling)
|
| 140 |
-
processed["labels"] = [ids[:] for ids in processed["input_ids"]]
|
| 141 |
-
return processed
|
| 142 |
-
|
| 143 |
tokenized_dataset = dataset.map(
|
| 144 |
-
|
| 145 |
batched=True,
|
| 146 |
remove_columns=dataset.column_names
|
| 147 |
)
|
|
@@ -179,12 +172,44 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
|
|
| 179 |
greater_is_better=False,
|
| 180 |
)
|
| 181 |
|
| 182 |
-
#
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
# Initialize trainer with custom loss
|
| 190 |
trainer = CoDATrainer(
|
|
|
|
| 133 |
# Preprocess dataset
|
| 134 |
progress(0.3, desc="Preprocessing dataset...")
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
tokenized_dataset = dataset.map(
|
| 137 |
+
lambda x: preprocess_conversations(x, tokenizer),
|
| 138 |
batched=True,
|
| 139 |
remove_columns=dataset.column_names
|
| 140 |
)
|
|
|
|
| 172 |
greater_is_better=False,
|
| 173 |
)
|
| 174 |
|
| 175 |
+
# Custom data collator that handles labels properly
|
| 176 |
+
from dataclasses import dataclass
|
| 177 |
+
from typing import Any, Dict, List
|
| 178 |
+
|
| 179 |
+
@dataclass
|
| 180 |
+
class CustomDataCollator:
|
| 181 |
+
tokenizer: Any
|
| 182 |
+
|
| 183 |
+
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 184 |
+
# Pad input_ids
|
| 185 |
+
max_length = max(len(f["input_ids"]) for f in features)
|
| 186 |
+
|
| 187 |
+
batch = {
|
| 188 |
+
"input_ids": [],
|
| 189 |
+
"attention_mask": [],
|
| 190 |
+
"labels": []
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
for f in features:
|
| 194 |
+
input_ids = f["input_ids"]
|
| 195 |
+
padding_length = max_length - len(input_ids)
|
| 196 |
+
|
| 197 |
+
# Pad input_ids and attention_mask
|
| 198 |
+
batch["input_ids"].append(input_ids + [self.tokenizer.pad_token_id] * padding_length)
|
| 199 |
+
batch["attention_mask"].append(f["attention_mask"] + [0] * padding_length)
|
| 200 |
+
|
| 201 |
+
# Labels: copy of input_ids with padding as -100 (ignored in loss)
|
| 202 |
+
batch["labels"].append(input_ids + [-100] * padding_length)
|
| 203 |
+
|
| 204 |
+
# Convert to tensors
|
| 205 |
+
import torch
|
| 206 |
+
return {
|
| 207 |
+
"input_ids": torch.tensor(batch["input_ids"], dtype=torch.long),
|
| 208 |
+
"attention_mask": torch.tensor(batch["attention_mask"], dtype=torch.long),
|
| 209 |
+
"labels": torch.tensor(batch["labels"], dtype=torch.long)
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
data_collator = CustomDataCollator(tokenizer=tokenizer)
|
| 213 |
|
| 214 |
# Initialize trainer with custom loss
|
| 215 |
trainer = CoDATrainer(
|