llaa33219 commited on
Commit
0dbb2c9
·
verified ·
1 Parent(s): cb69d8f

Upload 4 files

Browse files
Files changed (1) hide show
  1. app.py +39 -14
app.py CHANGED
@@ -133,15 +133,8 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
133
  # Preprocess dataset
134
  progress(0.3, desc="Preprocessing dataset...")
135
 
136
- def tokenize_function(examples):
137
- # Process conversations
138
- processed = preprocess_conversations(examples, tokenizer)
139
- # Add labels (copy of input_ids for language modeling)
140
- processed["labels"] = [ids[:] for ids in processed["input_ids"]]
141
- return processed
142
-
143
  tokenized_dataset = dataset.map(
144
- tokenize_function,
145
  batched=True,
146
  remove_columns=dataset.column_names
147
  )
@@ -179,12 +172,44 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
179
  greater_is_better=False,
180
  )
181
 
182
- # Data collator with padding
183
- data_collator = DataCollatorForLanguageModeling(
184
- tokenizer=tokenizer,
185
- mlm=False,
186
- pad_to_multiple_of=8 # Pad to multiple of 8 for efficiency
187
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  # Initialize trainer with custom loss
190
  trainer = CoDATrainer(
 
133
  # Preprocess dataset
134
  progress(0.3, desc="Preprocessing dataset...")
135
 
 
 
 
 
 
 
 
136
  tokenized_dataset = dataset.map(
137
+ lambda x: preprocess_conversations(x, tokenizer),
138
  batched=True,
139
  remove_columns=dataset.column_names
140
  )
 
172
  greater_is_better=False,
173
  )
174
 
175
+ # Custom data collator that handles labels properly
176
+ from dataclasses import dataclass
177
+ from typing import Any, Dict, List
178
+
179
+ @dataclass
180
+ class CustomDataCollator:
181
+ tokenizer: Any
182
+
183
+ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
184
+ # Pad input_ids
185
+ max_length = max(len(f["input_ids"]) for f in features)
186
+
187
+ batch = {
188
+ "input_ids": [],
189
+ "attention_mask": [],
190
+ "labels": []
191
+ }
192
+
193
+ for f in features:
194
+ input_ids = f["input_ids"]
195
+ padding_length = max_length - len(input_ids)
196
+
197
+ # Pad input_ids and attention_mask
198
+ batch["input_ids"].append(input_ids + [self.tokenizer.pad_token_id] * padding_length)
199
+ batch["attention_mask"].append(f["attention_mask"] + [0] * padding_length)
200
+
201
+ # Labels: copy of input_ids with padding as -100 (ignored in loss)
202
+ batch["labels"].append(input_ids + [-100] * padding_length)
203
+
204
+ # Convert to tensors
205
+ import torch
206
+ return {
207
+ "input_ids": torch.tensor(batch["input_ids"], dtype=torch.long),
208
+ "attention_mask": torch.tensor(batch["attention_mask"], dtype=torch.long),
209
+ "labels": torch.tensor(batch["labels"], dtype=torch.long)
210
+ }
211
+
212
+ data_collator = CustomDataCollator(tokenizer=tokenizer)
213
 
214
  # Initialize trainer with custom loss
215
  trainer = CoDATrainer(