Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -50,20 +50,6 @@ if num_args == 6:
|
|
| 50 |
batch_size_for_trainer = int(arg4) # batch sizes to send to trainer
|
| 51 |
should_produce_eval_matrix = int(arg5) # should produce matrix?
|
| 52 |
path_to_save_trained_model_to = arg6
|
| 53 |
-
|
| 54 |
-
print(f"should train model? : {arg1}")
|
| 55 |
-
print (f"file to train on : {arg2}")
|
| 56 |
-
print (f"file to evaluate on : {arg3}")
|
| 57 |
-
print (f"batch size : {arg4}")
|
| 58 |
-
print (f"should produce eval matrix : {arg5}")
|
| 59 |
-
print (f"path to save trained model : {arg6}")
|
| 60 |
-
|
| 61 |
-
print(f"should train model? : {should_train_model}")
|
| 62 |
-
print (f"file to train on : {train_file}")
|
| 63 |
-
print (f"file to evaluate on : {test_file}")
|
| 64 |
-
print (f"batch size : {batch_size_for_trainer}")
|
| 65 |
-
print (f"should produce eval matrix : {should_produce_eval_matrix}")
|
| 66 |
-
print (f"path to save trained model : {path_to_save_trained_model_to}")
|
| 67 |
|
| 68 |
else:
|
| 69 |
print(f"Only {num_args-1} arguments after filename were passed out of 6")
|
|
@@ -101,8 +87,6 @@ if (should_train_model=='1'): #train model
|
|
| 101 |
|
| 102 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
| 103 |
|
| 104 |
-
# Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
|
| 105 |
-
# tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
|
| 106 |
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
|
| 107 |
# I made sure to add all the ones in the training and eval data to this list
|
| 108 |
# since we are training using data that only contains the left tag - we don't need right tags added to this list
|
|
@@ -112,8 +96,6 @@ if (should_train_model=='1'): #train model
|
|
| 112 |
|
| 113 |
# Model
|
| 114 |
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
|
| 115 |
-
# model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
|
| 116 |
-
|
| 117 |
|
| 118 |
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
|
| 119 |
model.resize_token_embeddings(len(tokenizer))
|
|
@@ -153,8 +135,6 @@ if (should_train_model=='1'): #train model
|
|
| 153 |
emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
|
| 154 |
emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
# Step 4: Split dataset into train and validation
|
| 159 |
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
|
| 160 |
# and one for "validation" with test dataset)
|
|
@@ -163,12 +143,10 @@ if (should_train_model=='1'): #train model
|
|
| 163 |
'validation': emotions_dataset_test
|
| 164 |
})
|
| 165 |
|
| 166 |
-
|
| 167 |
# Define the tokenize function
|
| 168 |
def tokenize(batch):
|
| 169 |
return tokenizer(batch["text"], padding=True, truncation=True)
|
| 170 |
|
| 171 |
-
|
| 172 |
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
|
| 173 |
# this will add the "input_id" and "attention_mask" columns
|
| 174 |
emotions_encoded = emotions_encoded.map(tokenize, batched=True)
|
|
@@ -223,15 +201,6 @@ if (should_train_model=='1'): #train model
|
|
| 223 |
|
| 224 |
return (loss, outputs) if return_outputs else loss
|
| 225 |
|
| 226 |
-
|
| 227 |
-
# trainer = CustomTrainer(
|
| 228 |
-
# model=model,
|
| 229 |
-
# compute_metrics=compute_metrics,
|
| 230 |
-
# args=training_args,
|
| 231 |
-
# train_dataset=emotions_encoded["train"],
|
| 232 |
-
# eval_dataset=emotions_encoded["validation"],
|
| 233 |
-
# tokenizer=tokenizer )
|
| 234 |
-
|
| 235 |
trainer = Trainer(
|
| 236 |
model=model,
|
| 237 |
args=training_args,
|
|
|
|
| 50 |
batch_size_for_trainer = int(arg4) # batch sizes to send to trainer
|
| 51 |
should_produce_eval_matrix = int(arg5) # should produce matrix?
|
| 52 |
path_to_save_trained_model_to = arg6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
else:
|
| 55 |
print(f"Only {num_args-1} arguments after filename were passed out of 6")
|
|
|
|
| 87 |
|
| 88 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
| 89 |
|
|
|
|
|
|
|
| 90 |
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
|
| 91 |
# I made sure to add all the ones in the training and eval data to this list
|
| 92 |
# since we are training using data that only contains the left tag - we don't need right tags added to this list
|
|
|
|
| 96 |
|
| 97 |
# Model
|
| 98 |
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
|
| 101 |
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
| 135 |
emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
|
| 136 |
emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
|
| 137 |
|
|
|
|
|
|
|
| 138 |
# Step 4: Split dataset into train and validation
|
| 139 |
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
|
| 140 |
# and one for "validation" with test dataset)
|
|
|
|
| 143 |
'validation': emotions_dataset_test
|
| 144 |
})
|
| 145 |
|
|
|
|
| 146 |
# Define the tokenize function
|
| 147 |
def tokenize(batch):
|
| 148 |
return tokenizer(batch["text"], padding=True, truncation=True)
|
| 149 |
|
|
|
|
| 150 |
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
|
| 151 |
# this will add the "input_id" and "attention_mask" columns
|
| 152 |
emotions_encoded = emotions_encoded.map(tokenize, batched=True)
|
|
|
|
| 201 |
|
| 202 |
return (loss, outputs) if return_outputs else loss
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
trainer = Trainer(
|
| 205 |
model=model,
|
| 206 |
args=training_args,
|