Spaces:

Reyad-Ahmmed
/

HF_Python

Paused

App Files Files Community

Reyad-Ahmmed commited on Mar 20

Commit

d14cc37

verified ·

1 Parent(s): 3206374

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -31

app.py CHANGED Viewed

@@ -50,20 +50,6 @@ if num_args == 6:
     batch_size_for_trainer = int(arg4)  # batch sizes to send to trainer
     should_produce_eval_matrix = int(arg5)     # should produce matrix?
     path_to_save_trained_model_to = arg6
-    print(f"should train model? : {arg1}")
-    print (f"file to train on : {arg2}")
-    print (f"file to evaluate on : {arg3}")
-    print (f"batch size : {arg4}")
-    print (f"should produce eval matrix : {arg5}")
-    print (f"path to save trained model : {arg6}")
-    print(f"should train model? : {should_train_model}")
-    print (f"file to train on : {train_file}")
-    print (f"file to evaluate on : {test_file}")
-    print (f"batch size : {batch_size_for_trainer}")
-    print (f"should produce eval matrix : {should_produce_eval_matrix}")
-    print (f"path to save trained model : {path_to_save_trained_model_to}")
 else:
     print(f"Only {num_args-1} arguments after filename were passed out of 6")
@@ -101,8 +87,6 @@ if (should_train_model=='1'): #train model
     repo_name = "Reyad-Ahmmed/hf-data-timeframe"
-    # Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
-    # tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
     tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
     # I made sure to add all the ones in the training and eval data to this list
     # since we are training using data that only contains the left tag - we don't need right tags added to this list
@@ -112,8 +96,6 @@ if (should_train_model=='1'): #train model
     # Model
     model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
-    # model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
     # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
     model.resize_token_embeddings(len(tokenizer))
@@ -153,8 +135,6 @@ if (should_train_model=='1'): #train model
     emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
     emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
     # Step 4: Split dataset into train and validation
     # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
     # and one for "validation" with test dataset)
@@ -163,12 +143,10 @@ if (should_train_model=='1'): #train model
         'validation': emotions_dataset_test
     })
     # Define the tokenize function
     def tokenize(batch):
         return tokenizer(batch["text"], padding=True, truncation=True)
     # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
     # this will add the "input_id" and "attention_mask" columns
     emotions_encoded = emotions_encoded.map(tokenize, batched=True)
@@ -223,15 +201,6 @@ if (should_train_model=='1'): #train model
             return (loss, outputs) if return_outputs else loss
-    # trainer = CustomTrainer(
-    #     model=model,
-    #     compute_metrics=compute_metrics,
-    #     args=training_args,
-    #     train_dataset=emotions_encoded["train"],
-    #     eval_dataset=emotions_encoded["validation"],
-    #     tokenizer=tokenizer    )
     trainer = Trainer(
         model=model,
         args=training_args,

     batch_size_for_trainer = int(arg4)  # batch sizes to send to trainer
     should_produce_eval_matrix = int(arg5)     # should produce matrix?
     path_to_save_trained_model_to = arg6
 else:
     print(f"Only {num_args-1} arguments after filename were passed out of 6")
     repo_name = "Reyad-Ahmmed/hf-data-timeframe"
     tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
     # I made sure to add all the ones in the training and eval data to this list
     # since we are training using data that only contains the left tag - we don't need right tags added to this list
     # Model
     model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
     # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
     model.resize_token_embeddings(len(tokenizer))
     emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
     emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
     # Step 4: Split dataset into train and validation
     # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
     # and one for "validation" with test dataset)
         'validation': emotions_dataset_test
     })
     # Define the tokenize function
     def tokenize(batch):
         return tokenizer(batch["text"], padding=True, truncation=True)
     # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
     # this will add the "input_id" and "attention_mask" columns
     emotions_encoded = emotions_encoded.map(tokenize, batched=True)
             return (loss, outputs) if return_outputs else loss
     trainer = Trainer(
         model=model,
         args=training_args,