Spaces:

Reyad-Ahmmed
/

HF_Python

Paused

App Files Files Community

Reyad-Ahmmed commited on Mar 17

Commit

0523aca

verified ·

1 Parent(s): caab6dc

Create app.py

Browse files

Files changed (1) hide show

app.py +421 -0

app.py ADDED Viewed

	@@ -0,0 +1,421 @@

+#python hf-fine-tune-fleet-8.py 1 train_fleet test_fleet 1 1 saved_fleet_model
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+import torch
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+from transformers import RobertaTokenizer, RobertaForSequenceClassification
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+import sys
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+import re
+from datasets import load_dataset, DatasetDict
+import time
+import pprint
+import json
+from huggingface_hub import HfApi, login, upload_folder, create_repo
+import os
+# Load configuration file
+with open('config.json', 'r') as config_file:
+    config = json.load(config_file)
+num_args = len(config)
+arg2 = config.get('arg2', '1')
+arg3 = config.get('arg3', 'train_fleet')
+arg4 = config.get('arg4', 'train_fleet')
+arg5 = config.get('arg5', '1')
+arg6 = config.get('arg6', '1')
+arg7 = config.get('arg7', 'saved_fleet_model')
+if num_args == 7:
+    # cmd args
+    # sys.argv[0] is the script name, sys.argv[1] is the first argument, etc.
+    should_train_model = arg2  # should train model?
+    train_file = arg3   # training file name
+    test_file = arg4    # eval file name
+    batch_size_for_trainer = int(arg5)  # batch sizes to send to trainer
+    should_produce_eval_matrix = int(arg6)     # should produce matrix?
+    path_to_save_trained_model_to = arg7
+    print(f"should train model? : {arg2}")
+    print (f"file to train on : {arg3}")
+    print (f"file to evaluate on : {arg4}")
+    print (f"batch size : {arg5}")
+    print (f"should produce eval matrix : {arg6}")
+    print (f"path to save trained model : {arg7}")
+    print(f"should train model? : {should_train_model}")
+    print (f"file to train on : {train_file}")
+    print (f"file to evaluate on : {test_file}")
+    print (f"batch size : {batch_size_for_trainer}")
+    print (f"should produce eval matrix : {should_produce_eval_matrix}")
+    print (f"path to save trained model : {path_to_save_trained_model_to}")
+else:
+    print(f"Only {num_args-1} arguments after filename were passed out of 6")
+    sys.exit()
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0" #only use 1 of my GPS (in case very weak ones are installed which would slow the training down)
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+if (should_train_model=='1'): #train model
+    #settings
+    model_save_path = path_to_save_trained_model_to
+    bias_non_fleet = 1.0
+    epochs_to_run = 15
+    file_path_train = train_file + ".csv"
+    file_path_test = test_file + ".csv"
+    # Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
+    file_train_df = pd.read_csv(file_path_train)
+    file_test_df = pd.read_csv(file_path_test)
+    #combine dataframes to get all possible labels/classifications for both training and evaluating - to get all possible labels (intents)
+    df = pd.concat([file_train_df, file_test_df], ignore_index=True)
+    sorted_labels = sorted(df['label'].unique())
+    #create labels map from unique sorted labels
+    label_mapping = {label: i for i, label in enumerate(sorted_labels)}
+    print("label mappings")
+    print(label_mapping)
+    repo_name = "Reyad-Ahmmed/hf-data-timeframe"
+    # Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
+    # tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
+    tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
+    # I made sure to add all the ones in the training and eval data to this list
+    # since we are training using data that only contains the left tag - we don't need right tags added to this list
+    new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
+    tokenizer.add_tokens(new_tokens)
+    # Model
+    model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
+    # model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
+    # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
+    model.resize_token_embeddings(len(tokenizer))
+    #important_tokens = ["Acura-New", "TR-9012", "TR-NEW-02"]
+    from datasets import Dataset, DatasetDict
+    from sklearn.model_selection import train_test_split
+    # Step 2: Convert string labels to integers
+    # Create a mapping from unique labels (strings) to integers
+    label_to_id = {label: idx for idx, label in enumerate(sorted(df["label"].unique()))}
+    print(label_to_id)
+    # Dataframes contain prompts and label names
+    print('before converting labels to labelIds')
+    pprint.pp(file_train_df)
+    pprint.pp(file_test_df)
+    # Apply the mapping to the labels to id (will swap out the label names with label id to the dataframes)
+    file_train_df["label"] = file_train_df["label"].map(label_to_id)
+    file_test_df["label"] = file_test_df["label"].map(label_to_id)
+    print('after swapping out label names with Ids')
+    pprint.pp(file_train_df)
+    pprint.pp(file_test_df)
+    # Step 3: Convert both dataframes to dictionaries
+    emotions_dict_train = {"text": file_train_df["text"].tolist(), "label": file_train_df["label"].tolist()}
+    emotions_dict_test = {"text": file_test_df["text"].tolist(), "label": file_test_df["label"].tolist()}
+    print('dictionaries')
+    pprint.pp(emotions_dict_train)
+    pprint.pp(emotions_dict_test)
+    # convert dictionaries to datasets
+    emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
+    emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
+    # Step 4: Split dataset into train and validation
+    # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
+    # and one for "validation" with test dataset)
+    emotions_encoded = DatasetDict({
+        'train': emotions_dataset_train,
+        'validation': emotions_dataset_test
+    })
+    # Define the tokenize function
+    def tokenize(batch):
+        return tokenizer(batch["text"], padding=True, truncation=True)
+    # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
+    # this will add the "input_id" and "attention_mask" columns
+    emotions_encoded = emotions_encoded.map(tokenize, batched=True)
+    emotions_encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
+    # Set the model to evaluation mode (this line does not run any training or eval)
+    model.eval()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    from sklearn.metrics import accuracy_score, f1_score
+    # Define additional compute_metrics (used as part of error-analysis - produces "accuracy" metric which can be used in another program
+    # that shows any training prompts with large losses)
+    def compute_metrics(pred):
+        logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
+        preds = logits.argmax(-1)
+        labels = pred.label_ids
+        accuracy = (preds == labels).astype(float).mean()
+        return {"accuracy": accuracy}
+    training_args = TrainingArguments(
+        output_dir='./results',
+        num_train_epochs=epochs_to_run,
+        per_device_train_batch_size=batch_size_for_trainer,
+        per_device_eval_batch_size=batch_size_for_trainer,
+        warmup_steps=500,
+        learning_rate=2e-5,
+        weight_decay=0.02,
+        logging_dir='./logs',
+        logging_steps=10,
+        evaluation_strategy="epoch",
+    )
+    # notice the bias_non_float in next line (it is given a value at top of code)
+    # class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0])  # Replace with your actual class weights
+    # class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu')
+    # This is needed b/c loss_fn is swapped out in order to use weighted loss
+    # Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
+    class CustomTrainer(Trainer):
+        def compute_loss(self, model, inputs, return_outputs=False):
+            labels = inputs.get("labels")
+            outputs = model(**inputs)
+            logits = outputs.get("logits")
+            # Use cross-entropy loss with class weights
+            # loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
+            loss_fn = torch.nn.CrossEntropyLoss()
+            loss = loss_fn(logits, labels)
+            return (loss, outputs) if return_outputs else loss
+    # trainer = CustomTrainer(
+    #     model=model,
+    #     compute_metrics=compute_metrics,
+    #     args=training_args,
+    #     train_dataset=emotions_encoded["train"],
+    #     eval_dataset=emotions_encoded["validation"],
+    #     tokenizer=tokenizer    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=emotions_encoded["train"],
+        eval_dataset=emotions_encoded["validation"],
+        tokenizer=tokenizer
+    )
+    # Train the model and set timer to measure the training time
+    start_time = time.time()
+    trainer.train()
+    end_time = time.time()
+    execution_time = end_time - start_time
+    print(f"Execution Time: {execution_time:.2f} seconds")
+    # send validation prompts through the model - will be used in error-analysis matrix below
+    preds_output = trainer.predict(emotions_encoded["validation"])
+    #################This section creates a error analysis matrix
+    # Extract the logits from the predictions output
+    logits = preds_output.predictions[0] if isinstance(preds_output.predictions, tuple) else preds_output.predictions
+    # Get the predicted class by applying argmax on the logits
+    y_preds = np.argmax(logits, axis=1)    #prediction
+    y_valid = np.array(emotions_encoded["validation"]["label"]) #labels
+    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
+    import matplotlib.pyplot as plt
+    import numpy as np
+    from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
+    #num_labels2 = len(label_mapping)
+    print("Ypreds and valids shape")
+    print(y_preds.shape, y_valid.shape)
+    # Define the function to plot the confusion matrix
+    def plot_confusion_matrix_with_text_labels(y_preds, y_true, labels):
+        # Compute confusion matrix
+        cm = confusion_matrix(y_true, y_preds,normalize="true")
+        # Plot confusion matrix
+        fig, ax = plt.subplots(figsize=(len(labels), len(labels)))
+        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
+        disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
+        # Rotate the x-axis labels to prevent overlap
+        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
+        # Ensure the plot is displayed
+        plt.title("Normalized Confusion Matrix with Text Labels")
+        plt.tight_layout()
+        plt.savefig("confusion_matrix.png")
+        plt.show()
+    # Get unique labels for validation data only - this will be shown in the matrix
+    unique_labels = sorted(set(y_valid) | set(y_preds))
+    id_to_label = {v: k for k, v in label_to_id.items()}
+    labels = [id_to_label[label] for label in unique_labels]
+    print ("unique_labels")
+    print(labels)
+    # Call the function with the correct labels
+    if(should_produce_eval_matrix == 1):
+        plot_confusion_matrix_with_text_labels(y_preds, y_valid, labels)
+    #the label mapping will be saved in the model - and retrieved by any other program using the model -
+    # for instance the pathway through this code used for inference only will retrieve this value
+    # (or like the Python program that measures poor accuracies)
+    model.config.label_mapping = label_mapping
+    # Save the model and tokenizer
+    model.save_pretrained(f"./{model_save_path}")
+    tokenizer.save_pretrained('./saved_fleet_tokenizer')
+    #for push repository
+    repo_name = "Reyad-Ahmmed/hf-data-timeframe"
+    # Your repository name
+    api_token = os.getenv("hf_token")  # Retrieve the API token from environment variable
+    if not api_token:
+        raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.")
+    # Create repository (if not already created)
+    api = HfApi()
+    create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
+    # Upload the model and tokenizer to the Hugging Face repository
+    upload_folder(
+        folder_path=f"{model_save_path}",
+        path_in_repo=f"{model_save_path}",
+        repo_id=repo_name,
+        token=api_token,
+        commit_message="Push fleet model",
+        #overwrite=True  # Force overwrite existing files
+    )
+    upload_folder(
+        folder_path="saved_fleet_tokenizer",
+        path_in_repo="saved_fleet_tokenizer",
+        repo_id=repo_name,
+        token=api_token,
+        commit_message="Push fleet tokenizer",
+        #overwrite=True  # Force overwrite existing files
+    )
+else:
+    print('Load Pre-trained')
+    model_save_path = "./saved_fleet_model"
+    tokenizer_save_path = "./saved_fleet_tokenizer"
+    # RobertaTokenizer.from_pretrained(model_save_path)
+    model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cuda')
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
+#Define the label mappings (this must match the mapping used during training)
+label_mapping = model.config.label_mapping
+label_mapping_reverse = {value: key for key, value in label_mapping.items()}
+#Function to classify user input
+def classify_user_input():
+    while True:
+        user_input = input("Enter a command (or type 'q' to quit): ")
+        if user_input.lower() == 'q':
+            print("Exiting...")
+            break
+        # Tokenize and predict
+        input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cuda')
+        with torch.no_grad():
+            #attention_mask = input_encoding['attention_mask'].clone()
+            # Modify the attention mask to emphasize certain key tokens
+            for idx, token_id in enumerate(input_encoding['input_ids'][0]):
+                word = tokenizer.decode([token_id])
+                print(word)
+                #if word.strip() in ["point", "summarize", "oil", "maintenance"]:  # Target key tokens
+                    #attention_mask[0, idx] = 2  # Increase attention weight for these words
+                # else:
+                #     attention_mask[0, idx] = 0
+            #print (attention_mask)
+            #input_encoding['attention_mask'] = attention_mask
+            output = model(**input_encoding, output_hidden_states=True)
+            # print('start-logits')
+            # print(output.logits)
+            # print('end-logits')
+            #print(output)
+            attention = output.attentions  # Get attention scores
+            #print('atten')
+            #print(attention)
+            # Apply softmax to get the probabilities (confidence scores)
+            probabilities = F.softmax(output.logits, dim=-1)
+            # tokens = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0].cpu().numpy())
+            # # Display the attention visualization
+            # input_text = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0])
+            prediction = torch.argmax(output.logits, dim=1).cpu().numpy()
+            # Map prediction back to label
+            print(prediction)
+            predicted_label = label_mapping_reverse[prediction[0]]
+            print(f"Predicted intent: {predicted_label}\n")
+            # Print the confidence for each label
+            print("\nLabel Confidence Scores:")
+            for i, label in label_mapping_reverse.items():
+                confidence = probabilities[0][i].item()  # Get confidence score for each label
+                print(f"{label}: {confidence:.4f}")
+            print("\n")
+#Run the function
+classify_user_input()