Reyad-Ahmmed commited on
Commit
0523aca
·
verified ·
1 Parent(s): caab6dc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +421 -0
app.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #python hf-fine-tune-fleet-8.py 1 train_fleet test_fleet 1 1 saved_fleet_model
2
+
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+ from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
6
+ import torch
7
+ from torch.utils.data import Dataset
8
+ from torch.utils.data import DataLoader
9
+ from transformers import RobertaTokenizer, RobertaForSequenceClassification
10
+ import pandas as pd
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.linear_model import LogisticRegression
13
+ from sklearn.metrics import accuracy_score, confusion_matrix
14
+ import matplotlib.pyplot as plt
15
+ import seaborn as sns
16
+ import numpy as np
17
+ import sys
18
+ import torch.nn.functional as F
19
+ from torch.nn import CrossEntropyLoss
20
+ from sklearn.decomposition import PCA
21
+ import matplotlib.pyplot as plt
22
+ import re
23
+ from datasets import load_dataset, DatasetDict
24
+ import time
25
+ import pprint
26
+ import json
27
+ from huggingface_hub import HfApi, login, upload_folder, create_repo
28
+ import os
29
+
30
+ # Load configuration file
31
+ with open('config.json', 'r') as config_file:
32
+ config = json.load(config_file)
33
+
34
+ num_args = len(config)
35
+
36
+
37
+ arg2 = config.get('arg2', '1')
38
+ arg3 = config.get('arg3', 'train_fleet')
39
+ arg4 = config.get('arg4', 'train_fleet')
40
+ arg5 = config.get('arg5', '1')
41
+ arg6 = config.get('arg6', '1')
42
+ arg7 = config.get('arg7', 'saved_fleet_model')
43
+
44
+ if num_args == 7:
45
+ # cmd args
46
+ # sys.argv[0] is the script name, sys.argv[1] is the first argument, etc.
47
+ should_train_model = arg2 # should train model?
48
+ train_file = arg3 # training file name
49
+ test_file = arg4 # eval file name
50
+ batch_size_for_trainer = int(arg5) # batch sizes to send to trainer
51
+ should_produce_eval_matrix = int(arg6) # should produce matrix?
52
+ path_to_save_trained_model_to = arg7
53
+
54
+ print(f"should train model? : {arg2}")
55
+ print (f"file to train on : {arg3}")
56
+ print (f"file to evaluate on : {arg4}")
57
+ print (f"batch size : {arg5}")
58
+ print (f"should produce eval matrix : {arg6}")
59
+ print (f"path to save trained model : {arg7}")
60
+
61
+ print(f"should train model? : {should_train_model}")
62
+ print (f"file to train on : {train_file}")
63
+ print (f"file to evaluate on : {test_file}")
64
+ print (f"batch size : {batch_size_for_trainer}")
65
+ print (f"should produce eval matrix : {should_produce_eval_matrix}")
66
+ print (f"path to save trained model : {path_to_save_trained_model_to}")
67
+
68
+ else:
69
+ print(f"Only {num_args-1} arguments after filename were passed out of 6")
70
+ sys.exit()
71
+
72
+ import os
73
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0" #only use 1 of my GPS (in case very weak ones are installed which would slow the training down)
74
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
75
+
76
+
77
+ if (should_train_model=='1'): #train model
78
+
79
+ #settings
80
+ model_save_path = path_to_save_trained_model_to
81
+ bias_non_fleet = 1.0
82
+ epochs_to_run = 15
83
+
84
+ file_path_train = train_file + ".csv"
85
+ file_path_test = test_file + ".csv"
86
+
87
+ # Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
88
+ file_train_df = pd.read_csv(file_path_train)
89
+ file_test_df = pd.read_csv(file_path_test)
90
+
91
+
92
+ #combine dataframes to get all possible labels/classifications for both training and evaluating - to get all possible labels (intents)
93
+ df = pd.concat([file_train_df, file_test_df], ignore_index=True)
94
+ sorted_labels = sorted(df['label'].unique())
95
+
96
+
97
+ #create labels map from unique sorted labels
98
+ label_mapping = {label: i for i, label in enumerate(sorted_labels)}
99
+ print("label mappings")
100
+ print(label_mapping)
101
+
102
+ repo_name = "Reyad-Ahmmed/hf-data-timeframe"
103
+
104
+ # Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
105
+ # tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
106
+ tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
107
+ # I made sure to add all the ones in the training and eval data to this list
108
+ # since we are training using data that only contains the left tag - we don't need right tags added to this list
109
+ new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
110
+ tokenizer.add_tokens(new_tokens)
111
+
112
+
113
+ # Model
114
+ model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
115
+ # model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
116
+
117
+
118
+ # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
119
+ model.resize_token_embeddings(len(tokenizer))
120
+
121
+ #important_tokens = ["Acura-New", "TR-9012", "TR-NEW-02"]
122
+
123
+ from datasets import Dataset, DatasetDict
124
+ from sklearn.model_selection import train_test_split
125
+
126
+ # Step 2: Convert string labels to integers
127
+ # Create a mapping from unique labels (strings) to integers
128
+ label_to_id = {label: idx for idx, label in enumerate(sorted(df["label"].unique()))}
129
+ print(label_to_id)
130
+
131
+ # Dataframes contain prompts and label names
132
+ print('before converting labels to labelIds')
133
+ pprint.pp(file_train_df)
134
+ pprint.pp(file_test_df)
135
+
136
+ # Apply the mapping to the labels to id (will swap out the label names with label id to the dataframes)
137
+ file_train_df["label"] = file_train_df["label"].map(label_to_id)
138
+ file_test_df["label"] = file_test_df["label"].map(label_to_id)
139
+
140
+ print('after swapping out label names with Ids')
141
+ pprint.pp(file_train_df)
142
+ pprint.pp(file_test_df)
143
+
144
+ # Step 3: Convert both dataframes to dictionaries
145
+ emotions_dict_train = {"text": file_train_df["text"].tolist(), "label": file_train_df["label"].tolist()}
146
+ emotions_dict_test = {"text": file_test_df["text"].tolist(), "label": file_test_df["label"].tolist()}
147
+
148
+ print('dictionaries')
149
+ pprint.pp(emotions_dict_train)
150
+ pprint.pp(emotions_dict_test)
151
+
152
+ # convert dictionaries to datasets
153
+ emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
154
+ emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
155
+
156
+
157
+
158
+ # Step 4: Split dataset into train and validation
159
+ # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
160
+ # and one for "validation" with test dataset)
161
+ emotions_encoded = DatasetDict({
162
+ 'train': emotions_dataset_train,
163
+ 'validation': emotions_dataset_test
164
+ })
165
+
166
+
167
+ # Define the tokenize function
168
+ def tokenize(batch):
169
+ return tokenizer(batch["text"], padding=True, truncation=True)
170
+
171
+
172
+ # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
173
+ # this will add the "input_id" and "attention_mask" columns
174
+ emotions_encoded = emotions_encoded.map(tokenize, batched=True)
175
+ emotions_encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
176
+
177
+ # Set the model to evaluation mode (this line does not run any training or eval)
178
+ model.eval()
179
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
180
+ model.to(device)
181
+
182
+ from sklearn.metrics import accuracy_score, f1_score
183
+
184
+ # Define additional compute_metrics (used as part of error-analysis - produces "accuracy" metric which can be used in another program
185
+ # that shows any training prompts with large losses)
186
+ def compute_metrics(pred):
187
+ logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
188
+ preds = logits.argmax(-1)
189
+ labels = pred.label_ids
190
+ accuracy = (preds == labels).astype(float).mean()
191
+ return {"accuracy": accuracy}
192
+
193
+
194
+ training_args = TrainingArguments(
195
+ output_dir='./results',
196
+ num_train_epochs=epochs_to_run,
197
+ per_device_train_batch_size=batch_size_for_trainer,
198
+ per_device_eval_batch_size=batch_size_for_trainer,
199
+ warmup_steps=500,
200
+ learning_rate=2e-5,
201
+ weight_decay=0.02,
202
+ logging_dir='./logs',
203
+ logging_steps=10,
204
+ evaluation_strategy="epoch",
205
+ )
206
+
207
+ # notice the bias_non_float in next line (it is given a value at top of code)
208
+ # class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0]) # Replace with your actual class weights
209
+ # class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu')
210
+
211
+ # This is needed b/c loss_fn is swapped out in order to use weighted loss
212
+ # Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
213
+ class CustomTrainer(Trainer):
214
+ def compute_loss(self, model, inputs, return_outputs=False):
215
+ labels = inputs.get("labels")
216
+ outputs = model(**inputs)
217
+ logits = outputs.get("logits")
218
+
219
+ # Use cross-entropy loss with class weights
220
+ # loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
221
+ loss_fn = torch.nn.CrossEntropyLoss()
222
+ loss = loss_fn(logits, labels)
223
+
224
+ return (loss, outputs) if return_outputs else loss
225
+
226
+
227
+ # trainer = CustomTrainer(
228
+ # model=model,
229
+ # compute_metrics=compute_metrics,
230
+ # args=training_args,
231
+ # train_dataset=emotions_encoded["train"],
232
+ # eval_dataset=emotions_encoded["validation"],
233
+ # tokenizer=tokenizer )
234
+
235
+ trainer = Trainer(
236
+ model=model,
237
+ args=training_args,
238
+ train_dataset=emotions_encoded["train"],
239
+ eval_dataset=emotions_encoded["validation"],
240
+ tokenizer=tokenizer
241
+ )
242
+
243
+ # Train the model and set timer to measure the training time
244
+ start_time = time.time()
245
+ trainer.train()
246
+ end_time = time.time()
247
+ execution_time = end_time - start_time
248
+
249
+ print(f"Execution Time: {execution_time:.2f} seconds")
250
+
251
+ # send validation prompts through the model - will be used in error-analysis matrix below
252
+ preds_output = trainer.predict(emotions_encoded["validation"])
253
+
254
+
255
+ #################This section creates a error analysis matrix
256
+ # Extract the logits from the predictions output
257
+ logits = preds_output.predictions[0] if isinstance(preds_output.predictions, tuple) else preds_output.predictions
258
+
259
+ # Get the predicted class by applying argmax on the logits
260
+ y_preds = np.argmax(logits, axis=1) #prediction
261
+ y_valid = np.array(emotions_encoded["validation"]["label"]) #labels
262
+
263
+
264
+ from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
265
+ import matplotlib.pyplot as plt
266
+ import numpy as np
267
+
268
+ from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
269
+ #num_labels2 = len(label_mapping)
270
+
271
+ print("Ypreds and valids shape")
272
+ print(y_preds.shape, y_valid.shape)
273
+
274
+
275
+ # Define the function to plot the confusion matrix
276
+ def plot_confusion_matrix_with_text_labels(y_preds, y_true, labels):
277
+
278
+ # Compute confusion matrix
279
+ cm = confusion_matrix(y_true, y_preds,normalize="true")
280
+
281
+ # Plot confusion matrix
282
+ fig, ax = plt.subplots(figsize=(len(labels), len(labels)))
283
+ disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
284
+ disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
285
+
286
+ # Rotate the x-axis labels to prevent overlap
287
+ plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
288
+
289
+ # Ensure the plot is displayed
290
+ plt.title("Normalized Confusion Matrix with Text Labels")
291
+ plt.tight_layout()
292
+ plt.savefig("confusion_matrix.png")
293
+ plt.show()
294
+
295
+
296
+
297
+ # Get unique labels for validation data only - this will be shown in the matrix
298
+ unique_labels = sorted(set(y_valid) | set(y_preds))
299
+ id_to_label = {v: k for k, v in label_to_id.items()}
300
+ labels = [id_to_label[label] for label in unique_labels]
301
+
302
+ print ("unique_labels")
303
+ print(labels)
304
+
305
+ # Call the function with the correct labels
306
+ if(should_produce_eval_matrix == 1):
307
+ plot_confusion_matrix_with_text_labels(y_preds, y_valid, labels)
308
+
309
+ #the label mapping will be saved in the model - and retrieved by any other program using the model -
310
+ # for instance the pathway through this code used for inference only will retrieve this value
311
+ # (or like the Python program that measures poor accuracies)
312
+ model.config.label_mapping = label_mapping
313
+
314
+ # Save the model and tokenizer
315
+ model.save_pretrained(f"./{model_save_path}")
316
+ tokenizer.save_pretrained('./saved_fleet_tokenizer')
317
+
318
+ #for push repository
319
+ repo_name = "Reyad-Ahmmed/hf-data-timeframe"
320
+
321
+ # Your repository name
322
+ api_token = os.getenv("hf_token") # Retrieve the API token from environment variable
323
+
324
+ if not api_token:
325
+ raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.")
326
+
327
+ # Create repository (if not already created)
328
+ api = HfApi()
329
+ create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
330
+
331
+ # Upload the model and tokenizer to the Hugging Face repository
332
+
333
+ upload_folder(
334
+ folder_path=f"{model_save_path}",
335
+ path_in_repo=f"{model_save_path}",
336
+ repo_id=repo_name,
337
+ token=api_token,
338
+ commit_message="Push fleet model",
339
+ #overwrite=True # Force overwrite existing files
340
+ )
341
+
342
+ upload_folder(
343
+ folder_path="saved_fleet_tokenizer",
344
+ path_in_repo="saved_fleet_tokenizer",
345
+ repo_id=repo_name,
346
+ token=api_token,
347
+ commit_message="Push fleet tokenizer",
348
+ #overwrite=True # Force overwrite existing files
349
+ )
350
+
351
+ else:
352
+ print('Load Pre-trained')
353
+ model_save_path = "./saved_fleet_model"
354
+ tokenizer_save_path = "./saved_fleet_tokenizer"
355
+ # RobertaTokenizer.from_pretrained(model_save_path)
356
+ model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cuda')
357
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
358
+
359
+ #Define the label mappings (this must match the mapping used during training)
360
+ label_mapping = model.config.label_mapping
361
+ label_mapping_reverse = {value: key for key, value in label_mapping.items()}
362
+
363
+
364
+ #Function to classify user input
365
+ def classify_user_input():
366
+ while True:
367
+ user_input = input("Enter a command (or type 'q' to quit): ")
368
+ if user_input.lower() == 'q':
369
+ print("Exiting...")
370
+ break
371
+
372
+ # Tokenize and predict
373
+ input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cuda')
374
+
375
+ with torch.no_grad():
376
+ #attention_mask = input_encoding['attention_mask'].clone()
377
+
378
+ # Modify the attention mask to emphasize certain key tokens
379
+ for idx, token_id in enumerate(input_encoding['input_ids'][0]):
380
+ word = tokenizer.decode([token_id])
381
+ print(word)
382
+ #if word.strip() in ["point", "summarize", "oil", "maintenance"]: # Target key tokens
383
+ #attention_mask[0, idx] = 2 # Increase attention weight for these words
384
+ # else:
385
+ # attention_mask[0, idx] = 0
386
+ #print (attention_mask)
387
+ #input_encoding['attention_mask'] = attention_mask
388
+ output = model(**input_encoding, output_hidden_states=True)
389
+ # print('start-logits')
390
+ # print(output.logits)
391
+ # print('end-logits')
392
+ #print(output)
393
+ attention = output.attentions # Get attention scores
394
+ #print('atten')
395
+ #print(attention)
396
+ # Apply softmax to get the probabilities (confidence scores)
397
+ probabilities = F.softmax(output.logits, dim=-1)
398
+
399
+ # tokens = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0].cpu().numpy())
400
+ # # Display the attention visualization
401
+ # input_text = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0])
402
+
403
+ prediction = torch.argmax(output.logits, dim=1).cpu().numpy()
404
+
405
+ # Map prediction back to label
406
+ print(prediction)
407
+ predicted_label = label_mapping_reverse[prediction[0]]
408
+
409
+
410
+ print(f"Predicted intent: {predicted_label}\n")
411
+ # Print the confidence for each label
412
+ print("\nLabel Confidence Scores:")
413
+ for i, label in label_mapping_reverse.items():
414
+ confidence = probabilities[0][i].item() # Get confidence score for each label
415
+ print(f"{label}: {confidence:.4f}")
416
+ print("\n")
417
+
418
+ #Run the function
419
+ classify_user_input()
420
+
421
+