--- license: mit language: - en metrics: - accuracy tags: - IT - helpdesk - classifier - nlp - natural-language - classification ---
TinyBERT based model ### Fetching the model ```python import torch from torch.utils.data import DataLoader, Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW from sklearn.model_selection import train_test_split import pandas as pd from tqdm import tqdm # Load the TinyBERT tokenizer and model tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D') model = AutoModelForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2) # fetch the statedict to apply the fine-tuned weights state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/tiny_bert_model.bin") # if running on cpu # state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/tiny_bert_model.bin", map_location=torch.device('cpu')) model.load_state_dict(state_dict) model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) ``` ### Using the model ```python def predict_description(model, tokenizer, text, max_length=512): model.eval() # Set the model to evaluation mode # Ensure model is on the correct device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) # Encode the input text inputs = tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=max_length, padding='max_length', return_token_type_ids=False, return_tensors='pt', truncation=True ) # Move tensors to the correct device inputs = {key: value.to(device) for key, value in inputs.items()} # Make prediction with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits probabilities = torch.softmax(logits, dim=-1) predicted_class_id = torch.argmax(probabilities, dim=-1).item() return predicted_class_id, probabilities.cpu().tolist() #Example usage tickets = [ """Inquiry about the possibility of customizing Docker to better meet department-specific needs. Gathered requirements for desired customizations.""", """We've encountered a recurring problem with DEVEnv shutting down anytime we try to save documents. I looked over the error logs for any clues about what's going wrong. I'm passing this on to the team responsible for software upkeep.""" ] for row in tickets: prediction, probabilities = predict_description(model, tokenizer, row) prediction = (['INCIDENT', 'TASK'])[prediction] print(f"{prediction} ({probabilities}) <== {row['content']}") ``` ### Additional fine-tuning ```python # The dataset class class TextDataset(Dataset): def __init__(self, descriptions, labels, tokenizer, max_len): self.descriptions = descriptions self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.descriptions) def __getitem__(self, idx): text = self.descriptions[idx] inputs = self.tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', return_token_type_ids=False, truncation=True ) return { 'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long), 'labels': torch.tensor(self.labels[idx], dtype=torch.long) } # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= # load the data df = pd.read_csv('..\\data\\final_data.csv') df['label'] = df['type'].astype('category').cat.codes # Convert labels to category codes if they aren't already # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= # create the training and validation sets and data loaders print( "cuda is available" if torch.cuda.is_available() else "cuda is unavailable: running on cpu") # Split the data into training and validation sets train_df, val_df = train_test_split(df, test_size=0.15) # Create PyTorch datasets train_dataset = TextDataset(train_df['content'].tolist(), train_df['label'].tolist(), tokenizer, max_len=512) val_dataset = TextDataset(val_df['content'].tolist(), val_df['label'].tolist(), tokenizer, max_len=512) # Create data loaders train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=32) # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= # Train the model # only these layers will be trained, customize this to your liking to freeze the ones you dont want to retrain training_layers = [ "bert.encoder.layer.3.output.dense.weight", "bert.encoder.layer.3.output.dense.bias", "bert.encoder.layer.3.output.LayerNorm.weight", "bert.encoder.layer.3.output.LayerNorm.bias", "bert.pooler.dense.weight", "bert.pooler.dense.bias", "classifier.weight", "classifier.bias", ] for name, param in model.named_parameters(): if name not in training_layers: # Freeze layers that are not part of the classifier param.requires_grad = False # Training setup optimizer = AdamW(model.parameters(), lr=5e-5) epochs = 2 for epoch in range(epochs): model.train() loss_item = float('+inf') for batch in tqdm(train_loader, desc=f"Training Loss: {loss_item}"): batch = {k: v.to(model.device) for k, v in batch.items()} outputs = model(**batch) loss = outputs.loss loss.backward() optimizer.step() optimizer.zero_grad() loss_item = loss.item() model.eval() total_eval_accuracy = 0 for batch in tqdm(val_loader, desc=f"Validation Accuracy: {total_eval_accuracy}"): batch = {k: v.to(model.device) for k, v in batch.items()} with torch.no_grad(): outputs = model(**batch) logits = outputs.logits predictions = torch.argmax(logits, dim=-1) accuracy = (predictions == batch['labels']).cpu().numpy().mean() total_eval_accuracy += accuracy print(f"Validation Accuracy: {total_eval_accuracy / len(val_loader)}") ```
DistilBERT based model ### Fetching the model ```python import torch from torch.utils.data import DataLoader, Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW from sklearn.model_selection import train_test_split import pandas as pd from tqdm import tqdm # Load the TinyBERT tokenizer and model tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased') model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2) # fetch the statedict to apply the fine-tuned weights state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/distilbert_1.bin") # if running on cpu # state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/distilbert_1.bin", map_location=torch.device('cpu')) model.load_state_dict(state_dict) model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) ``` ### Using the model ```python def predict_description(model, tokenizer, text, max_length=512): model.eval() # Set the model to evaluation mode # Ensure model is on the correct device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) # Encode the input text inputs = tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=max_length, padding='max_length', return_token_type_ids=False, return_tensors='pt', truncation=True ) # Move tensors to the correct device inputs = {key: value.to(device) for key, value in inputs.items()} # Make prediction with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits probabilities = torch.softmax(logits, dim=-1) predicted_class_id = torch.argmax(probabilities, dim=-1).item() return predicted_class_id, probabilities.cpu().tolist() #Example usage tickets = [ """Inquiry about the possibility of customizing Docker to better meet department-specific needs. Gathered requirements for desired customizations.""", """We've encountered a recurring problem with DEVEnv shutting down anytime we try to save documents. I looked over the error logs for any clues about what's going wrong. I'm passing this on to the team responsible for software upkeep.""" ] for row in tickets: prediction, probabilities = predict_description(model, tokenizer, row) prediction = (['INCIDENT', 'TASK'])[prediction] print(f"{prediction} ({probabilities}) <== {row['content']}") ``` ### Additional fine-tuning ```python # The dataset class class TextDataset(Dataset): def __init__(self, descriptions, labels, tokenizer, max_len): self.descriptions = descriptions self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.descriptions) def __getitem__(self, idx): text = self.descriptions[idx] inputs = self.tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', return_token_type_ids=False, truncation=True ) return { 'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long), 'labels': torch.tensor(self.labels[idx], dtype=torch.long) } # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= # load the data df = pd.read_csv('..\\data\\final_data.csv') df['label'] = df['type'].astype('category').cat.codes # Convert labels to category codes if they aren't already # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= # create the training and validation sets and data loaders print( "cuda is available" if torch.cuda.is_available() else "cuda is unavailable: running on cpu") # Split the data into training and validation sets train_df, val_df = train_test_split(df, test_size=0.15) # Create PyTorch datasets train_dataset = TextDataset(train_df['content'].tolist(), train_df['label'].tolist(), tokenizer, max_len=512) val_dataset = TextDataset(val_df['content'].tolist(), val_df['label'].tolist(), tokenizer, max_len=512) # Create data loaders train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=32) # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= # Train the model # only these layers will be trained, customize this to your liking to freeze the ones you dont want to retrain training_layers = [ "distilbert.transformer.layer.5.ffn.lin2.weight", "distilbert.transformer.layer.5.ffn.lin2.bias", "distilbert.transformer.layer.5.output_layer_norm.weight", "distilbert.transformer.layer.5.output_layer_norm.bias", "pre_classifier.weight", "pre_classifier.bias", "classifier.weight", "classifier.bias" ] for name, param in model.named_parameters(): if name not in training_layers: # Freeze layers that are not part of the classifier param.requires_grad = False # if the model is not already on gpu, make sure to train it on gpu if available # model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) # Training setup optimizer = AdamW(model.parameters(), lr=5e-5) epochs = 2 for epoch in range(epochs): model.train() loss_item = float('+inf') for batch in tqdm(train_loader, desc=f"Training Loss: {loss_item}"): batch = {k: v.to(model.device) for k, v in batch.items()} outputs = model(**batch) loss = outputs.loss loss.backward() optimizer.step() optimizer.zero_grad() loss_item = loss.item() model.eval() total_eval_accuracy = 0 for batch in tqdm(val_loader, desc=f"Validation Accuracy: {total_eval_accuracy}"): batch = {k: v.to(model.device) for k, v in batch.items()} with torch.no_grad(): outputs = model(**batch) logits = outputs.logits predictions = torch.argmax(logits, dim=-1) accuracy = (predictions == batch['labels']).cpu().numpy().mean() total_eval_accuracy += accuracy print(f"Validation Accuracy: {total_eval_accuracy / len(val_loader)}") ```
RoBERT based model ### Base model ```python import torch from torch.utils.data import DataLoader, Dataset from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW from sklearn.model_selection import train_test_split import pandas as pd # Load the tokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') # Load RoBERTa pre-trained model model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2) # fetch the statedict to apply the fine-tuned weights state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/pytorch_model.bin") # if running on cpu # state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/pytorch_model.bin", map_location=torch.device('cpu')) model.load_state_dict(state_dict) model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) ``` ### Use model to make predictions ```python def predict_description(model, tokenizer, text, max_length=512): model.eval() # Set the model to evaluation mode # Ensure model is on the correct device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) # Encode the input text inputs = tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=max_length, padding='max_length', return_token_type_ids=False, return_tensors='pt', truncation=True ) # Move tensors to the correct device inputs = {key: value.to(device) for key, value in inputs.items()} # Make prediction with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits probabilities = torch.softmax(logits, dim=-1) predicted_class_id = torch.argmax(probabilities, dim=-1).item() return predicted_class_id (['INCIDENT', 'REQUEST'])[predict_description(model, tokenizer, """My ID card is not being detected.""")] ```