KameronB
/

SITCC-Incident-Request-Classifier

@@ -7,10 +7,182 @@ language:
   <summary>
     TinyBERT based model
   </summary>
 </details>
 <details>
 <summary>RoBERT based model</summary>
 ```python
 import torch
 from torch.utils.data import DataLoader, Dataset
@@ -22,14 +194,22 @@ import pandas as pd
 tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
 # Load RoBERTa pre-trained model
-model = RobertaForSequenceClassification.from_pretrained('KameronB/SITCC-Incident-Request-Classifier', num_labels=2)
-model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
 ```
 ```python
 def predict_description(model, tokenizer, text, max_length=512):

   <summary>
     TinyBERT based model
   </summary>
+### Fetching the model
+```python
+# Load the TinyBERT tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
+model = AutoModelForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2)
+# fetch the statedict to apply the fine-tuned weights
+state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/tiny_bert_model.bin")
+# if running on cpu
+# state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/tiny_bert_model.bin", map_location=torch.device('cpu'))
+model.load_state_dict(state_dict)
+model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+```
+### Using the model
+```python
+def predict_description(model, tokenizer, text, max_length=512):
+    model.eval()  # Set the model to evaluation mode
+    # Ensure model is on the correct device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    # Encode the input text
+    inputs = tokenizer.encode_plus(
+        text,
+        None,
+        add_special_tokens=True,
+        max_length=max_length,
+        padding='max_length',
+        return_token_type_ids=False,
+        return_tensors='pt',
+        truncation=True
+    )
+    # Move tensors to the correct device
+    inputs = {key: value.to(device) for key, value in inputs.items()}
+    # Make prediction
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        probabilities = torch.softmax(logits, dim=-1)
+        predicted_class_id = torch.argmax(probabilities, dim=-1).item()
+    return predicted_class_id, probabilities.cpu().tolist()
+#Example usage
+tickets = [
+  """Inquiry about the possibility of customizing Docker to better meet department-specific needs.
+Gathered requirements for desired customizations.""",
+  """We've encountered a recurring problem with DEVEnv shutting down anytime we try to save documents.
+I looked over the error logs for any clues about what's going wrong. I'm passing this on to the team responsible for software upkeep."""
+]
+for i, row in df.sample(frac=0.01).iterrows():
+    prediction, probabilities = predict_description(model, tokenizer, row['content'])
+    prediction = (['INCIDENT', 'TASK'])[prediction]
+    print(f"{prediction} ({probabilities}) <== {row['content']}")
+```
+### Additional fine-tuning
+```python
+# The dataset class
+class TextDataset(Dataset):
+    def __init__(self, descriptions, labels, tokenizer, max_len):
+        self.descriptions = descriptions
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.descriptions)
+    def __getitem__(self, idx):
+        text = self.descriptions[idx]
+        inputs = self.tokenizer.encode_plus(
+            text,
+            None,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            padding='max_length',
+            return_token_type_ids=False,
+            truncation=True
+        )
+        return {
+            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
+            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
+            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
+        }
+# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+# load the data
+df = pd.read_csv('..\\data\\final_data.csv')
+df['label'] = df['type'].astype('category').cat.codes  # Convert labels to category codes if they aren't already
+# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+# create the training and validation sets and data loaders
+print( "cuda is available" if torch.cuda.is_available() else "cuda is unavailable: running on cpu")
+# Split the data into training and validation sets
+train_df, val_df = train_test_split(df, test_size=0.15)
+# Create PyTorch datasets
+train_dataset = TextDataset(train_df['content'].tolist(), train_df['label'].tolist(), tokenizer, max_len=512)
+val_dataset = TextDataset(val_df['content'].tolist(), val_df['label'].tolist(), tokenizer, max_len=512)
+# Create data loaders
+train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
+val_loader = DataLoader(val_dataset, batch_size=32)
+# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+# Train the model
+# only these layers will be trained, customize this to your liking to freeze the ones you dont want to retrain
+training_layers = [
+    "bert.encoder.layer.3.output.dense.weight",
+    "bert.encoder.layer.3.output.dense.bias",
+    "bert.encoder.layer.3.output.LayerNorm.weight",
+    "bert.encoder.layer.3.output.LayerNorm.bias",
+    "bert.pooler.dense.weight",
+    "bert.pooler.dense.bias",
+    "classifier.weight",
+    "classifier.bias",
+]
+for name, param in model.named_parameters():
+    if name not in training_layers:  # Freeze layers that are not part of the classifier
+        param.requires_grad = False
+# Training setup
+optimizer = AdamW(model.parameters(), lr=5e-5)
+epochs = 2
+for epoch in range(epochs):
+    model.train()
+    loss_item = float('+inf')
+    for batch in tqdm(train_loader, desc=f"Training Loss: {loss_item}"):
+        batch = {k: v.to(model.device) for k, v in batch.items()}
+        outputs = model(**batch)
+        loss = outputs.loss
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        loss_item = loss.item()
+    model.eval()
+    total_eval_accuracy = 0
+    for batch in tqdm(val_loader, desc=f"Validation Accuracy: {total_eval_accuracy}"):
+        batch = {k: v.to(model.device) for k, v in batch.items()}
+        with torch.no_grad():
+            outputs = model(**batch)
+        logits = outputs.logits
+        predictions = torch.argmax(logits, dim=-1)
+        accuracy = (predictions == batch['labels']).cpu().numpy().mean()
+        total_eval_accuracy += accuracy
+    print(f"Validation Accuracy: {total_eval_accuracy / len(val_loader)}")
+```
 </details>
 <details>
 <summary>RoBERT based model</summary>
+### Base model
 ```python
 import torch
 from torch.utils.data import DataLoader, Dataset
 tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
 # Load RoBERTa pre-trained model
+model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
+# fetch the statedict to apply the fine-tuned weights
+state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/pytorch_model.bin")
+# if running on cpu
+# state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/pytorch_model.bin", map_location=torch.device('cpu'))
+model.load_state_dict(state_dict)
+model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
 ```
+### Use model to make predictions
 ```python
 def predict_description(model, tokenizer, text, max_length=512):