In [None]:
!pip install click==8.0.3
!pip install cloudml_hypertune==0.1.0.dev6
!pip install hypertune==0.0.0
!pip uninstall matplotlib
!pip install matplotlib==3.1.3
!pip install numpy==1.20.3
!pip install pandas==1.3.4
!pip install protobuf==3.19.3
!pip install python-dotenv==0.19.2
!pip install cikit_learn==1.0.2
!pip install torch==1.10.1
!pip install transformers==4.15.0
!pip install hopsworks

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import hopsworks
project = hopsworks.login()

Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated

Paste it here: ··········
Connected. Call `.close()` to terminate connection gracefully.

Multiple projects found. 

	 (1) liangc40
	 (2) Lab1_for_iris

Enter project to access: 1

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/5311


## Load Feature from Hopsworks

In [4]:
fs = project.get_feature_store()
try: 
  feature_view = fs.get_feature_view(name="sentimental_analysis_feature_group", version=1)
except:
  fg = fs.get_feature_group(name="sentimental_analysis_feature_group", version=1)
  query = fg.select_all()
  feature_view = fs.create_feature_view(name="sentimental_analysis_feature_group",
                                        version=1,
                                        description="Read from pre-processed sentimental analysis dataset",
                                        labels=["label"],
                                        query=query)  



Connected. Call `.close()` to terminate connection gracefully.


## Create DataLoader and TweetsDataset

In [35]:
BATCH_SIZE = 16
MAX_LEN = 160
EPOCHS = 3

In [6]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

class TweetsDataset(Dataset):
  def __init__(self, message, depression, tokenizer, max_len):
    self.message = message
    self.depression = depression
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.message)
  
  def __getitem__(self, item):
    message = str(self.message[item])
    depression = self.depression[item]

    encoding = self.tokenizer.encode_plus(
      message,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      truncation=True,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'tweet_text': message,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'depression': torch.tensor(depression, dtype=torch.long)
    }

In [22]:
def create_data_loader(message, depression, tokenizer, max_len, batch_size):
  ds = TweetsDataset(
    message = message['message'].to_numpy(),
    depression = depression['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size = batch_size,
    num_workers = 9
  )

In [23]:
train_message, test_message, train_depression, test_depression = feature_view.train_test_split(0.2)

#Creating dataloaders
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_data_loader = create_data_loader(train_message, train_depression, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_message, test_depression, tokenizer, MAX_LEN, BATCH_SIZE)
data = next(iter(train_data_loader))



## Bert-Based Depression Classier Model

In [24]:
from torch import nn, optim
import torch.nn.functional as F
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from collections import defaultdict

class DepressionClassifier(nn.Module):
  def __init__(self, n_classes, pre_trained_model_name):
    super(DepressionClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(pre_trained_model_name)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict = False #here
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [25]:
class_names = ['Not Depressed', 'Depressed']
model = DepressionClassifier(len(class_names), 'bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Training Functions

In [26]:
from torch import nn, optim
import torch.nn.functional as F
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from collections import defaultdict
import matplotlib.pyplot as plt

In [27]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    depression = d["depression"].to(device)

    outputs = model(
      input_ids = input_ids,
      attention_mask = attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, depression)

    correct_predictions += torch.sum(preds == depression)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [28]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      depression = d["depression"].to(device)

      outputs = model(
        input_ids = input_ids,
        attention_mask = attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, depression)

      correct_predictions += torch.sum(preds == depression)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [51]:
def loss_accuracy_plots(history):
    plt.figure(1)
    plt.plot(history['train_loss'])
    plt.plot(history['val_loss'])
    plt.xlabel("Epochs [-]")
    plt.ylabel("Loss [-]")
    plt.legend(['Training loss','Validation loss'])
    plt.grid()
    plt.savefig(f"/content/Training_losses_plot.jpg")
    plt.figure(2)
    plt.plot(history['train_acc'])
    plt.plot(history['val_acc'])
    plt.xlabel("Epochs [-]")
    plt.ylabel("Loss [-]")
    plt.legend(['Training accuracy','Validation accuracy'])
    plt.grid()
    plt.savefig(f"/content/Training_accuracies_plot.jpg")

## Training Data

In [31]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Jan 11 10:55:48 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    29W /  70W |  10716MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [32]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

In [33]:
F.softmax(model(input_ids, attention_mask), dim=1)

tensor([[0.6483, 0.3517],
        [0.7467, 0.2533],
        [0.7182, 0.2818],
        [0.6410, 0.3590],
        [0.4981, 0.5019],
        [0.6323, 0.3677],
        [0.3284, 0.6716],
        [0.6354, 0.3646],
        [0.5387, 0.4613],
        [0.5530, 0.4470],
        [0.5840, 0.4160],
        [0.6082, 0.3918],
        [0.5927, 0.4073],
        [0.5545, 0.4455],
        [0.7305, 0.2695],
        [0.6892, 0.3108]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [41]:
import gc
gc.collect()

optimizer = AdamW(model.parameters(), lr = 2e-5, correct_bias = False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps = 0,
                                                num_training_steps = total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  
  train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_message))
  
  print(f'Train loss {train_loss} accuracy {train_acc}')
  
  val_acc, val_loss = eval_model(model, test_data_loader, loss_fn, device, len(test_message))
  
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

Epoch 1/3
----------
Train loss 0.032615248548951696 accuracy 0.9951367781155015
Val   loss 0.03613543838475535 accuracy 0.9941662615459407
Epoch 2/3
----------
Train loss 0.021585255281155413 accuracy 0.9958662613981764
Val   loss 0.008615166831007156 accuracy 0.9990277102576568
Epoch 3/3
----------
Train loss 0.003893426973731551 accuracy 0.9993920972644377
Val   loss 0.009192386632538158 accuracy 0.9985415653864851


In [55]:
from google.colab import drive
drive.mount('/content/drive')
torch.save(model.state_dict(), '/content/drive/MyDrive/data/weights.pth')

Mounted at /content/drive


In [56]:
import os
import joblib
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from sklearn.metrics import classification_report

# We will now upload our model to the Hopsworks Model Registry. First get an object for the model registry.
mr = project.get_model_registry()
    
# The contents of the directory will be saved to the model registry. Create the dir, first.
model_dir="sentimental_analysis_model"
if os.path.isdir(model_dir) == False:
  os.mkdir(model_dir)

# Save both our model and the confusion matrix to 'model_dir', whose contents will be uploaded to the model registry
joblib.dump(model, model_dir + "/sentimental_analysis_model.pkl")  


# Specify the schema of the model's input/output using the features (X_train) and labels (y_train)
input_schema = Schema(train_message)
output_schema = Schema(train_depression)
model_schema = ModelSchema(input_schema, output_schema)

# Create an entry in the model registry that includes the model's name, desc, metrics
sentimental_analysis_model = mr.python.create_model(
    name="sentimental_analysis_model", 
    model_schema=model_schema,
    description="Sentimental Analysis Predictor"
)
    
# Upload the model to the model registry, including all files in 'model_dir'
sentimental_analysis_model.save(model_dir)

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Model created, explore it at https://c.app.hopsworks.ai:443/p/5311/models/sentimental_analysis_model/1


Model(name: 'sentimental_analysis_model', version: 1)