In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Installing Libraries

In [None]:
!pip install huggingface_hub transformers datasets gradio pipreqs TextBlob emot xformers
!pip install accelerate>=0.20.1
!pip install transformers[torch] accelerate -U

Collecting huggingface_hub
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m106.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.2-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio
  Downloading gradio-3.39.0-py3-none-any.whl (19.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pipreqs
  Downloading pipreqs-0.4.13-py2.py3-none-any.whl (33 kB)
Collecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Loading Libraries

In [None]:
# Import libraries
import os
import uuid
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from scipy.special import softmax
import gradio as gr

from google.colab import drive
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import IntervalStrategy
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
from transformers import pipeline
from transformers import TrainingArguments
from transformers import Trainer
from torch import nn



In [None]:
# Disabe W&B
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Load the dataset and display some values

# Load the CSV file into a DataFrame

url = "https://github.com/Azubi-Africa/Career_Accelerator_P5-NLP/raw/master/zindi_challenge/data/Train.csv"

df = pd.read_csv(url)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [None]:
# Select rows with missing values
df.isnull().sum()

tweet_id     0
safe_text    0
label        1
agreement    2
dtype: int64

In [None]:
# Select rows with missing values
df[df.isnull().any(axis=1)]

Unnamed: 0,tweet_id,safe_text,label,agreement
4798,RQMQ0L2A,#lawandorderSVU,,
4799,I cannot believe in this day and age some pare...,1,0.666667,


In [None]:
# Extract complete text from 'safe_text' column
complete_text = df.iloc[4798]['safe_text']
complete_text

'#lawandorderSVU '

In [None]:
# Select row by index and assign values to columns
df.loc[4798, 'label'] = 0
df.loc[4798, 'agreement'] = 0.666667

# Use .iloc[] and .iat[] to select and update safe_text column
df.iloc[4798, df.columns.get_loc('safe_text')] = complete_text

In [None]:
# Generate random UUID string for tweet_id
'''UUIDs are often used in software applications for various purposes such as generating unique IDs for entities,
tracking unique user sessions, or creating unique file names'''
rand_tweet_id = str(uuid.uuid4())

# Select row by index and assign values to columns
row_index = 4799
df.loc[row_index, 'tweet_id'] = rand_tweet_id
df.loc[row_index, 'label'] = 1
df.loc[row_index, 'agreement'] = 0.666667

# Use .iloc[] and .iat[] to select and update safe_text column
df.iloc[row_index, df.columns.get_loc('safe_text')] = df.iloc[row_index, 1]

In [None]:
df[df.duplicated()].sum()

tweet_id     0.0
safe_text    0.0
label        0.0
agreement    0.0
dtype: float64

## Handling the Imbalanced Data

In [None]:
df['label'].value_counts()

 0.0    4909
 1.0    4054
-1.0    1038
Name: label, dtype: int64

In [None]:
# Find the maximum count among all classes
max_class_count = df['label'].value_counts().max()

# Group the dataframe by 'label'
grouped = df.groupby('label')

# Sample each group to match the max_class_count
balanced_df = grouped.apply(lambda x: x.sample(max_class_count, replace=True)).reset_index(drop=True)

# The 'balanced_df' now contains an equal number of instances for each class
print(balanced_df['label'].value_counts())


-1.0    4909
 0.0    4909
 1.0    4909
Name: label, dtype: int64


In [None]:
# Split the train data => {train, eval}
train, eval = train_test_split(balanced_df, test_size=0.2, random_state=22)

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11781 entries, 11342 to 11125
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   11781 non-null  object 
 1   safe_text  11781 non-null  object 
 2   label      11781 non-null  float64
 3   agreement  11781 non-null  float64
dtypes: float64(2), object(2)
memory usage: 460.2+ KB


In [None]:
eval.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2946 entries, 1591 to 6131
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   2946 non-null   object 
 1   safe_text  2946 non-null   object 
 2   label      2946 non-null   float64
 3   agreement  2946 non-null   float64
dtypes: float64(2), object(2)
memory usage: 115.1+ KB


In [None]:
# Save splitted subsets

# Define file path

file_path = '/content/drive/MyDrive/LP5/Career_Accelerator_P5-NLP'

#"/content/drive/MyDrive/LP5/Career_Accelerator_P5-NLP"

train.to_csv(os.path.join(file_path, "train_subset.csv"), index=False)
eval.to_csv(os.path.join(file_path, "eval_subset.csv"), index=False)

In [None]:
# Load the CSV files into a dataset

dataset = load_dataset('csv', data_files={
    'train': '/content/drive/MyDrive/LP5/Career_Accelerator_P5-NLP/train_subset.csv',
    'eval': '/content/drive/MyDrive/LP5/Career_Accelerator_P5-NLP/eval_subset.csv'
}, encoding='ISO-8859-1')

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',                          # Directory where the model checkpoints and evaluation results will be stored
    evaluation_strategy=IntervalStrategy.STEPS,      # Interval for evaluating the model during training (every specified number of steps)
    save_strategy=IntervalStrategy.STEPS,            # Interval for saving the model during training (every specified number of steps)
    save_steps=500,                                  # Number of steps between two saves
    load_best_model_at_end=True,                     # Whether to load the best model at the end of training
    num_train_epochs=3,                              # Number of training epochs
    per_device_train_batch_size=4,                   # Batch size per GPU for training
    per_device_eval_batch_size=4,                    # Batch size per GPU for evaluation
    learning_rate=3e-5,                              # Learning rate
    weight_decay=0.01,                               # Weight decay
    warmup_steps=500,                                # Number of warmup steps
    logging_steps=500,                               # Number of steps between two logs
    gradient_accumulation_steps=16,                  # Number of steps to accumulate gradients before performing an optimizer step
    dataloader_num_workers=2,                        # Number of workers to use for loading data
    push_to_hub=True,                                # Whether to push the model checkpoints to the Hugging Face hub
    hub_model_id="slickdata/finetuned-Sentiment-classfication-ROBERTA-model",  # Model ID to use when pushing the model to the Hugging Face hub
)



# Define the early stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,                       # Number of epochs with no improvement before stopping training
    early_stopping_threshold=0.01,                   # Minimum improvement in the metric for considering an improvement
)

# Combine the training arguments and the early stopping callback
training_args.callbacks = [early_stopping]


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
tokenizer_ROBERTA = AutoTokenizer.from_pretrained('roberta-base')
'''
This code instantiates a tokenizer for the BERT (Bidirectional Encoder Representations from Transformers)
pre-trained model with the bert-base-cased configuration.

'''

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

'\nThis code instantiates a tokenizer for the BERT (Bidirectional Encoder Representations from Transformers)\npre-trained model with the bert-base-cased configuration.\n\n'

In [None]:
# Define a function to transform the label values
def transform_labels(label):
    # Extract the label value
    label = label['label']
    # Map the label value to an integer value
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2
    # Return a dictionary with a single key-value pair
    return {'labels': num}

# Define a function to tokenize the text data
def tokenize_data(example):
    # Extract the 'safe_text' value from the input example and tokenize it
    return tokenizer_ROBERTA(example['safe_text'], padding='max_length')

# Apply the transformation functions to the dataset using the 'map' method
# This transforms the label values and tokenizes the text data
dataset_out = dataset.map(transform_labels)

dataset_ROBERTA = dataset_out.map(tokenize_data, batched=True)

# Define a list of column names to remove from the dataset
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']

# Apply the 'transform_labels' function to the dataset to transform the label values
# Also remove the columns specified in 'remove_columns'

dataset_ROBERTA = dataset_ROBERTA.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/11781 [00:00<?, ? examples/s]

Map:   0%|          | 0/2946 [00:00<?, ? examples/s]

Map:   0%|          | 0/11781 [00:00<?, ? examples/s]

Map:   0%|          | 0/2946 [00:00<?, ? examples/s]

Map:   0%|          | 0/11781 [00:00<?, ? examples/s]

Map:   0%|          | 0/2946 [00:00<?, ? examples/s]

In [None]:
# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model_ROBERTA = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_dataset_ROBERTA = dataset_ROBERTA['train'].shuffle(seed=10) #.select(range(40000)) # to select a part

In [None]:
eval_dataset_ROBERTA = dataset_ROBERTA['eval'].shuffle(seed=10)

In [None]:
import numpy as np
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_macro = f1_score(labels, predictions, average='macro')
    return {"f1_macro": f1_macro}

In [None]:
trainer_ROBERTA = Trainer(
    model=model_ROBERTA,
    args=training_args,
    train_dataset=train_dataset_ROBERTA,
    eval_dataset=eval_dataset_ROBERTA,
    compute_metrics=compute_metrics    # Add this line to define the compute_metrics function
)

/content/./results is already a clone of https://huggingface.co/slickdata/finetuned-Sentiment-classfication-ROBERTA-model. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer_ROBERTA.train()



Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


In [None]:
# Evaluate the model
eval_results = trainer_ROBERTA.evaluate()

# Create a dictionary of the evaluation results
results_dict = {
    "Model": "roberta-base",
    "Loss": eval_results["eval_loss"],
    "RMSE": eval_results["eval_rmse"],
    "Runtime": eval_results["eval_runtime"],
    "Samples Per Second": eval_results["eval_samples_per_second"],
    "Steps Per Second": eval_results["eval_steps_per_second"],
    "Epoch": eval_results["epoch"]
}

# Create a pandas DataFrame from the dictionary
results_df = pd.DataFrame([results_dict])

# Sort the results by "eval_rmse" in ascending order and get the name and state dict of the best model
best_model = results_df.loc[results_df['f1_macro'].idxmin()]

print(best_model)

NameError: ignored

In [None]:
# Push the final fine-tuned model to the Hugging Face model hub

trainer_ROBERTA.push_to_hub ("MissChloe/PQ_Roberta_Model")

In [None]:
tokenizer_ROBERTA.push_to_hub ("MissChloe/PQ_Roberta_Model")

In [None]:
model_ROBERTA.push_to_hub("MissChloe/PQ_Roberta_Model")

In [None]:
# Load the tokenizer
tokenizer = tokenizer_ROBERTA.from_pretrained("slickdata/finetuned-Sentiment-classfication-ROBERTA-model")

# Load the fine-tuned model
model = pipeline("text-classification", model="MissChloe/PQ_Roberta_Model", tokenizer=tokenizer)

In [None]:
label_map = {0: "negative", 1: "neutral", 2: "positive"}

# Make predictions on some example text
result = model("I love these covid vaccines.")

# Map the numerical label to the corresponding class name
result[0]["label"] = label_map[int(result[0]["label"].split("_")[1])]

# Print the predicted label and score
print(result)

In [None]:
!pip freeze >