Spaces:

sergey-hovhannisyan
/

toxic-tweets

Sleeping

App Files Files Community

sergey-hovhannisyan commited on Apr 25, 2023

Commit

19df55a

•

1 Parent(s): 4838aca

final touch, finished model loading

Browse files

Files changed (5) hide show

.gitignore +4 -0
app.py +5 -7
models/fine_tuned/.gitkeep +0 -0
src/evaluate.py +37 -4
src/finetune.ipynb +215 -137

.gitignore CHANGED Viewed

@@ -127,3 +127,7 @@ dmypy.json
 # Pyre type checker
 .pyre/

 # Pyre type checker
 .pyre/
+# Model training logs, checkpoints, etc.
+tokenized_encodings/
+models/

app.py CHANGED Viewed

@@ -3,23 +3,21 @@ from src.evaluate import evaluate_prompt, model_list
 st.title("Toxic Tweets")
 st.info("This NLP machine learning project aims to predict the toxicity level of input tweets using fine-tuning techniques on a pre-trained language model. The project utilizes Docker containers for efficient deployment and management, while Hugging Face Spaces and Transformers provide the necessary libraries and tools for building and training the model. The model is trained on a large dataset of labeled toxic tweets to enable it to classify new input tweets as toxic or non-toxic. This project can help improve online safety by automatically flagging potentially harmful content.")
 # variables defined
 sentiment_model_names = model_list()
 section1, section2 = st.columns(2)
-# functions
 def predict(model_name, prompt):
-    output = evaluate_prompt(model_name, prompt)[0]
-    label = output["label"]
-    score = output["score"]
     with section2:
-        st.write("Label:", label)
-        st.write("Score:", score)
         st.success("Completed!")
 with section1:
     st.header("Input")
     prompt = st.text_area("Prompt", "Eat Up Every Moment")

 st.title("Toxic Tweets")
+# description of the project
 st.info("This NLP machine learning project aims to predict the toxicity level of input tweets using fine-tuning techniques on a pre-trained language model. The project utilizes Docker containers for efficient deployment and management, while Hugging Face Spaces and Transformers provide the necessary libraries and tools for building and training the model. The model is trained on a large dataset of labeled toxic tweets to enable it to classify new input tweets as toxic or non-toxic. This project can help improve online safety by automatically flagging potentially harmful content.")
 # variables defined
 sentiment_model_names = model_list()
 section1, section2 = st.columns(2)
+# function to predict the output
 def predict(model_name, prompt):
+    output = evaluate_prompt(model_name, prompt)
     with section2:
+        st.table(output)
         st.success("Completed!")
+# main code
 with section1:
     st.header("Input")
     prompt = st.text_area("Prompt", "Eat Up Every Moment")

models/fine_tuned/.gitkeep DELETED Viewed

File without changes

src/evaluate.py CHANGED Viewed

@@ -1,14 +1,47 @@
 from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
 def evaluate_prompt(model_name, prompt):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
-    return classifier(prompt)
 def model_list():
     sentiment_models = [
         "distilbert-base-uncased-finetuned-sst-2-english",
         "bert-base-uncased",
         "albert-base-v2"]

 from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
+from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
+import torch
+import numpy as np
+import pandas as pd
+# Public function called from app.py to evaluate the prompt
 def evaluate_prompt(model_name, prompt):
+    # Check if the model is fine-tuned-toxic-tweets
+    if model_name == "fine-tuned-toxic-tweets":
+        return eval_fine_tuned_toxic_tweets(model_name, prompt)
+    else: # If not, use the pipeline function
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+        return classifier(prompt)
+# Private function to evaluate the prompt using the fine-tuned-toxic-tweets model
+def eval_fine_tuned_toxic_tweets(model_name, prompt):
+    # Load the model and tokenizer
+    model = DistilBertForSequenceClassification.from_pretrained("sergey-hovhannisyan/fine-tuned-toxic-tweets")
+    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+    encoded_text = tokenizer(prompt, truncation=True, padding=True, return_tensors='pt')
+    with torch.no_grad():
+        output = model(**encoded_text)
+    # Get the labels and scores
+    labels = np.array(["toxic", "severe", "obscene", "threat", "insult", "identity hate"])
+    scores = torch.sigmoid(output.logits)*100
+    scores = scores.numpy()
+    # Sort the scores in descending order
+    sort_idx = np.flip(np.argsort(scores))
+    labels = labels[sort_idx]
+    scores = scores[0][sort_idx]
+    # Return the labels and scores as a dataframe
+    return pd.DataFrame({"Label": labels[0].tolist(), "Score": scores[0].tolist()}).set_index("Label")
+# List of models to choose from
 def model_list():
     sentiment_models = [
+        "fine-tuned-toxic-tweets",
         "distilbert-base-uncased-finetuned-sst-2-english",
         "bert-base-uncased",
         "albert-base-v2"]

src/finetune.ipynb CHANGED Viewed

@@ -1,140 +1,218 @@
 {
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ___Toxic Tweets Fine-Tuned Pretrained Transformer with Multi Head Classifier___"
-   ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import torch\n",
-    "from torch.utils.data import Dataset \n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification\n",
-    "from transformers import Trainer, TrainingArguments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Getting training dataset\n",
-    "df = pd.read_csv(\"../data/raw/train.csv\")\n",
-    "# Comments as list of strings for training texts\n",
-    "texts = df[\"comment_text\"].tolist()\n",
-    "# Labels extracted from dataframe as list of lists\n",
-    "labels = df[[\"toxic\",\"severe_toxic\",\"obscene\",\"threat\",\"insult\",\"identity_hate\"]].values.tolist()\n",
-    "# Training set split into training and validation sets\n",
-    "train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Tokenizing training and validation sets\n",
-    "class ToxicTweetsDataset(Dataset):\n",
-    "    def __init__(self, encodings, labels):\n",
-    "        self.encodings = encodings\n",
-    "        self.labels = labels\n",
-    "\n",
-    "    def __len__(self):\n",
-    "        return len(self.encodings['input_ids'])\n",
-    "\n",
-    "    def __getitem__(self, index):\n",
-    "        input_ids = self.encodings['input_ids'][index]\n",
-    "        attention_mask = self.encodings['attention_mask'][index]\n",
-    "        labels = torch.tensor(self.labels[index], dtype=torch.float32)\n",
-    "\n",
-    "        return {'input_ids': input_ids,\n",
-    "                'attention_mask': attention_mask,\n",
-    "                'labels': labels}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Choosing base pretrained model\n",
-    "model_name = \"distilbert-base-uncased\"\n",
-    "tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)\n",
-    "\n",
-    "# Tokenizing and encoding training and validation sets\n",
-    "train_encodings = tokenizer(train_texts, truncation=True, padding=True)\n",
-    "val_encodings = tokenizer(val_texts, truncation=True, padding=True)\n",
-    "\n",
-    "# Creating datasets for training and validation\n",
-    "train_dataset = ToxicTweetsDataset(train_encodings, train_labels)\n",
-    "val_dataset = ToxicTweetsDataset(val_encodings, val_labels)\n",
-    "\n",
-    "# Setting training arguments\n",
-    "training_args = TrainingArguments(\n",
-    "    output_dir=\"../models/fine_tuned\",\n",
-    "    num_train_epochs=2, \n",
-    "    per_device_train_batch_size=16, \n",
-    "    per_device_eval_batch_size=64, \n",
-    "    warmup_steps=500,\n",
-    "    learning_rate=5e-5,\n",
-    "    weight_decay=0.01,\n",
-    "    logging_dir=\"./logs\",\n",
-    "    logging_steps=10,\n",
-    "    save_strategy=\"epoch\",\n",
-    "    save_total_limit=1\n",
-    ")\n",
-    "\n",
-    "# If GPU is available, use it, otherwise use CPU\n",
-    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-    "\n",
-    "# Creating model\n",
-    "model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
-    "\n",
-    "# Creating trainer\n",
-    "trainer = Trainer(\n",
-    "    model=model, \n",
-    "    args=training_args,\n",
-    "    train_dataset=train_dataset,\n",
-    "    eval_dataset=val_dataset\n",
-    ")\n",
-    "\n",
-    "# Training model\n",
-    "trainer.train()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
 }

 {
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pgTOocy70xHB"
+      },
+      "source": [
+        "## ___Toxic Tweets Fine-Tuned Pretrained Transformer with Multi Head Classifier___"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uGpWnNct0xHC"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "from torch.utils.data import Dataset, DataLoader\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification\n",
+        "from transformers import Trainer, TrainingArguments"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6JovL-hI8JKQ"
+      },
+      "outputs": [],
+      "source": [
+        "# If GPU is available, use it, otherwise use CPU\n",
+        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DBhZv3Sc0xHD"
+      },
+      "outputs": [],
+      "source": [
+        "# Getting training dataset\n",
+        "df = pd.read_csv(\"../data/raw/train.csv\")\n",
+        "# Comments as list of strings for training texts\n",
+        "texts = df[\"comment_text\"].tolist()\n",
+        "# Labels extracted from dataframe as list of lists\n",
+        "labels = df[[\"toxic\",\"severe_toxic\",\"obscene\",\"threat\",\"insult\",\"identity_hate\"]].values.tolist()\n",
+        "# Training set split into training and validation sets\n",
+        "train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Q0OqpFmZ0xHD"
+      },
+      "outputs": [],
+      "source": [
+        "# Tokenizing training and validation sets\n",
+        "class ToxicTweetsDataset(Dataset):\n",
+        "    # Initialize the class variables\n",
+        "    def __init__(self, encodings, labels):\n",
+        "        self.encodings = encodings\n",
+        "        self.labels = labels\n",
+        "    # Returns the length of the dataset\n",
+        "    def __len__(self):\n",
+        "        return len(self.encodings['input_ids'])\n",
+        "    # Returns a dictionary of the tokenized text, attention mask, and labels\n",
+        "    def __getitem__(self, index):\n",
+        "        input_ids = self.encodings['input_ids'][index]\n",
+        "        attention_mask = self.encodings['attention_mask'][index]\n",
+        "        labels = torch.tensor(self.labels[index], dtype=torch.float32)\n",
+        "        return {'input_ids': input_ids,\n",
+        "                'attention_mask': attention_mask,\n",
+        "                'labels': labels}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zRQlGBz3Dwbs"
+      },
+      "outputs": [],
+      "source": [
+        "# Choosing base pretrained model\n",
+        "model_name = \"distilbert-base-uncased\"\n",
+        "tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AEtsC8-Y0xHE"
+      },
+      "outputs": [],
+      "source": [
+        "# Tokenizing and encoding training and validation sets\n",
+        "train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')\n",
+        "val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "bjQ_ZuJxLnb8"
+      },
+      "outputs": [],
+      "source": [
+        "# Saving encoded and tokenized data to files\n",
+        "torch.save(train_encodings, '../data/tokenized_encodings/train_encodings.pt')\n",
+        "torch.save(val_encodings, '../data/tokenized_encodings/val_encodings.pt')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lqmHgIXwNMJQ"
+      },
+      "outputs": [],
+      "source": [
+        "# Creating training and validation datasets\n",
+        "train_encodings = torch.load('../data/tokenized_encodings/train_encodings.pt').to(device)\n",
+        "val_encodings = torch.load('../data/tokenized_encodings/val_encodings.pt').to(device)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FmEatZLqDt6p"
+      },
+      "outputs": [],
+      "source": [
+        "# Creating datasets for training and validation\n",
+        "train_dataset = ToxicTweetsDataset(train_encodings, train_labels)\n",
+        "val_dataset = ToxicTweetsDataset(val_encodings, val_labels)\n",
+        "\n",
+        "# Creating model\n",
+        "model = DistilBertForSequenceClassification.from_pretrained(model_name, problem_type=\"multi_label_classification\", num_labels=6)\n",
+        "model.to(device)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8CIjI_Q75haw"
+      },
+      "outputs": [],
+      "source": [
+        "# Setting training arguments\n",
+        "training_args = TrainingArguments(\n",
+        "    output_dir=\"../models/fine_tuned\",\n",
+        "    num_train_epochs=2, \n",
+        "    per_device_train_batch_size=16, \n",
+        "    per_device_eval_batch_size=32, \n",
+        "    warmup_steps=500,\n",
+        "    learning_rate=5e-5,\n",
+        "    weight_decay=0.01,\n",
+        "    logging_dir=\"./logs\",\n",
+        "    logging_steps=10,\n",
+        "    save_strategy=\"epoch\",\n",
+        "    save_total_limit=1,\n",
+        ")\n",
+        "\n",
+        "# Creating trainer\n",
+        "trainer = Trainer(\n",
+        "    model=model,\n",
+        "    args=training_args,\n",
+        "    train_dataset=train_dataset,\n",
+        "    eval_dataset=val_dataset,\n",
+        ")\n",
+        "\n",
+        "# Training model\n",
+        "trainer.train()"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.2"
+    },
+    "orig_nbformat": 4
   },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }