{ "cells": [ { "cell_type": "markdown", "id": "750fed8c", "metadata": {}, "source": [ "Must run the following:" ] }, { "cell_type": "code", "execution_count": 1, "id": "ccad76ec", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\n" ] } ], "source": [ "!git clone https://github.com/dvamossy/EmTract.git\n", "%cd EmTract\n", "!pip install -r requirements.txt " ] }, { "cell_type": "markdown", "id": "2551adee", "metadata": {}, "source": [ "Text Cleaner for unprocessed text" ] }, { "cell_type": "code", "execution_count": 2, "id": "687995ef", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\\emtract\\processors\\cleaning.py:68: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", " symspell_list = pd.read_csv(\n" ] }, { "data": { "text/plain": [ "'soo well'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from emtract.processors.cleaning import clean_text\n", "# Illustrate text cleaning\n", "clean_text(\"soooooo well\", segment_words=False)" ] }, { "cell_type": "markdown", "id": "6b81c0cd", "metadata": {}, "source": [ "Option I" ] }, { "cell_type": "code", "execution_count": null, "id": "0ca68eb1", "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline\n", "classifier = pipeline(\"text-classification\", model=\"vamossyd/emtract-distilbert-base-uncased-emotion\", return_all_scores=True)\n", "classifier(\"i love this!\")" ] }, { "cell_type": "markdown", "id": "0b9cd58f", "metadata": {}, "source": [ "Option II" ] }, { "cell_type": "code", "execution_count": null, "id": "524cb5d6", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import pandas as pd\n", "import numpy as np\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer\n", "\n", "# Create class for data preparation\n", "class SimpleDataset:\n", " def __init__(self, tokenized_texts):\n", " self.tokenized_texts = tokenized_texts\n", " \n", " def __len__(self):\n", " return len(self.tokenized_texts[\"input_ids\"])\n", " \n", " def __getitem__(self, idx):\n", " return {k: v[idx] for k, v in self.tokenized_texts.items()}" ] }, { "cell_type": "code", "execution_count": null, "id": "1f9f01f4", "metadata": {}, "outputs": [], "source": [ "input_path = \"PROVIDE_PATH_TO_DATA\"\n", "# data = pd.read_csv(input_path) # ASSUMING DATA IS IN CSV\n", "\n", "# If text is already cleaned:\n", "# texts = data.text.tolist() \n", "\n", "# Otherwise:\n", "# texts = data['text'].apply(clean_text).tolist() # \n", "\n", "# As an example:\n", "texts = ['i love this', 'i do not love you', 'to the moon 🚀']" ] }, { "cell_type": "code", "execution_count": null, "id": "04ce5528", "metadata": {}, "outputs": [], "source": [ "# in case the model does not load, use git to clone it and use emtract-distilbert-base-uncased-emotion in the model_name field\n", "\n", "#!git clone https://huggingface.co/vamossyd/emtract-distilbert-base-uncased-emotion" ] }, { "cell_type": "code", "execution_count": null, "id": "839cd230", "metadata": {}, "outputs": [], "source": [ "# load tokenizer and model, create trainer\n", "model_name = \"vamossyd/emtract-distilbert-base-uncased-emotion\"\n", "# model_name = \"emtract-distilbert-base-uncased-emotion\" # in case the model does not load\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n", "trainer = Trainer(model=model)\n", "\n", "# Tokenize texts and create prediction data set\n", "tokenized_texts = tokenizer(texts, truncation=True, padding=True)\n", "pred_dataset = SimpleDataset(tokenized_texts)\n", "predictions = trainer.predict(pred_dataset)" ] }, { "cell_type": "code", "execution_count": null, "id": "3d903549", "metadata": {}, "outputs": [], "source": [ "# scores raw\n", "temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))\n", "preds = predictions.predictions.argmax(-1)\n", "labels = pd.Series(preds).map(model.config.id2label)\n", "\n", "# container\n", "anger = []\n", "disgust = []\n", "fear = []\n", "happy = []\n", "neutral = []\n", "sadness = []\n", "surprise = []\n", "\n", "# extract scores (as many entries as exist in pred_texts)\n", "for i in range(len(texts)):\n", " anger.append(temp[i][3])\n", " disgust.append(temp[i][4])\n", " fear.append(temp[i][6])\n", " happy.append(temp[i][1])\n", " neutral.append(temp[i][0])\n", " sadness.append(temp[i][2])\n", " surprise.append(temp[i][5])\n", " \n", "df = pd.DataFrame(list(zip(texts, labels, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text','pred_label', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "577f10b8", "metadata": {}, "outputs": [], "source": [ "# save results to csv\n", "output_path = \"YOUR_FILENAME_EMOTIONS.csv\" # name your output file\n", "# df.to_csv(YOUR_FILENAME)" ] }, { "cell_type": "markdown", "id": "ddd22317", "metadata": {}, "source": [ "Option III\n", "\n", "Batch prediction in case data is too large." ] }, { "cell_type": "code", "execution_count": null, "id": "6f39375b", "metadata": {}, "outputs": [], "source": [ "# Specify batch size\n", "batch_size = 100000\n", "\n", "# Split the texts into batches\n", "text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]\n", "\n", "# Store the predictions\n", "all_predictions = []\n", "\n", "# Iterate through batches\n", "for batch in tqdm(text_batches):\n", " # Tokenize texts and create prediction dataset\n", " tokenized_texts = tokenizer(batch, truncation=True, padding=True)\n", " pred_dataset = SimpleDataset(tokenized_texts)\n", " predictions = trainer.predict(pred_dataset)[0]\n", " all_predictions.extend(predictions)\n", "\n", "all_predictions = np.array(all_predictions)\n", "\n", "# scores raw\n", "temp = (np.exp(all_predictions)/np.exp(all_predictions).sum(-1,keepdims=True))\n", "\n", "# container\n", "anger = []\n", "disgust = []\n", "fear = []\n", "happy = []\n", "neutral = []\n", "sadness = []\n", "surprise = []\n", "\n", "# extract scores (as many entries as exist in pred_texts)\n", "for i in range(len(texts)):\n", " anger.append(temp[i][3])\n", " disgust.append(temp[i][4])\n", " fear.append(temp[i][6])\n", " happy.append(temp[i][1])\n", " neutral.append(temp[i][0])\n", " sadness.append(temp[i][2])\n", " surprise.append(temp[i][5])\n", " \n", "df = pd.DataFrame(list(zip(texts, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n", "df.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }