Spaces:
Runtime error
Runtime error
File size: 2,856 Bytes
86756d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
"""
This module contains methods for extracting text sentiment from texts
"""
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
# ref: https://colab.research.google.com/github/chrsiebert/sentiment-roberta-large-english/blob/main/sentiment_roberta_prediction_example.ipynb
# Create class for data preparation
class SimpleDataset:
def __init__(self, tokenized_texts):
self.tokenized_texts = tokenized_texts
def __len__(self):
return len(self.tokenized_texts["input_ids"])
def __getitem__(self, idx):
return {k: v[idx] for k, v in self.tokenized_texts.items()}
class Sentiment_Extractor:
def __init__(self,input_file_name,text_column,output_file_name):
self.input_file_name = input_file_name
self.text_column = text_column
self.output_file_name = output_file_name
def run(self):
# Load tokenizer and model, create trainer
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)
df_pred = pd.read_csv(self.input_file_name,encoding='cp1255')
pred_texts = df_pred[self.text_column].dropna().astype('str').tolist()
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)
# Run predictions
predictions = trainer.predict(pred_dataset)
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text_sentiment','pred_sentiment','label_sentiment','score_sentiment'])
df_output = df_pred.merge(df,left_on=self.text_column,right_on='text_sentiment')
del df_output['text_sentiment']
df_output.to_csv(self.output_file_name,encoding='cp1255',index=False)
if __name__ == "__main__":
# Arguments
# INPUT_FILE_NAME is the name of the input file
INPUT_FILE_NAME = "tagging_MMD_db_with_summarized.csv"
# TEXT_COLUMN is the name of the text column in the input file
# from which we extract the positive / negative sentiment by the 🤗 model.
TEXT_COLUMN = "text"
OUTPUT_FILE_NAME = 'tagging_MMD_db_with_sentiment.csv'
# Run Sentiment_Extractor on the given arguments
obj = Sentiment_Extractor(INPUT_FILE_NAME,OUTPUT_FILE_NAME)
obj.run() |