File size: 2,856 Bytes
86756d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
This module contains methods for extracting text sentiment from texts
"""
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
# ref: https://colab.research.google.com/github/chrsiebert/sentiment-roberta-large-english/blob/main/sentiment_roberta_prediction_example.ipynb
# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

class Sentiment_Extractor:
    def __init__(self,input_file_name,text_column,output_file_name):
        self.input_file_name = input_file_name
        self.text_column = text_column
        self.output_file_name = output_file_name
    def run(self):
        # Load tokenizer and model, create trainer
        model_name = "siebert/sentiment-roberta-large-english"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        trainer = Trainer(model=model)

        df_pred = pd.read_csv(self.input_file_name,encoding='cp1255')
        pred_texts = df_pred[self.text_column].dropna().astype('str').tolist()

        # Tokenize texts and create prediction data set
        tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
        pred_dataset = SimpleDataset(tokenized_texts)

        # Run predictions
        predictions = trainer.predict(pred_dataset)

        # Transform predictions to labels
        preds = predictions.predictions.argmax(-1)
        labels = pd.Series(preds).map(model.config.id2label)
        scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

        # Create DataFrame with texts, predictions, labels, and scores
        df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text_sentiment','pred_sentiment','label_sentiment','score_sentiment'])
        df_output = df_pred.merge(df,left_on=self.text_column,right_on='text_sentiment')
        del df_output['text_sentiment']
        df_output.to_csv(self.output_file_name,encoding='cp1255',index=False)
        
if __name__ == "__main__":
    # Arguments
    # INPUT_FILE_NAME is the name of the input file
    INPUT_FILE_NAME = "tagging_MMD_db_with_summarized.csv"
    # TEXT_COLUMN is the name of the text column in the input file
    # from which we extract the positive / negative sentiment by the 🤗 model.
    TEXT_COLUMN = "text"
    OUTPUT_FILE_NAME = 'tagging_MMD_db_with_sentiment.csv'

    # Run Sentiment_Extractor on the given arguments
    obj = Sentiment_Extractor(INPUT_FILE_NAME,OUTPUT_FILE_NAME)
    obj.run()