Spaces:

samueldomdey
/

Emotion

Runtime error

File size: 2,693 Bytes

8083b64
 
 
8495f34
294751b
cfa89f0
cbdef56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cfa89f0
 
 
 
cbdef56
 
294751b
cbdef56
 
294751b
cfa89f0
 
 
cbdef56
 
294751b
cbdef56
 
cfa89f0
cbdef56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294751b
cbdef56
 
 
 
 
 
 
 
 
294751b
cbdef56
 
cfa89f0
cbdef56
 
 
cfa89f0

import gradio as gr
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
# summary function - test for single gradio function interface
# summary function - test for single gradio function interfrace
def bulk_function(filename):
  # Create class for data preparation
  class SimpleDataset:
      def __init__(self, tokenized_texts):
          self.tokenized_texts = tokenized_texts
      
      def __len__(self):
          return len(self.tokenized_texts["input_ids"])
      
      def __getitem__(self, idx):
          return {k: v[idx] for k, v in self.tokenized_texts.items()}

  # load tokenizer and model, create trainer
  model_name = "j-hartmann/emotion-english-distilroberta-base"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
  trainer = Trainer(model=model)  
  print(filename, type(filename))
  print(filename.name)



  # read file lines
  with open(filename.name, "r") as f:
    lines = f.readlines()
  # expects unnamed:0 or index, col name -> strip both
  lines_s = [item.split("\n")[0].split(",")[-1] for item in lines]
  print(lines_s)
  print(filename)
 

    # Tokenize texts and create prediction data set
  tokenized_texts = tokenizer(lines_s,truncation=True,padding=True)
  pred_dataset = SimpleDataset(tokenized_texts)

    # Run predictions -> predict whole df
  predictions = trainer.predict(pred_dataset)

    # Transform predictions to labels
  preds = predictions.predictions.argmax(-1)
  labels = pd.Series(preds).map(model.config.id2label)
  scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
    # scores raw
  temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

    # work in progress
  # container
  anger = []
  disgust = []
  fear = []
  joy = []
  neutral = []
  sadness = []
  surprise = []

  # extract scores (as many entries as exist in pred_texts)
  for i in range(len(lines_s)):
    anger.append(temp[i][0])
    disgust.append(temp[i][1])
    fear.append(temp[i][2])
    joy.append(temp[i][3])
    neutral.append(temp[i][4])
    sadness.append(temp[i][5])
    surprise.append(temp[i][6])

  # define df
  df = pd.DataFrame(list(zip(lines_s,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])

  # save results to csv
  YOUR_FILENAME = filename.name.split(".")[0] + "_emotion_predictions" + ".csv"  # name your output file
  df.to_csv(YOUR_FILENAME)

  # return dataframe for space output
  return df