awacke1 commited on
Commit
f9ae357
1 Parent(s): 2b3a00c

Create new file

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import spacy
5
+ from spacy import displacy
6
+
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
8
+
9
+ def linkify():
10
+ import pandas as pd
11
+ import streamlit as st
12
+ link1 = "https://stackoverflow.com/questions/71641666/hyperlink-in-streamlit-dataframe"
13
+ link2 = "https://stackoverflow.com/questions/71731937/how-to-plot-comparison-in-streamlit-dynamically-with-multiselect"
14
+ df = pd.DataFrame(
15
+ {
16
+ "url": [
17
+ f'<a target="_blank" href="{link1}">Hyperlink in Streamlit dataframe</a>',
18
+ f'<a target="_blank" href="{link2}">How to plot comparison in Streamlit dynamically with multiselect?</a>'
19
+ ],
20
+ "label": ["question", "question"]
21
+ }
22
+ )
23
+ doc=df.to_html(escape=False, index=False)
24
+ html = displacy.render(doc, style="dep", page=True)
25
+ return html
26
+
27
+
28
+ # summary function - test for single gradio function interfrace
29
+ def bulk_function(filename):
30
+ # Create class for data preparation
31
+ class SimpleDataset:
32
+ def __init__(self, tokenized_texts):
33
+ self.tokenized_texts = tokenized_texts
34
+
35
+ def __len__(self):
36
+ return len(self.tokenized_texts["input_ids"])
37
+
38
+ def __getitem__(self, idx):
39
+ return {k: v[idx] for k, v in self.tokenized_texts.items()}
40
+
41
+ html = linkify()
42
+ gradio.HTML(html)
43
+
44
+ # load tokenizer and model, create trainer
45
+ model_name = "j-hartmann/emotion-english-distilroberta-base"
46
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
47
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
48
+ trainer = Trainer(model=model)
49
+ print(filename, type(filename))
50
+ print(filename.name)
51
+
52
+
53
+ # check type of input file
54
+ if filename.name.split(".")[1] == "csv":
55
+ print("entered")
56
+ # read file, drop index if exists
57
+ df_input = pd.read_csv(filename.name, index_col=False)
58
+ if df_input.columns[0] == "Unnamed: 0":
59
+ df_input = df_input.drop("Unnamed: 0", axis=1)
60
+ elif filename.name.split(".")[1] == "xlsx":
61
+ df_input = pd.read_excel(filename.name, index_col=False)
62
+ # handle Unnamed
63
+ if df_input.columns[0] == "Unnamed: 0":
64
+ df_input = df_input.drop("Unnamed: 0", axis=1)
65
+ else:
66
+ return
67
+
68
+
69
+ # read csv
70
+ # even if index given, drop it
71
+ #df_input = pd.read_csv(filename.name, index_col=False)
72
+ #print("df_input", df_input)
73
+
74
+ # expect csv format to be in:
75
+ # 1: ID
76
+ # 2: Texts
77
+ # no index
78
+ # store ids in ordered list
79
+ ids = df_input[df_input.columns[0]].to_list()
80
+
81
+ # store sentences in ordered list
82
+ # expects sentences to be in second col
83
+ # of csv with two cols
84
+ lines_s = df_input[df_input.columns[1]].to_list()
85
+
86
+ # Tokenize texts and create prediction data set
87
+ tokenized_texts = tokenizer(lines_s,truncation=True,padding=True)
88
+ pred_dataset = SimpleDataset(tokenized_texts)
89
+
90
+ # Run predictions -> predict whole df
91
+ predictions = trainer.predict(pred_dataset)
92
+
93
+ # Transform predictions to labels
94
+ preds = predictions.predictions.argmax(-1)
95
+ labels = pd.Series(preds).map(model.config.id2label)
96
+ scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
97
+
98
+ # round scores
99
+ scores_rounded = [round(score, 3) for score in scores]
100
+
101
+ # scores raw
102
+ temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))
103
+
104
+ # container
105
+ anger = []
106
+ disgust = []
107
+ fear = []
108
+ joy = []
109
+ neutral = []
110
+ sadness = []
111
+ surprise = []
112
+
113
+ # extract scores (as many entries as exist in pred_texts)
114
+ for i in range(len(lines_s)):
115
+ anger.append(round(temp[i][0], 3))
116
+ disgust.append(round(temp[i][1], 3))
117
+ fear.append(round(temp[i][2], 3))
118
+ joy.append(round(temp[i][3], 3))
119
+ neutral.append(round(temp[i][4], 3))
120
+ sadness.append(round(temp[i][5], 3))
121
+ surprise.append(round(temp[i][6], 3))
122
+
123
+ # define df
124
+ df = pd.DataFrame(list(zip(ids,lines_s,labels,scores_rounded, anger, disgust, fear, joy, neutral, sadness, surprise)), columns=[df_input.columns[0], df_input.columns[1],'max_label','max_score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
125
+ print(df)
126
+ # save results to csv
127
+ YOUR_FILENAME = filename.name.split(".")[0] + "_emotion_predictions" + ".csv" # name your output file
128
+ df.to_csv(YOUR_FILENAME, index=False)
129
+
130
+ # return dataframe for space output
131
+ return YOUR_FILENAME
132
+
133
+ gr.Interface(bulk_function,
134
+ inputs=[gr.inputs.File(file_count="single", type="file", label="Upload file", optional=False),],
135
+ outputs=[gr.outputs.File(label="Output file")],
136
+ # examples=[["YOUR_FILENAME.csv"]], # computes, doesn't export df so far
137
+ #["highlight", "json", "html"],
138
+ theme="huggingface",
139
+ title="Emotion Classification from CSV",
140
+ description="Upload csv file with 2 columns (in order): (a) ID column, (b) text column. Model: https://huggingface.co/j-hartmann/emotion-english-distilroberta-base.",
141
+ allow_flagging=False,
142
+ ).launch(debug=True)