Spaces:

awacke1
/

CSV2ClassifyVisualization

Runtime error

App Files Files Community

CSV2ClassifyVisualization / app.py

awacke1

Update app.py

093c4ba about 2 years ago

raw

history blame contribute delete

No virus

4.71 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import spacy
	from spacy import displacy
	from transformers import AutoTokenizer,AutoModelForSequenceClassification, Trainer

	def linkifyHTML():
	import pandas as pd
	import streamlit as st
	link1 = "https://stackoverflow.com/questions/71641666/hyperlink-in-streamlit-dataframe"
	link2 = "https://stackoverflow.com/questions/71731937/how-to-plot-comparison-in-streamlit-dynamically-with-multiselect"
	df = pd.DataFrame(
	{
	"url": [
	f'<a target="_blank" href="{link1}">Hyperlink in Streamlit dataframe</a>',
	f'<a target="_blank" href="{link2}">How to plot comparison in Streamlit dynamically with multiselect?</a>'
	],
	"label": ["question", "question"]
	}
	)
	doc=df.to_html(escape=False, index=False)
	html = displacy.render(doc, style="dep", page=True)
	return html


	# summary function - test for single gradio function interfrace
	def bulk_function(filename):
	# Create class for data preparation
	class SimpleDataset:
	def __init__(self, tokenized_texts):
	self.tokenized_texts = tokenized_texts

	def __len__(self):
	return len(self.tokenized_texts["input_ids"])

	def __getitem__(self, idx):
	return {k: v[idx] for k, v in self.tokenized_texts.items()}

	html = linkify()
	gradio.HTML(html)

	# load tokenizer and model, create trainer
	model_name = "j-hartmann/emotion-english-distilroberta-base"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	trainer = Trainer(model=model)
	print(filename, type(filename))
	print(filename.name)

	# check type of input file
	if filename.name.split(".")[1] == "csv":
	print("entered")
	# read file, drop index if exists
	df_input = pd.read_csv(filename.name, index_col=False)
	if df_input.columns[0] == "Unnamed: 0":
	df_input = df_input.drop("Unnamed: 0", axis=1)
	elif filename.name.split(".")[1] == "xlsx":
	df_input = pd.read_excel(filename.name, index_col=False)
	# handle Unnamed
	if df_input.columns[0] == "Unnamed: 0":
	df_input = df_input.drop("Unnamed: 0", axis=1)
	else:
	return

	# expect csv format to be in:
	# 1: ID
	# 2: Texts
	# no index
	# store ids in ordered list
	ids = df_input[df_input.columns[0]].to_list()

	# store sentences in ordered list
	# expects sentences to be in second col
	# of csv with two cols
	lines_s = df_input[df_input.columns[1]].to_list()

	# Tokenize texts and create prediction data set
	tokenized_texts = tokenizer(lines_s,truncation=True,padding=True)
	pred_dataset = SimpleDataset(tokenized_texts)

	# Run predictions -> predict whole df
	predictions = trainer.predict(pred_dataset)

	# Transform predictions to labels
	preds = predictions.predictions.argmax(-1)
	labels = pd.Series(preds).map(model.config.id2label)
	scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

	# round scores
	scores_rounded = [round(score, 3) for score in scores]

	# scores raw
	temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

	# container
	anger = []
	disgust = []
	fear = []
	joy = []
	neutral = []
	sadness = []
	surprise = []

	# extract scores (as many entries as exist in pred_texts)
	for i in range(len(lines_s)):
	anger.append(round(temp[i][0], 3))
	disgust.append(round(temp[i][1], 3))
	fear.append(round(temp[i][2], 3))
	joy.append(round(temp[i][3], 3))
	neutral.append(round(temp[i][4], 3))
	sadness.append(round(temp[i][5], 3))
	surprise.append(round(temp[i][6], 3))

	# define df
	df = pd.DataFrame(list(zip(ids,lines_s,labels,scores_rounded, anger, disgust, fear, joy, neutral, sadness, surprise)), columns=[df_input.columns[0], df_input.columns[1],'max_label','max_score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
	print(df)
	# save results to csv
	YOUR_FILENAME = filename.name.split(".")[0] + "_emotion_predictions" + ".csv" # name your output file
	df.to_csv(YOUR_FILENAME, index=False)

	# return dataframe for space output
	return YOUR_FILENAME

	gr.Interface(
	bulk_function,
	inputs=[gr.inputs.File(file_count="single",
	type="file",
	label="Upload file",
	optional=False),
	],
	outputs=[gr.outputs.File(label="Output file")],
	theme="huggingface",
	title="CSV File to Sentence Emotion Classification",
	description="Upload csv file with 2 columns (in order): (a) ID column, (b) text column. Model: https://huggingface.co/j-hartmann/emotion-english-distilroberta-base.",
	allow_flagging=False,
	).launch(debug=True)