mdj1412's picture
Update app.py
c239d0a
import gradio as gr
import fasttext
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import torch
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
title = "Movie Review Score Discriminator"
description = "It is a program that classifies whether it is positive or negative by entering movie reviews. \
You can choose between the Korean version and the English version. \
It also provides a version called ""Default"", which determines whether it is Korean or English and predicts it."
class LanguageIdentification:
def __init__(self):
pretrained_lang_model = "./lid.176.ftz"
self.model = fasttext.load_model(pretrained_lang_model)
def predict_lang(self, text):
predictions = self.model.predict(text, k=200) # returns top 200 matching languages
return predictions
LANGUAGE = LanguageIdentification()
def tokenized_data(tokenizer, inputs):
return tokenizer.batch_encode_plus(
[inputs],
return_tensors="pt",
padding="max_length",
max_length=64,
truncation=True)
examples = []
df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
np.random.seed(100)
idx = np.random.choice(50, size=5, replace=False)
eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ]
kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ]
examples = eng_examples + kor_examples
eng_model_name = "roberta-base"
eng_step = 1900
eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
eng_file_name = "{}-{}.pt".format(eng_model_name, eng_step)
eng_state_dict = torch.load(eng_file_name)
eng_model = AutoModelForSequenceClassification.from_pretrained(
eng_model_name, num_labels=2, id2label=id2label, label2id=label2id,
state_dict=eng_state_dict
)
kor_model_name = "klue/roberta-small"
kor_step = 2400
kor_tokenizer = AutoTokenizer.from_pretrained(kor_model_name)
kor_file_name = "{}-{}.pt".format(kor_model_name.replace('/', '_'), kor_step)
kor_state_dict = torch.load(kor_file_name)
kor_model = AutoModelForSequenceClassification.from_pretrained(
kor_model_name, num_labels=2, id2label=id2label, label2id=label2id,
state_dict=kor_state_dict
)
def builder(Lang, Text):
percent_kor, percent_eng = 0, 0
text_list = Text.split(' ')
# [ output_1 ]
if Lang == 'Default':
pred = LANGUAGE.predict_lang(Text)
if '__label__en' in pred[0]:
Lang = 'Eng'
idx = pred[0].index('__label__en')
p_eng = pred[1][idx]
if '__label__ko' in pred[0]:
Lang = 'Kor'
idx = pred[0].index('__label__ko')
p_kor = pred[1][idx]
# Normalize Percentage
percent_kor = p_kor / (p_kor+p_eng)
percent_eng = p_eng / (p_kor+p_eng)
if Lang == 'Eng':
model = eng_model
tokenizer = eng_tokenizer
if percent_eng==0: percent_eng=1
if Lang == 'Kor':
model = kor_model
tokenizer = kor_tokenizer
if percent_kor==0: percent_kor=1
# [ output_2 ]
inputs = tokenized_data(tokenizer, Text)
model.eval()
with torch.no_grad():
logits = model(input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask']).logits
m = torch.nn.Softmax(dim=1)
output = m(logits)
# print(logits, output)
# [ output_3 ]
output_analysis = []
for word in text_list:
tokenized_word = tokenized_data(tokenizer, word)
with torch.no_grad():
logit = model(input_ids=tokenized_word['input_ids'],
attention_mask=tokenized_word['attention_mask']).logits
word_output = m(logit)
if word_output[0][1] > 0.99:
output_analysis.append( (word, '+++') )
elif word_output[0][1] > 0.9:
output_analysis.append( (word, '++') )
elif word_output[0][1] > 0.8:
output_analysis.append( (word, '+') )
elif word_output[0][1] < 0.01:
output_analysis.append( (word, '---') )
elif word_output[0][1] < 0.1:
output_analysis.append( (word, '--') )
elif word_output[0][1] < 0.2:
output_analysis.append( (word, '-') )
else:
output_analysis.append( (word, None) )
return [ {'Kor': percent_kor, 'Eng': percent_eng},
{id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()},
output_analysis ]
# prediction = torch.argmax(logits, axis=1)
return id2label[prediction.item()]
# demo3 = gr.Interface.load("models/mdj1412/movie_review_score_discriminator_eng", inputs="text", outputs="text",
# title=title, theme="peach",
# allow_flagging="auto",
# description=description, examples=examples)
demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Default', 'Eng', 'Kor']), gr.Textbox(placeholder="๋ฆฌ๋ทฐ๋ฅผ ์ž…๋ ฅํ•˜์‹œ์˜ค.")],
outputs=[ gr.Label(num_top_classes=3, label='Lang'),
gr.Label(num_top_classes=2, label='Result'),
gr.HighlightedText(label="Analysis", combine_adjacent=False)
.style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"}) ],
# outputs='label',
title=title, description=description, examples=examples)
with gr.Blocks() as demo1:
gr.Markdown(
"""
<h1 align="center">
Movie Review Score Discriminator
</h1>
""")
with gr.Accordion("Open for More!"):
gr.Markdown(
"""
๋‚ด์šฉ์€ ์•„์ง ๋ฐ”๊พธ์ง€ ์•Š์•˜์Œ (ํ˜•์‹๋งŒ ์ฐธ๊ณ )
๋ฌธ์ œ์  : ํด๋ฆฌ์–ด ํด๋ฆญ์ด ์›ํ•˜๋Š”๋Œ€๋กœ ์•ˆ๋จ
It is a program that classifies whether it is positive or negative by entering movie reviews. \
You can choose between the Korean version and the English version. \
It also provides a version called ""Default"", which determines whether it is Korean or English and predicts it.
""")
with gr.Row():
with gr.Column():
inputs_1 = gr.inputs.Dropdown(['Default', 'Eng', 'Kor'])
inputs_2 = gr.Textbox(placeholder="๋ฆฌ๋ทฐ๋ฅผ ์ž…๋ ฅํ•˜์‹œ์˜ค.")
with gr.Row():
btn2 = gr.Button("ํด๋ฆฌ์–ด")
btn = gr.Button("์ œ์ถœํ•˜๊ธฐ")
with gr.Column():
output_1 = gr.Label(num_top_classes=3, label='Lang')
output_2 = gr.Label(num_top_classes=2, label='Result')
output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \
.style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"})
btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3])
gr.Examples(examples, inputs=[inputs_1, inputs_2])
if __name__ == "__main__":
# print(examples)
demo.launch()
demo1.launch()