mdj1412's picture
Upload app.py
4f2c346
raw
history blame
No virus
3.89 kB
import gradio as gr
import fasttext
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import random
import numpy as np
import pandas as pd
import torch
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
title = "Movie Review Score Discriminator"
description = "It is a program that classifies whether it is positive or negative by entering movie reviews. \
You can choose between the Korean version and the English version. \
It also provides a version called Any, which determines whether it is Korean or English and predicts it."
class LanguageIdentification:
def __init__(self):
pretrained_lang_model = "./lid.176.ftz"
self.model = fasttext.load_model(pretrained_lang_model)
def predict_lang(self, text):
predictions = self.model.predict(text, k=2) # returns top 2 matching languages
return predictions
LANGUAGE = LanguageIdentification()
def tokenized_data(tokenizer, inputs):
return tokenizer.batch_encode_plus(
[inputs],
return_tensors="pt",
padding="max_length",
max_length=64,
truncation=True)
examples = []
df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
random.seed(100)
for i in range(15):
idx = random.randint(0, 50)
examples.extend([ ['Eng', df.iloc[idx, 0]], ['Kor', df.iloc[idx, 1]] ])
eng_model_name = "roberta-base"
eng_step = 1900
eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
eng_file_name = "{}-{}.pt".format(eng_model_name, eng_step)
eng_state_dict = torch.load(eng_file_name)
eng_model = AutoModelForSequenceClassification.from_pretrained(
eng_model_name, num_labels=2, id2label=id2label, label2id=label2id,
state_dict=eng_state_dict
)
kor_model_name = "klue/roberta-small"
kor_step = 2400
kor_tokenizer = AutoTokenizer.from_pretrained(kor_model_name)
kor_file_name = "{}-{}.pt".format(kor_model_name.replace('/', '_'), kor_step)
kor_state_dict = torch.load(kor_file_name)
kor_model = AutoModelForSequenceClassification.from_pretrained(
kor_model_name, num_labels=2, id2label=id2label, label2id=label2id,
state_dict=kor_state_dict
)
def builder(lang, text):
if lang == 'Any':
pred = LANGUAGE.predict_lang(text)
if pred[0][0] == '__label__ko':
lang = 'Kor'
else: # '__label__en'
lang = 'Eng'
# else:
# raise NotImplementedError("It's neither Korean nor English.")
if lang == 'Eng':
model = eng_model
tokenizer = eng_tokenizer
if lang == 'Kor':
model = kor_model
tokenizer = kor_tokenizer
inputs = tokenized_data(tokenizer, text)
model.eval()
with torch.no_grad():
logits = model(input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask']).logits
m = torch.nn.Softmax(dim=1)
output = m(logits)
# print(logits, output)
prediction = torch.argmax(logits, axis=1)
return {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()}
return id2label[prediction.item()]
demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Any', 'Eng', 'Kor']), "text"],
outputs=gr.Label(num_top_classes=2, label='Result', color='CadetBlue'),
# outputs='label',
title=title, description=description, examples=examples)
# demo3 = gr.Interface.load("models/mdj1412/movie_review_score_discriminator_eng", inputs="text", outputs="text",
# title=title, theme="peach",
# allow_flagging="auto",
# description=description, examples=examples)
if __name__ == "__main__":
# print(examples)
demo.launch()
# demo3.launch()