|
import gradio as gr |
|
import fasttext |
|
|
|
from transformers import AutoModelForSequenceClassification |
|
from transformers import AutoTokenizer |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import torch |
|
|
|
|
|
id2label = {0: "NEGATIVE", 1: "POSITIVE"} |
|
label2id = {"NEGATIVE": 0, "POSITIVE": 1} |
|
|
|
|
|
title = "Movie Review Score Discriminator" |
|
description = "It is a program that classifies whether it is positive or negative by entering movie reviews. \ |
|
You can choose between the Korean version and the English version. \ |
|
It also provides a version called ""Default"", which determines whether it is Korean or English and predicts it." |
|
|
|
|
|
class LanguageIdentification: |
|
def __init__(self): |
|
pretrained_lang_model = "./lid.176.ftz" |
|
self.model = fasttext.load_model(pretrained_lang_model) |
|
|
|
def predict_lang(self, text): |
|
predictions = self.model.predict(text, k=200) |
|
return predictions |
|
|
|
LANGUAGE = LanguageIdentification() |
|
|
|
|
|
|
|
def tokenized_data(tokenizer, inputs): |
|
return tokenizer.batch_encode_plus( |
|
[inputs], |
|
return_tensors="pt", |
|
padding="max_length", |
|
max_length=64, |
|
truncation=True) |
|
|
|
|
|
|
|
examples = [] |
|
df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0') |
|
np.random.seed(100) |
|
|
|
idx = np.random.choice(50, size=5, replace=False) |
|
eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ] |
|
kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ] |
|
examples = eng_examples + kor_examples |
|
|
|
|
|
|
|
eng_model_name = "roberta-base" |
|
eng_step = 1900 |
|
eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name) |
|
eng_file_name = "{}-{}.pt".format(eng_model_name, eng_step) |
|
eng_state_dict = torch.load(eng_file_name) |
|
eng_model = AutoModelForSequenceClassification.from_pretrained( |
|
eng_model_name, num_labels=2, id2label=id2label, label2id=label2id, |
|
state_dict=eng_state_dict |
|
) |
|
|
|
|
|
kor_model_name = "klue/roberta-small" |
|
kor_step = 2400 |
|
kor_tokenizer = AutoTokenizer.from_pretrained(kor_model_name) |
|
kor_file_name = "{}-{}.pt".format(kor_model_name.replace('/', '_'), kor_step) |
|
kor_state_dict = torch.load(kor_file_name) |
|
kor_model = AutoModelForSequenceClassification.from_pretrained( |
|
kor_model_name, num_labels=2, id2label=id2label, label2id=label2id, |
|
state_dict=kor_state_dict |
|
) |
|
|
|
|
|
def builder(Lang, Text): |
|
percent_kor, percent_eng = 0, 0 |
|
text_list = Text.split(' ') |
|
|
|
|
|
|
|
if Lang == 'Default': |
|
pred = LANGUAGE.predict_lang(Text) |
|
if '__label__en' in pred[0]: |
|
Lang = 'Eng' |
|
idx = pred[0].index('__label__en') |
|
p_eng = pred[1][idx] |
|
if '__label__ko' in pred[0]: |
|
Lang = 'Kor' |
|
idx = pred[0].index('__label__ko') |
|
p_kor = pred[1][idx] |
|
|
|
percent_kor = p_kor / (p_kor+p_eng) |
|
percent_eng = p_eng / (p_kor+p_eng) |
|
|
|
if Lang == 'Eng': |
|
model = eng_model |
|
tokenizer = eng_tokenizer |
|
if percent_eng==0: percent_eng=1 |
|
|
|
if Lang == 'Kor': |
|
model = kor_model |
|
tokenizer = kor_tokenizer |
|
if percent_kor==0: percent_kor=1 |
|
|
|
|
|
|
|
inputs = tokenized_data(tokenizer, Text) |
|
model.eval() |
|
with torch.no_grad(): |
|
logits = model(input_ids=inputs['input_ids'], |
|
attention_mask=inputs['attention_mask']).logits |
|
|
|
m = torch.nn.Softmax(dim=1) |
|
output = m(logits) |
|
|
|
|
|
|
|
|
|
output_analysis = [] |
|
for word in text_list: |
|
tokenized_word = tokenized_data(tokenizer, word) |
|
with torch.no_grad(): |
|
logit = model(input_ids=tokenized_word['input_ids'], |
|
attention_mask=tokenized_word['attention_mask']).logits |
|
word_output = m(logit) |
|
if word_output[0][1] > 0.99: |
|
output_analysis.append( (word, '+++') ) |
|
elif word_output[0][1] > 0.9: |
|
output_analysis.append( (word, '++') ) |
|
elif word_output[0][1] > 0.8: |
|
output_analysis.append( (word, '+') ) |
|
elif word_output[0][1] < 0.01: |
|
output_analysis.append( (word, '---') ) |
|
elif word_output[0][1] < 0.1: |
|
output_analysis.append( (word, '--') ) |
|
elif word_output[0][1] < 0.2: |
|
output_analysis.append( (word, '-') ) |
|
else: |
|
output_analysis.append( (word, None) ) |
|
|
|
|
|
return [ {'Kor': percent_kor, 'Eng': percent_eng}, |
|
{id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()}, |
|
output_analysis ] |
|
|
|
|
|
return id2label[prediction.item()] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Default', 'Eng', 'Kor']), gr.Textbox(placeholder="๋ฆฌ๋ทฐ๋ฅผ ์
๋ ฅํ์์ค.")], |
|
outputs=[ gr.Label(num_top_classes=3, label='Lang'), |
|
gr.Label(num_top_classes=2, label='Result'), |
|
gr.HighlightedText(label="Analysis", combine_adjacent=False) |
|
.style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"}) ], |
|
|
|
title=title, description=description, examples=examples) |
|
|
|
def fn2(a, b): |
|
return [None, None, None] |
|
|
|
with gr.Blocks() as demo1: |
|
gr.Markdown( |
|
""" |
|
<h1 align="center"> |
|
Movie Review Score Discriminator |
|
</h1> |
|
""") |
|
|
|
gr.Markdown( |
|
""" |
|
ํด๋น ์ฌ์ดํธ์์๋ ์ํ ๋ฆฌ๋ทฐ๋ฅผ ์
๋ ฅํ์ ๋, ๊ธ์ ์ ์ธ ๋ฆฌ๋ทฐ์ธ์ง ๋ถ์ ์ ์ธ ๋ฆฌ๋ทฐ์ธ์ง ํ๋ณ ํด์ค๋ค. |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
gr.Markdown( |
|
""" |
|
### ์
๋ ฅ(Lang, Text)์ ๋ํ ์ค๋ช
|
|
๊ฐ์ด๋ฐ๋ฅผ ๊ธฐ์ค์ผ๋ก ์ผ์ชฝ์ ์๋ ์นธ๋ค์ ์
๋ ฅ์ ํด๋นํ๋ค. |
|
* โLangโ์ ์์ ์ด ์
๋ ฅํ ์ธ์ด๋ฅผ ์ ํํ๋ ๊ฒ์ด๋ค. ๊ธฐ์
์ด ์์ ๊ฒฝ์ฐ(Default)์๋ ํ๊ตญ์ด์ธ์ง ์์ด์ธ์ง ์ค๋ฅธ์ชฝ ๋ฐ์ค โLangโ์์ ํ๋จ ํด์ค๋ค. |
|
* โTextโ๋ ์ํ ๋ฆฌ๋ทฐ๋ฅผ ์
๋ ฅํ๋ ๊ณณ์ด๋ค. ์
๋ ฅ์ ๋ง์น๊ณ ์ ์ถํ๊ธฐ ๋ฒํผ์ ๋๋ฅด๋ฉด ๊ฒฐ๊ณผ๋ฅผ ๋ถ์ํ ์ ์๋ค. |
|
""") |
|
gr.Markdown( |
|
""" |
|
### ์ถ๋ ฅ(Lang, Result, Analysis)์ ๋ํ ์ค๋ช
|
|
๊ฐ์ด๋ฐ๋ฅผ ๊ธฐ์ค์ผ๋ก ์ค๋ฅธ์ชฝ์ ์๋ ์นธ๋ค์ ์ถ๋ ฅ์ ํด๋นํ๋ค. |
|
* โLangโ์ ์ผ์ชฝ ์
๋ ฅ์์ ์ธ์ด๋ฅผ ์ ํํ ๋, ํ๊ตญ์ด๋ฅผ ์ ํํ์ผ๋ฉด 100% Kor, ์์ด๋ฅผ ์ ํํ๋ค๋ฉด 100%% Eng ์ด๋ค. |
|
๋ง์ฝ Default๋ฅผ ์ ํํ๋ค๋ฉด ํ๊ตญ์ด์ธ์ง ์์ด์ธ์ง ํ๋จ ํด์ค๋ค. |
|
์ด ๋ฐฉ๋ฒ์ [ํด๋น ์ฌ์ดํธ](https://medium.com/@c.chaitanya/language-identification-in-python-using-fasttext-60359dc30ed0)์ ์๋ ํจํค์ง๋ฅผ ์ฌ์ฉํ๋ค. |
|
* โResultโ๋ ๋ ๋ชจ๋ธ(Kor, Eng)์ ์ด์ฉํด์ ๊ธ์ ๋ฆฌ๋ทฐ ๋๋ ๋ถ์ ๋ฆฌ๋ทฐ๋ฅผ ํ๋จํ๋ค. (๋ชจ๋ธ์ ๋ํด ๋ ์๊ณ ์ถ์ผ๋ฉด ๋ฐ์์ ํ์ธํ๊ธธ ๋ฐ๋๋ค.) |
|
* โAnalysisโ๋ ์
๋ ฅํ ๋ฆฌ๋ทฐ์ ๊ธ์ ๋๋ ๋ถ์ ์ ๊ฒฐ์ ๋ ๋, ์ด๋ค ๋จ์ด์ ์ํด์ ๊ฒฐ์ ๋์๋์ง ๊ฐ์กฐ ํด์ค๋ค. |
|
์ฆ, ์ํฅ์ ์ค ๋จ์ด์ ๋ํด์๋ ์ผ๋ง๋ ๊ธ์ ์ ์ธ(๋๋ ๋ถ์ ์ ์ธ) ์ํฅ์ ์ฃผ์๋์ง ๊ฐ์กฐ๋์ด ์๋ค. |
|
( ๊ธ์ ์ธ ๊ฒฝ์ฐ ๋ถ์์(+++, ++, +), ๋ถ์ ์ธ ๊ฒฝ์ฐ ํ๋์(---,--,-) ) |
|
""") |
|
|
|
with gr.Accordion("๋ชจ๋ธ์ ๋ํ ์ค๋ช
"): |
|
gr.Markdown( |
|
""" |
|
' | Kor | Eng |
|
:---:|:---:|:---: |
|
Model Name | klue/roberta-base | bert-base-uncased |
|
Learning Rate | 3e-05 | 5e-5 |
|
Batch Size Train | 64 | 64 |
|
Steps | 2800 | 2000 |
|
Batch Size Test | 4 | 4 |
|
Validation Accuracy | 93.55% | 95.81% |
|
Test Accuracy | 94.0% | 92.8% |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
inputs_1 = gr.inputs.Dropdown(['Default', 'Eng', 'Kor'], label='Lang') |
|
inputs_2 = gr.Textbox(placeholder="๋ฆฌ๋ทฐ๋ฅผ ์
๋ ฅํ์์ค.", label='Text') |
|
with gr.Row(): |
|
btn2 = gr.Button("ํด๋ฆฌ์ด") |
|
btn = gr.Button("์ ์ถํ๊ธฐ") |
|
with gr.Column(): |
|
output_1 = gr.Label(num_top_classes=3, label='Lang') |
|
output_2 = gr.Label(num_top_classes=2, label='Result') |
|
output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \ |
|
.style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"}) |
|
|
|
|
|
btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3]) |
|
gr.Examples(examples, inputs=[inputs_1, inputs_2]) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
demo1.launch() |