File size: 3,886 Bytes
a83f80b
4f2c346
 
509d266
 
4f2c346
6671a55
81eed54
8404815
509d266
6671a55
a83f80b
9660558
 
 
 
 
7c020ac
4f2c346
 
6c5db00
 
 
 
 
 
 
 
 
 
 
 
 
7c020ac
 
9660558
509d266
 
fe7c35d
509d266
 
 
 
9660558
 
 
509d266
 
81eed54
e3822e3
509d266
e3822e3
fe7c35d
 
 
 
 
 
 
 
 
 
 
 
 
81eed54
fe7c35d
81eed54
 
fe7c35d
 
81eed54
fe7c35d
 
 
 
 
6c5db00
 
 
 
 
 
 
 
fe7c35d
 
 
6c5db00
fe7c35d
 
509d266
fe7c35d
 
509d266
 
 
 
81eed54
 
 
 
 
509d266
 
81eed54
509d266
 
 
 
6c5db00
4f2c346
e3822e3
fe7c35d
509d266
1acabbc
509d266
837f208
 
 
81eed54
7c020ac
fe7c35d
509d266
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
import fasttext

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import random
import numpy as np
import pandas as pd
import torch



id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


title = "Movie Review Score Discriminator"
description = "It is a program that classifies whether it is positive or negative by entering movie reviews.  \
                You can choose between the Korean version and the English version.  \
                It also provides a version called Any, which determines whether it is Korean or English and predicts it."


class LanguageIdentification:
    def __init__(self):
        pretrained_lang_model = "./lid.176.ftz"
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text):
        predictions = self.model.predict(text, k=2) # returns top 2 matching languages
        return predictions

LANGUAGE = LanguageIdentification()



def tokenized_data(tokenizer, inputs):
    return tokenizer.batch_encode_plus(
        [inputs],
        return_tensors="pt",
        padding="max_length",
        max_length=64,
        truncation=True)



examples = []
df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
random.seed(100)
for i in range(15):
    idx = random.randint(0, 50)
    examples.extend([ ['Eng', df.iloc[idx, 0]], ['Kor', df.iloc[idx, 1]] ])


eng_model_name = "roberta-base"
eng_step = 1900
eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
eng_file_name = "{}-{}.pt".format(eng_model_name, eng_step)
eng_state_dict = torch.load(eng_file_name)
eng_model = AutoModelForSequenceClassification.from_pretrained(
    eng_model_name, num_labels=2, id2label=id2label, label2id=label2id,
    state_dict=eng_state_dict
)


kor_model_name = "klue/roberta-small"
kor_step = 2400
kor_tokenizer = AutoTokenizer.from_pretrained(kor_model_name)
kor_file_name = "{}-{}.pt".format(kor_model_name.replace('/', '_'), kor_step)
kor_state_dict = torch.load(kor_file_name)
kor_model = AutoModelForSequenceClassification.from_pretrained(
    kor_model_name, num_labels=2, id2label=id2label, label2id=label2id,
    state_dict=kor_state_dict
)


def builder(lang, text):
    if lang == 'Any':
        pred = LANGUAGE.predict_lang(text)
        if pred[0][0] == '__label__ko':
            lang = 'Kor'
        else: # '__label__en'
            lang = 'Eng'
        # else:
        #     raise NotImplementedError("It's neither Korean nor English.")
    if lang == 'Eng':
        model = eng_model
        tokenizer = eng_tokenizer
    if lang == 'Kor':
        model = kor_model
        tokenizer = kor_tokenizer
        
    inputs = tokenized_data(tokenizer, text)
    
    model.eval()
    with torch.no_grad():
        logits = model(input_ids=inputs['input_ids'], 
            attention_mask=inputs['attention_mask']).logits
    
    m = torch.nn.Softmax(dim=1)
    output = m(logits)
    # print(logits, output)

    prediction = torch.argmax(logits, axis=1)
    
    return {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()}
    return id2label[prediction.item()]



demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Any', 'Eng', 'Kor']), "text"], 
                            outputs=gr.Label(num_top_classes=2, label='Result', color='CadetBlue'), 
                            # outputs='label',
                            title=title, description=description, examples=examples)


# demo3 = gr.Interface.load("models/mdj1412/movie_review_score_discriminator_eng", inputs="text", outputs="text", 
#                          title=title, theme="peach",
#                          allow_flagging="auto",
#                          description=description, examples=examples)

if __name__ == "__main__":
    # print(examples)
    demo.launch()
    # demo3.launch()