File size: 4,273 Bytes
d3830cc
 
 
65aabc9
d3830cc
 
 
 
 
 
 
 
 
 
65aabc9
d3830cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49b7222
d3830cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
python interactive.py
"""
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TextClassificationPipeline
import gradio as gr

model_name = 'momo/KcELECTRA-base_Hate_speech_Privacy_Detection'

model_name_list = [
    'momo/KcELECTRA-base_Hate_speech_Privacy_Detection',
    "momo/KcBERT-base_Hate_speech_Privacy_Detection",
]

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=15,
    problem_type="multi_label_classification"
).cuda()

tokenizer = AutoTokenizer.from_pretrained(model_name)
model.cuda()

unsmile_labels = ["์—ฌ์„ฑ/๊ฐ€์กฑ","๋‚จ์„ฑ","์„ฑ์†Œ์ˆ˜์ž","์ธ์ข…/๊ตญ์ ","์—ฐ๋ น","์ง€์—ญ","์ข…๊ต","๊ธฐํƒ€ ํ˜์˜ค","์•…ํ”Œ/์š•์„ค","clean", 'name', 'number', 'address', 'bank', 'person']
num_labels = len(unsmile_labels)

model.config.id2label = {i: label for i, label in zip(range(num_labels), unsmile_labels)}
model.config.label2id = {label: i for i, label in zip(range(num_labels), unsmile_labels)}

pipe = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    device=0,
    return_all_scores=True,
    function_to_apply='sigmoid'
    )
    
def dectection(input):
    for result in pipe(input)[0]:
        return result

#Create a gradio app with a button that calls predict()
app = gr.Interface(
    fn=dectection,
    inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label'], 
    title="ํ•œ๊ตญ์–ด ํ˜์˜คํ‘œํ˜„, ๊ฐœ์ธ์ •๋ณด ํŒ๋ณ„๊ธฐ (Korean Hate Speech and Privacy Detection)",
    description="Korean Hate Speech and Privacy Detection."
    )
app.launch(share=True)





# # global var
# MODEL_NAME = 'jason9693/SoongsilBERT-base-beep'
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
# config = AutoConfig.from_pretrained(MODEL_NAME)

# MODEL_BUF = {
#     "name": MODEL_NAME,
#     "tokenizer": tokenizer,
#     "model": model,
#     "config": config
# }

# def change_model_name(name):
#     MODEL_BUF["name"] = name
#     MODEL_BUF["tokenizer"] = AutoTokenizer.from_pretrained(name)
#     MODEL_BUF["model"] = AutoModelForSequenceClassification.from_pretrained(name)
#     MODEL_BUF["config"] = AutoConfig.from_pretrained(name)


# def predict(model_name, text):
#     if model_name != MODEL_BUF["name"]:
#         change_model_name(model_name)
    
#     tokenizer = MODEL_BUF["tokenizer"]
#     model = MODEL_BUF["model"]
#     config = MODEL_BUF["config"]

#     tokenized_text = tokenizer([text], return_tensors='pt')

#     input_tokens = tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0])
#     try:
#         input_tokens = util.bytetokens_to_unicdode(input_tokens) if config.model_type in ['roberta', 'gpt', 'gpt2'] else input_tokens
#     except KeyError:
#         input_tokens = input_tokens

#     model.eval()
#     output, attention = model(**tokenized_text, output_attentions=True, return_dict=False)
#     output = F.softmax(output, dim=-1)
#     result = {}
    
#     for idx, label in enumerate(output[0].detach().numpy()):
#         result[config.id2label[idx]] = float(label)

#     fig = visualize_attention(input_tokens, attention[0][0].detach().numpy())
#     return result, fig#.logits.detach()#.numpy()#, output.attentions.detach().numpy()


# if __name__ == '__main__':
#     text = '์ฟ๋”ด๊ฑธ ํ™๋ณฟ๊ธ€ ์ฟ๋ž‰๊ณญ ์Œ‘์ ฉ๋‚„๊ณ  ์•‰์•Ÿ์žˆ๋ƒฉ'

#     model_name_list = [
#         'jason9693/SoongsilBERT-base-beep',
#         "beomi/beep-klue-roberta-base-hate",
#         "beomi/beep-koelectra-base-v3-discriminator-hate",
#         "beomi/beep-KcELECTRA-base-hate"
#     ]

#     #Create a gradio app with a button that calls predict()
#     app = gr.Interface(
#         fn=predict,
#         inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label', 'plot'], 
#         examples = [[MODEL_BUF["name"], text], [MODEL_BUF["name"], "4=๐Ÿฆ€ 4โ‰ ๐Ÿฆ€"]],
#         title="ํ•œ๊ตญ์–ด ํ˜์˜ค์„ฑ ๋ฐœํ™” ๋ถ„๋ฅ˜๊ธฐ (Korean Hate Speech Classifier)",
#         description="Korean Hate Speech Classifier with Several Pretrained LM\nCurrent Supported Model:\n1. SoongsilBERT\n2. KcBERT(+KLUE)\n3. KcELECTRA\n4.KoELECTRA."
#         )
#     app.launch(inline=False)