File size: 2,921 Bytes
eb1ba05
 
 
54412bc
eb1ba05
 
 
 
 
 
e556c82
 
eb1ba05
 
9d7201e
 
eb1ba05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aff6f6a
eb1ba05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a014dc
 
 
 
 
 
 
 
 
eb1ba05
 
 
 
 
 
 
fb622a8
 
 
 
eb1ba05
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109


import numpy as np
#import itertools

from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

import pandas as pd

# make function using import pip to install torch
import pip
#pip.main(['install', 'torch'])
#pip.main(['install', 'transformers'])

import torch
import transformers

from transformers import BertTokenizerFast
from transformers import AutoModel

def make_candiadte(prompt):
    okt = Okt()
    tokenized_doc = okt.pos(prompt)
    tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])

    n_gram_range = (2, 3)

    count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
    candidates = count.get_feature_names_out()

    return candidates


# saved_model
def load_model():

    pretrained_model_name = "kykim/bert-kor-base"

    tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name)
    model = AutoModel.from_pretrained("./bertmodel/")

    return model, tokenizer


# main
def inference(prompt):

    candidates = make_candiadte(prompt)

    model, tokenizer = load_model()
    
    input_ids = tokenizer.encode(prompt)
    input_ids = torch.tensor(input_ids).unsqueeze(0)

    doc_embedding = model(input_ids)["pooler_output"]

    top_n = 5

    words = []
    distances = []

    for word in candidates:
        input_ids = tokenizer.encode(word)
        input_ids = torch.tensor(input_ids).unsqueeze(0)
        word_embedding = model(input_ids)["pooler_output"]

        distance = torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item()

        words.append(word)
        distances.append(distance)

        #print(word, torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item())

    cos_df = pd.DataFrame({'word':words, 'distance':distances})

    # sort by distance
    cos_df = cos_df.sort_values(by='distance', ascending=False)

    # top n
    cos_df = cos_df[:top_n]

    cos_df["word"].values

    # ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ
    outputs = []
    for word in cos_df["word"].values:
        okt = Okt()
        tokenized_doc = okt.pos(word)
        tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])
        outputs.append("#" + tokenized_nouns)

    outputs = " ".join(outputs)

    return outputs


demo = gr.Interface(
    fn=inference, 
    inputs="text", 
    outputs="text", #return ๊ฐ’
    examples=[
        "์ง€๋‚œํ•ด ๊ตญ๋‚ด ํด๋ž˜์‹๊ณ„ ์ตœ๊ณ  ์Šคํƒ€๋กœ ๋– ์˜ค๋ฅธ ํ”ผ์•„๋‹ˆ์ŠคํŠธ ์ž„์œค์ฐฌ์ด ๋ฏธ๊ตญ ๋ฐด ํด๋ผ์ด๋ฒˆ ๊ตญ์ œ์ฝฉ์ฟ ๋ฅด ๊ฒฐ์„ ์—์„œ ์—ฐ์ฃผํ•œ ๋ผํ๋งˆ๋‹ˆ๋…ธํ”„ ํ”ผ์•„๋…ธ ํ˜‘์ฃผ๊ณก ์ œ3๋ฒˆ ์˜์ƒ์ด ์œ ํŠœ๋ธŒ์—์„œ ์กฐํšŒ์ˆ˜ 1000๋งŒํšŒ๋ฅผ ๋„˜๊ฒผ๋‹ค. ๋ผํ๋งˆ๋‹ˆ๋…ธํ”„ 3๋ฒˆ ์—ฐ์ฃผ ์˜์ƒ ์ค‘ ๋‹จ์—ฐ ์ตœ๊ณ  ์กฐํšŒ์ˆ˜๋‹ค."
    ]
    ).launch() # launch(share=True)๋ฅผ ์„ค์ •ํ•˜๋ฉด ์™ธ๋ถ€์—์„œ ์ ‘์† ๊ฐ€๋Šฅํ•œ ๋งํฌ๊ฐ€ ์ƒ์„ฑ๋จ

demo.launch()