Unggi commited on
Commit
eb1ba05
โ€ข
1 Parent(s): 44a51ab

first commit

Browse files
Files changed (1) hide show
  1. app.py +98 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import numpy as np
4
+ import itertools
5
+
6
+ from konlpy.tag import Okt
7
+ from sklearn.feature_extraction.text import CountVectorizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import gradio as gr
10
+
11
+ # make function using import pip to install torch
12
+ import pip
13
+ pip.main(['install', 'torch'])
14
+ pip.main(['install', 'transformers'])
15
+
16
+ import torch
17
+ import transformers
18
+
19
+ from transformers import BertTokenizerFast
20
+ from transformers import AutoModel
21
+
22
+ def make_candiadte(prompt):
23
+ okt = Okt()
24
+ tokenized_doc = okt.pos(prompt)
25
+ tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])
26
+
27
+ n_gram_range = (2, 3)
28
+
29
+ count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
30
+ candidates = count.get_feature_names_out()
31
+
32
+ return candidates
33
+
34
+
35
+ # saved_model
36
+ def load_model():
37
+
38
+ pretrained_model_name = "kykim/bert-kor-base"
39
+
40
+ tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name)
41
+ model = AutoModel.from_pretrained(pretrained_model_name)
42
+
43
+ return model, tokenizer
44
+
45
+
46
+ # main
47
+ def inference(prompt):
48
+
49
+ candidates = make_candiadte(prompt)
50
+
51
+ model, tokenizer = load_model()
52
+
53
+ input_ids = tokenizer.encode(prompt)
54
+ input_ids = torch.tensor(input_ids).unsqueeze(0)
55
+
56
+ doc_embedding = model(input_ids)["pooler_output"]
57
+
58
+ top_n = 5
59
+
60
+ words = []
61
+ distances = []
62
+
63
+ for word in candidates:
64
+ input_ids = tokenizer.encode(word)
65
+ input_ids = torch.tensor(input_ids).unsqueeze(0)
66
+ word_embedding = model(input_ids)["pooler_output"]
67
+
68
+ distance = torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item()
69
+
70
+ words.append(word)
71
+ distances.append(distance)
72
+
73
+ #print(word, torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item())
74
+
75
+ cos_df = pd.DataFrame({'word':words, 'distance':distances})
76
+
77
+ # sort by distance
78
+ cos_df = cos_df.sort_values(by='distance', ascending=False)
79
+
80
+ # top n
81
+ cos_df = cos_df[:top_n]
82
+
83
+ cos_df["word"].values
84
+
85
+ outputs = " ".join(["#" + s for s in cos_df["word"].values])
86
+
87
+ outputs
88
+
89
+ return outputs
90
+
91
+
92
+ demo = gr.Interface(
93
+ fn=inference,
94
+ inputs="text",
95
+ outputs="text" #return ๊ฐ’
96
+ ).launch() # launch(share=True)๋ฅผ ์„ค์ •ํ•˜๋ฉด ์™ธ๋ถ€์—์„œ ์ ‘์† ๊ฐ€๋Šฅํ•œ ๋งํฌ๊ฐ€ ์ƒ์„ฑ๋จ
97
+
98
+ demo.launch()