import numpy as np #import itertools from konlpy.tag import Okt from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity import gradio as gr import pandas as pd # make function using import pip to install torch import pip #pip.main(['install', 'torch']) #pip.main(['install', 'transformers']) import torch import transformers from transformers import BertTokenizerFast from transformers import AutoModel def make_candiadte(prompt): okt = Okt() tokenized_doc = okt.pos(prompt) tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun']) n_gram_range = (2, 3) count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns]) candidates = count.get_feature_names_out() return candidates # saved_model def load_model(): pretrained_model_name = "kykim/bert-kor-base" tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name) model = AutoModel.from_pretrained("./bertmodel/") return model, tokenizer # main def inference(prompt): candidates = make_candiadte(prompt) model, tokenizer = load_model() input_ids = tokenizer.encode(prompt) input_ids = torch.tensor(input_ids).unsqueeze(0) doc_embedding = model(input_ids)["pooler_output"] top_n = 5 words = [] distances = [] for word in candidates: input_ids = tokenizer.encode(word) input_ids = torch.tensor(input_ids).unsqueeze(0) word_embedding = model(input_ids)["pooler_output"] distance = torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item() words.append(word) distances.append(distance) #print(word, torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item()) cos_df = pd.DataFrame({'word':words, 'distance':distances}) # sort by distance cos_df = cos_df.sort_values(by='distance', ascending=False) # top n cos_df = cos_df[:top_n] cos_df["word"].values # 명사만 추출 outputs = [] for word in cos_df["word"].values: okt = Okt() tokenized_doc = okt.pos(word) tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun']) outputs.append("#" + tokenized_nouns) outputs = " ".join(outputs) return outputs demo = gr.Interface( fn=inference, inputs="text", outputs="text", #return 값 examples=[ "지난해 국내 클래식계 최고 스타로 떠오른 피아니스트 임윤찬이 미국 밴 클라이번 국제콩쿠르 결선에서 연주한 라흐마니노프 피아노 협주곡 제3번 영상이 유튜브에서 조회수 1000만회를 넘겼다. 라흐마니노프 3번 연주 영상 중 단연 최고 조회수다." ] ).launch() # launch(share=True)를 설정하면 외부에서 접속 가능한 링크가 생성됨 demo.launch()